feat(embedding): add pdfplumber backend for multi-column PDF extraction

EU Official Journal PDFs (AI Act, CRA, NIS2, DSGVO, etc.) use multi-column layouts that pypdf breaks into fragmented words ("Ar tik el" instead of "Artikel"). pdfplumber handles these correctly. Backend priority: unstructured > pdfplumber > pypdf (auto mode). Also increases D5 re-ingestion timeout to 3600s for large PDFs. 58 embedding-service tests passing. pdfplumber: MIT license. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-02 15:42:25 +02:00
parent a459636bc4
commit 75dda9ac92
3 changed files with 42 additions and 3 deletions
@@ -44,7 +44,7 @@ from reingest_d5_config import (
 logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
 logger = logging.getLogger("d5-reingest")
-UPLOAD_TIMEOUT = httpx.Timeout(timeout=1800.0, connect=30.0)
+UPLOAD_TIMEOUT = httpx.Timeout(timeout=3600.0, connect=30.0)
 SCROLL_TIMEOUT = httpx.Timeout(timeout=60.0, connect=10.0)
@@ -750,6 +750,12 @@ def detect_pdf_backends() -> List[str]:
    except ImportError:
        pass
    try:
        import pdfplumber  # noqa: F401
        available.append("pdfplumber")
    except ImportError:
        pass
    try:
        from pypdf import PdfReader  # noqa: F401
        available.append("pypdf")
@@ -815,8 +821,32 @@ def extract_pdf_unstructured(pdf_content: bytes) -> ExtractPDFResponse:
            pass
 def extract_pdf_pdfplumber(pdf_content: bytes) -> ExtractPDFResponse:
    """Extract PDF using pdfplumber (best for multi-column EU regulation PDFs)."""
    import io
    import pdfplumber
    pdf_file = io.BytesIO(pdf_content)
    text_parts = []
    page_count = 0
    with pdfplumber.open(pdf_file) as pdf:
        page_count = len(pdf.pages)
        for page in pdf.pages:
            text = page.extract_text(x_tolerance=2, y_tolerance=3)
            if text:
                text_parts.append(text)
    return ExtractPDFResponse(
        text="\n\n".join(text_parts),
        backend_used="pdfplumber",
        pages=page_count,
        table_count=0,
    )
 def extract_pdf_pypdf(pdf_content: bytes) -> ExtractPDFResponse:
-    """Extract PDF using pypdf."""
+    """Extract PDF using pypdf (fallback)."""
    import io
    from pypdf import PdfReader
@@ -1039,11 +1069,19 @@ async def extract_pdf(file: UploadFile = File(...)):
    backend = config.PDF_EXTRACTION_BACKEND
    if backend == "auto":
-        backend = "unstructured" if "unstructured" in available else "pypdf"
+        # Prefer: unstructured > pdfplumber > pypdf
        if "unstructured" in available:
            backend = "unstructured"
        elif "pdfplumber" in available:
            backend = "pdfplumber"
        else:
            backend = "pypdf"
    try:
        if backend == "unstructured" and "unstructured" in available:
            return extract_pdf_unstructured(pdf_content)
        elif backend == "pdfplumber" and "pdfplumber" in available:
            return extract_pdf_pdfplumber(pdf_content)
        elif "pypdf" in available:
            return extract_pdf_pypdf(pdf_content)
        else:
@@ -14,6 +14,7 @@ sentence-transformers>=2.2.0
 # PDF Extraction
 unstructured>=0.12.0
 pypdf>=4.0.0
 pdfplumber>=0.11.0
 python-magic>=0.4.27
 # HTTP Client (for OpenAI/Cohere API calls)