feat(embedding): add pdfplumber backend for multi-column PDF extraction

EU Official Journal PDFs (AI Act, CRA, NIS2, DSGVO, etc.) use multi-column layouts that pypdf breaks into fragmented words ("Ar tik el" instead of "Artikel"). pdfplumber handles these correctly. Backend priority: unstructured > pdfplumber > pypdf (auto mode). Also increases D5 re-ingestion timeout to 3600s for large PDFs. 58 embedding-service tests passing. pdfplumber: MIT license. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-02 15:42:25 +02:00
parent a459636bc4
commit 75dda9ac92
3 changed files with 42 additions and 3 deletions
@@ -750,6 +750,12 @@ def detect_pdf_backends() -> List[str]:
    except ImportError:
        pass

+    try:
+        import pdfplumber  # noqa: F401
+        available.append("pdfplumber")
+    except ImportError:
+        pass
+
    try:
        from pypdf import PdfReader  # noqa: F401
        available.append("pypdf")
@@ -815,8 +821,32 @@ def extract_pdf_unstructured(pdf_content: bytes) -> ExtractPDFResponse:
            pass


+def extract_pdf_pdfplumber(pdf_content: bytes) -> ExtractPDFResponse:
+    """Extract PDF using pdfplumber (best for multi-column EU regulation PDFs)."""
+    import io
+    import pdfplumber
+
+    pdf_file = io.BytesIO(pdf_content)
+    text_parts = []
+    page_count = 0
+
+    with pdfplumber.open(pdf_file) as pdf:
+        page_count = len(pdf.pages)
+        for page in pdf.pages:
+            text = page.extract_text(x_tolerance=2, y_tolerance=3)
+            if text:
+                text_parts.append(text)
+
+    return ExtractPDFResponse(
+        text="\n\n".join(text_parts),
+        backend_used="pdfplumber",
+        pages=page_count,
+        table_count=0,
+    )
+
+
 def extract_pdf_pypdf(pdf_content: bytes) -> ExtractPDFResponse:
-    """Extract PDF using pypdf."""
+    """Extract PDF using pypdf (fallback)."""
    import io
    from pypdf import PdfReader

@@ -1039,11 +1069,19 @@ async def extract_pdf(file: UploadFile = File(...)):

    backend = config.PDF_EXTRACTION_BACKEND
    if backend == "auto":
-        backend = "unstructured" if "unstructured" in available else "pypdf"
+        # Prefer: unstructured > pdfplumber > pypdf
+        if "unstructured" in available:
+            backend = "unstructured"
+        elif "pdfplumber" in available:
+            backend = "pdfplumber"
+        else:
+            backend = "pypdf"

    try:
        if backend == "unstructured" and "unstructured" in available:
            return extract_pdf_unstructured(pdf_content)
+        elif backend == "pdfplumber" and "pdfplumber" in available:
+            return extract_pdf_pdfplumber(pdf_content)
        elif "pypdf" in available:
            return extract_pdf_pypdf(pdf_content)
        else: