feat(embedding): add pdfplumber backend for multi-column PDF extraction

EU Official Journal PDFs (AI Act, CRA, NIS2, DSGVO, etc.) use
multi-column layouts that pypdf breaks into fragmented words
("Ar tik el" instead of "Artikel"). pdfplumber handles these correctly.

Backend priority: unstructured > pdfplumber > pypdf (auto mode).
Also increases D5 re-ingestion timeout to 3600s for large PDFs.

58 embedding-service tests passing. pdfplumber: MIT license.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-05-02 15:42:25 +02:00
parent a459636bc4
commit 75dda9ac92
3 changed files with 42 additions and 3 deletions
+1 -1
View File
@@ -44,7 +44,7 @@ from reingest_d5_config import (
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s") logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
logger = logging.getLogger("d5-reingest") logger = logging.getLogger("d5-reingest")
UPLOAD_TIMEOUT = httpx.Timeout(timeout=1800.0, connect=30.0) UPLOAD_TIMEOUT = httpx.Timeout(timeout=3600.0, connect=30.0)
SCROLL_TIMEOUT = httpx.Timeout(timeout=60.0, connect=10.0) SCROLL_TIMEOUT = httpx.Timeout(timeout=60.0, connect=10.0)
+40 -2
View File
@@ -750,6 +750,12 @@ def detect_pdf_backends() -> List[str]:
except ImportError: except ImportError:
pass pass
try:
import pdfplumber # noqa: F401
available.append("pdfplumber")
except ImportError:
pass
try: try:
from pypdf import PdfReader # noqa: F401 from pypdf import PdfReader # noqa: F401
available.append("pypdf") available.append("pypdf")
@@ -815,8 +821,32 @@ def extract_pdf_unstructured(pdf_content: bytes) -> ExtractPDFResponse:
pass pass
def extract_pdf_pdfplumber(pdf_content: bytes) -> ExtractPDFResponse:
"""Extract PDF using pdfplumber (best for multi-column EU regulation PDFs)."""
import io
import pdfplumber
pdf_file = io.BytesIO(pdf_content)
text_parts = []
page_count = 0
with pdfplumber.open(pdf_file) as pdf:
page_count = len(pdf.pages)
for page in pdf.pages:
text = page.extract_text(x_tolerance=2, y_tolerance=3)
if text:
text_parts.append(text)
return ExtractPDFResponse(
text="\n\n".join(text_parts),
backend_used="pdfplumber",
pages=page_count,
table_count=0,
)
def extract_pdf_pypdf(pdf_content: bytes) -> ExtractPDFResponse: def extract_pdf_pypdf(pdf_content: bytes) -> ExtractPDFResponse:
"""Extract PDF using pypdf.""" """Extract PDF using pypdf (fallback)."""
import io import io
from pypdf import PdfReader from pypdf import PdfReader
@@ -1039,11 +1069,19 @@ async def extract_pdf(file: UploadFile = File(...)):
backend = config.PDF_EXTRACTION_BACKEND backend = config.PDF_EXTRACTION_BACKEND
if backend == "auto": if backend == "auto":
backend = "unstructured" if "unstructured" in available else "pypdf" # Prefer: unstructured > pdfplumber > pypdf
if "unstructured" in available:
backend = "unstructured"
elif "pdfplumber" in available:
backend = "pdfplumber"
else:
backend = "pypdf"
try: try:
if backend == "unstructured" and "unstructured" in available: if backend == "unstructured" and "unstructured" in available:
return extract_pdf_unstructured(pdf_content) return extract_pdf_unstructured(pdf_content)
elif backend == "pdfplumber" and "pdfplumber" in available:
return extract_pdf_pdfplumber(pdf_content)
elif "pypdf" in available: elif "pypdf" in available:
return extract_pdf_pypdf(pdf_content) return extract_pdf_pypdf(pdf_content)
else: else:
+1
View File
@@ -14,6 +14,7 @@ sentence-transformers>=2.2.0
# PDF Extraction # PDF Extraction
unstructured>=0.12.0 unstructured>=0.12.0
pypdf>=4.0.0 pypdf>=4.0.0
pdfplumber>=0.11.0
python-magic>=0.4.27 python-magic>=0.4.27
# HTTP Client (for OpenAI/Cohere API calls) # HTTP Client (for OpenAI/Cohere API calls)