feat(embedding): add pdfplumber backend for multi-column PDF extraction
EU Official Journal PDFs (AI Act, CRA, NIS2, DSGVO, etc.) use
multi-column layouts that pypdf breaks into fragmented words
("Ar tik el" instead of "Artikel"). pdfplumber handles these correctly.
Backend priority: unstructured > pdfplumber > pypdf (auto mode).
Also increases D5 re-ingestion timeout to 3600s for large PDFs.
58 embedding-service tests passing. pdfplumber: MIT license.
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -750,6 +750,12 @@ def detect_pdf_backends() -> List[str]:
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
try:
|
||||
import pdfplumber # noqa: F401
|
||||
available.append("pdfplumber")
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
try:
|
||||
from pypdf import PdfReader # noqa: F401
|
||||
available.append("pypdf")
|
||||
@@ -815,8 +821,32 @@ def extract_pdf_unstructured(pdf_content: bytes) -> ExtractPDFResponse:
|
||||
pass
|
||||
|
||||
|
||||
def extract_pdf_pdfplumber(pdf_content: bytes) -> ExtractPDFResponse:
|
||||
"""Extract PDF using pdfplumber (best for multi-column EU regulation PDFs)."""
|
||||
import io
|
||||
import pdfplumber
|
||||
|
||||
pdf_file = io.BytesIO(pdf_content)
|
||||
text_parts = []
|
||||
page_count = 0
|
||||
|
||||
with pdfplumber.open(pdf_file) as pdf:
|
||||
page_count = len(pdf.pages)
|
||||
for page in pdf.pages:
|
||||
text = page.extract_text(x_tolerance=2, y_tolerance=3)
|
||||
if text:
|
||||
text_parts.append(text)
|
||||
|
||||
return ExtractPDFResponse(
|
||||
text="\n\n".join(text_parts),
|
||||
backend_used="pdfplumber",
|
||||
pages=page_count,
|
||||
table_count=0,
|
||||
)
|
||||
|
||||
|
||||
def extract_pdf_pypdf(pdf_content: bytes) -> ExtractPDFResponse:
|
||||
"""Extract PDF using pypdf."""
|
||||
"""Extract PDF using pypdf (fallback)."""
|
||||
import io
|
||||
from pypdf import PdfReader
|
||||
|
||||
@@ -1039,11 +1069,19 @@ async def extract_pdf(file: UploadFile = File(...)):
|
||||
|
||||
backend = config.PDF_EXTRACTION_BACKEND
|
||||
if backend == "auto":
|
||||
backend = "unstructured" if "unstructured" in available else "pypdf"
|
||||
# Prefer: unstructured > pdfplumber > pypdf
|
||||
if "unstructured" in available:
|
||||
backend = "unstructured"
|
||||
elif "pdfplumber" in available:
|
||||
backend = "pdfplumber"
|
||||
else:
|
||||
backend = "pypdf"
|
||||
|
||||
try:
|
||||
if backend == "unstructured" and "unstructured" in available:
|
||||
return extract_pdf_unstructured(pdf_content)
|
||||
elif backend == "pdfplumber" and "pdfplumber" in available:
|
||||
return extract_pdf_pdfplumber(pdf_content)
|
||||
elif "pypdf" in available:
|
||||
return extract_pdf_pypdf(pdf_content)
|
||||
else:
|
||||
|
||||
Reference in New Issue
Block a user