diff --git a/control-pipeline/scripts/reingest_d5.py b/control-pipeline/scripts/reingest_d5.py index cd40b5f..31d880a 100644 --- a/control-pipeline/scripts/reingest_d5.py +++ b/control-pipeline/scripts/reingest_d5.py @@ -44,7 +44,7 @@ from reingest_d5_config import ( logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s") logger = logging.getLogger("d5-reingest") -UPLOAD_TIMEOUT = httpx.Timeout(timeout=1800.0, connect=30.0) +UPLOAD_TIMEOUT = httpx.Timeout(timeout=3600.0, connect=30.0) SCROLL_TIMEOUT = httpx.Timeout(timeout=60.0, connect=10.0) diff --git a/embedding-service/main.py b/embedding-service/main.py index 86f733d..5124881 100644 --- a/embedding-service/main.py +++ b/embedding-service/main.py @@ -750,6 +750,12 @@ def detect_pdf_backends() -> List[str]: except ImportError: pass + try: + import pdfplumber # noqa: F401 + available.append("pdfplumber") + except ImportError: + pass + try: from pypdf import PdfReader # noqa: F401 available.append("pypdf") @@ -815,8 +821,32 @@ def extract_pdf_unstructured(pdf_content: bytes) -> ExtractPDFResponse: pass +def extract_pdf_pdfplumber(pdf_content: bytes) -> ExtractPDFResponse: + """Extract PDF using pdfplumber (best for multi-column EU regulation PDFs).""" + import io + import pdfplumber + + pdf_file = io.BytesIO(pdf_content) + text_parts = [] + page_count = 0 + + with pdfplumber.open(pdf_file) as pdf: + page_count = len(pdf.pages) + for page in pdf.pages: + text = page.extract_text(x_tolerance=2, y_tolerance=3) + if text: + text_parts.append(text) + + return ExtractPDFResponse( + text="\n\n".join(text_parts), + backend_used="pdfplumber", + pages=page_count, + table_count=0, + ) + + def extract_pdf_pypdf(pdf_content: bytes) -> ExtractPDFResponse: - """Extract PDF using pypdf.""" + """Extract PDF using pypdf (fallback).""" import io from pypdf import PdfReader @@ -1039,11 +1069,19 @@ async def extract_pdf(file: UploadFile = File(...)): backend = config.PDF_EXTRACTION_BACKEND if backend == "auto": - backend = "unstructured" if "unstructured" in available else "pypdf" + # Prefer: unstructured > pdfplumber > pypdf + if "unstructured" in available: + backend = "unstructured" + elif "pdfplumber" in available: + backend = "pdfplumber" + else: + backend = "pypdf" try: if backend == "unstructured" and "unstructured" in available: return extract_pdf_unstructured(pdf_content) + elif backend == "pdfplumber" and "pdfplumber" in available: + return extract_pdf_pdfplumber(pdf_content) elif "pypdf" in available: return extract_pdf_pypdf(pdf_content) else: diff --git a/embedding-service/requirements.txt b/embedding-service/requirements.txt index 2d11c24..68b1442 100644 --- a/embedding-service/requirements.txt +++ b/embedding-service/requirements.txt @@ -14,6 +14,7 @@ sentence-transformers>=2.2.0 # PDF Extraction unstructured>=0.12.0 pypdf>=4.0.0 +pdfplumber>=0.11.0 python-magic>=0.4.27 # HTTP Client (for OpenAI/Cohere API calls)