""" PDF Extraction Module NOTE: This module delegates ML-heavy operations to the embedding-service via HTTP. Provides enhanced PDF text extraction using multiple backends (in embedding-service): 1. Unstructured.io - Best for complex layouts, tables, headers (Apache 2.0) 2. pypdf - Modern, BSD-licensed PDF library (recommended default) License Compliance: - Default backends (unstructured, pypdf) are BSD/Apache licensed """ import os import logging from typing import Dict, List, Optional logger = logging.getLogger(__name__) # Configuration (for backward compatibility - actual config in embedding-service) EMBEDDING_SERVICE_URL = os.getenv("EMBEDDING_SERVICE_URL", "http://embedding-service:8087") PDF_BACKEND = os.getenv("PDF_EXTRACTION_BACKEND", "auto") class PDFExtractionError(Exception): """Error during PDF extraction.""" pass class PDFExtractionResult: """Result of PDF extraction with metadata.""" def __init__( self, text: str, backend_used: str, pages: int = 0, elements: Optional[List[Dict]] = None, tables: Optional[List[Dict]] = None, metadata: Optional[Dict] = None, ): self.text = text self.backend_used = backend_used self.pages = pages self.elements = elements or [] self.tables = tables or [] self.metadata = metadata or {} def to_dict(self) -> Dict: return { "text": self.text, "backend_used": self.backend_used, "pages": self.pages, "element_count": len(self.elements), "table_count": len(self.tables), "metadata": self.metadata, } def _detect_available_backends() -> List[str]: """Get available backends from embedding-service.""" import httpx try: with httpx.Client(timeout=5.0) as client: response = client.get(f"{EMBEDDING_SERVICE_URL}/models") if response.status_code == 200: data = response.json() return data.get("available_pdf_backends", ["pypdf"]) except Exception as e: logger.warning(f"Could not reach embedding-service: {e}") return [] def extract_text_from_pdf_enhanced( pdf_content: bytes, backend: str = PDF_BACKEND, fallback: bool = True, ) -> PDFExtractionResult: """ Extract text from PDF using embedding-service. Args: pdf_content: PDF file content as bytes backend: Preferred backend (auto, unstructured, pypdf) fallback: If True, try other backends if preferred fails Returns: PDFExtractionResult with extracted text and metadata """ import httpx try: with httpx.Client(timeout=120.0) as client: response = client.post( f"{EMBEDDING_SERVICE_URL}/extract-pdf", content=pdf_content, headers={"Content-Type": "application/octet-stream"} ) response.raise_for_status() data = response.json() return PDFExtractionResult( text=data.get("text", ""), backend_used=data.get("backend_used", "unknown"), pages=data.get("pages", 0), tables=[{"count": data.get("table_count", 0)}] if data.get("table_count", 0) > 0 else [], metadata={"embedding_service": True} ) except httpx.TimeoutException: raise PDFExtractionError("PDF extraction timeout") except httpx.HTTPStatusError as e: raise PDFExtractionError(f"PDF extraction error: {e.response.status_code}") except Exception as e: raise PDFExtractionError(f"Failed to extract PDF: {str(e)}") def extract_text_from_pdf(pdf_content: bytes) -> str: """ Extract text from PDF (simple interface). This is a drop-in replacement for the original function that uses the embedding-service internally. """ result = extract_text_from_pdf_enhanced(pdf_content) return result.text def get_pdf_extraction_info() -> dict: """Get information about PDF extraction configuration.""" import httpx try: with httpx.Client(timeout=5.0) as client: response = client.get(f"{EMBEDDING_SERVICE_URL}/models") if response.status_code == 200: data = response.json() available = data.get("available_pdf_backends", []) return { "configured_backend": data.get("pdf_backend", PDF_BACKEND), "available_backends": available, "recommended": "unstructured" if "unstructured" in available else "pypdf", "backend_licenses": { "unstructured": "Apache-2.0", "pypdf": "BSD-3-Clause", }, "commercial_safe_backends": available, "embedding_service_url": EMBEDDING_SERVICE_URL, "embedding_service_available": True, } except Exception as e: logger.warning(f"Could not reach embedding-service: {e}") # Fallback when embedding-service is not available return { "configured_backend": PDF_BACKEND, "available_backends": [], "recommended": None, "backend_licenses": {}, "commercial_safe_backends": [], "embedding_service_url": EMBEDDING_SERVICE_URL, "embedding_service_available": False, }