breakpilot-pwa/klausur-service/backend/pdf_extraction.py

"""
PDF Extraction Module

NOTE: This module delegates ML-heavy operations to the embedding-service via HTTP.

Provides enhanced PDF text extraction using multiple backends (in embedding-service):
1. Unstructured.io - Best for complex layouts, tables, headers (Apache 2.0)
2. pypdf - Modern, BSD-licensed PDF library (recommended default)

License Compliance:
- Default backends (unstructured, pypdf) are BSD/Apache licensed
"""

import os
import logging
from typing import Dict, List, Optional

logger = logging.getLogger(__name__)

# Configuration (for backward compatibility - actual config in embedding-service)
EMBEDDING_SERVICE_URL = os.getenv("EMBEDDING_SERVICE_URL", "http://embedding-service:8087")
PDF_BACKEND = os.getenv("PDF_EXTRACTION_BACKEND", "auto")


class PDFExtractionError(Exception):
    """Error during PDF extraction."""
    pass


class PDFExtractionResult:
    """Result of PDF extraction with metadata."""

    def __init__(
        self,
        text: str,
        backend_used: str,
        pages: int = 0,
        elements: Optional[List[Dict]] = None,
        tables: Optional[List[Dict]] = None,
        metadata: Optional[Dict] = None,
    ):
        self.text = text
        self.backend_used = backend_used
        self.pages = pages
        self.elements = elements or []
        self.tables = tables or []
        self.metadata = metadata or {}

    def to_dict(self) -> Dict:
        return {
            "text": self.text,
            "backend_used": self.backend_used,
            "pages": self.pages,
            "element_count": len(self.elements),
            "table_count": len(self.tables),
            "metadata": self.metadata,
        }


def _detect_available_backends() -> List[str]:
    """Get available backends from embedding-service."""
    import httpx

    try:
        with httpx.Client(timeout=5.0) as client:
            response = client.get(f"{EMBEDDING_SERVICE_URL}/models")
            if response.status_code == 200:
                data = response.json()
                return data.get("available_pdf_backends", ["pypdf"])
    except Exception as e:
        logger.warning(f"Could not reach embedding-service: {e}")

    return []


def extract_text_from_pdf_enhanced(
    pdf_content: bytes,
    backend: str = PDF_BACKEND,
    fallback: bool = True,
) -> PDFExtractionResult:
    """
    Extract text from PDF using embedding-service.

    Args:
        pdf_content: PDF file content as bytes
        backend: Preferred backend (auto, unstructured, pypdf)
        fallback: If True, try other backends if preferred fails

    Returns:
        PDFExtractionResult with extracted text and metadata
    """
    import httpx

    try:
        with httpx.Client(timeout=120.0) as client:
            response = client.post(
                f"{EMBEDDING_SERVICE_URL}/extract-pdf",
                content=pdf_content,
                headers={"Content-Type": "application/octet-stream"}
            )
            response.raise_for_status()
            data = response.json()

            return PDFExtractionResult(
                text=data.get("text", ""),
                backend_used=data.get("backend_used", "unknown"),
                pages=data.get("pages", 0),
                tables=[{"count": data.get("table_count", 0)}] if data.get("table_count", 0) > 0 else [],
                metadata={"embedding_service": True}
            )
    except httpx.TimeoutException:
        raise PDFExtractionError("PDF extraction timeout")
    except httpx.HTTPStatusError as e:
        raise PDFExtractionError(f"PDF extraction error: {e.response.status_code}")
    except Exception as e:
        raise PDFExtractionError(f"Failed to extract PDF: {str(e)}")


def extract_text_from_pdf(pdf_content: bytes) -> str:
    """
    Extract text from PDF (simple interface).

    This is a drop-in replacement for the original function
    that uses the embedding-service internally.
    """
    result = extract_text_from_pdf_enhanced(pdf_content)
    return result.text


def get_pdf_extraction_info() -> dict:
    """Get information about PDF extraction configuration."""
    import httpx

    try:
        with httpx.Client(timeout=5.0) as client:
            response = client.get(f"{EMBEDDING_SERVICE_URL}/models")
            if response.status_code == 200:
                data = response.json()
                available = data.get("available_pdf_backends", [])
                return {
                    "configured_backend": data.get("pdf_backend", PDF_BACKEND),
                    "available_backends": available,
                    "recommended": "unstructured" if "unstructured" in available else "pypdf",
                    "backend_licenses": {
                        "unstructured": "Apache-2.0",
                        "pypdf": "BSD-3-Clause",
                    },
                    "commercial_safe_backends": available,
                    "embedding_service_url": EMBEDDING_SERVICE_URL,
                    "embedding_service_available": True,
                }
    except Exception as e:
        logger.warning(f"Could not reach embedding-service: {e}")

    # Fallback when embedding-service is not available
    return {
        "configured_backend": PDF_BACKEND,
        "available_backends": [],
        "recommended": None,
        "backend_licenses": {},
        "commercial_safe_backends": [],
        "embedding_service_url": EMBEDDING_SERVICE_URL,
        "embedding_service_available": False,
    }