Files
breakpilot-lehrer/klausur-service/backend/pdf_extraction.py
Benjamin Boenisch 5a31f52310 Initial commit: breakpilot-lehrer - Lehrer KI Platform
Services: Admin-Lehrer, Backend-Lehrer, Studio v2, Website,
Klausur-Service, School-Service, Voice-Service, Geo-Service,
BreakPilot Drive, Agent-Core

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-11 23:47:26 +01:00

165 lines
5.4 KiB
Python

"""
PDF Extraction Module
NOTE: This module delegates ML-heavy operations to the embedding-service via HTTP.
Provides enhanced PDF text extraction using multiple backends (in embedding-service):
1. Unstructured.io - Best for complex layouts, tables, headers (Apache 2.0)
2. pypdf - Modern, BSD-licensed PDF library (recommended default)
License Compliance:
- Default backends (unstructured, pypdf) are BSD/Apache licensed
"""
import os
import logging
from typing import Dict, List, Optional
logger = logging.getLogger(__name__)
# Configuration (for backward compatibility - actual config in embedding-service)
EMBEDDING_SERVICE_URL = os.getenv("EMBEDDING_SERVICE_URL", "http://embedding-service:8087")
PDF_BACKEND = os.getenv("PDF_EXTRACTION_BACKEND", "auto")
class PDFExtractionError(Exception):
"""Error during PDF extraction."""
pass
class PDFExtractionResult:
"""Result of PDF extraction with metadata."""
def __init__(
self,
text: str,
backend_used: str,
pages: int = 0,
elements: Optional[List[Dict]] = None,
tables: Optional[List[Dict]] = None,
metadata: Optional[Dict] = None,
):
self.text = text
self.backend_used = backend_used
self.pages = pages
self.elements = elements or []
self.tables = tables or []
self.metadata = metadata or {}
def to_dict(self) -> Dict:
return {
"text": self.text,
"backend_used": self.backend_used,
"pages": self.pages,
"element_count": len(self.elements),
"table_count": len(self.tables),
"metadata": self.metadata,
}
def _detect_available_backends() -> List[str]:
"""Get available backends from embedding-service."""
import httpx
try:
with httpx.Client(timeout=5.0) as client:
response = client.get(f"{EMBEDDING_SERVICE_URL}/models")
if response.status_code == 200:
data = response.json()
return data.get("available_pdf_backends", ["pypdf"])
except Exception as e:
logger.warning(f"Could not reach embedding-service: {e}")
return []
def extract_text_from_pdf_enhanced(
pdf_content: bytes,
backend: str = PDF_BACKEND,
fallback: bool = True,
) -> PDFExtractionResult:
"""
Extract text from PDF using embedding-service.
Args:
pdf_content: PDF file content as bytes
backend: Preferred backend (auto, unstructured, pypdf)
fallback: If True, try other backends if preferred fails
Returns:
PDFExtractionResult with extracted text and metadata
"""
import httpx
try:
with httpx.Client(timeout=120.0) as client:
response = client.post(
f"{EMBEDDING_SERVICE_URL}/extract-pdf",
content=pdf_content,
headers={"Content-Type": "application/octet-stream"}
)
response.raise_for_status()
data = response.json()
return PDFExtractionResult(
text=data.get("text", ""),
backend_used=data.get("backend_used", "unknown"),
pages=data.get("pages", 0),
tables=[{"count": data.get("table_count", 0)}] if data.get("table_count", 0) > 0 else [],
metadata={"embedding_service": True}
)
except httpx.TimeoutException:
raise PDFExtractionError("PDF extraction timeout")
except httpx.HTTPStatusError as e:
raise PDFExtractionError(f"PDF extraction error: {e.response.status_code}")
except Exception as e:
raise PDFExtractionError(f"Failed to extract PDF: {str(e)}")
def extract_text_from_pdf(pdf_content: bytes) -> str:
"""
Extract text from PDF (simple interface).
This is a drop-in replacement for the original function
that uses the embedding-service internally.
"""
result = extract_text_from_pdf_enhanced(pdf_content)
return result.text
def get_pdf_extraction_info() -> dict:
"""Get information about PDF extraction configuration."""
import httpx
try:
with httpx.Client(timeout=5.0) as client:
response = client.get(f"{EMBEDDING_SERVICE_URL}/models")
if response.status_code == 200:
data = response.json()
available = data.get("available_pdf_backends", [])
return {
"configured_backend": data.get("pdf_backend", PDF_BACKEND),
"available_backends": available,
"recommended": "unstructured" if "unstructured" in available else "pypdf",
"backend_licenses": {
"unstructured": "Apache-2.0",
"pypdf": "BSD-3-Clause",
},
"commercial_safe_backends": available,
"embedding_service_url": EMBEDDING_SERVICE_URL,
"embedding_service_available": True,
}
except Exception as e:
logger.warning(f"Could not reach embedding-service: {e}")
# Fallback when embedding-service is not available
return {
"configured_backend": PDF_BACKEND,
"available_backends": [],
"recommended": None,
"backend_licenses": {},
"commercial_safe_backends": [],
"embedding_service_url": EMBEDDING_SERVICE_URL,
"embedding_service_available": False,
}