fix: Restore all files lost during destructive rebase
A previous `git pull --rebase origin main` dropped 177 local commits,
losing 3400+ files across admin-v2, backend, studio-v2, website,
klausur-service, and many other services. The partial restore attempt
(660295e2) only recovered some files.
This commit restores all missing files from pre-rebase ref 98933f5e
while preserving post-rebase additions (night-scheduler, night-mode UI,
NightModeWidget dashboard integration).
Restored features include:
- AI Module Sidebar (FAB), OCR Labeling, OCR Compare
- GPU Dashboard, RAG Pipeline, Magic Help
- Klausur-Korrektur (8 files), Abitur-Archiv (5+ files)
- Companion, Zeugnisse-Crawler, Screen Flow
- Full backend, studio-v2, website, klausur-service
- All compliance SDKs, agent-core, voice-service
- CI/CD configs, documentation, scripts
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
164
klausur-service/backend/pdf_extraction.py
Normal file
164
klausur-service/backend/pdf_extraction.py
Normal file
@@ -0,0 +1,164 @@
|
||||
"""
|
||||
PDF Extraction Module
|
||||
|
||||
NOTE: This module delegates ML-heavy operations to the embedding-service via HTTP.
|
||||
|
||||
Provides enhanced PDF text extraction using multiple backends (in embedding-service):
|
||||
1. Unstructured.io - Best for complex layouts, tables, headers (Apache 2.0)
|
||||
2. pypdf - Modern, BSD-licensed PDF library (recommended default)
|
||||
|
||||
License Compliance:
|
||||
- Default backends (unstructured, pypdf) are BSD/Apache licensed
|
||||
"""
|
||||
|
||||
import os
|
||||
import logging
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Configuration (for backward compatibility - actual config in embedding-service)
|
||||
EMBEDDING_SERVICE_URL = os.getenv("EMBEDDING_SERVICE_URL", "http://embedding-service:8087")
|
||||
PDF_BACKEND = os.getenv("PDF_EXTRACTION_BACKEND", "auto")
|
||||
|
||||
|
||||
class PDFExtractionError(Exception):
|
||||
"""Error during PDF extraction."""
|
||||
pass
|
||||
|
||||
|
||||
class PDFExtractionResult:
|
||||
"""Result of PDF extraction with metadata."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
text: str,
|
||||
backend_used: str,
|
||||
pages: int = 0,
|
||||
elements: Optional[List[Dict]] = None,
|
||||
tables: Optional[List[Dict]] = None,
|
||||
metadata: Optional[Dict] = None,
|
||||
):
|
||||
self.text = text
|
||||
self.backend_used = backend_used
|
||||
self.pages = pages
|
||||
self.elements = elements or []
|
||||
self.tables = tables or []
|
||||
self.metadata = metadata or {}
|
||||
|
||||
def to_dict(self) -> Dict:
|
||||
return {
|
||||
"text": self.text,
|
||||
"backend_used": self.backend_used,
|
||||
"pages": self.pages,
|
||||
"element_count": len(self.elements),
|
||||
"table_count": len(self.tables),
|
||||
"metadata": self.metadata,
|
||||
}
|
||||
|
||||
|
||||
def _detect_available_backends() -> List[str]:
|
||||
"""Get available backends from embedding-service."""
|
||||
import httpx
|
||||
|
||||
try:
|
||||
with httpx.Client(timeout=5.0) as client:
|
||||
response = client.get(f"{EMBEDDING_SERVICE_URL}/models")
|
||||
if response.status_code == 200:
|
||||
data = response.json()
|
||||
return data.get("available_pdf_backends", ["pypdf"])
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not reach embedding-service: {e}")
|
||||
|
||||
return []
|
||||
|
||||
|
||||
def extract_text_from_pdf_enhanced(
|
||||
pdf_content: bytes,
|
||||
backend: str = PDF_BACKEND,
|
||||
fallback: bool = True,
|
||||
) -> PDFExtractionResult:
|
||||
"""
|
||||
Extract text from PDF using embedding-service.
|
||||
|
||||
Args:
|
||||
pdf_content: PDF file content as bytes
|
||||
backend: Preferred backend (auto, unstructured, pypdf)
|
||||
fallback: If True, try other backends if preferred fails
|
||||
|
||||
Returns:
|
||||
PDFExtractionResult with extracted text and metadata
|
||||
"""
|
||||
import httpx
|
||||
|
||||
try:
|
||||
with httpx.Client(timeout=120.0) as client:
|
||||
response = client.post(
|
||||
f"{EMBEDDING_SERVICE_URL}/extract-pdf",
|
||||
content=pdf_content,
|
||||
headers={"Content-Type": "application/octet-stream"}
|
||||
)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
|
||||
return PDFExtractionResult(
|
||||
text=data.get("text", ""),
|
||||
backend_used=data.get("backend_used", "unknown"),
|
||||
pages=data.get("pages", 0),
|
||||
tables=[{"count": data.get("table_count", 0)}] if data.get("table_count", 0) > 0 else [],
|
||||
metadata={"embedding_service": True}
|
||||
)
|
||||
except httpx.TimeoutException:
|
||||
raise PDFExtractionError("PDF extraction timeout")
|
||||
except httpx.HTTPStatusError as e:
|
||||
raise PDFExtractionError(f"PDF extraction error: {e.response.status_code}")
|
||||
except Exception as e:
|
||||
raise PDFExtractionError(f"Failed to extract PDF: {str(e)}")
|
||||
|
||||
|
||||
def extract_text_from_pdf(pdf_content: bytes) -> str:
|
||||
"""
|
||||
Extract text from PDF (simple interface).
|
||||
|
||||
This is a drop-in replacement for the original function
|
||||
that uses the embedding-service internally.
|
||||
"""
|
||||
result = extract_text_from_pdf_enhanced(pdf_content)
|
||||
return result.text
|
||||
|
||||
|
||||
def get_pdf_extraction_info() -> dict:
|
||||
"""Get information about PDF extraction configuration."""
|
||||
import httpx
|
||||
|
||||
try:
|
||||
with httpx.Client(timeout=5.0) as client:
|
||||
response = client.get(f"{EMBEDDING_SERVICE_URL}/models")
|
||||
if response.status_code == 200:
|
||||
data = response.json()
|
||||
available = data.get("available_pdf_backends", [])
|
||||
return {
|
||||
"configured_backend": data.get("pdf_backend", PDF_BACKEND),
|
||||
"available_backends": available,
|
||||
"recommended": "unstructured" if "unstructured" in available else "pypdf",
|
||||
"backend_licenses": {
|
||||
"unstructured": "Apache-2.0",
|
||||
"pypdf": "BSD-3-Clause",
|
||||
},
|
||||
"commercial_safe_backends": available,
|
||||
"embedding_service_url": EMBEDDING_SERVICE_URL,
|
||||
"embedding_service_available": True,
|
||||
}
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not reach embedding-service: {e}")
|
||||
|
||||
# Fallback when embedding-service is not available
|
||||
return {
|
||||
"configured_backend": PDF_BACKEND,
|
||||
"available_backends": [],
|
||||
"recommended": None,
|
||||
"backend_licenses": {},
|
||||
"commercial_safe_backends": [],
|
||||
"embedding_service_url": EMBEDDING_SERVICE_URL,
|
||||
"embedding_service_available": False,
|
||||
}
|
||||
Reference in New Issue
Block a user