Some checks failed
Tests / Go Tests (push) Has been cancelled
Tests / Python Tests (push) Has been cancelled
Tests / Integration Tests (push) Has been cancelled
Tests / Go Lint (push) Has been cancelled
Tests / Python Lint (push) Has been cancelled
Tests / Security Scan (push) Has been cancelled
Tests / All Checks Passed (push) Has been cancelled
Security Scanning / Secret Scanning (push) Has been cancelled
Security Scanning / Dependency Vulnerability Scan (push) Has been cancelled
Security Scanning / Go Security Scan (push) Has been cancelled
Security Scanning / Python Security Scan (push) Has been cancelled
Security Scanning / Node.js Security Scan (push) Has been cancelled
Security Scanning / Docker Image Security (push) Has been cancelled
Security Scanning / Security Summary (push) Has been cancelled
CI/CD Pipeline / Go Tests (push) Has been cancelled
CI/CD Pipeline / Python Tests (push) Has been cancelled
CI/CD Pipeline / Website Tests (push) Has been cancelled
CI/CD Pipeline / Linting (push) Has been cancelled
CI/CD Pipeline / Security Scan (push) Has been cancelled
CI/CD Pipeline / Docker Build & Push (push) Has been cancelled
CI/CD Pipeline / Integration Tests (push) Has been cancelled
CI/CD Pipeline / Deploy to Staging (push) Has been cancelled
CI/CD Pipeline / Deploy to Production (push) Has been cancelled
CI/CD Pipeline / CI Summary (push) Has been cancelled
ci/woodpecker/manual/build-ci-image Pipeline was successful
ci/woodpecker/manual/main Pipeline failed
All services: admin-v2, studio-v2, website, ai-compliance-sdk, consent-service, klausur-service, voice-service, and infrastructure. Large PDFs and compiled binaries excluded via .gitignore.
165 lines
5.4 KiB
Python
165 lines
5.4 KiB
Python
"""
|
|
PDF Extraction Module
|
|
|
|
NOTE: This module delegates ML-heavy operations to the embedding-service via HTTP.
|
|
|
|
Provides enhanced PDF text extraction using multiple backends (in embedding-service):
|
|
1. Unstructured.io - Best for complex layouts, tables, headers (Apache 2.0)
|
|
2. pypdf - Modern, BSD-licensed PDF library (recommended default)
|
|
|
|
License Compliance:
|
|
- Default backends (unstructured, pypdf) are BSD/Apache licensed
|
|
"""
|
|
|
|
import os
|
|
import logging
|
|
from typing import Dict, List, Optional
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Configuration (for backward compatibility - actual config in embedding-service)
|
|
EMBEDDING_SERVICE_URL = os.getenv("EMBEDDING_SERVICE_URL", "http://embedding-service:8087")
|
|
PDF_BACKEND = os.getenv("PDF_EXTRACTION_BACKEND", "auto")
|
|
|
|
|
|
class PDFExtractionError(Exception):
|
|
"""Error during PDF extraction."""
|
|
pass
|
|
|
|
|
|
class PDFExtractionResult:
|
|
"""Result of PDF extraction with metadata."""
|
|
|
|
def __init__(
|
|
self,
|
|
text: str,
|
|
backend_used: str,
|
|
pages: int = 0,
|
|
elements: Optional[List[Dict]] = None,
|
|
tables: Optional[List[Dict]] = None,
|
|
metadata: Optional[Dict] = None,
|
|
):
|
|
self.text = text
|
|
self.backend_used = backend_used
|
|
self.pages = pages
|
|
self.elements = elements or []
|
|
self.tables = tables or []
|
|
self.metadata = metadata or {}
|
|
|
|
def to_dict(self) -> Dict:
|
|
return {
|
|
"text": self.text,
|
|
"backend_used": self.backend_used,
|
|
"pages": self.pages,
|
|
"element_count": len(self.elements),
|
|
"table_count": len(self.tables),
|
|
"metadata": self.metadata,
|
|
}
|
|
|
|
|
|
def _detect_available_backends() -> List[str]:
|
|
"""Get available backends from embedding-service."""
|
|
import httpx
|
|
|
|
try:
|
|
with httpx.Client(timeout=5.0) as client:
|
|
response = client.get(f"{EMBEDDING_SERVICE_URL}/models")
|
|
if response.status_code == 200:
|
|
data = response.json()
|
|
return data.get("available_pdf_backends", ["pypdf"])
|
|
except Exception as e:
|
|
logger.warning(f"Could not reach embedding-service: {e}")
|
|
|
|
return []
|
|
|
|
|
|
def extract_text_from_pdf_enhanced(
|
|
pdf_content: bytes,
|
|
backend: str = PDF_BACKEND,
|
|
fallback: bool = True,
|
|
) -> PDFExtractionResult:
|
|
"""
|
|
Extract text from PDF using embedding-service.
|
|
|
|
Args:
|
|
pdf_content: PDF file content as bytes
|
|
backend: Preferred backend (auto, unstructured, pypdf)
|
|
fallback: If True, try other backends if preferred fails
|
|
|
|
Returns:
|
|
PDFExtractionResult with extracted text and metadata
|
|
"""
|
|
import httpx
|
|
|
|
try:
|
|
with httpx.Client(timeout=120.0) as client:
|
|
response = client.post(
|
|
f"{EMBEDDING_SERVICE_URL}/extract-pdf",
|
|
content=pdf_content,
|
|
headers={"Content-Type": "application/octet-stream"}
|
|
)
|
|
response.raise_for_status()
|
|
data = response.json()
|
|
|
|
return PDFExtractionResult(
|
|
text=data.get("text", ""),
|
|
backend_used=data.get("backend_used", "unknown"),
|
|
pages=data.get("pages", 0),
|
|
tables=[{"count": data.get("table_count", 0)}] if data.get("table_count", 0) > 0 else [],
|
|
metadata={"embedding_service": True}
|
|
)
|
|
except httpx.TimeoutException:
|
|
raise PDFExtractionError("PDF extraction timeout")
|
|
except httpx.HTTPStatusError as e:
|
|
raise PDFExtractionError(f"PDF extraction error: {e.response.status_code}")
|
|
except Exception as e:
|
|
raise PDFExtractionError(f"Failed to extract PDF: {str(e)}")
|
|
|
|
|
|
def extract_text_from_pdf(pdf_content: bytes) -> str:
|
|
"""
|
|
Extract text from PDF (simple interface).
|
|
|
|
This is a drop-in replacement for the original function
|
|
that uses the embedding-service internally.
|
|
"""
|
|
result = extract_text_from_pdf_enhanced(pdf_content)
|
|
return result.text
|
|
|
|
|
|
def get_pdf_extraction_info() -> dict:
|
|
"""Get information about PDF extraction configuration."""
|
|
import httpx
|
|
|
|
try:
|
|
with httpx.Client(timeout=5.0) as client:
|
|
response = client.get(f"{EMBEDDING_SERVICE_URL}/models")
|
|
if response.status_code == 200:
|
|
data = response.json()
|
|
available = data.get("available_pdf_backends", [])
|
|
return {
|
|
"configured_backend": data.get("pdf_backend", PDF_BACKEND),
|
|
"available_backends": available,
|
|
"recommended": "unstructured" if "unstructured" in available else "pypdf",
|
|
"backend_licenses": {
|
|
"unstructured": "Apache-2.0",
|
|
"pypdf": "BSD-3-Clause",
|
|
},
|
|
"commercial_safe_backends": available,
|
|
"embedding_service_url": EMBEDDING_SERVICE_URL,
|
|
"embedding_service_available": True,
|
|
}
|
|
except Exception as e:
|
|
logger.warning(f"Could not reach embedding-service: {e}")
|
|
|
|
# Fallback when embedding-service is not available
|
|
return {
|
|
"configured_backend": PDF_BACKEND,
|
|
"available_backends": [],
|
|
"recommended": None,
|
|
"backend_licenses": {},
|
|
"commercial_safe_backends": [],
|
|
"embedding_service_url": EMBEDDING_SERVICE_URL,
|
|
"embedding_service_available": False,
|
|
}
|