Files
breakpilot-compliance/document-crawler/extractors/dispatcher.py
Benjamin Boenisch 364d2c69ff feat: Add Document Crawler & Auto-Onboarding service (Phase 1.4)
New standalone Python/FastAPI service for automatic compliance document
scanning, LLM-based classification, IPFS archival, and gap analysis.
Includes extractors (PDF, DOCX, XLSX, PPTX), keyword fallback classifier,
compliance matrix, and full REST API on port 8098.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-13 20:35:15 +01:00

26 lines
729 B
Python

"""Routes files to the appropriate extractor by extension."""
from .pdf_extractor import extract_pdf
from .docx_extractor import extract_docx
from .xlsx_extractor import extract_xlsx
from .pptx_extractor import extract_pptx
EXTRACTORS = {
".pdf": extract_pdf,
".docx": extract_docx,
".xlsx": extract_xlsx,
".pptx": extract_pptx,
}
def extract_text(file_path: str, extension: str) -> str:
"""Extract text from a file based on its extension.
Returns extracted text or raises ValueError for unsupported types.
"""
ext = extension.lower()
extractor = EXTRACTORS.get(ext)
if extractor is None:
raise ValueError(f"Unsupported file extension: {ext}")
return extractor(file_path)