feat: Add Document Crawler & Auto-Onboarding service (Phase 1.4)
New standalone Python/FastAPI service for automatic compliance document scanning, LLM-based classification, IPFS archival, and gap analysis. Includes extractors (PDF, DOCX, XLSX, PPTX), keyword fallback classifier, compliance matrix, and full REST API on port 8098. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
25
document-crawler/extractors/dispatcher.py
Normal file
25
document-crawler/extractors/dispatcher.py
Normal file
@@ -0,0 +1,25 @@
|
||||
"""Routes files to the appropriate extractor by extension."""
|
||||
|
||||
from .pdf_extractor import extract_pdf
|
||||
from .docx_extractor import extract_docx
|
||||
from .xlsx_extractor import extract_xlsx
|
||||
from .pptx_extractor import extract_pptx
|
||||
|
||||
EXTRACTORS = {
|
||||
".pdf": extract_pdf,
|
||||
".docx": extract_docx,
|
||||
".xlsx": extract_xlsx,
|
||||
".pptx": extract_pptx,
|
||||
}
|
||||
|
||||
|
||||
def extract_text(file_path: str, extension: str) -> str:
|
||||
"""Extract text from a file based on its extension.
|
||||
|
||||
Returns extracted text or raises ValueError for unsupported types.
|
||||
"""
|
||||
ext = extension.lower()
|
||||
extractor = EXTRACTORS.get(ext)
|
||||
if extractor is None:
|
||||
raise ValueError(f"Unsupported file extension: {ext}")
|
||||
return extractor(file_path)
|
||||
Reference in New Issue
Block a user