feat: Add Document Crawler & Auto-Onboarding service (Phase 1.4)

New standalone Python/FastAPI service for automatic compliance document scanning, LLM-based classification, IPFS archival, and gap analysis. Includes extractors (PDF, DOCX, XLSX, PPTX), keyword fallback classifier, compliance matrix, and full REST API on port 8098. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-13 20:35:15 +01:00
parent 0923c03756
commit 364d2c69ff
34 changed files with 1633 additions and 0 deletions
@@ -0,0 +1,25 @@
+"""Routes files to the appropriate extractor by extension."""
+
+from .pdf_extractor import extract_pdf
+from .docx_extractor import extract_docx
+from .xlsx_extractor import extract_xlsx
+from .pptx_extractor import extract_pptx
+
+EXTRACTORS = {
+    ".pdf": extract_pdf,
+    ".docx": extract_docx,
+    ".xlsx": extract_xlsx,
+    ".pptx": extract_pptx,
+}
+
+
+def extract_text(file_path: str, extension: str) -> str:
+    """Extract text from a file based on its extension.
+
+    Returns extracted text or raises ValueError for unsupported types.
+    """
+    ext = extension.lower()
+    extractor = EXTRACTORS.get(ext)
+    if extractor is None:
+        raise ValueError(f"Unsupported file extension: {ext}")
+    return extractor(file_path)