feat: Add Document Crawler & Auto-Onboarding service (Phase 1.4)

New standalone Python/FastAPI service for automatic compliance document scanning, LLM-based classification, IPFS archival, and gap analysis. Includes extractors (PDF, DOCX, XLSX, PPTX), keyword fallback classifier, compliance matrix, and full REST API on port 8098. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-13 20:35:15 +01:00
parent 0923c03756
commit 364d2c69ff
34 changed files with 1633 additions and 0 deletions
@@ -0,0 +1 @@
+from .dispatcher import extract_text
@@ -0,0 +1,25 @@
+"""Routes files to the appropriate extractor by extension."""
+
+from .pdf_extractor import extract_pdf
+from .docx_extractor import extract_docx
+from .xlsx_extractor import extract_xlsx
+from .pptx_extractor import extract_pptx
+
+EXTRACTORS = {
+    ".pdf": extract_pdf,
+    ".docx": extract_docx,
+    ".xlsx": extract_xlsx,
+    ".pptx": extract_pptx,
+}
+
+
+def extract_text(file_path: str, extension: str) -> str:
+    """Extract text from a file based on its extension.
+
+    Returns extracted text or raises ValueError for unsupported types.
+    """
+    ext = extension.lower()
+    extractor = EXTRACTORS.get(ext)
+    if extractor is None:
+        raise ValueError(f"Unsupported file extension: {ext}")
+    return extractor(file_path)
@@ -0,0 +1,18 @@
+"""DOCX text extraction using python-docx."""
+
+from docx import Document
+
+
+def extract_docx(file_path: str) -> str:
+    """Extract text from a DOCX file."""
+    doc = Document(file_path)
+    paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
+
+    # Also extract from tables
+    for table in doc.tables:
+        for row in table.rows:
+            cells = [cell.text.strip() for cell in row.cells if cell.text.strip()]
+            if cells:
+                paragraphs.append(" | ".join(cells))
+
+    return "\n\n".join(paragraphs)
@@ -0,0 +1,15 @@
+"""PDF text extraction using PyMuPDF (fitz)."""
+
+import fitz
+
+
+def extract_pdf(file_path: str) -> str:
+    """Extract text from a PDF file."""
+    doc = fitz.open(file_path)
+    pages = []
+    for page in doc:
+        text = page.get_text()
+        if text.strip():
+            pages.append(text)
+    doc.close()
+    return "\n\n".join(pages)
@@ -0,0 +1,22 @@
+"""PPTX text extraction using python-pptx."""
+
+from pptx import Presentation
+
+
+def extract_pptx(file_path: str) -> str:
+    """Extract text from a PPTX file."""
+    prs = Presentation(file_path)
+    slides = []
+
+    for i, slide in enumerate(prs.slides, 1):
+        texts = []
+        for shape in slide.shapes:
+            if shape.has_text_frame:
+                for para in shape.text_frame.paragraphs:
+                    text = para.text.strip()
+                    if text:
+                        texts.append(text)
+        if texts:
+            slides.append(f"[Folie {i}]\n" + "\n".join(texts))
+
+    return "\n\n".join(slides)
@@ -0,0 +1,22 @@
+"""XLSX text extraction using openpyxl."""
+
+from openpyxl import load_workbook
+
+
+def extract_xlsx(file_path: str) -> str:
+    """Extract text from an XLSX file."""
+    wb = load_workbook(file_path, read_only=True, data_only=True)
+    sheets = []
+
+    for sheet_name in wb.sheetnames:
+        ws = wb[sheet_name]
+        rows = []
+        for row in ws.iter_rows(values_only=True):
+            cells = [str(c) for c in row if c is not None]
+            if cells:
+                rows.append(" | ".join(cells))
+        if rows:
+            sheets.append(f"[{sheet_name}]\n" + "\n".join(rows))
+
+    wb.close()
+    return "\n\n".join(sheets)