feat: Add Document Crawler & Auto-Onboarding service (Phase 1.4)

New standalone Python/FastAPI service for automatic compliance document scanning, LLM-based classification, IPFS archival, and gap analysis. Includes extractors (PDF, DOCX, XLSX, PPTX), keyword fallback classifier, compliance matrix, and full REST API on port 8098. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-13 20:35:15 +01:00
parent 0923c03756
commit 364d2c69ff
34 changed files with 1633 additions and 0 deletions
@@ -0,0 +1,22 @@
+"""PPTX text extraction using python-pptx."""
+
+from pptx import Presentation
+
+
+def extract_pptx(file_path: str) -> str:
+    """Extract text from a PPTX file."""
+    prs = Presentation(file_path)
+    slides = []
+
+    for i, slide in enumerate(prs.slides, 1):
+        texts = []
+        for shape in slide.shapes:
+            if shape.has_text_frame:
+                for para in shape.text_frame.paragraphs:
+                    text = para.text.strip()
+                    if text:
+                        texts.append(text)
+        if texts:
+            slides.append(f"[Folie {i}]\n" + "\n".join(texts))
+
+    return "\n\n".join(slides)