feat: Add Document Crawler & Auto-Onboarding service (Phase 1.4)
New standalone Python/FastAPI service for automatic compliance document scanning, LLM-based classification, IPFS archival, and gap analysis. Includes extractors (PDF, DOCX, XLSX, PPTX), keyword fallback classifier, compliance matrix, and full REST API on port 8098. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
22
document-crawler/extractors/pptx_extractor.py
Normal file
22
document-crawler/extractors/pptx_extractor.py
Normal file
@@ -0,0 +1,22 @@
|
||||
"""PPTX text extraction using python-pptx."""
|
||||
|
||||
from pptx import Presentation
|
||||
|
||||
|
||||
def extract_pptx(file_path: str) -> str:
|
||||
"""Extract text from a PPTX file."""
|
||||
prs = Presentation(file_path)
|
||||
slides = []
|
||||
|
||||
for i, slide in enumerate(prs.slides, 1):
|
||||
texts = []
|
||||
for shape in slide.shapes:
|
||||
if shape.has_text_frame:
|
||||
for para in shape.text_frame.paragraphs:
|
||||
text = para.text.strip()
|
||||
if text:
|
||||
texts.append(text)
|
||||
if texts:
|
||||
slides.append(f"[Folie {i}]\n" + "\n".join(texts))
|
||||
|
||||
return "\n\n".join(slides)
|
||||
Reference in New Issue
Block a user