feat: Add Document Crawler & Auto-Onboarding service (Phase 1.4)

New standalone Python/FastAPI service for automatic compliance document
scanning, LLM-based classification, IPFS archival, and gap analysis.
Includes extractors (PDF, DOCX, XLSX, PPTX), keyword fallback classifier,
compliance matrix, and full REST API on port 8098.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Boenisch
2026-02-13 20:35:15 +01:00
parent 0923c03756
commit 364d2c69ff
34 changed files with 1633 additions and 0 deletions

View File

@@ -0,0 +1 @@
from .dispatcher import extract_text

View File

@@ -0,0 +1,25 @@
"""Routes files to the appropriate extractor by extension."""
from .pdf_extractor import extract_pdf
from .docx_extractor import extract_docx
from .xlsx_extractor import extract_xlsx
from .pptx_extractor import extract_pptx
EXTRACTORS = {
".pdf": extract_pdf,
".docx": extract_docx,
".xlsx": extract_xlsx,
".pptx": extract_pptx,
}
def extract_text(file_path: str, extension: str) -> str:
"""Extract text from a file based on its extension.
Returns extracted text or raises ValueError for unsupported types.
"""
ext = extension.lower()
extractor = EXTRACTORS.get(ext)
if extractor is None:
raise ValueError(f"Unsupported file extension: {ext}")
return extractor(file_path)

View File

@@ -0,0 +1,18 @@
"""DOCX text extraction using python-docx."""
from docx import Document
def extract_docx(file_path: str) -> str:
"""Extract text from a DOCX file."""
doc = Document(file_path)
paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
# Also extract from tables
for table in doc.tables:
for row in table.rows:
cells = [cell.text.strip() for cell in row.cells if cell.text.strip()]
if cells:
paragraphs.append(" | ".join(cells))
return "\n\n".join(paragraphs)

View File

@@ -0,0 +1,15 @@
"""PDF text extraction using PyMuPDF (fitz)."""
import fitz
def extract_pdf(file_path: str) -> str:
"""Extract text from a PDF file."""
doc = fitz.open(file_path)
pages = []
for page in doc:
text = page.get_text()
if text.strip():
pages.append(text)
doc.close()
return "\n\n".join(pages)

View File

@@ -0,0 +1,22 @@
"""PPTX text extraction using python-pptx."""
from pptx import Presentation
def extract_pptx(file_path: str) -> str:
"""Extract text from a PPTX file."""
prs = Presentation(file_path)
slides = []
for i, slide in enumerate(prs.slides, 1):
texts = []
for shape in slide.shapes:
if shape.has_text_frame:
for para in shape.text_frame.paragraphs:
text = para.text.strip()
if text:
texts.append(text)
if texts:
slides.append(f"[Folie {i}]\n" + "\n".join(texts))
return "\n\n".join(slides)

View File

@@ -0,0 +1,22 @@
"""XLSX text extraction using openpyxl."""
from openpyxl import load_workbook
def extract_xlsx(file_path: str) -> str:
"""Extract text from an XLSX file."""
wb = load_workbook(file_path, read_only=True, data_only=True)
sheets = []
for sheet_name in wb.sheetnames:
ws = wb[sheet_name]
rows = []
for row in ws.iter_rows(values_only=True):
cells = [str(c) for c in row if c is not None]
if cells:
rows.append(" | ".join(cells))
if rows:
sheets.append(f"[{sheet_name}]\n" + "\n".join(rows))
wb.close()
return "\n\n".join(sheets)