feat: Add Document Crawler & Auto-Onboarding service (Phase 1.4)

New standalone Python/FastAPI service for automatic compliance document
scanning, LLM-based classification, IPFS archival, and gap analysis.
Includes extractors (PDF, DOCX, XLSX, PPTX), keyword fallback classifier,
compliance matrix, and full REST API on port 8098.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Boenisch
2026-02-13 20:35:15 +01:00
parent 0923c03756
commit 364d2c69ff
34 changed files with 1633 additions and 0 deletions

View File

@@ -0,0 +1,2 @@
from .analyzer import generate_gap_analysis
from .compliance_matrix import COMPLIANCE_MATRIX

View File

@@ -0,0 +1,59 @@
"""Gap detection logic — compares found documents against compliance matrix."""
import uuid
from .compliance_matrix import COMPLIANCE_MATRIX, RequiredDocument
def generate_gap_analysis(
classification_counts: dict[str, int],
company_profiles: list[str] | None = None,
) -> dict:
"""Analyze gaps between found documents and required compliance matrix.
Args:
classification_counts: e.g. {"VVT": 2, "TOM": 1, "DSE": 0}
company_profiles: list of applicable profiles.
Default: ["universal", "data_processor", "ai_user"]
Returns dict with compliance_score, gaps list, classification_breakdown.
"""
if company_profiles is None:
company_profiles = ["universal", "data_processor", "ai_user"]
applicable = [
req for req in COMPLIANCE_MATRIX
if req.applies_to in company_profiles
]
gaps = []
covered = 0
for req in applicable:
count = classification_counts.get(req.category, 0)
if count == 0:
gaps.append({
"id": str(uuid.uuid4()),
"category": req.category,
"description": req.description,
"severity": req.severity,
"regulation": req.regulation,
"requiredAction": f"{req.category} erstellen und dokumentieren",
"relatedStepId": None,
})
else:
covered += 1
total_required = len(applicable)
compliance_score = (covered / total_required * 100) if total_required > 0 else 0
return {
"compliance_score": round(compliance_score, 1),
"total_required": total_required,
"covered": covered,
"gaps": gaps,
"gap_summary": {
"critical": sum(1 for g in gaps if g["severity"] == "CRITICAL"),
"high": sum(1 for g in gaps if g["severity"] == "HIGH"),
"medium": sum(1 for g in gaps if g["severity"] == "MEDIUM"),
},
}

View File

@@ -0,0 +1,75 @@
"""Required documents per regulation and company type."""
from dataclasses import dataclass
@dataclass
class RequiredDocument:
category: str
description: str
regulation: str
severity: str # CRITICAL, HIGH, MEDIUM
applies_to: str # universal, data_processor, ai_user, large_company
COMPLIANCE_MATRIX: list[RequiredDocument] = [
# Universal — every company
RequiredDocument(
category="VVT",
description="Verzeichnis von Verarbeitungstaetigkeiten fehlt",
regulation="Art. 30 DSGVO",
severity="CRITICAL",
applies_to="universal",
),
RequiredDocument(
category="TOM",
description="Technisch-organisatorische Massnahmen nicht dokumentiert",
regulation="Art. 32 DSGVO",
severity="CRITICAL",
applies_to="universal",
),
RequiredDocument(
category="DSE",
description="Datenschutzerklaerung fehlt oder unvollstaendig",
regulation="Art. 13/14 DSGVO",
severity="CRITICAL",
applies_to="universal",
),
RequiredDocument(
category="Loeschkonzept",
description="Kein Loeschkonzept / keine Loeschfristen definiert",
regulation="Art. 17 DSGVO / Art. 5 Abs. 1e DSGVO",
severity="HIGH",
applies_to="universal",
),
RequiredDocument(
category="Richtlinie",
description="Interne Datenschutzrichtlinie fehlt",
regulation="Art. 24 DSGVO",
severity="MEDIUM",
applies_to="universal",
),
RequiredDocument(
category="Schulungsnachweis",
description="Keine Datenschutz-Schulungsnachweise vorhanden",
regulation="Art. 39 Abs. 1b DSGVO",
severity="MEDIUM",
applies_to="universal",
),
# Data processors
RequiredDocument(
category="AVV",
description="Auftragsverarbeitungsvertrag fehlt",
regulation="Art. 28 DSGVO",
severity="CRITICAL",
applies_to="data_processor",
),
# AI users
RequiredDocument(
category="DSFA",
description="Datenschutz-Folgenabschaetzung fuer KI-Systeme fehlt",
regulation="Art. 35 DSGVO / EU AI Act",
severity="HIGH",
applies_to="ai_user",
),
]