feat: Add Document Crawler & Auto-Onboarding service (Phase 1.4)

New standalone Python/FastAPI service for automatic compliance document scanning, LLM-based classification, IPFS archival, and gap analysis. Includes extractors (PDF, DOCX, XLSX, PPTX), keyword fallback classifier, compliance matrix, and full REST API on port 8098. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-13 20:35:15 +01:00
parent 0923c03756
commit 364d2c69ff
34 changed files with 1633 additions and 0 deletions
@@ -0,0 +1,2 @@
+from .llm_classifier import classify_document
+from .keyword_fallback import keyword_classify
@@ -0,0 +1,80 @@
+"""Heuristic keyword-based classification fallback."""
+
+# Keyword patterns per category — order matters (first match wins on tie)
+KEYWORD_MAP: list[tuple[str, list[str]]] = [
+    ("VVT", [
+        "verarbeitungsverzeichnis", "verzeichnis von verarbeitungstaetigkeiten",
+        "verarbeitungstaetigkeit", "art. 30", "art 30", "zweck der verarbeitung",
+        "kategorie betroffener personen", "datenkategorien",
+    ]),
+    ("TOM", [
+        "technisch-organisatorische massnahmen", "technische und organisatorische",
+        "art. 32", "art 32", "zutrittskontrolle", "zugangskontrolle",
+        "zugriffskontrolle", "verschluesselungskonzept", "pseudonymisierung",
+    ]),
+    ("DSE", [
+        "datenschutzerklaerung", "datenschutzhinweise", "privacy policy",
+        "informationspflichten", "art. 13", "art. 14", "art 13", "art 14",
+        "betroffenenrechte", "verantwortlicher im sinne",
+    ]),
+    ("AVV", [
+        "auftragsverarbeitung", "auftragsverarbeitungsvertrag",
+        "art. 28", "art 28", "weisungsgebundenheit", "unterauftragnehmer",
+        "subunternehmer",
+    ]),
+    ("DSFA", [
+        "datenschutz-folgenabschaetzung", "folgenabschaetzung",
+        "art. 35", "art 35", "risikoanalyse", "hohes risiko",
+        "systematische beschreibung",
+    ]),
+    ("Loeschkonzept", [
+        "loeschkonzept", "loeschfristen", "aufbewahrungsfrist",
+        "loeschung personenbezogener", "speicherdauer", "vorhaltefrist",
+    ]),
+    ("Einwilligung", [
+        "einwilligung", "einwilligungserklaerung", "consent",
+        "freiwillige zustimmung", "widerruf der einwilligung",
+    ]),
+    ("Vertrag", [
+        "vertrag", "vereinbarung", "vertragspartner",
+        "leistungsbeschreibung", "vertragsgegenstand",
+    ]),
+    ("Richtlinie", [
+        "richtlinie", "policy", "datenschutzrichtlinie", "leitlinie",
+        "verhaltensregeln", "organisationsanweisung",
+    ]),
+    ("Schulungsnachweis", [
+        "schulungsnachweis", "schulung", "training", "datenschutzschulung",
+        "teilnahmebestaetigung", "fortbildung datenschutz",
+    ]),
+]
+
+MAX_KEYWORD_CONFIDENCE = 0.3
+
+
+def keyword_classify(text: str, filename: str) -> dict:
+    """Classify document by keyword matching. Confidence capped at 0.3."""
+    combined = (filename + " " + text).lower()
+
+    best_category = "Sonstiges"
+    best_score = 0
+
+    for category, keywords in KEYWORD_MAP:
+        score = sum(1 for kw in keywords if kw in combined)
+        if score > best_score:
+            best_score = score
+            best_category = category
+
+    if best_score == 0:
+        return {
+            "classification": "Sonstiges",
+            "confidence": 0.1,
+            "reasoning": "Keine Schluesselwoerter gefunden (Keyword-Fallback)",
+        }
+
+    confidence = min(best_score * 0.1, MAX_KEYWORD_CONFIDENCE)
+    return {
+        "classification": best_category,
+        "confidence": confidence,
+        "reasoning": f"Keyword-Fallback: {best_score} Treffer fuer {best_category}",
+    }
@@ -0,0 +1,73 @@
+"""LLM-based document classification via ai-compliance-sdk."""
+
+import json
+import httpx
+
+from config import settings
+from .prompts import (
+    CLASSIFICATION_SYSTEM_PROMPT,
+    CLASSIFICATION_USER_PROMPT,
+    VALID_CLASSIFICATIONS,
+)
+from .keyword_fallback import keyword_classify
+
+
+async def classify_document(
+    text: str,
+    filename: str,
+    tenant_id: str,
+    user_id: str = "system",
+) -> dict:
+    """Classify a document using the LLM gateway.
+
+    Returns dict with keys: classification, confidence, reasoning.
+    Falls back to keyword heuristic if LLM is unavailable.
+    """
+    truncated = text[: settings.LLM_TEXT_LIMIT]
+    user_prompt = CLASSIFICATION_USER_PROMPT.format(
+        filename=filename, text=truncated
+    )
+
+    try:
+        async with httpx.AsyncClient(timeout=60.0) as client:
+            resp = await client.post(
+                f"{settings.LLM_GATEWAY_URL}/sdk/v1/llm/chat",
+                json={
+                    "messages": [
+                        {"role": "system", "content": CLASSIFICATION_SYSTEM_PROMPT},
+                        {"role": "user", "content": user_prompt},
+                    ],
+                    "temperature": 0.1,
+                    "max_tokens": 300,
+                },
+                headers={
+                    "X-Tenant-ID": tenant_id,
+                    "X-User-ID": user_id,
+                    "Content-Type": "application/json",
+                },
+            )
+
+            if resp.status_code != 200:
+                return keyword_classify(text, filename)
+
+            data = resp.json()
+            # The SDK returns the assistant message content
+            content = (
+                data.get("content")
+                or data.get("message", {}).get("content")
+                or data.get("choices", [{}])[0].get("message", {}).get("content", "")
+            )
+
+            result = json.loads(content)
+            classification = result.get("classification", "Sonstiges")
+            if classification not in VALID_CLASSIFICATIONS:
+                classification = "Sonstiges"
+
+            return {
+                "classification": classification,
+                "confidence": min(max(float(result.get("confidence", 0.5)), 0.0), 1.0),
+                "reasoning": result.get("reasoning", ""),
+            }
+
+    except (httpx.RequestError, json.JSONDecodeError, KeyError, IndexError):
+        return keyword_classify(text, filename)
@@ -0,0 +1,33 @@
+"""Classification prompt templates for LLM-based document classification."""
+
+CLASSIFICATION_SYSTEM_PROMPT = """Du bist ein Experte fuer DSGVO-Compliance-Dokumentation.
+Deine Aufgabe ist es, Dokumente anhand ihres Inhalts in eine der folgenden Kategorien einzuordnen.
+
+Kategorien:
+- VVT: Verzeichnis von Verarbeitungstaetigkeiten (Art. 30 DSGVO)
+- TOM: Technisch-organisatorische Massnahmen (Art. 32 DSGVO)
+- DSE: Datenschutzerklaerung (Art. 13/14 DSGVO)
+- AVV: Auftragsverarbeitungsvertrag (Art. 28 DSGVO)
+- DSFA: Datenschutz-Folgenabschaetzung (Art. 35 DSGVO)
+- Loeschkonzept: Loeschfristen und Loeschregeln
+- Einwilligung: Einwilligungserklaerungen und Consent-Formulare
+- Vertrag: Vertraege mit Datenschutzbezug
+- Richtlinie: Interne Datenschutz-Richtlinien und Policies
+- Schulungsnachweis: Datenschutz-Schulungen und Nachweise
+- Sonstiges: Dokument mit anderem oder unklarem Inhalt
+
+Antworte AUSSCHLIESSLICH im folgenden JSON-Format:
+{"classification": "<KATEGORIE>", "confidence": <0.0-1.0>, "reasoning": "<kurze Begruendung>"}"""
+
+CLASSIFICATION_USER_PROMPT = """Klassifiziere das folgende Dokument:
+
+Dateiname: {filename}
+
+Textinhalt (Auszug):
+{text}"""
+
+VALID_CLASSIFICATIONS = [
+    "VVT", "TOM", "DSE", "AVV", "DSFA",
+    "Loeschkonzept", "Einwilligung", "Vertrag",
+    "Richtlinie", "Schulungsnachweis", "Sonstiges",
+]