feat: Add Document Crawler & Auto-Onboarding service (Phase 1.4)
New standalone Python/FastAPI service for automatic compliance document scanning, LLM-based classification, IPFS archival, and gap analysis. Includes extractors (PDF, DOCX, XLSX, PPTX), keyword fallback classifier, compliance matrix, and full REST API on port 8098. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2
document-crawler/classifiers/__init__.py
Normal file
2
document-crawler/classifiers/__init__.py
Normal file
@@ -0,0 +1,2 @@
|
||||
from .llm_classifier import classify_document
|
||||
from .keyword_fallback import keyword_classify
|
||||
80
document-crawler/classifiers/keyword_fallback.py
Normal file
80
document-crawler/classifiers/keyword_fallback.py
Normal file
@@ -0,0 +1,80 @@
|
||||
"""Heuristic keyword-based classification fallback."""
|
||||
|
||||
# Keyword patterns per category — order matters (first match wins on tie)
|
||||
KEYWORD_MAP: list[tuple[str, list[str]]] = [
|
||||
("VVT", [
|
||||
"verarbeitungsverzeichnis", "verzeichnis von verarbeitungstaetigkeiten",
|
||||
"verarbeitungstaetigkeit", "art. 30", "art 30", "zweck der verarbeitung",
|
||||
"kategorie betroffener personen", "datenkategorien",
|
||||
]),
|
||||
("TOM", [
|
||||
"technisch-organisatorische massnahmen", "technische und organisatorische",
|
||||
"art. 32", "art 32", "zutrittskontrolle", "zugangskontrolle",
|
||||
"zugriffskontrolle", "verschluesselungskonzept", "pseudonymisierung",
|
||||
]),
|
||||
("DSE", [
|
||||
"datenschutzerklaerung", "datenschutzhinweise", "privacy policy",
|
||||
"informationspflichten", "art. 13", "art. 14", "art 13", "art 14",
|
||||
"betroffenenrechte", "verantwortlicher im sinne",
|
||||
]),
|
||||
("AVV", [
|
||||
"auftragsverarbeitung", "auftragsverarbeitungsvertrag",
|
||||
"art. 28", "art 28", "weisungsgebundenheit", "unterauftragnehmer",
|
||||
"subunternehmer",
|
||||
]),
|
||||
("DSFA", [
|
||||
"datenschutz-folgenabschaetzung", "folgenabschaetzung",
|
||||
"art. 35", "art 35", "risikoanalyse", "hohes risiko",
|
||||
"systematische beschreibung",
|
||||
]),
|
||||
("Loeschkonzept", [
|
||||
"loeschkonzept", "loeschfristen", "aufbewahrungsfrist",
|
||||
"loeschung personenbezogener", "speicherdauer", "vorhaltefrist",
|
||||
]),
|
||||
("Einwilligung", [
|
||||
"einwilligung", "einwilligungserklaerung", "consent",
|
||||
"freiwillige zustimmung", "widerruf der einwilligung",
|
||||
]),
|
||||
("Vertrag", [
|
||||
"vertrag", "vereinbarung", "vertragspartner",
|
||||
"leistungsbeschreibung", "vertragsgegenstand",
|
||||
]),
|
||||
("Richtlinie", [
|
||||
"richtlinie", "policy", "datenschutzrichtlinie", "leitlinie",
|
||||
"verhaltensregeln", "organisationsanweisung",
|
||||
]),
|
||||
("Schulungsnachweis", [
|
||||
"schulungsnachweis", "schulung", "training", "datenschutzschulung",
|
||||
"teilnahmebestaetigung", "fortbildung datenschutz",
|
||||
]),
|
||||
]
|
||||
|
||||
MAX_KEYWORD_CONFIDENCE = 0.3
|
||||
|
||||
|
||||
def keyword_classify(text: str, filename: str) -> dict:
|
||||
"""Classify document by keyword matching. Confidence capped at 0.3."""
|
||||
combined = (filename + " " + text).lower()
|
||||
|
||||
best_category = "Sonstiges"
|
||||
best_score = 0
|
||||
|
||||
for category, keywords in KEYWORD_MAP:
|
||||
score = sum(1 for kw in keywords if kw in combined)
|
||||
if score > best_score:
|
||||
best_score = score
|
||||
best_category = category
|
||||
|
||||
if best_score == 0:
|
||||
return {
|
||||
"classification": "Sonstiges",
|
||||
"confidence": 0.1,
|
||||
"reasoning": "Keine Schluesselwoerter gefunden (Keyword-Fallback)",
|
||||
}
|
||||
|
||||
confidence = min(best_score * 0.1, MAX_KEYWORD_CONFIDENCE)
|
||||
return {
|
||||
"classification": best_category,
|
||||
"confidence": confidence,
|
||||
"reasoning": f"Keyword-Fallback: {best_score} Treffer fuer {best_category}",
|
||||
}
|
||||
73
document-crawler/classifiers/llm_classifier.py
Normal file
73
document-crawler/classifiers/llm_classifier.py
Normal file
@@ -0,0 +1,73 @@
|
||||
"""LLM-based document classification via ai-compliance-sdk."""
|
||||
|
||||
import json
|
||||
import httpx
|
||||
|
||||
from config import settings
|
||||
from .prompts import (
|
||||
CLASSIFICATION_SYSTEM_PROMPT,
|
||||
CLASSIFICATION_USER_PROMPT,
|
||||
VALID_CLASSIFICATIONS,
|
||||
)
|
||||
from .keyword_fallback import keyword_classify
|
||||
|
||||
|
||||
async def classify_document(
|
||||
text: str,
|
||||
filename: str,
|
||||
tenant_id: str,
|
||||
user_id: str = "system",
|
||||
) -> dict:
|
||||
"""Classify a document using the LLM gateway.
|
||||
|
||||
Returns dict with keys: classification, confidence, reasoning.
|
||||
Falls back to keyword heuristic if LLM is unavailable.
|
||||
"""
|
||||
truncated = text[: settings.LLM_TEXT_LIMIT]
|
||||
user_prompt = CLASSIFICATION_USER_PROMPT.format(
|
||||
filename=filename, text=truncated
|
||||
)
|
||||
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=60.0) as client:
|
||||
resp = await client.post(
|
||||
f"{settings.LLM_GATEWAY_URL}/sdk/v1/llm/chat",
|
||||
json={
|
||||
"messages": [
|
||||
{"role": "system", "content": CLASSIFICATION_SYSTEM_PROMPT},
|
||||
{"role": "user", "content": user_prompt},
|
||||
],
|
||||
"temperature": 0.1,
|
||||
"max_tokens": 300,
|
||||
},
|
||||
headers={
|
||||
"X-Tenant-ID": tenant_id,
|
||||
"X-User-ID": user_id,
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
)
|
||||
|
||||
if resp.status_code != 200:
|
||||
return keyword_classify(text, filename)
|
||||
|
||||
data = resp.json()
|
||||
# The SDK returns the assistant message content
|
||||
content = (
|
||||
data.get("content")
|
||||
or data.get("message", {}).get("content")
|
||||
or data.get("choices", [{}])[0].get("message", {}).get("content", "")
|
||||
)
|
||||
|
||||
result = json.loads(content)
|
||||
classification = result.get("classification", "Sonstiges")
|
||||
if classification not in VALID_CLASSIFICATIONS:
|
||||
classification = "Sonstiges"
|
||||
|
||||
return {
|
||||
"classification": classification,
|
||||
"confidence": min(max(float(result.get("confidence", 0.5)), 0.0), 1.0),
|
||||
"reasoning": result.get("reasoning", ""),
|
||||
}
|
||||
|
||||
except (httpx.RequestError, json.JSONDecodeError, KeyError, IndexError):
|
||||
return keyword_classify(text, filename)
|
||||
33
document-crawler/classifiers/prompts.py
Normal file
33
document-crawler/classifiers/prompts.py
Normal file
@@ -0,0 +1,33 @@
|
||||
"""Classification prompt templates for LLM-based document classification."""
|
||||
|
||||
CLASSIFICATION_SYSTEM_PROMPT = """Du bist ein Experte fuer DSGVO-Compliance-Dokumentation.
|
||||
Deine Aufgabe ist es, Dokumente anhand ihres Inhalts in eine der folgenden Kategorien einzuordnen.
|
||||
|
||||
Kategorien:
|
||||
- VVT: Verzeichnis von Verarbeitungstaetigkeiten (Art. 30 DSGVO)
|
||||
- TOM: Technisch-organisatorische Massnahmen (Art. 32 DSGVO)
|
||||
- DSE: Datenschutzerklaerung (Art. 13/14 DSGVO)
|
||||
- AVV: Auftragsverarbeitungsvertrag (Art. 28 DSGVO)
|
||||
- DSFA: Datenschutz-Folgenabschaetzung (Art. 35 DSGVO)
|
||||
- Loeschkonzept: Loeschfristen und Loeschregeln
|
||||
- Einwilligung: Einwilligungserklaerungen und Consent-Formulare
|
||||
- Vertrag: Vertraege mit Datenschutzbezug
|
||||
- Richtlinie: Interne Datenschutz-Richtlinien und Policies
|
||||
- Schulungsnachweis: Datenschutz-Schulungen und Nachweise
|
||||
- Sonstiges: Dokument mit anderem oder unklarem Inhalt
|
||||
|
||||
Antworte AUSSCHLIESSLICH im folgenden JSON-Format:
|
||||
{"classification": "<KATEGORIE>", "confidence": <0.0-1.0>, "reasoning": "<kurze Begruendung>"}"""
|
||||
|
||||
CLASSIFICATION_USER_PROMPT = """Klassifiziere das folgende Dokument:
|
||||
|
||||
Dateiname: {filename}
|
||||
|
||||
Textinhalt (Auszug):
|
||||
{text}"""
|
||||
|
||||
VALID_CLASSIFICATIONS = [
|
||||
"VVT", "TOM", "DSE", "AVV", "DSFA",
|
||||
"Loeschkonzept", "Einwilligung", "Vertrag",
|
||||
"Richtlinie", "Schulungsnachweis", "Sonstiges",
|
||||
]
|
||||
Reference in New Issue
Block a user