feat: Multi-URL Document Check with full checklist visibility
New "Dokumenten-Pruefung" tab in Compliance Agent: - User adds multiple URLs with document type (DSI, AGB, Impressum, Cookie, Widerruf) - Each document loaded via Playwright, accordions expanded, text extracted - Checked against type-specific legal checklist - Optional: Cookie banner check via checkbox Checklisten-UX (solves "100% looks like nothing was checked"): - All checks shown per document: green checkmark + matched text excerpt - Red X for missing fields with legal reference - Builds user trust: "9 Punkte geprueft, alle bestanden" - Expandable per document with completeness bar New checklists: - Impressum: §5 TMG (6 fields: name, address, contact, register, VAT, representative) - Cookie-Richtlinie: §25 TDDDG (5 fields: types, purposes, retention, third-party, opt-out) Backend: - POST /agent/doc-check — async with polling (same pattern as /scan) - DocCheckResult includes checks[] with passed/failed + matched_text - dsi_document_checker returns all_checks in SCORE finding - Email report shows per-document checklist Files: agent_doc_check_routes.py (280 LOC), DocCheckTab.tsx (248 LOC), ChecklistView.tsx (130 LOC), dsi_document_checker.py (+70 LOC) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -163,6 +163,36 @@ AGB_CHECKLIST = [
|
||||
"patterns": [r"gerichtsstand", r"anwendbares\s+recht", r"jurisdiction", r"governing\s+law"]},
|
||||
]
|
||||
|
||||
# §5 TMG / §18 MStV Impressum requirements
|
||||
IMPRESSUM_CHECKLIST = [
|
||||
{"id": "name", "label": "Name des Anbieters",
|
||||
"patterns": [r"(?:gmbh|ag|e\.v\.|ohg|kg|gbr|ug|mbh|inc|ltd)", r"firma", r"unternehmen"]},
|
||||
{"id": "address", "label": "Anschrift",
|
||||
"patterns": [r"(?:str(?:asse|\.)|weg|platz|allee)\s*\d", r"d-\d{5}", r"\d{5}\s+\w+"]},
|
||||
{"id": "contact", "label": "Kontaktdaten (E-Mail + Telefon)",
|
||||
"patterns": [r"(?:e-?mail|mail).*@", r"telefon|phone|tel\.", r"\+?\d[\d\s/\-]{8,}"]},
|
||||
{"id": "register", "label": "Handelsregister / Registernummer",
|
||||
"patterns": [r"(?:handelsregister|hrb|hra|registergericht|amtsgericht)", r"register.*(?:nr|nummer)"]},
|
||||
{"id": "vat", "label": "USt-IdNr.",
|
||||
"patterns": [r"ust.*id", r"umsatzsteuer.*identifikation", r"vat.*id", r"de\s*\d{9}"]},
|
||||
{"id": "representative", "label": "Vertretungsberechtigte",
|
||||
"patterns": [r"vertretungsberechtigt", r"geschäftsführ", r"vorstand", r"inhaber"]},
|
||||
]
|
||||
|
||||
# §25 TDDDG Cookie policy requirements
|
||||
COOKIE_CHECKLIST = [
|
||||
{"id": "cookie_types", "label": "Arten der Cookies",
|
||||
"patterns": [r"(?:notwendig|essentiell|funktional|statistik|marketing|tracking)", r"cookie.*(?:art|typ|kategori)"]},
|
||||
{"id": "purposes", "label": "Zwecke der Cookies",
|
||||
"patterns": [r"zweck.*cookie", r"cookie.*zweck", r"(?:wofuer|wozu|warum).*cookie"]},
|
||||
{"id": "retention", "label": "Speicherdauer der Cookies",
|
||||
"patterns": [r"(?:speicherdauer|laufzeit|gueltigk|ablauf).*cookie", r"cookie.*(?:\d+\s+(?:tag|monat|jahr)|session)"]},
|
||||
{"id": "third_party", "label": "Drittanbieter-Cookies",
|
||||
"patterns": [r"drittanbieter", r"third.?party", r"(?:google|facebook|meta|microsoft).*cookie"]},
|
||||
{"id": "opt_out", "label": "Widerspruchsmoeglichkeit",
|
||||
"patterns": [r"(?:widerspruch|opt.?out|ablehnen|deaktivieren).*cookie", r"cookie.*(?:ablehnen|deaktivieren|loeschen)"]},
|
||||
]
|
||||
|
||||
|
||||
def check_document_completeness(
|
||||
text: str,
|
||||
@@ -215,15 +245,36 @@ def check_document_completeness(
|
||||
elif doc_type in ("agb", "terms", "nutzungsbedingungen"):
|
||||
checklist = AGB_CHECKLIST
|
||||
label = "§305ff BGB"
|
||||
elif doc_type in ("impressum", "imprint"):
|
||||
checklist = IMPRESSUM_CHECKLIST
|
||||
label = "§5 TMG / §18 MStV"
|
||||
elif doc_type in ("cookie",):
|
||||
checklist = COOKIE_CHECKLIST
|
||||
label = "§25 TDDDG"
|
||||
else:
|
||||
checklist = ART13_CHECKLIST # Default: check as DSE
|
||||
label = "Art. 13 DSGVO"
|
||||
|
||||
present = 0
|
||||
total = len(checklist)
|
||||
all_checks: list[dict] = []
|
||||
|
||||
for check in checklist:
|
||||
found = any(re.search(p, text_lower) for p in check["patterns"])
|
||||
if not found:
|
||||
match = None
|
||||
for p in check["patterns"]:
|
||||
m = re.search(p, text_lower)
|
||||
if m:
|
||||
match = m
|
||||
break
|
||||
|
||||
passed = match is not None
|
||||
matched_text = ""
|
||||
if match:
|
||||
start = max(0, match.start() - 30)
|
||||
end = min(len(text_lower), match.end() + 30)
|
||||
matched_text = text_lower[start:end].strip()
|
||||
present += 1
|
||||
else:
|
||||
findings.append({
|
||||
"code": f"DSI-MISSING-{check['id'].upper()}",
|
||||
"severity": check.get("severity", "MEDIUM"),
|
||||
@@ -236,8 +287,14 @@ def check_document_completeness(
|
||||
"doc_type": doc_type,
|
||||
"check_id": check["id"],
|
||||
})
|
||||
else:
|
||||
present += 1
|
||||
|
||||
all_checks.append({
|
||||
"id": check["id"],
|
||||
"label": check["label"],
|
||||
"passed": passed,
|
||||
"severity": check.get("severity", "MEDIUM"),
|
||||
"matched_text": matched_text,
|
||||
})
|
||||
|
||||
# Always add summary finding (even at 100% — needed for completeness tracking)
|
||||
if total > 0:
|
||||
@@ -252,6 +309,7 @@ def check_document_completeness(
|
||||
"doc_title": doc_title,
|
||||
"doc_url": doc_url,
|
||||
"doc_type": doc_type,
|
||||
"all_checks": all_checks,
|
||||
})
|
||||
|
||||
return findings
|
||||
|
||||
Reference in New Issue
Block a user