b363c28539
Split dsi_document_checker.py (466 LOC) into doc_checks/ package (9 files). Two-pass L1→L2 logic: L1 checks "Is it mentioned?", L2 checks "Is it correct?" (e.g. controller has full address, specific Art. 6 lit., concrete time periods). 138 total checks (62 L1 + 76 L2) across 7 doc types: - DSE Art. 13: 31, Impressum §5 TMG: 16, Cookie §25 TDDDG: 15 - Widerruf §355: 15, AGB §305ff: 21, Social Media Art. 26: 20, DSFA Art. 35: 18 Frontend: hierarchical L1→L2 display with dual progress bars (green=completeness, blue=correctness). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
228 lines
8.6 KiB
Python
228 lines
8.6 KiB
Python
"""
|
|
Document check runner — two-pass L1/L2 logic.
|
|
|
|
Pass 1: Run all L1 checks ("Is it mentioned?")
|
|
Pass 2: Run L2 checks only where their L1 parent passed ("Is it correct?")
|
|
"""
|
|
|
|
import logging
|
|
import re
|
|
|
|
from .dse_checks import ART13_CHECKLIST
|
|
from .widerruf_checks import WIDERRUF_CHECKLIST
|
|
from .agb_checks import AGB_CHECKLIST
|
|
from .impressum_checks import IMPRESSUM_CHECKLIST
|
|
from .cookie_checks import COOKIE_CHECKLIST
|
|
from .social_media_checks import JOINT_CONTROLLER_CHECKLIST
|
|
from .dsfa_checks import DSFA_CHECKLIST
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Map doc_type strings to (checklist, label)
|
|
_CHECKLIST_MAP = {
|
|
"dse": (ART13_CHECKLIST, "Art. 13 DSGVO"),
|
|
"datenschutz": (ART13_CHECKLIST, "Art. 13 DSGVO"),
|
|
"privacy": (ART13_CHECKLIST, "Art. 13 DSGVO"),
|
|
"widerruf": (WIDERRUF_CHECKLIST, "§355 BGB"),
|
|
"withdrawal": (WIDERRUF_CHECKLIST, "§355 BGB"),
|
|
"cancellation": (WIDERRUF_CHECKLIST, "§355 BGB"),
|
|
"agb": (AGB_CHECKLIST, "§305ff BGB"),
|
|
"terms": (AGB_CHECKLIST, "§305ff BGB"),
|
|
"nutzungsbedingungen": (AGB_CHECKLIST, "§305ff BGB"),
|
|
"impressum": (IMPRESSUM_CHECKLIST, "§5 TMG / §18 MStV"),
|
|
"imprint": (IMPRESSUM_CHECKLIST, "§5 TMG / §18 MStV"),
|
|
"cookie": (COOKIE_CHECKLIST, "§25 TDDDG"),
|
|
"social_media": (JOINT_CONTROLLER_CHECKLIST, "Art. 26 DSGVO"),
|
|
"joint_controller": (JOINT_CONTROLLER_CHECKLIST, "Art. 26 DSGVO"),
|
|
"dsfa": (DSFA_CHECKLIST, "Art. 35 DSGVO"),
|
|
}
|
|
|
|
|
|
def _match_patterns(patterns: list[str], text_lower: str):
|
|
"""Try each regex pattern against text, return first Match or None."""
|
|
for p in patterns:
|
|
m = re.search(p, text_lower)
|
|
if m:
|
|
return m
|
|
return None
|
|
|
|
|
|
def _extract_context(text_lower: str, match) -> str:
|
|
"""Extract ~30 chars around a match for evidence display."""
|
|
if not match:
|
|
return ""
|
|
start = max(0, match.start() - 30)
|
|
end = min(len(text_lower), match.end() + 30)
|
|
return text_lower[start:end].strip()
|
|
|
|
|
|
def check_document_completeness(
|
|
text: str,
|
|
doc_type: str,
|
|
doc_title: str,
|
|
doc_url: str,
|
|
) -> list[dict]:
|
|
"""Check a legal document against its type-specific requirements.
|
|
|
|
Two-pass approach:
|
|
L1 — Is the mandatory field mentioned at all?
|
|
L2 — Is it correct/complete? (only checked if L1 parent passed)
|
|
|
|
Returns a list of findings (summary + missing items).
|
|
"""
|
|
findings = []
|
|
text_lower = text.lower()
|
|
|
|
if not text or len(text) < 50:
|
|
findings.append({
|
|
"code": f"DSI-EMPTY-{doc_type.upper()}",
|
|
"severity": "HIGH",
|
|
"text": f"Dokument '{doc_title}' ist leer oder zu kurz fuer eine Pruefung.",
|
|
"doc_title": doc_title, "doc_url": doc_url, "doc_type": doc_type,
|
|
})
|
|
return findings
|
|
|
|
word_count = len(text.split())
|
|
if word_count < 200 and doc_type == "dse":
|
|
findings.append({
|
|
"code": f"DSI-SCORE-{doc_type.upper()}",
|
|
"severity": "LOW",
|
|
"text": (
|
|
f"'{doc_title}': Kurzhinweis ({word_count} Woerter) — zu kurz fuer "
|
|
f"eine vollstaendige Art. 13 DSGVO Pruefung. Kein eigenstaendiges DSI-Dokument."
|
|
),
|
|
"doc_title": doc_title, "doc_url": doc_url, "doc_type": doc_type,
|
|
"all_checks": [],
|
|
})
|
|
return findings
|
|
|
|
entry = _CHECKLIST_MAP.get(doc_type, (ART13_CHECKLIST, "Art. 13 DSGVO"))
|
|
checklist, label = entry
|
|
|
|
l1_checks = [c for c in checklist if c.get("level", 1) == 1]
|
|
l2_checks = [c for c in checklist if c.get("level", 1) == 2]
|
|
|
|
# ── Pass 1: L1 checks ────────────────────────────────────────────
|
|
passed_l1_ids: set[str] = set()
|
|
all_checks: list[dict] = []
|
|
l1_present = 0
|
|
|
|
for check in l1_checks:
|
|
match = _match_patterns(check["patterns"], text_lower)
|
|
passed = match is not None
|
|
if passed:
|
|
passed_l1_ids.add(check["id"])
|
|
l1_present += 1
|
|
else:
|
|
findings.append({
|
|
"code": f"DSI-MISSING-{check['id'].upper()}",
|
|
"severity": check.get("severity", "MEDIUM"),
|
|
"text": (
|
|
f"'{doc_title}': Pflichtangabe '{check['label']}' nicht gefunden. "
|
|
f"Erforderlich nach {label}."
|
|
),
|
|
"doc_title": doc_title, "doc_url": doc_url,
|
|
"doc_type": doc_type, "check_id": check["id"],
|
|
})
|
|
all_checks.append({
|
|
"id": check["id"], "label": check["label"],
|
|
"passed": passed, "severity": check.get("severity", "MEDIUM"),
|
|
"matched_text": _extract_context(text_lower, match),
|
|
"level": 1, "parent": None, "skipped": False,
|
|
})
|
|
|
|
# ── Pass 2: L2 checks (only if parent L1 passed) ─────────────────
|
|
l2_total = 0
|
|
l2_passed = 0
|
|
|
|
for check in l2_checks:
|
|
parent = check.get("parent")
|
|
skipped = parent not in passed_l1_ids
|
|
passed = False
|
|
matched_text = ""
|
|
|
|
if not skipped:
|
|
l2_total += 1
|
|
match = _match_patterns(check["patterns"], text_lower)
|
|
passed = match is not None
|
|
if passed:
|
|
l2_passed += 1
|
|
matched_text = _extract_context(text_lower, match)
|
|
else:
|
|
findings.append({
|
|
"code": f"DSI-DETAIL-{check['id'].upper()}",
|
|
"severity": check.get("severity", "MEDIUM"),
|
|
"text": (
|
|
f"'{doc_title}': Detailpruefung '{check['label']}' "
|
|
f"nicht bestanden. Empfohlen nach {label}."
|
|
),
|
|
"doc_title": doc_title, "doc_url": doc_url,
|
|
"doc_type": doc_type, "check_id": check["id"],
|
|
})
|
|
|
|
all_checks.append({
|
|
"id": check["id"], "label": check["label"],
|
|
"passed": passed, "severity": check.get("severity", "MEDIUM"),
|
|
"matched_text": matched_text,
|
|
"level": 2, "parent": parent, "skipped": skipped,
|
|
})
|
|
|
|
# ── Summary ───────────────────────────────────────────────────────
|
|
l1_total = len(l1_checks)
|
|
completeness_pct = round(l1_present / l1_total * 100) if l1_total else 0
|
|
correctness_pct = round(l2_passed / l2_total * 100) if l2_total else 0
|
|
|
|
severity = (
|
|
"OK" if completeness_pct == 100
|
|
else "LOW" if completeness_pct >= 80
|
|
else "MEDIUM" if completeness_pct >= 50
|
|
else "HIGH"
|
|
)
|
|
|
|
summary_text = (
|
|
f"'{doc_title}': {l1_present}/{l1_total} Pflichtangaben vorhanden "
|
|
f"({completeness_pct}%)."
|
|
)
|
|
if completeness_pct < 100:
|
|
summary_text += f" Fehlend: {l1_total - l1_present} Angaben nach {label}."
|
|
if l2_total > 0:
|
|
summary_text += (
|
|
f" Detailpruefung: {l2_passed}/{l2_total} bestanden "
|
|
f"({correctness_pct}%)."
|
|
)
|
|
|
|
findings.insert(0, {
|
|
"code": f"DSI-SCORE-{doc_type.upper()}",
|
|
"severity": severity,
|
|
"text": summary_text,
|
|
"doc_title": doc_title, "doc_url": doc_url, "doc_type": doc_type,
|
|
"all_checks": all_checks,
|
|
"completeness_pct": completeness_pct,
|
|
"correctness_pct": correctness_pct,
|
|
})
|
|
|
|
return findings
|
|
|
|
|
|
def classify_document_type(title: str, url: str) -> str:
|
|
"""Classify a document by its title/URL into a legal document type."""
|
|
combined = f"{title} {url}".lower()
|
|
|
|
if any(kw in combined for kw in ["datenschutzfolge", "dsfa", "risikoanalyse für nutzung"]):
|
|
return "dsfa"
|
|
if any(kw in combined for kw in ["social media", "facebook", "instagram", "linkedin", "fanpage"]):
|
|
if any(kw in combined for kw in ["datenschutzerkl", "datenschutz für", "datenschutzinformation"]):
|
|
return "social_media"
|
|
if any(kw in combined for kw in ["datenschutz", "privacy", "dsgvo", "data protection", "données"]):
|
|
return "dse"
|
|
if any(kw in combined for kw in ["widerruf", "withdrawal", "rétractation", "desistimiento"]):
|
|
return "widerruf"
|
|
if any(kw in combined for kw in ["agb", "allgemeine geschäftsbedingungen", "terms",
|
|
"nutzungsbedingungen", "conditions"]):
|
|
return "agb"
|
|
if any(kw in combined for kw in ["cookie", "slapuk", "evästeet", "kakor"]):
|
|
return "cookie"
|
|
if any(kw in combined for kw in ["impressum", "imprint", "legal notice", "mentions légales"]):
|
|
return "impressum"
|
|
return "other"
|