feat: Add 76 Level-2 regex checks for document correctness verification
Split dsi_document_checker.py (466 LOC) into doc_checks/ package (9 files). Two-pass L1→L2 logic: L1 checks "Is it mentioned?", L2 checks "Is it correct?" (e.g. controller has full address, specific Art. 6 lit., concrete time periods). 138 total checks (62 L1 + 76 L2) across 7 doc types: - DSE Art. 13: 31, Impressum §5 TMG: 16, Cookie §25 TDDDG: 15 - Widerruf §355: 15, AGB §305ff: 21, Social Media Art. 26: 20, DSFA Art. 35: 18 Frontend: hierarchical L1→L2 display with dual progress bars (green=completeness, blue=correctness). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,227 @@
|
||||
"""
|
||||
Document check runner — two-pass L1/L2 logic.
|
||||
|
||||
Pass 1: Run all L1 checks ("Is it mentioned?")
|
||||
Pass 2: Run L2 checks only where their L1 parent passed ("Is it correct?")
|
||||
"""
|
||||
|
||||
import logging
|
||||
import re
|
||||
|
||||
from .dse_checks import ART13_CHECKLIST
|
||||
from .widerruf_checks import WIDERRUF_CHECKLIST
|
||||
from .agb_checks import AGB_CHECKLIST
|
||||
from .impressum_checks import IMPRESSUM_CHECKLIST
|
||||
from .cookie_checks import COOKIE_CHECKLIST
|
||||
from .social_media_checks import JOINT_CONTROLLER_CHECKLIST
|
||||
from .dsfa_checks import DSFA_CHECKLIST
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Map doc_type strings to (checklist, label)
|
||||
_CHECKLIST_MAP = {
|
||||
"dse": (ART13_CHECKLIST, "Art. 13 DSGVO"),
|
||||
"datenschutz": (ART13_CHECKLIST, "Art. 13 DSGVO"),
|
||||
"privacy": (ART13_CHECKLIST, "Art. 13 DSGVO"),
|
||||
"widerruf": (WIDERRUF_CHECKLIST, "§355 BGB"),
|
||||
"withdrawal": (WIDERRUF_CHECKLIST, "§355 BGB"),
|
||||
"cancellation": (WIDERRUF_CHECKLIST, "§355 BGB"),
|
||||
"agb": (AGB_CHECKLIST, "§305ff BGB"),
|
||||
"terms": (AGB_CHECKLIST, "§305ff BGB"),
|
||||
"nutzungsbedingungen": (AGB_CHECKLIST, "§305ff BGB"),
|
||||
"impressum": (IMPRESSUM_CHECKLIST, "§5 TMG / §18 MStV"),
|
||||
"imprint": (IMPRESSUM_CHECKLIST, "§5 TMG / §18 MStV"),
|
||||
"cookie": (COOKIE_CHECKLIST, "§25 TDDDG"),
|
||||
"social_media": (JOINT_CONTROLLER_CHECKLIST, "Art. 26 DSGVO"),
|
||||
"joint_controller": (JOINT_CONTROLLER_CHECKLIST, "Art. 26 DSGVO"),
|
||||
"dsfa": (DSFA_CHECKLIST, "Art. 35 DSGVO"),
|
||||
}
|
||||
|
||||
|
||||
def _match_patterns(patterns: list[str], text_lower: str):
|
||||
"""Try each regex pattern against text, return first Match or None."""
|
||||
for p in patterns:
|
||||
m = re.search(p, text_lower)
|
||||
if m:
|
||||
return m
|
||||
return None
|
||||
|
||||
|
||||
def _extract_context(text_lower: str, match) -> str:
|
||||
"""Extract ~30 chars around a match for evidence display."""
|
||||
if not match:
|
||||
return ""
|
||||
start = max(0, match.start() - 30)
|
||||
end = min(len(text_lower), match.end() + 30)
|
||||
return text_lower[start:end].strip()
|
||||
|
||||
|
||||
def check_document_completeness(
|
||||
text: str,
|
||||
doc_type: str,
|
||||
doc_title: str,
|
||||
doc_url: str,
|
||||
) -> list[dict]:
|
||||
"""Check a legal document against its type-specific requirements.
|
||||
|
||||
Two-pass approach:
|
||||
L1 — Is the mandatory field mentioned at all?
|
||||
L2 — Is it correct/complete? (only checked if L1 parent passed)
|
||||
|
||||
Returns a list of findings (summary + missing items).
|
||||
"""
|
||||
findings = []
|
||||
text_lower = text.lower()
|
||||
|
||||
if not text or len(text) < 50:
|
||||
findings.append({
|
||||
"code": f"DSI-EMPTY-{doc_type.upper()}",
|
||||
"severity": "HIGH",
|
||||
"text": f"Dokument '{doc_title}' ist leer oder zu kurz fuer eine Pruefung.",
|
||||
"doc_title": doc_title, "doc_url": doc_url, "doc_type": doc_type,
|
||||
})
|
||||
return findings
|
||||
|
||||
word_count = len(text.split())
|
||||
if word_count < 200 and doc_type == "dse":
|
||||
findings.append({
|
||||
"code": f"DSI-SCORE-{doc_type.upper()}",
|
||||
"severity": "LOW",
|
||||
"text": (
|
||||
f"'{doc_title}': Kurzhinweis ({word_count} Woerter) — zu kurz fuer "
|
||||
f"eine vollstaendige Art. 13 DSGVO Pruefung. Kein eigenstaendiges DSI-Dokument."
|
||||
),
|
||||
"doc_title": doc_title, "doc_url": doc_url, "doc_type": doc_type,
|
||||
"all_checks": [],
|
||||
})
|
||||
return findings
|
||||
|
||||
entry = _CHECKLIST_MAP.get(doc_type, (ART13_CHECKLIST, "Art. 13 DSGVO"))
|
||||
checklist, label = entry
|
||||
|
||||
l1_checks = [c for c in checklist if c.get("level", 1) == 1]
|
||||
l2_checks = [c for c in checklist if c.get("level", 1) == 2]
|
||||
|
||||
# ── Pass 1: L1 checks ────────────────────────────────────────────
|
||||
passed_l1_ids: set[str] = set()
|
||||
all_checks: list[dict] = []
|
||||
l1_present = 0
|
||||
|
||||
for check in l1_checks:
|
||||
match = _match_patterns(check["patterns"], text_lower)
|
||||
passed = match is not None
|
||||
if passed:
|
||||
passed_l1_ids.add(check["id"])
|
||||
l1_present += 1
|
||||
else:
|
||||
findings.append({
|
||||
"code": f"DSI-MISSING-{check['id'].upper()}",
|
||||
"severity": check.get("severity", "MEDIUM"),
|
||||
"text": (
|
||||
f"'{doc_title}': Pflichtangabe '{check['label']}' nicht gefunden. "
|
||||
f"Erforderlich nach {label}."
|
||||
),
|
||||
"doc_title": doc_title, "doc_url": doc_url,
|
||||
"doc_type": doc_type, "check_id": check["id"],
|
||||
})
|
||||
all_checks.append({
|
||||
"id": check["id"], "label": check["label"],
|
||||
"passed": passed, "severity": check.get("severity", "MEDIUM"),
|
||||
"matched_text": _extract_context(text_lower, match),
|
||||
"level": 1, "parent": None, "skipped": False,
|
||||
})
|
||||
|
||||
# ── Pass 2: L2 checks (only if parent L1 passed) ─────────────────
|
||||
l2_total = 0
|
||||
l2_passed = 0
|
||||
|
||||
for check in l2_checks:
|
||||
parent = check.get("parent")
|
||||
skipped = parent not in passed_l1_ids
|
||||
passed = False
|
||||
matched_text = ""
|
||||
|
||||
if not skipped:
|
||||
l2_total += 1
|
||||
match = _match_patterns(check["patterns"], text_lower)
|
||||
passed = match is not None
|
||||
if passed:
|
||||
l2_passed += 1
|
||||
matched_text = _extract_context(text_lower, match)
|
||||
else:
|
||||
findings.append({
|
||||
"code": f"DSI-DETAIL-{check['id'].upper()}",
|
||||
"severity": check.get("severity", "MEDIUM"),
|
||||
"text": (
|
||||
f"'{doc_title}': Detailpruefung '{check['label']}' "
|
||||
f"nicht bestanden. Empfohlen nach {label}."
|
||||
),
|
||||
"doc_title": doc_title, "doc_url": doc_url,
|
||||
"doc_type": doc_type, "check_id": check["id"],
|
||||
})
|
||||
|
||||
all_checks.append({
|
||||
"id": check["id"], "label": check["label"],
|
||||
"passed": passed, "severity": check.get("severity", "MEDIUM"),
|
||||
"matched_text": matched_text,
|
||||
"level": 2, "parent": parent, "skipped": skipped,
|
||||
})
|
||||
|
||||
# ── Summary ───────────────────────────────────────────────────────
|
||||
l1_total = len(l1_checks)
|
||||
completeness_pct = round(l1_present / l1_total * 100) if l1_total else 0
|
||||
correctness_pct = round(l2_passed / l2_total * 100) if l2_total else 0
|
||||
|
||||
severity = (
|
||||
"OK" if completeness_pct == 100
|
||||
else "LOW" if completeness_pct >= 80
|
||||
else "MEDIUM" if completeness_pct >= 50
|
||||
else "HIGH"
|
||||
)
|
||||
|
||||
summary_text = (
|
||||
f"'{doc_title}': {l1_present}/{l1_total} Pflichtangaben vorhanden "
|
||||
f"({completeness_pct}%)."
|
||||
)
|
||||
if completeness_pct < 100:
|
||||
summary_text += f" Fehlend: {l1_total - l1_present} Angaben nach {label}."
|
||||
if l2_total > 0:
|
||||
summary_text += (
|
||||
f" Detailpruefung: {l2_passed}/{l2_total} bestanden "
|
||||
f"({correctness_pct}%)."
|
||||
)
|
||||
|
||||
findings.insert(0, {
|
||||
"code": f"DSI-SCORE-{doc_type.upper()}",
|
||||
"severity": severity,
|
||||
"text": summary_text,
|
||||
"doc_title": doc_title, "doc_url": doc_url, "doc_type": doc_type,
|
||||
"all_checks": all_checks,
|
||||
"completeness_pct": completeness_pct,
|
||||
"correctness_pct": correctness_pct,
|
||||
})
|
||||
|
||||
return findings
|
||||
|
||||
|
||||
def classify_document_type(title: str, url: str) -> str:
|
||||
"""Classify a document by its title/URL into a legal document type."""
|
||||
combined = f"{title} {url}".lower()
|
||||
|
||||
if any(kw in combined for kw in ["datenschutzfolge", "dsfa", "risikoanalyse für nutzung"]):
|
||||
return "dsfa"
|
||||
if any(kw in combined for kw in ["social media", "facebook", "instagram", "linkedin", "fanpage"]):
|
||||
if any(kw in combined for kw in ["datenschutzerkl", "datenschutz für", "datenschutzinformation"]):
|
||||
return "social_media"
|
||||
if any(kw in combined for kw in ["datenschutz", "privacy", "dsgvo", "data protection", "données"]):
|
||||
return "dse"
|
||||
if any(kw in combined for kw in ["widerruf", "withdrawal", "rétractation", "desistimiento"]):
|
||||
return "widerruf"
|
||||
if any(kw in combined for kw in ["agb", "allgemeine geschäftsbedingungen", "terms",
|
||||
"nutzungsbedingungen", "conditions"]):
|
||||
return "agb"
|
||||
if any(kw in combined for kw in ["cookie", "slapuk", "evästeet", "kakor"]):
|
||||
return "cookie"
|
||||
if any(kw in combined for kw in ["impressum", "imprint", "legal notice", "mentions légales"]):
|
||||
return "impressum"
|
||||
return "other"
|
||||
Reference in New Issue
Block a user