feat: Add 76 Level-2 regex checks for document correctness verification

Split dsi_document_checker.py (466 LOC) into doc_checks/ package (9 files).
Two-pass L1→L2 logic: L1 checks "Is it mentioned?", L2 checks "Is it correct?"
(e.g. controller has full address, specific Art. 6 lit., concrete time periods).

138 total checks (62 L1 + 76 L2) across 7 doc types:
- DSE Art. 13: 31, Impressum §5 TMG: 16, Cookie §25 TDDDG: 15
- Widerruf §355: 15, AGB §305ff: 21, Social Media Art. 26: 20, DSFA Art. 35: 18

Frontend: hierarchical L1→L2 display with dual progress bars
(green=completeness, blue=correctness).

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-05-07 12:37:03 +02:00
parent 3c12e06faf
commit b363c28539
12 changed files with 2083 additions and 496 deletions
@@ -48,6 +48,9 @@ class CheckItem(BaseModel):
passed: bool
severity: str
matched_text: str = ""
level: int = 1
parent: str | None = None
skipped: bool = False
class DocCheckResult(BaseModel):
@@ -56,6 +59,7 @@ class DocCheckResult(BaseModel):
doc_type: str
word_count: int = 0
completeness_pct: int = 0
correctness_pct: int = 0
checks: list[CheckItem] = []
findings_count: int = 0
error: str = ""
@@ -229,27 +233,30 @@ async def _check_single_document(entry: DocCheckEntry) -> list[DocCheckResult]:
def _run_checklist(text: str, doc_type: str, label: str, url: str, word_count: int = 0) -> DocCheckResult:
"""Run checklist against text and return structured result."""
import re as _re
findings = check_document_completeness(text, doc_type, label, url)
all_checks: list[CheckItem] = []
completeness = 0
correctness = 0
for f in findings:
if "SCORE" in f.get("code", ""):
for c in f.get("all_checks", []):
all_checks.append(CheckItem(
id=c["id"], label=c["label"], passed=c["passed"],
severity=c["severity"], matched_text=c.get("matched_text", ""),
level=c.get("level", 1),
parent=c.get("parent"),
skipped=c.get("skipped", False),
))
pct_match = _re.search(r"(\d+)%", f.get("text", ""))
if pct_match:
completeness = int(pct_match.group(1))
completeness = f.get("completeness_pct", 0)
correctness = f.get("correctness_pct", 0)
non_score = [f for f in findings if "SCORE" not in f.get("code", "")]
return DocCheckResult(
label=label, url=url, doc_type=doc_type,
word_count=word_count or len(text.split()),
completeness_pct=completeness,
correctness_pct=correctness,
checks=all_checks, findings_count=len(non_score),
)
@@ -374,11 +381,15 @@ def _build_report(results: list[DocCheckResult], cookie_result: dict | None) ->
status = "OK" if r.completeness_pct == 100 else "LUECKENHAFT" if r.completeness_pct >= 50 else "MANGELHAFT"
if r.error:
status = "FEHLER"
parts.append(f"[{status}] {r.label} ({r.completeness_pct}%, {r.word_count} Woerter)")
detail = f", Korrektheit {r.correctness_pct}%" if r.correctness_pct else ""
parts.append(f"[{status}] {r.label} ({r.completeness_pct}%{detail}, {r.word_count} Woerter)")
for check in r.checks:
if check.skipped:
continue
icon = "+" if check.passed else "!!"
parts.append(f" [{icon}] {check.label}")
indent = " " if check.level == 2 else " "
parts.append(f"{indent}[{icon}] {check.label}")
if r.error:
parts.append(f" FEHLER: {r.error}")