feat: HTML email report with hints + fix duplicate Social Media sections

1. Email report now renders as styled HTML (matching frontend design): - Progress bars (green=completeness, blue=correctness) - Hierarchical L1→L2 check display - Red hint boxes under failed checks explaining what to fix - Matched text evidence for passed checks 2. Section splitter deduplicates: two "Social Media" headings on the same page are merged into one section instead of creating duplicates. 3. Extracted report builder to agent_doc_check_report.py (175 LOC) to keep routes file under 500 LOC (386 LOC). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-07 15:13:00 +02:00
parent 56892cf7dc
commit a3287cd5e6
2 changed files with 211 additions and 67 deletions
@@ -141,7 +141,7 @@ async def _run_doc_check(check_id: str, req: DocCheckRequest):
        email_result = send_email(
            recipient=req.recipient,
            subject=f"[DOKUMENTEN-PRUEFUNG] {len(results)} Dokumente geprueft",
-            body_html=f"<pre>{summary}</pre>",
+            body_html=summary,
        )

        response = DocCheckResponse(
@@ -284,40 +284,49 @@ def _split_into_sections(text: str, parent_label: str, url: str) -> list[dict]:

    Detects sections like 'Cookies', 'Social Media', 'Dienste von Drittanbietern'
    and classifies each by document type for separate checking.
+    Deduplicates: if the same doc_type appears twice, texts are merged.
    """
    import re as _re
-    sections = []
+    sections: list[dict] = []
+    seen_types: dict[str, int] = {}  # doc_type -> index in sections

-    # Split by lines that look like headings (short, followed by longer content)
    lines = text.split("\n")
    current_heading = ""
-    current_text = []
+    current_text: list[str] = []
+
+    def _save_section(heading: str, text_lines: list[str]) -> None:
+        sec_text = "\n".join(text_lines)
+        if len(sec_text.split()) < 100:
+            return
+        sec_type = _classify_section(heading)
+        if not sec_type:
+            return
+        # Merge duplicate doc_types (e.g. two "Social Media" headings)
+        if sec_type in seen_types:
+            idx = seen_types[sec_type]
+            sections[idx]["text"] += "\n\n" + sec_text
+            sections[idx]["word_count"] = len(sections[idx]["text"].split())
+        else:
+            seen_types[sec_type] = len(sections)
+            sections.append({
+                "title": f"{parent_label} > {heading}",
+                "text": sec_text,
+                "doc_type": sec_type,
+                "word_count": len(sec_text.split()),
+            })

    for line in lines:
        stripped = line.strip()
-        # Detect heading: short line (< 80 chars), not empty, followed by content
        is_heading = (
            5 < len(stripped) < 80
            and not stripped.endswith(".")
            and not stripped.endswith(",")
            and stripped[0].isupper()
        )
-
-        # Skip-headings should NOT start a new section — their text
-        # belongs to the previous section (e.g. "Risikoabwägung" inside DSFA)
        is_skip = is_heading and stripped.lower().strip() in SKIP_HEADINGS

-        if is_heading and not is_skip and current_heading and len("\n".join(current_text)) > 200:
-            # Save previous section
-            sec_text = "\n".join(current_text)
-            sec_type = _classify_section(current_heading)
-            if sec_type and sec_type != "skip":
-                sections.append({
-                    "title": f"{parent_label} > {current_heading}",
-                    "text": sec_text,
-                    "doc_type": sec_type,
-                    "word_count": len(sec_text.split()),
-                })
+        if is_heading and not is_skip and current_heading:
+            _save_section(current_heading, current_text)

        if is_heading and not is_skip:
            current_heading = stripped
@@ -326,16 +335,8 @@ def _split_into_sections(text: str, parent_label: str, url: str) -> list[dict]:
            current_text.append(line)

    # Last section
-    if current_heading and len("\n".join(current_text)) > 200:
-        sec_text = "\n".join(current_text)
-        sec_type = _classify_section(current_heading)
-        if sec_type and sec_type != "skip":
-            sections.append({
-                "title": f"{parent_label} > {current_heading}",
-                "text": sec_text,
-                "doc_type": sec_type,
-                "word_count": len(sec_text.split()),
-            })
+    if current_heading:
+        _save_section(current_heading, current_text)

    return sections

@@ -347,6 +348,10 @@ SKIP_HEADINGS = {
    "risikoabwaegung und datenschutzfolgenabschaetzung",
 }

+# Track already-seen section types to avoid duplicate sub-documents
+# (e.g. two "Social Media" headings on the same page)
+_DEDUP_TYPES = {"social_media", "cookie", "dsfa", "widerruf", "impressum"}
+

 def _classify_section(heading: str) -> str | None:
    """Classify a section heading into a document type."""
@@ -377,41 +382,5 @@ async def _check_cookie_banner(url: str) -> dict | None:


 def _build_report(results: list[DocCheckResult], cookie_result: dict | None) -> str:
-    """Build email report."""
-    parts = [
-        "DOKUMENTEN-PRUEFUNG",
-        f"Dokumente geprueft: {len(results)}",
-        "",
-    ]
-    for r in results:
-        status = "OK" if r.completeness_pct == 100 else "LUECKENHAFT" if r.completeness_pct >= 50 else "MANGELHAFT"
-        if r.error:
-            status = "FEHLER"
-        detail = f", Korrektheit {r.correctness_pct}%" if r.correctness_pct else ""
-        parts.append(f"[{status}] {r.label} ({r.completeness_pct}%{detail}, {r.word_count} Woerter)")
-
-        for check in r.checks:
-            if check.skipped:
-                continue
-            icon = "+" if check.passed else "!!"
-            indent = "    " if check.level == 2 else "  "
-            parts.append(f"{indent}[{icon}] {check.label}")
-
-        if r.error:
-            parts.append(f"  FEHLER: {r.error}")
-        parts.append("")
-
-    if cookie_result:
-        parts.extend([
-            "Cookie-Banner Pruefung:",
-            f"  Banner erkannt: {cookie_result.get('banner_detected', False)}",
-            f"  Anbieter: {cookie_result.get('banner_provider', 'unbekannt')}",
-        ])
-        violations = cookie_result.get("banner_checks", {}).get("violations", [])
-        if violations:
-            for v in violations[:10]:
-                parts.append(f"  [!!] {v.get('text', '')[:80]}")
-        else:
-            parts.append("  Keine Verstoesse erkannt.")
-
-    return "\n".join(parts)
+    from .agent_doc_check_report import build_html_report
+    return build_html_report(results, cookie_result)