From a3287cd5e648088e36ec1242fa15b178e09d9850 Mon Sep 17 00:00:00 2001
From: Benjamin Admin <benjaminadmin@MacBook-Pro.local>
Date: Thu, 7 May 2026 15:13:00 +0200
Subject: [PATCH] feat: HTML email report with hints + fix duplicate Social
 Media sections
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

1. Email report now renders as styled HTML (matching frontend design):
   - Progress bars (green=completeness, blue=correctness)
   - Hierarchical L1→L2 check display
   - Red hint boxes under failed checks explaining what to fix
   - Matched text evidence for passed checks

2. Section splitter deduplicates: two "Social Media" headings on the
   same page are merged into one section instead of creating duplicates.

3. Extracted report builder to agent_doc_check_report.py (175 LOC)
   to keep routes file under 500 LOC (386 LOC).

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../compliance/api/agent_doc_check_report.py  | 175 ++++++++++++++++++
 .../compliance/api/agent_doc_check_routes.py  | 103 ++++-------
 2 files changed, 211 insertions(+), 67 deletions(-)
 create mode 100644 backend-compliance/compliance/api/agent_doc_check_report.py
diff --git a/backend-compliance/compliance/api/agent_doc_check_report.py b/backend-compliance/compliance/api/agent_doc_check_report.py
new file mode 100644
index 0000000..47e9aff
--- /dev/null
+++ b/backend-compliance/compliance/api/agent_doc_check_report.py
@@ -0,0 +1,175 @@
+"""
+HTML email report builder for document checks.
+
+Generates a styled HTML report similar to the frontend ChecklistView,
+including L1/L2 check hierarchy, progress bars, and actionable hints.
+"""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    from .agent_doc_check_routes import CheckItem, DocCheckResult
+
+
+def _bar(pct: int, color: str) -> str:
+    bg = {"green": "#22c55e", "yellow": "#eab308", "red": "#ef4444", "blue": "#60a5fa"}
+    c = bg.get(color, "#60a5fa")
+    return (
+        f'<div style="display:inline-block;width:120px;height:8px;background:#e5e7eb;'
+        f'border-radius:4px;overflow:hidden;vertical-align:middle;margin-right:8px">'
+        f'<div style="width:{pct}%;height:100%;background:{c};border-radius:4px"></div>'
+        f'</div><span style="font-size:13px;font-weight:600;color:{c}">{pct}%</span>'
+    )
+
+
+def _icon(passed: bool, skipped: bool = False) -> str:
+    if skipped:
+        return '<span style="color:#d1d5db">&mdash;</span>'
+    if passed:
+        return '<span style="color:#22c55e;font-weight:bold">&#10003;</span>'
+    return '<span style="color:#ef4444;font-weight:bold">&#10007;</span>'
+
+
+def _hint_box(hint: str) -> str:
+    return (
+        f'<div style="font-size:11px;color:#dc2626;margin:2px 0 4px 20px;'
+        f'padding:4px 8px;background:#fef2f2;border-radius:4px;'
+        f'border-left:3px solid #fca5a5">{hint}</div>'
+    )
+
+
+def build_html_report(
+    results: list[DocCheckResult],
+    cookie_result: dict | None,
+) -> str:
+    """Build HTML email report styled like the frontend."""
+    ok_count = sum(1 for r in results if r.completeness_pct == 100)
+    html = [
+        '<div style="font-family:-apple-system,BlinkMacSystemFont,sans-serif;'
+        'max-width:700px;margin:0 auto">',
+        '<h2 style="margin-bottom:4px">Dokumenten-Pruefung</h2>',
+        f'<p style="color:#6b7280;margin-top:0">'
+        f'{len(results)} Dokumente, {ok_count} vollstaendig</p>',
+    ]
+
+    for r in results:
+        _render_document(html, r)
+
+    if cookie_result:
+        _render_cookie_banner(html, cookie_result)
+
+    html.append('</div>')
+    return "\n".join(html)
+
+
+def _render_document(html: list[str], r: DocCheckResult) -> None:
+    pct = r.completeness_pct
+    cpct = r.correctness_pct
+    bar_color = "green" if pct >= 80 else "yellow" if pct >= 50 else "red"
+    status_label = "OK" if pct == 100 else "LUECKENHAFT" if pct >= 50 else "MANGELHAFT"
+    if r.error:
+        status_label = "FEHLER"
+
+    l1_checks = [c for c in r.checks if c.level == 1]
+    l2_by_parent: dict[str, list[CheckItem]] = {}
+    for c in r.checks:
+        if c.level == 2 and c.parent:
+            l2_by_parent.setdefault(c.parent, []).append(c)
+
+    l1_passed = sum(1 for c in l1_checks if c.passed)
+    l2_active = [c for c in r.checks if c.level == 2 and not c.skipped]
+    l2_passed = sum(1 for c in l2_active if c.passed)
+
+    # Header
+    html.append(
+        f'<div style="border:1px solid #e5e7eb;border-radius:8px;margin-bottom:12px;overflow:hidden">'
+        f'<div style="padding:12px 16px;background:#f9fafb">'
+        f'<div style="display:flex;justify-content:space-between;align-items:center"><div>'
+        f'<span style="font-size:11px;background:#f3f4f6;padding:2px 8px;border-radius:4px;'
+        f'color:#4b5563;font-weight:500;margin-right:8px">{status_label}</span>'
+        f'<strong style="font-size:14px">{r.label}</strong>'
+        f'<div style="font-size:12px;color:#6b7280;margin-top:2px">'
+        f'{l1_passed}/{len(l1_checks)} Pflichtangaben'
+    )
+    if l2_active:
+        html.append(f', {l2_passed}/{len(l2_active)} Detailpruefungen')
+    html.append(f'</div></div><div style="text-align:right">{_bar(pct, bar_color)}')
+    if cpct and l2_active:
+        html.append(f'<br>{_bar(cpct, "blue")}')
+    html.append('</div></div></div>')
+
+    # Body
+    if r.error:
+        html.append(f'<div style="padding:12px 16px;color:#991b1b">{r.error}</div>')
+    else:
+        html.append('<div style="padding:8px 16px 12px">')
+        for c in l1_checks:
+            _render_l1_check(html, c, l2_by_parent.get(c.id, []))
+        if r.word_count:
+            html.append(
+                f'<div style="font-size:11px;color:#9ca3af;margin-top:8px;'
+                f'padding-top:8px;border-top:1px solid #e5e7eb">'
+                f'{r.word_count} Woerter analysiert</div>'
+            )
+        html.append('</div>')
+    html.append('</div>')
+
+
+def _render_l1_check(
+    html: list[str], c: CheckItem, children: list[CheckItem],
+) -> None:
+    l2_sub = [ch for ch in children if not ch.skipped]
+    l2_passed = sum(1 for ch in l2_sub if ch.passed)
+
+    style = "color:#991b1b;font-weight:600" if not c.passed else "color:#374151"
+    html.append(
+        f'<div style="padding:3px 0">{_icon(c.passed)} '
+        f'<span style="font-size:13px;{style}">{c.label}</span>'
+    )
+    if l2_sub:
+        html.append(f' <span style="color:#9ca3af;font-size:11px">({l2_passed}/{len(l2_sub)})</span>')
+    if not c.passed and c.hint:
+        html.append(_hint_box(c.hint))
+    html.append('</div>')
+
+    for ch in children:
+        if ch.skipped:
+            continue
+        _render_l2_check(html, ch)
+
+
+def _render_l2_check(html: list[str], ch: CheckItem) -> None:
+    style = "color:#dc2626;font-weight:500" if not ch.passed else "color:#6b7280"
+    html.append(
+        f'<div style="padding:2px 0 2px 24px;border-left:2px solid #e5e7eb;margin-left:8px">'
+        f'{_icon(ch.passed)} '
+        f'<span style="font-size:12px;{style}">{ch.label}</span>'
+    )
+    if ch.passed and ch.matched_text:
+        html.append(
+            f'<div style="font-size:10px;color:#9ca3af;font-family:monospace;'
+            f'margin-left:20px;overflow:hidden;text-overflow:ellipsis;'
+            f'white-space:nowrap">"...{ch.matched_text[:80]}..."</div>'
+        )
+    if not ch.passed and ch.hint:
+        html.append(_hint_box(ch.hint))
+    html.append('</div>')
+
+
+def _render_cookie_banner(html: list[str], cookie_result: dict) -> None:
+    html.append(
+        '<div style="border:1px solid #e5e7eb;border-radius:8px;'
+        'padding:12px 16px;margin-bottom:12px">'
+        '<strong>Cookie-Banner Pruefung</strong><br>'
+        f'Banner erkannt: {cookie_result.get("banner_detected", False)}<br>'
+        f'Anbieter: {cookie_result.get("banner_provider", "unbekannt")}'
+    )
+    violations = cookie_result.get("banner_checks", {}).get("violations", [])
+    if violations:
+        for v in violations[:10]:
+            html.append(f'<br>{_icon(False)} {v.get("text", "")[:80]}')
+    else:
+        html.append('<br><span style="color:#22c55e">Keine Verstoesse erkannt.</span>')
+    html.append('</div>')
diff --git a/backend-compliance/compliance/api/agent_doc_check_routes.py b/backend-compliance/compliance/api/agent_doc_check_routes.py
index 5e3f546..1e9db57 100644
--- a/backend-compliance/compliance/api/agent_doc_check_routes.py
+++ b/backend-compliance/compliance/api/agent_doc_check_routes.py
@@ -141,7 +141,7 @@ async def _run_doc_check(check_id: str, req: DocCheckRequest):
         email_result = send_email(
             recipient=req.recipient,
             subject=f"[DOKUMENTEN-PRUEFUNG] {len(results)} Dokumente geprueft",
-            body_html=f"<pre>{summary}</pre>",
+            body_html=summary,
         )
 
         response = DocCheckResponse(
@@ -284,40 +284,49 @@ def _split_into_sections(text: str, parent_label: str, url: str) -> list[dict]:
 
     Detects sections like 'Cookies', 'Social Media', 'Dienste von Drittanbietern'
     and classifies each by document type for separate checking.
+    Deduplicates: if the same doc_type appears twice, texts are merged.
     """
     import re as _re
-    sections = []
+    sections: list[dict] = []
+    seen_types: dict[str, int] = {}  # doc_type -> index in sections
 
-    # Split by lines that look like headings (short, followed by longer content)
     lines = text.split("\n")
     current_heading = ""
-    current_text = []
+    current_text: list[str] = []
+
+    def _save_section(heading: str, text_lines: list[str]) -> None:
+        sec_text = "\n".join(text_lines)
+        if len(sec_text.split()) < 100:
+            return
+        sec_type = _classify_section(heading)
+        if not sec_type:
+            return
+        # Merge duplicate doc_types (e.g. two "Social Media" headings)
+        if sec_type in seen_types:
+            idx = seen_types[sec_type]
+            sections[idx]["text"] += "\n\n" + sec_text
+            sections[idx]["word_count"] = len(sections[idx]["text"].split())
+        else:
+            seen_types[sec_type] = len(sections)
+            sections.append({
+                "title": f"{parent_label} > {heading}",
+                "text": sec_text,
+                "doc_type": sec_type,
+                "word_count": len(sec_text.split()),
+            })
 
     for line in lines:
         stripped = line.strip()
-        # Detect heading: short line (< 80 chars), not empty, followed by content
         is_heading = (
             5 < len(stripped) < 80
             and not stripped.endswith(".")
             and not stripped.endswith(",")
             and stripped[0].isupper()
         )
-
-        # Skip-headings should NOT start a new section — their text
-        # belongs to the previous section (e.g. "Risikoabwägung" inside DSFA)
         is_skip = is_heading and stripped.lower().strip() in SKIP_HEADINGS
 
-        if is_heading and not is_skip and current_heading and len("\n".join(current_text)) > 200:
-            # Save previous section
-            sec_text = "\n".join(current_text)
-            sec_type = _classify_section(current_heading)
-            if sec_type and sec_type != "skip":
-                sections.append({
-                    "title": f"{parent_label} > {current_heading}",
-                    "text": sec_text,
-                    "doc_type": sec_type,
-                    "word_count": len(sec_text.split()),
-                })
+        if is_heading and not is_skip and current_heading:
+            _save_section(current_heading, current_text)
 
         if is_heading and not is_skip:
             current_heading = stripped
@@ -326,16 +335,8 @@ def _split_into_sections(text: str, parent_label: str, url: str) -> list[dict]:
             current_text.append(line)
 
     # Last section
-    if current_heading and len("\n".join(current_text)) > 200:
-        sec_text = "\n".join(current_text)
-        sec_type = _classify_section(current_heading)
-        if sec_type and sec_type != "skip":
-            sections.append({
-                "title": f"{parent_label} > {current_heading}",
-                "text": sec_text,
-                "doc_type": sec_type,
-                "word_count": len(sec_text.split()),
-            })
+    if current_heading:
+        _save_section(current_heading, current_text)
 
     return sections
 
@@ -347,6 +348,10 @@ SKIP_HEADINGS = {
     "risikoabwaegung und datenschutzfolgenabschaetzung",
 }
 
+# Track already-seen section types to avoid duplicate sub-documents
+# (e.g. two "Social Media" headings on the same page)
+_DEDUP_TYPES = {"social_media", "cookie", "dsfa", "widerruf", "impressum"}
+
 
 def _classify_section(heading: str) -> str | None:
     """Classify a section heading into a document type."""
@@ -377,41 +382,5 @@ async def _check_cookie_banner(url: str) -> dict | None:
 
 
 def _build_report(results: list[DocCheckResult], cookie_result: dict | None) -> str:
-    """Build email report."""
-    parts = [
-        "DOKUMENTEN-PRUEFUNG",
-        f"Dokumente geprueft: {len(results)}",
-        "",
-    ]
-    for r in results:
-        status = "OK" if r.completeness_pct == 100 else "LUECKENHAFT" if r.completeness_pct >= 50 else "MANGELHAFT"
-        if r.error:
-            status = "FEHLER"
-        detail = f", Korrektheit {r.correctness_pct}%" if r.correctness_pct else ""
-        parts.append(f"[{status}] {r.label} ({r.completeness_pct}%{detail}, {r.word_count} Woerter)")
-
-        for check in r.checks:
-            if check.skipped:
-                continue
-            icon = "+" if check.passed else "!!"
-            indent = "    " if check.level == 2 else "  "
-            parts.append(f"{indent}[{icon}] {check.label}")
-
-        if r.error:
-            parts.append(f"  FEHLER: {r.error}")
-        parts.append("")
-
-    if cookie_result:
-        parts.extend([
-            "Cookie-Banner Pruefung:",
-            f"  Banner erkannt: {cookie_result.get('banner_detected', False)}",
-            f"  Anbieter: {cookie_result.get('banner_provider', 'unbekannt')}",
-        ])
-        violations = cookie_result.get("banner_checks", {}).get("violations", [])
-        if violations:
-            for v in violations[:10]:
-                parts.append(f"  [!!] {v.get('text', '')[:80]}")
-        else:
-            parts.append("  Keine Verstoesse erkannt.")
-
-    return "\n".join(parts)
+    from .agent_doc_check_report import build_html_report
+    return build_html_report(results, cookie_result)