From a3287cd5e648088e36ec1242fa15b178e09d9850 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Thu, 7 May 2026 15:13:00 +0200 Subject: [PATCH] feat: HTML email report with hints + fix duplicate Social Media sections MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1. Email report now renders as styled HTML (matching frontend design): - Progress bars (green=completeness, blue=correctness) - Hierarchical L1→L2 check display - Red hint boxes under failed checks explaining what to fix - Matched text evidence for passed checks 2. Section splitter deduplicates: two "Social Media" headings on the same page are merged into one section instead of creating duplicates. 3. Extracted report builder to agent_doc_check_report.py (175 LOC) to keep routes file under 500 LOC (386 LOC). Co-Authored-By: Claude Opus 4.6 (1M context) --- .../compliance/api/agent_doc_check_report.py | 175 ++++++++++++++++++ .../compliance/api/agent_doc_check_routes.py | 103 ++++------- 2 files changed, 211 insertions(+), 67 deletions(-) create mode 100644 backend-compliance/compliance/api/agent_doc_check_report.py diff --git a/backend-compliance/compliance/api/agent_doc_check_report.py b/backend-compliance/compliance/api/agent_doc_check_report.py new file mode 100644 index 0000000..47e9aff --- /dev/null +++ b/backend-compliance/compliance/api/agent_doc_check_report.py @@ -0,0 +1,175 @@ +""" +HTML email report builder for document checks. + +Generates a styled HTML report similar to the frontend ChecklistView, +including L1/L2 check hierarchy, progress bars, and actionable hints. +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from .agent_doc_check_routes import CheckItem, DocCheckResult + + +def _bar(pct: int, color: str) -> str: + bg = {"green": "#22c55e", "yellow": "#eab308", "red": "#ef4444", "blue": "#60a5fa"} + c = bg.get(color, "#60a5fa") + return ( + f'
' + f'
' + f'
{pct}%' + ) + + +def _icon(passed: bool, skipped: bool = False) -> str: + if skipped: + return '' + if passed: + return '' + return '' + + +def _hint_box(hint: str) -> str: + return ( + f'
{hint}
' + ) + + +def build_html_report( + results: list[DocCheckResult], + cookie_result: dict | None, +) -> str: + """Build HTML email report styled like the frontend.""" + ok_count = sum(1 for r in results if r.completeness_pct == 100) + html = [ + '
', + '

Dokumenten-Pruefung

', + f'

' + f'{len(results)} Dokumente, {ok_count} vollstaendig

', + ] + + for r in results: + _render_document(html, r) + + if cookie_result: + _render_cookie_banner(html, cookie_result) + + html.append('
') + return "\n".join(html) + + +def _render_document(html: list[str], r: DocCheckResult) -> None: + pct = r.completeness_pct + cpct = r.correctness_pct + bar_color = "green" if pct >= 80 else "yellow" if pct >= 50 else "red" + status_label = "OK" if pct == 100 else "LUECKENHAFT" if pct >= 50 else "MANGELHAFT" + if r.error: + status_label = "FEHLER" + + l1_checks = [c for c in r.checks if c.level == 1] + l2_by_parent: dict[str, list[CheckItem]] = {} + for c in r.checks: + if c.level == 2 and c.parent: + l2_by_parent.setdefault(c.parent, []).append(c) + + l1_passed = sum(1 for c in l1_checks if c.passed) + l2_active = [c for c in r.checks if c.level == 2 and not c.skipped] + l2_passed = sum(1 for c in l2_active if c.passed) + + # Header + html.append( + f'
' + f'
' + f'
' + f'{status_label}' + f'{r.label}' + f'
' + f'{l1_passed}/{len(l1_checks)} Pflichtangaben' + ) + if l2_active: + html.append(f', {l2_passed}/{len(l2_active)} Detailpruefungen') + html.append(f'
{_bar(pct, bar_color)}') + if cpct and l2_active: + html.append(f'
{_bar(cpct, "blue")}') + html.append('
') + + # Body + if r.error: + html.append(f'
{r.error}
') + else: + html.append('
') + for c in l1_checks: + _render_l1_check(html, c, l2_by_parent.get(c.id, [])) + if r.word_count: + html.append( + f'
' + f'{r.word_count} Woerter analysiert
' + ) + html.append('
') + html.append('
') + + +def _render_l1_check( + html: list[str], c: CheckItem, children: list[CheckItem], +) -> None: + l2_sub = [ch for ch in children if not ch.skipped] + l2_passed = sum(1 for ch in l2_sub if ch.passed) + + style = "color:#991b1b;font-weight:600" if not c.passed else "color:#374151" + html.append( + f'
{_icon(c.passed)} ' + f'{c.label}' + ) + if l2_sub: + html.append(f' ({l2_passed}/{len(l2_sub)})') + if not c.passed and c.hint: + html.append(_hint_box(c.hint)) + html.append('
') + + for ch in children: + if ch.skipped: + continue + _render_l2_check(html, ch) + + +def _render_l2_check(html: list[str], ch: CheckItem) -> None: + style = "color:#dc2626;font-weight:500" if not ch.passed else "color:#6b7280" + html.append( + f'
' + f'{_icon(ch.passed)} ' + f'{ch.label}' + ) + if ch.passed and ch.matched_text: + html.append( + f'
"...{ch.matched_text[:80]}..."
' + ) + if not ch.passed and ch.hint: + html.append(_hint_box(ch.hint)) + html.append('
') + + +def _render_cookie_banner(html: list[str], cookie_result: dict) -> None: + html.append( + '
' + 'Cookie-Banner Pruefung
' + f'Banner erkannt: {cookie_result.get("banner_detected", False)}
' + f'Anbieter: {cookie_result.get("banner_provider", "unbekannt")}' + ) + violations = cookie_result.get("banner_checks", {}).get("violations", []) + if violations: + for v in violations[:10]: + html.append(f'
{_icon(False)} {v.get("text", "")[:80]}') + else: + html.append('
Keine Verstoesse erkannt.') + html.append('
') diff --git a/backend-compliance/compliance/api/agent_doc_check_routes.py b/backend-compliance/compliance/api/agent_doc_check_routes.py index 5e3f546..1e9db57 100644 --- a/backend-compliance/compliance/api/agent_doc_check_routes.py +++ b/backend-compliance/compliance/api/agent_doc_check_routes.py @@ -141,7 +141,7 @@ async def _run_doc_check(check_id: str, req: DocCheckRequest): email_result = send_email( recipient=req.recipient, subject=f"[DOKUMENTEN-PRUEFUNG] {len(results)} Dokumente geprueft", - body_html=f"
{summary}
", + body_html=summary, ) response = DocCheckResponse( @@ -284,40 +284,49 @@ def _split_into_sections(text: str, parent_label: str, url: str) -> list[dict]: Detects sections like 'Cookies', 'Social Media', 'Dienste von Drittanbietern' and classifies each by document type for separate checking. + Deduplicates: if the same doc_type appears twice, texts are merged. """ import re as _re - sections = [] + sections: list[dict] = [] + seen_types: dict[str, int] = {} # doc_type -> index in sections - # Split by lines that look like headings (short, followed by longer content) lines = text.split("\n") current_heading = "" - current_text = [] + current_text: list[str] = [] + + def _save_section(heading: str, text_lines: list[str]) -> None: + sec_text = "\n".join(text_lines) + if len(sec_text.split()) < 100: + return + sec_type = _classify_section(heading) + if not sec_type: + return + # Merge duplicate doc_types (e.g. two "Social Media" headings) + if sec_type in seen_types: + idx = seen_types[sec_type] + sections[idx]["text"] += "\n\n" + sec_text + sections[idx]["word_count"] = len(sections[idx]["text"].split()) + else: + seen_types[sec_type] = len(sections) + sections.append({ + "title": f"{parent_label} > {heading}", + "text": sec_text, + "doc_type": sec_type, + "word_count": len(sec_text.split()), + }) for line in lines: stripped = line.strip() - # Detect heading: short line (< 80 chars), not empty, followed by content is_heading = ( 5 < len(stripped) < 80 and not stripped.endswith(".") and not stripped.endswith(",") and stripped[0].isupper() ) - - # Skip-headings should NOT start a new section — their text - # belongs to the previous section (e.g. "Risikoabwägung" inside DSFA) is_skip = is_heading and stripped.lower().strip() in SKIP_HEADINGS - if is_heading and not is_skip and current_heading and len("\n".join(current_text)) > 200: - # Save previous section - sec_text = "\n".join(current_text) - sec_type = _classify_section(current_heading) - if sec_type and sec_type != "skip": - sections.append({ - "title": f"{parent_label} > {current_heading}", - "text": sec_text, - "doc_type": sec_type, - "word_count": len(sec_text.split()), - }) + if is_heading and not is_skip and current_heading: + _save_section(current_heading, current_text) if is_heading and not is_skip: current_heading = stripped @@ -326,16 +335,8 @@ def _split_into_sections(text: str, parent_label: str, url: str) -> list[dict]: current_text.append(line) # Last section - if current_heading and len("\n".join(current_text)) > 200: - sec_text = "\n".join(current_text) - sec_type = _classify_section(current_heading) - if sec_type and sec_type != "skip": - sections.append({ - "title": f"{parent_label} > {current_heading}", - "text": sec_text, - "doc_type": sec_type, - "word_count": len(sec_text.split()), - }) + if current_heading: + _save_section(current_heading, current_text) return sections @@ -347,6 +348,10 @@ SKIP_HEADINGS = { "risikoabwaegung und datenschutzfolgenabschaetzung", } +# Track already-seen section types to avoid duplicate sub-documents +# (e.g. two "Social Media" headings on the same page) +_DEDUP_TYPES = {"social_media", "cookie", "dsfa", "widerruf", "impressum"} + def _classify_section(heading: str) -> str | None: """Classify a section heading into a document type.""" @@ -377,41 +382,5 @@ async def _check_cookie_banner(url: str) -> dict | None: def _build_report(results: list[DocCheckResult], cookie_result: dict | None) -> str: - """Build email report.""" - parts = [ - "DOKUMENTEN-PRUEFUNG", - f"Dokumente geprueft: {len(results)}", - "", - ] - for r in results: - status = "OK" if r.completeness_pct == 100 else "LUECKENHAFT" if r.completeness_pct >= 50 else "MANGELHAFT" - if r.error: - status = "FEHLER" - detail = f", Korrektheit {r.correctness_pct}%" if r.correctness_pct else "" - parts.append(f"[{status}] {r.label} ({r.completeness_pct}%{detail}, {r.word_count} Woerter)") - - for check in r.checks: - if check.skipped: - continue - icon = "+" if check.passed else "!!" - indent = " " if check.level == 2 else " " - parts.append(f"{indent}[{icon}] {check.label}") - - if r.error: - parts.append(f" FEHLER: {r.error}") - parts.append("") - - if cookie_result: - parts.extend([ - "Cookie-Banner Pruefung:", - f" Banner erkannt: {cookie_result.get('banner_detected', False)}", - f" Anbieter: {cookie_result.get('banner_provider', 'unbekannt')}", - ]) - violations = cookie_result.get("banner_checks", {}).get("violations", []) - if violations: - for v in violations[:10]: - parts.append(f" [!!] {v.get('text', '')[:80]}") - else: - parts.append(" Keine Verstoesse erkannt.") - - return "\n".join(parts) + from .agent_doc_check_report import build_html_report + return build_html_report(results, cookie_result)