Files
breakpilot-compliance/backend-compliance/compliance/api/agent_doc_check_report.py
T
Benjamin Admin 6c223c7c9b
CI / detect-changes (push) Successful in 10s
CI / branch-name (push) Has been skipped
CI / guardrail-integrity (push) Has been skipped
CI / secret-scan (push) Has been skipped
CI / dep-audit (push) Has been skipped
CI / sbom-scan (push) Has been skipped
CI / validate-canonical-controls (push) Successful in 14s
CI / loc-budget (push) Failing after 15s
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / nodejs-build (push) Successful in 2m43s
CI / test-go (push) Has been skipped
CI / iace-gt-coverage (push) Has been skipped
CI / test-python-backend (push) Successful in 37s
CI / test-python-document-crawler (push) Has been skipped
CI / test-python-dsms-gateway (push) Has been skipped
feat(compliance-check): exec-summary + voll-audit + TDM-respect + cookie-KB-extended + saving-scan-funnel
P1 — Exec-Summary oben im Email-Report (4 KPIs + 2 CTAs, dunkler Gradient)
P3 — no_direct_sales-Flag fuer OEM-Konfigurator-Sites; AGB/Widerruf/AGB als
     "NICHT ANWENDBAR" (grau) statt "NICHT GEFUNDEN" (rot)
P5 — Voll-Audit Unification: alle Findings (MC + Pflichtangaben + Vendor +
     Redundanz) in /data/compliance_audits.db.unified_findings; neuer
     /api/compliance/agent/findings/<id> Endpoint + FindingsTab im Audit-UI
     mit Filter + CSV-Export
P7 — Crawl-Hardening: TDM-Reservation-Check (robots.txt / ai.txt / Header /
     Meta) vor jedem Run mit 24h-Cache; HeadlessChrome-UA (Firma noch nicht
     gegruendet — Switch via BREAKPILOT_BRANDED_UA env); per-Domain
     Rate-Limit 1 req/s + max 2 concurrent
P2 — Cookie-Knowledge-DB additiv erweitert (35 -> 74 Cookies): Adobe, Meta,
     Microsoft, LinkedIn, TikTok, HubSpot, Marketo, Salesforce, Hotjar,
     FullStory, Mouseflow, Intercom, Drift, Zendesk, Cloudflare, Stripe,
     OneTrust/Cookiebot/Usercentrics, Matomo, Pinterest, Snapchat, X/Twitter,
     YouTube, Vimeo, Klaviyo, Mailchimp, Mixpanel, Segment, Amplitude,
     Optimizely, Datadog; Wire-in in cookie_function_classifier liefert
     compliance_risk-Label (kritisch/hoch/mittel/gering) pro Vendor
A  — k-Anonymitaets-Helper (benchmark_k_anonymity) fuer P6-Vorbereitung
B  — Cross-Tenant-Domain-Assertion im /findings-Endpoint (expected_domain
     Query-Param -> 403 bei Mismatch)
C  — Saving-Scan-Funnel: /api/compliance/agent/saving-scan/start mit
     Validierung + 24h-Rate-Limit pro Domain + Lead-Persistenz in
     saving_scan_leads + Auto-Discovery via _run_compliance_check; 6 Tests
D  — Risk-Badge im Email-Vendor-Row

Rechtliche Leitplanken (Memory feedback_oem_data_legal.md): nur eigene
Knapp-Bewertungen + Source-Pointer, keine 1:1-Kopien fremder CMP-Texte.
TDM-Opt-Out-Respect nach § 44b UrhG. KEINE Schema-Aenderungen — alles in
Sidecar-SQLite.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-18 23:48:34 +02:00

500 lines
21 KiB
Python

"""
HTML email report builder for document checks.
Generates a styled HTML report similar to the frontend ChecklistView,
including L1/L2 check hierarchy, progress bars, and actionable hints.
"""
from __future__ import annotations
import logging
import re
from typing import TYPE_CHECKING
logger = logging.getLogger(__name__)
if TYPE_CHECKING:
from .agent_doc_check_routes import CheckItem, DocCheckResult
def _bar(pct: int, color: str) -> str:
bg = {"green": "#22c55e", "yellow": "#eab308", "red": "#ef4444", "blue": "#60a5fa"}
c = bg.get(color, "#60a5fa")
return (
f'<div style="display:inline-block;width:120px;height:8px;background:#e5e7eb;'
f'border-radius:4px;overflow:hidden;vertical-align:middle;margin-right:8px">'
f'<div style="width:{pct}%;height:100%;background:{c};border-radius:4px"></div>'
f'</div><span style="font-size:13px;font-weight:600;color:{c}">{pct}%</span>'
)
def _icon(passed: bool, skipped: bool = False) -> str:
if skipped:
return '<span style="color:#d1d5db">&mdash;</span>'
if passed:
return '<span style="color:#22c55e;font-weight:bold">&#10003;</span>'
return '<span style="color:#ef4444;font-weight:bold">&#10007;</span>'
def _first_sentence(text: str, max_chars: int = 300) -> str:
"""Erster vollstaendiger Satz statt erste Zeile — robust gegen
mehrzeilige Fix-Texte die mit Bullet-Listen anfangen."""
if not text:
return ""
# Suche Satz-Endezeichen vor max_chars
snippet = text[:max_chars]
m = re.search(r"^(.+?[\.\?\!])(?:\s|$)", snippet, re.DOTALL)
if m:
first = m.group(1).strip()
# Wenn der "Satz" eine Variant-Header wie "Variante A:" ist, nimm
# weiter — der echte Inhalt kommt erst danach
if re.fullmatch(r"(Variante [A-Z]\s*\([^\)]+\):?|Beispiel\s*\d*:?)",
first, re.IGNORECASE):
rest = text[m.end():].lstrip()
return _first_sentence(rest, max_chars)
return first
# Kein Satz-Endezeichen — nimm bis max_chars
line = (text.splitlines() or [""])[0]
return line[:max_chars] + ("" if len(line) > max_chars else "")
def _hint_box(hint: str, check_label: str = "", doc_text: str = "",
doc_id: str | None = None) -> str:
"""Hint-Block mit angereichertem Recipe + Doc-Anchor wenn moeglich."""
base = (
f'<div style="font-size:11px;color:#dc2626;margin:2px 0 4px 20px;'
f'padding:4px 8px;background:#fef2f2;border-radius:4px;'
f'border-left:3px solid #fca5a5">{hint}'
)
# Recipe + Anker hinzufuegen wenn check_label bekannt
if check_label:
try:
from compliance.services.finding_action_recipes import recipe_for
from compliance.services.doc_anchor_locator import locate_anchor
rec = recipe_for(check_label)
if rec and rec.get("fix_text"):
first_sentence = _first_sentence(rec["fix_text"], 300)
full = rec["fix_text"]
# Statt <details> ein einfaches Inline-Block-Layout —
# robuster bei Plain-Text-Mail-Render
more = ""
if len(full) > len(first_sentence) + 10:
more = (
f'<div style="margin-top:4px;padding:6px 8px;background:#fff;'
f'border:1px solid #fcd5d5;border-radius:4px;font-size:10px;'
f'white-space:pre-wrap;color:#1e293b">'
f'<strong style="display:block;margin-bottom:3px;color:#475569">'
f'Vollstaendiger Textbaustein zum Einfuegen:</strong>'
f'{full}</div>'
)
base += (
f'<div style="margin-top:6px;padding-top:6px;border-top:1px solid #fecaca">'
f'<strong style="color:#7c3aed;font-size:10px">Konkrete Massnahme:</strong> '
f'<span style="color:#1e293b">{first_sentence}</span>'
f'{more}'
)
# Anker via Embedding-Locator (mit doc_id-Cache)
if doc_text:
anchor = locate_anchor(check_label, doc_text, doc_id)
if anchor and anchor.get("anchor_phrase") and anchor.get("confidence") != "low":
conf_label = anchor.get("confidence", "")
conf_badge = (
f' <span style="color:#94a3b8;font-size:9px">'
f'(Match-Konfidenz {conf_label}, '
f'Score {anchor.get("score", "")})</span>'
)
base += (
f'<div style="margin-top:4px;color:#475569;font-size:10px">'
f'<strong>Einfuegen:</strong> {anchor["position_hint"]}'
f'{conf_badge}</div>'
)
elif rec.get("where"):
# Kein guter Anchor-Match — zeige generischen Fallback
base += (
f'<div style="margin-top:4px;color:#475569;font-size:10px">'
f'<strong>Einfuegen:</strong> {rec["where"]} '
f'<span style="color:#94a3b8;font-size:9px">'
f'(kein eindeutiger Absatz im Dokument gefunden — '
f'Anweisung allgemein)</span></div>'
)
base += '</div>'
except Exception as e:
logger.debug("Hint-box enrichment failed: %s", e)
pass # Recipes optional — Hint-Box muss nie crashen
base += '</div>'
return base
def build_management_summary(results: list[DocCheckResult]) -> str:
"""Build a plain-language management summary for the CEO/GF.
No legal jargon — concrete actions that can be delegated to staff,
lawyers, or the DPO.
"""
ok = [r for r in results if r.completeness_pct == 100 and not r.error]
fixable = [r for r in results if 0 < r.completeness_pct < 100 and not r.error]
critical = [r for r in results if r.completeness_pct == 0 and not r.error]
not_applicable = [r for r in results if r.error
and r.error.startswith("Nicht anwendbar")]
errors = [r for r in results if r.error and r not in not_applicable]
html = [
'<div style="font-family:-apple-system,BlinkMacSystemFont,sans-serif;'
'max-width:700px;margin:0 auto 20px;padding:16px 20px;'
'background:#f8fafc;border:1px solid #e2e8f0;border-radius:12px">',
'<h2 style="margin:0 0 12px;font-size:18px;color:#1e293b">'
'Zusammenfassung fuer die Geschaeftsfuehrung</h2>',
]
# Overall status
total = len(results) - len(errors)
if total == 0:
html.append('<p>Keine Dokumente geprueft.</p></div>')
return "\n".join(html)
na_note = (
f' Zusaetzlich {len(not_applicable)} Dokument{"" if len(not_applicable) == 1 else "e"} '
f'als NICHT ANWENDBAR markiert (kein Direkt-Vertrieb — '
f'OEM-Konfigurator-Pattern).' if not_applicable else ""
)
if len(ok) == total:
html.append(
f'<p style="color:#16a34a;font-weight:600;font-size:15px">'
f'Alle Dokumente sind vollstaendig. Keine dringenden Massnahmen noetig.'
f'{na_note}</p>'
)
else:
html.append(
f'<p style="font-size:14px;color:#475569">'
f'{len(ok)} von {total} Dokumenten sind vollstaendig. '
f'{len(fixable)} brauchen Korrekturen'
f'{f", {len(critical)} fehlen oder sind unbrauchbar" if critical else ""}.'
f'{na_note}</p>'
)
# Concrete actions
actions: list[str] = []
for r in results:
if r.error or r.completeness_pct == 100:
continue
failed_checks = [
c for c in r.checks
if c.level == 1 and not c.passed and not c.skipped
and c.severity != "INFO"
]
for c in failed_checks[:3]: # Max 3 per document
action = _check_to_action(r.label, c.label, c.hint)
if action:
actions.append(action)
if actions:
html.append(
'<h3 style="font-size:14px;color:#334155;margin:16px 0 8px">'
'Konkrete Aufgaben:</h3>'
'<ol style="font-size:13px;color:#475569;padding-left:20px;margin:0">'
)
for a in actions[:10]: # Max 10 actions
html.append(f'<li style="margin-bottom:6px">{a}</li>')
html.append('</ol>')
html.append('</div>')
return "\n".join(html)
def _check_to_action(doc_label: str, check_label: str, hint: str) -> str:
"""Convert a failed check into a plain-language action item."""
# Map technical check labels to business-language actions
label_lower = check_label.lower()
if "datenschutzbeauftragter" in label_lower or "dsb" in label_lower:
return (f"<strong>{doc_label}:</strong> Ihren Datenschutzbeauftragten "
f"mit Kontaktdaten erwaehnen. Pflicht ab 20 Mitarbeitern.")
if "beschwerderecht" in label_lower or "art. 77" in label_lower:
return (f"<strong>{doc_label}:</strong> Hinweis auf das Beschwerderecht "
f"bei der Aufsichtsbehoerde ergaenzen (Name + Kontakt der Behoerde).")
if "betroffenenrechte" in label_lower:
return (f"<strong>{doc_label}:</strong> Alle Betroffenenrechte "
f"(Auskunft, Berichtigung, Loeschung, etc.) einzeln auffuehren.")
if "verantwortlicher" in label_lower:
return (f"<strong>{doc_label}:</strong> Vollstaendige Firmenbezeichnung "
f"mit Rechtsform, Adresse, E-Mail und Telefon eintragen.")
if "interessenabwaegung" in label_lower:
return (f"<strong>{doc_label}:</strong> Bei 'berechtigtem Interesse' "
f"die Abwaegung dokumentieren. Aufgabe fuer den DSB/Rechtsanwalt.")
if "widerrufsbelehrung" in label_lower or "widerruf" in label_lower:
return (f"<strong>{doc_label}:</strong> Gesetzliche Widerrufsbelehrung "
f"mit 14-Tage-Frist und Musterformular bereitstellen.")
if "loeschkonzept" in label_lower:
return (f"<strong>{doc_label}:</strong> Loeschfristen und -prozess "
f"dokumentieren. Aufgabe fuer den DSB.")
if "profiling" in label_lower or "art. 22" in label_lower:
return (f"<strong>{doc_label}:</strong> Hinweis ergaenzen ob "
f"automatisierte Entscheidungen stattfinden oder nicht.")
if "nicht im eingereichten text" in label_lower:
return (f"<strong>{doc_label}:</strong> Das eingereichte Dokument "
f"enthaelt nicht den erwarteten Inhalt. Bitte korrekte URL pruefen.")
# Generic fallback
if hint and len(hint) < 150:
return f"<strong>{doc_label}:</strong> {hint[:120]}"
return f"<strong>{doc_label}:</strong> '{check_label}' muss ergaenzt werden."
def build_html_report(
results: list[DocCheckResult],
cookie_result: dict | None,
doc_texts: dict[str, str] | None = None,
) -> str:
"""Build HTML email report styled like the frontend.
`doc_texts` is the doc_type→text dict so hint-boxes can locate the
relevant Absatz in the original document for the Einfuege-Empfehlung.
"""
doc_texts = doc_texts or {}
ok_count = sum(1 for r in results if r.completeness_pct == 100)
html = [
'<div style="font-family:-apple-system,BlinkMacSystemFont,sans-serif;'
'max-width:700px;margin:0 auto">',
'<h2 style="margin-bottom:4px">Dokumenten-Pruefung</h2>',
f'<p style="color:#6b7280;margin-top:0">'
f'{len(results)} Dokumente, {ok_count} vollstaendig</p>',
]
for r in results:
_render_document(html, r, doc_texts.get(r.doc_type, ""))
if cookie_result:
_render_cookie_banner(html, cookie_result)
html.append('</div>')
return "\n".join(html)
def _render_document(html: list[str], r: DocCheckResult, doc_text: str = "") -> None:
pct = r.completeness_pct
cpct = r.correctness_pct
bar_color = "green" if pct >= 80 else "yellow" if pct >= 50 else "red"
status_label = "OK" if pct == 100 else "LUECKENHAFT" if pct >= 50 else "MANGELHAFT"
is_missing = bool(r.error) and (
r.error.startswith("Nicht eingereicht")
or r.error.startswith("Auf der Website nicht gefunden")
)
is_not_applicable = bool(r.error) and r.error.startswith("Nicht anwendbar")
if is_missing:
status_label = ("NICHT GEFUNDEN"
if r.error.startswith("Auf der Website")
else "NICHT EINGEREICHT")
elif is_not_applicable:
status_label = "NICHT ANWENDBAR"
elif r.error:
status_label = "FEHLER"
l1_checks = [c for c in r.checks if c.level == 1]
l2_by_parent: dict[str, list[CheckItem]] = {}
for c in r.checks:
if c.level == 2 and c.parent:
l2_by_parent.setdefault(c.parent, []).append(c)
l1_passed = sum(1 for c in l1_checks if c.passed)
l2_active = [c for c in r.checks if c.level == 2 and not c.skipped]
l2_passed = sum(1 for c in l2_active if c.passed)
# Header
html.append(
f'<div style="border:1px solid #e5e7eb;border-radius:8px;margin-bottom:12px;overflow:hidden">'
f'<div style="padding:12px 16px;background:#f9fafb">'
f'<div style="display:flex;justify-content:space-between;align-items:center"><div>'
f'<span style="font-size:11px;background:#f3f4f6;padding:2px 8px;border-radius:4px;'
f'color:#4b5563;font-weight:500;margin-right:8px">{status_label}</span>'
f'<strong style="font-size:14px">{r.label}</strong>'
f'<div style="font-size:12px;color:#6b7280;margin-top:2px">'
f'{l1_passed}/{len(l1_checks)} Pflichtangaben'
)
if l2_active:
html.append(f', {l2_passed}/{len(l2_active)} Detailpruefungen')
html.append(f'</div></div><div style="text-align:right">{_bar(pct, bar_color)}')
if cpct and l2_active:
html.append(f'<br>{_bar(cpct, "blue")}')
html.append('</div></div></div>')
# Body
if is_missing:
body_msg = (
"Wir haben die Hauptseite durchsucht, aber kein Dokument fuer "
"diese Pflichtangabe gefunden. Pruefen Sie, ob es auf der "
"Website existiert und tragen Sie die URL manuell nach."
if r.error.startswith("Auf der Website")
else "Keine URL oder Text fuer dieses Dokument angegeben. "
"Tragen Sie die Quelle im Compliance-Check Formular nach, "
"um diese Pflichtangabe zu pruefen."
)
html.append(
'<div style="padding:12px 16px;color:#6b7280;font-size:12px;'
'background:#fafafa;border-top:1px solid #f3f4f6">'
+ body_msg + '</div>'
)
elif is_not_applicable:
html.append(
'<div style="padding:12px 16px;color:#475569;font-size:12px;'
'background:#f1f5f9;border-top:1px solid #cbd5e1;border-left:'
'3px solid #94a3b8">'
+ r.error + '</div>'
)
elif r.error:
html.append(f'<div style="padding:12px 16px;color:#991b1b">{r.error}</div>')
else:
html.append('<div style="padding:8px 16px 12px">')
for c in l1_checks:
_render_l1_check(html, c, l2_by_parent.get(c.id, []), doc_text)
# Master-Control aggregation: with 1874 MCs evaluated per run,
# rendering every L2 check inline produces ~600 rows per doc and
# makes the email unreadable. Show only top-N severe fails plus a
# one-line summary. Full results live in /sdk/agent/audit/<id>.
from compliance.api.agent_doc_check_scorecard import build_top_fails_html
from compliance.services.mc_scorecard import top_fails
mc_results = [
{"id": c.id, "label": c.label, "passed": c.passed,
"severity": c.severity, "skipped": c.skipped, "hint": c.hint,
"regulation": c.regulation}
for c in r.checks
if c.id.startswith("mc-")
]
if mc_results:
n_total = len(mc_results)
n_passed = sum(1 for x in mc_results if x["passed"])
n_skipped = sum(1 for x in mc_results if x["skipped"])
n_failed = n_total - n_passed - n_skipped
html.append(
f'<div style="margin-top:12px;padding-top:8px;'
f'border-top:1px solid #e5e7eb;font-size:11px;color:#475569">'
f'<strong>Master-Controls:</strong> {n_passed}/'
f'{n_total - n_skipped} bestanden '
f'<span style="color:#dc2626">({n_failed} Fail)</span>'
f'{f" + {n_skipped} nicht anwendbar" if n_skipped else ""}.'
f'</div>'
)
top = top_fails(mc_results, n=10)
html.append(build_top_fails_html(top, r.label))
if r.word_count:
html.append(
f'<div style="font-size:11px;color:#9ca3af;margin-top:8px;'
f'padding-top:8px;border-top:1px solid #e5e7eb">'
f'{r.word_count} Woerter analysiert</div>'
)
html.append('</div>')
html.append('</div>')
def _render_l1_check(
html: list[str], c: CheckItem, children: list[CheckItem],
doc_text: str = "",
) -> None:
l2_sub = [ch for ch in children if not ch.skipped]
l2_passed = sum(1 for ch in l2_sub if ch.passed)
style = "color:#991b1b;font-weight:600" if not c.passed else "color:#374151"
html.append(
f'<div style="padding:3px 0">{_icon(c.passed)} '
f'<span style="font-size:13px;{style}">{c.label}</span>'
)
if l2_sub:
html.append(f' <span style="color:#9ca3af;font-size:11px">({l2_passed}/{len(l2_sub)})</span>')
if not c.passed and c.hint:
html.append(_hint_box(c.hint, c.label, doc_text))
html.append('</div>')
for ch in children:
if ch.skipped:
continue
_render_l2_check(html, ch, doc_text)
def _render_l2_check(html: list[str], ch: CheckItem, doc_text: str = "") -> None:
style = "color:#dc2626;font-weight:500" if not ch.passed else "color:#6b7280"
html.append(
f'<div style="padding:2px 0 2px 24px;border-left:2px solid #e5e7eb;margin-left:8px">'
f'{_icon(ch.passed)} '
f'<span style="font-size:12px;{style}">{ch.label}</span>'
)
if ch.passed and ch.matched_text:
html.append(
f'<div style="font-size:10px;color:#9ca3af;font-family:monospace;'
f'margin-left:20px;overflow:hidden;text-overflow:ellipsis;'
f'white-space:nowrap">"...{ch.matched_text[:80]}..."</div>'
)
if not ch.passed and ch.hint:
html.append(_hint_box(ch.hint, ch.label, doc_text))
html.append('</div>')
def _render_cookie_banner(html: list[str], cookie_result: dict) -> None:
html.append(
'<div style="border:1px solid #e5e7eb;border-radius:8px;'
'padding:12px 16px;margin-bottom:12px">'
'<strong>Cookie-Banner Pruefung</strong><br>'
f'Banner erkannt: {cookie_result.get("banner_detected", False)}<br>'
f'Anbieter: {cookie_result.get("banner_provider", "unbekannt")}'
)
violations = cookie_result.get("banner_checks", {}).get("violations", [])
if violations:
for v in violations[:10]:
html.append(f'<br>{_icon(False)} {v.get("text", "")[:80]}')
else:
html.append('<br><span style="color:#22c55e">Keine Verstoesse erkannt.</span>')
html.append('</div>')
# Re-export the helpers extracted to agent_doc_check_extras.py so existing
# callers that did `from .agent_doc_check_report import build_scanned_urls_html`
# keep working.
from .agent_doc_check_extras import ( # noqa: E402,F401
build_provider_list_html,
build_scanned_urls_html,
)
def build_profile_html(profile) -> str:
"""Build a small HTML block summarizing the detected business profile."""
service_tags = ", ".join(profile.detected_services[:10]) or "keine erkannt"
flags = []
if profile.has_online_shop:
flags.append("Online-Shop")
if profile.has_editorial_content:
flags.append("Redaktionelle Inhalte")
if profile.is_regulated_profession:
flags.append(f"Regulierter Beruf ({profile.regulated_profession_type})")
if profile.needs_odr:
flags.append("ODR-pflichtig")
flags_str = ", ".join(flags) or "keine"
return (
'<div style="font-family:-apple-system,BlinkMacSystemFont,sans-serif;'
'max-width:700px;margin:0 auto 16px;padding:12px 16px;'
'background:#f0f9ff;border:1px solid #bae6fd;border-radius:8px">'
'<h3 style="margin:0 0 8px;font-size:14px;color:#0369a1">'
'Erkanntes Geschaeftsmodell</h3>'
'<table style="font-size:13px;color:#374151">'
f'<tr><td style="padding:2px 12px 2px 0;color:#6b7280">Typ:</td>'
f'<td><strong>{profile.business_type.upper()}</strong>'
f' ({profile.industry})</td></tr>'
f'<tr><td style="padding:2px 12px 2px 0;color:#6b7280">Merkmale:</td>'
f'<td>{flags_str}</td></tr>'
f'<tr><td style="padding:2px 12px 2px 0;color:#6b7280">Dienste:</td>'
f'<td>{service_tags}</td></tr>'
f'<tr><td style="padding:2px 12px 2px 0;color:#6b7280">Konfidenz:</td>'
f'<td>{int(profile.confidence * 100)}%</td></tr>'
'</table></div>'
)