diff --git a/backend-compliance/compliance/api/agent_compliance_check_routes.py b/backend-compliance/compliance/api/agent_compliance_check_routes.py index 18c124cc..6a3fc653 100644 --- a/backend-compliance/compliance/api/agent_compliance_check_routes.py +++ b/backend-compliance/compliance/api/agent_compliance_check_routes.py @@ -190,18 +190,44 @@ async def list_snapshots(domain: str = "", limit: int = 20): @router.get("/snapshots/{snapshot_id}") async def get_snapshot(snapshot_id: str): """P80: load full snapshot raw data.""" + from fastapi import HTTPException from database import SessionLocal from compliance.services.check_snapshot import load_snapshot db = SessionLocal() try: snap = load_snapshot(db, snapshot_id) if not snap: - return {"error": "snapshot not found"}, 404 + raise HTTPException(status_code=404, detail="snapshot not found") return snap finally: db.close() +@router.post("/snapshots/{snapshot_id}/replay") +async def replay_snapshot( + snapshot_id: str, + recipient: str = "", + dry_run: bool = True, +): + """P80: replay audit mail render from snapshot. 7min->2sec test cycle. + + Default dry_run=true just returns rendered HTML size + section breakdown. + Pass recipient + dry_run=false to actually send a [REPLAY] mail. + """ + from database import SessionLocal + from compliance.services.check_replay import replay_from_snapshot + db = SessionLocal() + try: + return replay_from_snapshot( + db, + snapshot_id=snapshot_id, + recipient=(recipient if recipient else None), + dry_run=dry_run, + ) + finally: + db.close() + + async def _run_compliance_check(check_id: str, req: ComplianceCheckRequest): """Background task: check all documents with business-profile context.""" try: diff --git a/backend-compliance/compliance/services/check_replay.py b/backend-compliance/compliance/services/check_replay.py new file mode 100644 index 00000000..8aa6e73c --- /dev/null +++ b/backend-compliance/compliance/services/check_replay.py @@ -0,0 +1,147 @@ +""" +P80 — Replay-Pipeline (Mini-Version v1). + +Lädt einen persistierten Snapshot und rendert die Audit-Mail mit dem +AKTUELLEN Mail-Render-Code neu. Nutzbar fuer: + * Mail-Layout-Aenderungen (P63-P67, P82 1-Pager, P84 Diff-Mode) testen + * Action-Recipes anpassen + * Disclaimer-Text iterieren + * Pattern-Notice-Logik tunen + +NICHT enthalten (kommt in v2): + * MC-Scorecard re-run mit aktuellem scope_doc_type-Filter (P72) — + erfordert MC-Pipeline-Refactoring aus _run_compliance_check + * Vendor-Redundancy-Analyse re-run + +Effekt v1: 7min Re-Scan -> 2-5 Sek fuer Mail-Layout-Iterationen. +Effekt v2 (spaeter): auch fuer MC-Filter-Tests. +""" + +from __future__ import annotations + +import logging +from typing import Any + +from sqlalchemy.orm import Session + +from compliance.services.check_snapshot import load_snapshot + +logger = logging.getLogger(__name__) + + +def replay_from_snapshot( + db: Session, + snapshot_id: str, + recipient: str | None = None, + dry_run: bool = False, +) -> dict: + """Replay audit mail render from snapshot. + + Args: + db: SQLAlchemy session + snapshot_id: UUID of snapshot to replay + recipient: Override email recipient. None = skip send. + dry_run: If True, render HTML but do not send mail. + + Returns: + {"snapshot_id", "html_size", "sections", "mail_sent", "preview"} + """ + snap = load_snapshot(db, snapshot_id) + if not snap: + return {"error": "snapshot not found", "snapshot_id": snapshot_id} + + doc_entries = snap.get("doc_entries") or [] + banner_result = snap.get("banner_result") or {} + profile_dict = snap.get("profile") or {} + cmp_vendors = snap.get("cmp_vendors") or [] + site_label = snap.get("site_label") or snap.get("site_domain") + + # Reconstruct doc_texts mapping (was the input to mail-render) + doc_texts: dict[str, str] = {} + for e in doc_entries: + dt = e.get("doc_type", "") + txt = (e.get("full_text") or e.get("text_preview") or "").strip() + if dt and txt: + doc_texts[dt] = txt + + # Build results list mock (just enough for mail-render) + from compliance.services.doc_checks.runner import DocCheckResult + + def _dict_to_result(d: dict) -> Any: + """Best-effort reconstruction. Snapshot didn't persist DocCheckResult + so we fake minimal fields. For real MC-replay (v2) we'd re-run the + check_document_completeness function against the snapshot text.""" + return type("R", (), { + "doc_type": d.get("doc_type", "other"), + "label": d.get("doc_type", "Dokument"), + "completeness_pct": d.get("completeness_pct", 0), + "correctness_pct": d.get("correctness_pct"), + "checks": [], + "error": d.get("error", ""), + })() + + results = [_dict_to_result(e) for e in doc_entries] + + # Render mail sections + section_sizes: dict[str, int] = {} + parts: list[str] = [] + + try: + from compliance.api.agent_doc_check_critical import build_critical_findings_html + critical_html = build_critical_findings_html(banner_result, None, results) or "" + parts.append(critical_html) + section_sizes["critical"] = len(critical_html) + except Exception as e: + logger.warning("Replay: critical-block failed: %s", e) + + try: + from compliance.api.scope_disclaimer import build_scope_disclaimer_html + disclaimer = build_scope_disclaimer_html() + parts.append(disclaimer) + section_sizes["disclaimer"] = len(disclaimer) + except Exception as e: + logger.warning("Replay: disclaimer failed: %s", e) + + try: + from compliance.api.agent_doc_check_banner import build_banner_deep_html + banner_html = build_banner_deep_html(banner_result) or "" + parts.append(banner_html) + section_sizes["banner"] = len(banner_html) + except Exception as e: + logger.warning("Replay: banner-block failed: %s", e) + + try: + from compliance.api.vvt_table_renderer import build_vvt_table_html + vvt_html = build_vvt_table_html(cmp_vendors) or "" + parts.append(vvt_html) + section_sizes["vvt"] = len(vvt_html) + except Exception as e: + logger.warning("Replay: vvt failed: %s", e) + + full_html = "".join(parts) + + result = { + "snapshot_id": snapshot_id, + "check_id": snap.get("check_id"), + "site_domain": snap.get("site_domain"), + "html_size": len(full_html), + "sections": section_sizes, + "mail_sent": False, + "preview": full_html[:500] + "..." if len(full_html) > 500 else full_html, + } + + if recipient and not dry_run: + try: + from compliance.services.email_sender import send_email + email_res = send_email( + recipient=recipient, + subject=f"[REPLAY] {site_label} (Snapshot {snapshot_id[:8]})", + body_html=full_html, + ) + result["mail_sent"] = (email_res.get("status") == "sent") + result["mail_status"] = email_res.get("status") + except Exception as e: + logger.warning("Replay: mail send failed: %s", e) + result["mail_send_error"] = str(e)[:200] + + return result diff --git a/backend-compliance/scripts/backfill_mc_scope_v2.py b/backend-compliance/scripts/backfill_mc_scope_v2.py new file mode 100644 index 00000000..fa2a65e6 --- /dev/null +++ b/backend-compliance/scripts/backfill_mc_scope_v2.py @@ -0,0 +1,257 @@ +#!/usr/bin/env python3 +"""P72 v2 — Heuristik nachschaerfen. + +v1 hatte 79% 'other'-Bucket. v2 erweitert Patterns deutlich: + * DSE: deckt allgemeine Art. 13/14 + Betroffenenrechte ab + * TOM: deckt Art. 32 + technische Sicherheit breit ab + * cookie_richtlinie: alle Tracking/Analyse-Patterns + * process: alle Schulung/Verfahren/Meldepflicht-Patterns + +Re-classify NUR den 'other'-Bucket — die anderen Buckets aus v1 sind bereits +spezifisch genug, die wollen wir nicht nochmal anfassen. +""" +from __future__ import annotations + +import os +import re +import sys +from typing import Pattern + +import psycopg2 + +# Patterns v2: deutlich lockerer (vorher zu strict, 79% landeten in 'other') +RULES_V2: list[tuple[str, Pattern]] = [ + # Banner-UI (sehr eng — bleibt v1) + ( + "banner_implementation", + re.compile( + r"\b(banner|cookie[-\s]?wall|pre[-\s]?ticked|" + r"vorausgewaehlt(e|en)?\s*checkbox|" + r"browser[-\s]?(default|standard|einstellung).{0,30}einwilligung|" + r"dark[-\s]?pattern|reject.{0,20}button|ablehn.{0,20}button|" + r"floating.{0,20}icon|cookie[-\s]?einstellungen)" + ), + ), + # CMP-Audit + ( + "cmp_audit", + re.compile( + r"\b(consent[-\s]?(log|trail|audit)|" + r"einwilligungs(nachweis|log|trail|protokoll)|" + r"datensaetze?.{0,30}einwilligung|" + r"zeitstempel.{0,30}einwilligung)" + ), + ), + # AVV (Art. 28) + ( + "avv", + re.compile( + r"\b(art\.?\s*28|auftragsverarbeit|adv|avv|" + r"data[-\s]?processing[-\s]?agreement|dpa|" + r"weisungsgebunden(er)?\s*auftragsverarbeit)" + ), + ), + # JC (Art. 26) + ( + "jc", + re.compile( + r"\b(art\.?\s*26|joint[-\s]?controller|" + r"gemeinsam(e|er)\s*verantwortlich|" + r"konzern.{0,40}(verantwortlich|verarbeit)|" + r"gemeinsame.{0,20}verarbeitung)" + ), + ), + # Impressum + ( + "impressum", + re.compile( + r"\b((paragraph|§)\s*5\s*(tmg|ddg)|" + r"§\s*18\s*mstv|" + r"impressum|anbieterkennzeichnung|" + r"geschaeftsbrief|firma.{0,20}kaufmann|" + r"vollstaendige.{0,20}geschaeftsadresse|" + r"postalische?.{0,30}adresse|" + r"handelsregister.{0,30}eintrag|" + r"§\s*55\s*rstv)" + ), + ), + # AGB + ( + "agb", + re.compile( + r"\b(agb|allgemeine\s*geschaeftsbedingungen|" + r"vertragsbedingungen|" + r"§\s*305.{0,5}(bgb)?|" + r"klausel.{0,30}wirksam|" + r"vertragsabschluss.{0,40}online)" + ), + ), + # Widerruf + ( + "widerruf", + re.compile( + r"\b(widerrufsbelehrung|widerrufsrecht|" + r"14.{0,10}tage.{0,10}frist|" + r"musterwiderruf|" + r"§\s*355\s*bgb)" + ), + ), + # Accounting + ( + "accounting", + re.compile( + r"\b((rechnung|invoice).{0,30}(angeben|enthalten|fuehren)|" + r"§\s*14\s*ustg|umsatzsteueridentifikation\s+nummer.{0,30}rechnung|" + r"buchhaltung|" + r"steuernummer.{0,30}rechnung)" + ), + ), + # Cookie-Richtlinie (erweitert) + ( + "cookie_richtlinie", + re.compile( + r"\b(cookie[-\s]?(richtlinie|policy|liste|tabelle|verzeichnis)|" + r"§\s*25\s*(tddg|tdddg|ttdsg)|" + r"§\s*165.{0,5}tkg|" + r"tracking[-\s]?technologi|" + r"tracking[-\s]?(pixel|skript)|" + r"webstorage|local[-\s]?storage.{0,30}einwilligung|" + r"cookie.{0,30}(zweck|speicherdauer|drittland|anbieter)|" + r"google\s+analytics|matomo|piwik|hotjar|" + r"facebook\s+pixel|meta\s+pixel|google\s+tag\s+manager)" + ), + ), + # TOM (deutlich erweitert) + ( + "tom", + re.compile( + r"\b(art\.?\s*32|" + r"verschluesselung|encryption|kryptograph|" + r"backup|wiederherstell|recovery|" + r"pseudonymisier|anonymisier|" + r"zugriffskontrolle|berechtigungskonzept|" + r"benutzerverwaltung|identity[-\s]?management|" + r"penetrationstest|security[-\s]?incident|" + r"intrusion[-\s]?detection|firewall|" + r"tom|technisch[-\s]?organisatorische|" + r"iso[-\s]?2700[12]|bsi[-\s]?grundschutz|" + r"protokollier(ung)?|audit[-\s]?log|" + r"datensicherheit|netzwerksicherheit|" + r"patch[-\s]?management|update[-\s]?prozess|" + r"physische?\s+sicherheit|zutrittskontrolle)" + ), + ), + # DSE (DEUTLICH erweitert — wichtigster Bucket fuer aktuelle Audits) + ( + "dse", + re.compile( + r"\b(art\.?\s*1[34]|" + r"datenschutzerklaerung|datenschutzhinweis|datenschutzinformation|" + r"informationspflicht|" + r"empfaenger(\s*oder\s*empfaengerkategorien)?|" + r"drittland.{0,30}(transfer|uebermittlung)|" + r"verantwortlich(er|en)\s+benennen|" + r"rechtsgrundlage.{0,30}(verarbeitung|nennen|angeben)|" + r"betroffenenrecht|" + r"art\.?\s*1[5-9]\s*dsgvo|art\.?\s*2[0-2]\s*dsgvo|" + r"art\.?\s*1[5-9]|art\.?\s*2[0-2]|" + r"auskunftsrecht|berichtigungsrecht|loeschungsrecht|" + r"einschraenkungsrecht|datenportabilitaet|widerspruchsrecht|" + r"einwilligung.{0,40}widerruf|" + r"datenschutzbeauftragt(er|en)?|dsb|" + r"aufsichtsbehoerde.{0,30}(benennen|nennen|kontakt)|" + r"beschwerderecht|art\.?\s*77|" + r"speicherdauer|loeschfrist|aufbewahrungsfrist|" + r"besondere\s+kategorien|art\.?\s*9\s*dsgvo|" + r"profiling|automatisierte\s+entscheidung|art\.?\s*22)" + ), + ), + # Process (erweitert) + ( + "process", + re.compile( + r"\b(prozess|verfahren|workflow|" + r"durchfuehren|umsetzen|implementieren|" + r"schulung|mitarbeiterunterweis|sensibilisier|" + r"regelmaessig.{0,30}pruefen|" + r"kontinuierlich|laufend|fortlaufend|" + r"datenpannenmeldung|art\.?\s*3[34]|" + r"meldepflicht.{0,30}(behoerd|datenpannen|breach)|" + r"loeschkonzept|loeschroutine|" + r"hinweisgebersystem|whistleblow|hinschg|" + r"dsfa|datenschutz[-\s]?folgenabschaetz|art\.?\s*35|" + r"vvt|verzeichnis.{0,30}verarbeitung|art\.?\s*30|" + r"risiko(analyse|bewertung|management)|" + r"penetration.{0,30}durchfuehr|" + r"audit.{0,30}durchfuehr|" + r"kontrolle.{0,30}(durchfuehr|etablier)|" + r"nachweis(fuehrung|pflicht)|rechenschaft)" + ), + ), +] + + +def classify_v2(title: str, objective: str, tags: str | None = None) -> str: + text = " ".join( + s.lower() for s in (title or "", objective or "", tags or "") if s + ) + for scope, pattern in RULES_V2: + if pattern.search(text): + return scope + return "other" + + +def main() -> int: + dsn = os.environ.get("DATABASE_URL") + if not dsn: + print("DATABASE_URL missing", file=sys.stderr) + return 1 + conn = psycopg2.connect(dsn) + cur = conn.cursor() + + # NUR den 'other'-Bucket aus v1 re-classifizieren + cur.execute(""" + SELECT id, title, objective, tags + FROM compliance.canonical_controls + WHERE scope_doc_type = 'other' + AND merged_into_uuid IS NULL + """) + rows = cur.fetchall() + print(f"Re-classifying {len(rows):,} 'other'-bucket MCs with v2 heuristic...", + file=sys.stderr) + + from collections import Counter + stats = Counter() + moved = 0 + batch = [] + for row_id, title, objective, tags in rows: + new_scope = classify_v2(title or "", objective or "", tags) + if new_scope != "other": + moved += 1 + stats[new_scope] += 1 + batch.append((new_scope, row_id)) + if len(batch) >= 1000: + cur.executemany( + "UPDATE compliance.canonical_controls SET scope_doc_type=%s WHERE id=%s", + batch, + ) + conn.commit() + print(f" re-classified {sum(stats.values()):,} so far, moved={moved:,}", + file=sys.stderr) + batch = [] + if batch: + cur.executemany( + "UPDATE compliance.canonical_controls SET scope_doc_type=%s WHERE id=%s", + batch, + ) + conn.commit() + + print(f"\n=== v2 Re-classification: moved {moved:,} / {len(rows):,} from 'other' ===") + for scope, n in sorted(stats.items(), key=lambda x: -x[1]): + pct = 100 * n / max(1, len(rows)) + print(f" {scope:25s} {n:>7,} ({pct:>4.1f}%)") + return 0 + + +if __name__ == "__main__": + sys.exit(main())