""" P80 — Snapshot + Replay-Helper. Persistiert die Roh-Daten eines Compliance-Check-Laufs (DSE-Text, Banner-HTML, Cookies, CMP-Vendors, Profile), damit die Audit-Pipeline spaeter ohne erneuten Browser-Crawl die Mail-Render-/MC-Scoring-Logik neu laufen kann. Use Cases: * Logik-Iteration (MC-Filter P72, Mail-Layout, Action-Recipes) ohne 7min Re-Crawl. * Regression-Test: Golden-Truth-Library (P81). * Diff-Mode: "was hat sich seit letztem Snapshot geaendert" (P84). """ from __future__ import annotations import json import logging from typing import Any from urllib.parse import urlparse from sqlalchemy import text from sqlalchemy.orm import Session logger = logging.getLogger(__name__) def _to_jsonb(obj: Any) -> str: """Serialize to JSON-string for psycopg2 JSONB insertion.""" return json.dumps(obj, default=str, ensure_ascii=False) def _derive_site_domain(doc_entries: list[dict]) -> str: for e in doc_entries or []: url = (e.get("url") or "").strip() if url: try: netloc = urlparse(url).netloc.lower().replace("www.", "") if netloc: return netloc except Exception: continue return "unknown" def save_snapshot( db: Session, check_id: str, doc_entries: list[dict], banner_result: dict | None, profile: Any, cmp_vendors: list[dict] | None = None, scan_context: dict | None = None, site_label: str | None = None, notes: str | None = None, ) -> str | None: """Persist scan raw data. Returns snapshot UUID on success.""" try: profile_dict: dict = {} if profile is not None: if hasattr(profile, "__dict__"): profile_dict = {k: v for k, v in profile.__dict__.items() if not k.startswith("_")} elif isinstance(profile, dict): profile_dict = profile domain = _derive_site_domain(doc_entries or []) result = db.execute( text(""" INSERT INTO compliance.compliance_check_snapshots (check_id, site_domain, site_label, doc_entries, banner_result, profile, scan_context, cmp_vendors, notes) VALUES (:cid, :dom, :lbl, CAST(:de AS JSONB), CAST(:br AS JSONB), CAST(:pr AS JSONB), CAST(:sc AS JSONB), CAST(:cv AS JSONB), :nt) RETURNING id """), { "cid": check_id, "dom": domain, "lbl": site_label, "de": _to_jsonb(doc_entries or []), "br": _to_jsonb(banner_result) if banner_result else None, "pr": _to_jsonb(profile_dict) if profile_dict else None, "sc": _to_jsonb(scan_context) if scan_context else None, "cv": _to_jsonb(cmp_vendors) if cmp_vendors else None, "nt": notes, }, ) snapshot_id = str(result.fetchone()[0]) db.commit() logger.info( "P80: snapshot saved id=%s check=%s domain=%s docs=%d", snapshot_id, check_id, domain, len(doc_entries or []), ) return snapshot_id except Exception as e: logger.warning("P80 snapshot save failed for %s: %s", check_id, e) try: db.rollback() except Exception: pass return None def load_snapshot(db: Session, snapshot_id: str) -> dict | None: """Load a snapshot by UUID. Returns dict with all fields or None.""" try: row = db.execute( text(""" SELECT id, check_id, site_domain, site_label, doc_entries, banner_result, profile, scan_context, cmp_vendors, created_at, replay_count, notes FROM compliance.compliance_check_snapshots WHERE id = CAST(:sid AS uuid) """), {"sid": snapshot_id}, ).fetchone() if not row: return None db.execute( text(""" UPDATE compliance.compliance_check_snapshots SET replay_count = replay_count + 1, last_replay_at = now() WHERE id = CAST(:sid AS uuid) """), {"sid": snapshot_id}, ) db.commit() return { "id": str(row[0]), "check_id": row[1], "site_domain": row[2], "site_label": row[3], "doc_entries": row[4] or [], "banner_result": row[5], "profile": row[6] or {}, "scan_context": row[7] or {}, "cmp_vendors": row[8] or [], "created_at": str(row[9]), "replay_count": row[10], "notes": row[11], } except Exception as e: logger.warning("P80 snapshot load failed for %s: %s", snapshot_id, e) return None def list_snapshots_for_domain(db: Session, domain: str, limit: int = 20) -> list[dict]: """List recent snapshots for a domain (for diff-mode P84).""" try: rows = db.execute( text(""" SELECT id, check_id, site_domain, created_at, replay_count, notes FROM compliance.compliance_check_snapshots WHERE site_domain = :dom ORDER BY created_at DESC LIMIT :lim """), {"dom": domain.lower().replace("www.", ""), "lim": limit}, ).fetchall() return [ { "id": str(r[0]), "check_id": r[1], "site_domain": r[2], "created_at": str(r[3]), "replay_count": r[4], "notes": r[5], } for r in rows ] except Exception as e: logger.warning("P80 list_snapshots failed for %s: %s", domain, e) return []