c7fde93061
- check_snapshot: update_browser_matrix/load_browser_matrix — migrationsfrei
in banner_result.browser_matrix (JSONB jsonb_set, eigener scanned_at)
- snapshot_check_routes: POST /snapshots/{id}/browser-behavior/run laeuft
/scan-matrix LIVE (Re-Crawl je Engine, nur live messbar), persistiert das
Ergebnis; GET /snapshots/{id}/browser-behavior liefert die gespeicherte
Matrix ohne Re-Crawl. Profil-Set = 4 Default-Engines + Brave/Chrome/Edge.
- consent-tester multi_browser_scanner: Semaphore(2) gegen OOM (7 Browser
parallel sprengten das 2g-mem_limit)
- Pydantic-Modell mit Optional[List[...]] (nicht `| None`) → Py3.9-sicher
- Tests: _snapshot_scan_url + Request-Defaults (5)
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
226 lines
7.7 KiB
Python
226 lines
7.7 KiB
Python
"""
|
|
P80 — Snapshot + Replay-Helper.
|
|
|
|
Persistiert die Roh-Daten eines Compliance-Check-Laufs (DSE-Text,
|
|
Banner-HTML, Cookies, CMP-Vendors, Profile), damit die Audit-Pipeline
|
|
spaeter ohne erneuten Browser-Crawl die Mail-Render-/MC-Scoring-Logik
|
|
neu laufen kann.
|
|
|
|
Use Cases:
|
|
* Logik-Iteration (MC-Filter P72, Mail-Layout, Action-Recipes) ohne
|
|
7min Re-Crawl.
|
|
* Regression-Test: Golden-Truth-Library (P81).
|
|
* Diff-Mode: "was hat sich seit letztem Snapshot geaendert" (P84).
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import logging
|
|
from typing import Any
|
|
from urllib.parse import urlparse
|
|
|
|
from sqlalchemy import text
|
|
from sqlalchemy.orm import Session
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
def _to_jsonb(obj: Any) -> str:
|
|
"""Serialize to JSON-string for psycopg2 JSONB insertion."""
|
|
return json.dumps(obj, default=str, ensure_ascii=False)
|
|
|
|
|
|
def _derive_site_domain(doc_entries: list[dict]) -> str:
|
|
for e in doc_entries or []:
|
|
url = (e.get("url") or "").strip()
|
|
if url:
|
|
try:
|
|
netloc = urlparse(url).netloc.lower().replace("www.", "")
|
|
if netloc:
|
|
return netloc
|
|
except Exception:
|
|
continue
|
|
return "unknown"
|
|
|
|
|
|
def save_snapshot(
|
|
db: Session,
|
|
check_id: str,
|
|
doc_entries: list[dict],
|
|
banner_result: dict | None,
|
|
profile: Any,
|
|
cmp_vendors: list[dict] | None = None,
|
|
scan_context: dict | None = None,
|
|
site_label: str | None = None,
|
|
notes: str | None = None,
|
|
) -> str | None:
|
|
"""Persist scan raw data. Returns snapshot UUID on success."""
|
|
try:
|
|
profile_dict: dict = {}
|
|
if profile is not None:
|
|
if hasattr(profile, "__dict__"):
|
|
profile_dict = {k: v for k, v in profile.__dict__.items()
|
|
if not k.startswith("_")}
|
|
elif isinstance(profile, dict):
|
|
profile_dict = profile
|
|
|
|
domain = _derive_site_domain(doc_entries or [])
|
|
result = db.execute(
|
|
text("""
|
|
INSERT INTO compliance.compliance_check_snapshots
|
|
(check_id, site_domain, site_label,
|
|
doc_entries, banner_result, profile,
|
|
scan_context, cmp_vendors, notes)
|
|
VALUES (:cid, :dom, :lbl,
|
|
CAST(:de AS JSONB), CAST(:br AS JSONB), CAST(:pr AS JSONB),
|
|
CAST(:sc AS JSONB), CAST(:cv AS JSONB), :nt)
|
|
RETURNING id
|
|
"""),
|
|
{
|
|
"cid": check_id,
|
|
"dom": domain,
|
|
"lbl": site_label,
|
|
"de": _to_jsonb(doc_entries or []),
|
|
"br": _to_jsonb(banner_result) if banner_result else None,
|
|
"pr": _to_jsonb(profile_dict) if profile_dict else None,
|
|
"sc": _to_jsonb(scan_context) if scan_context else None,
|
|
"cv": _to_jsonb(cmp_vendors) if cmp_vendors else None,
|
|
"nt": notes,
|
|
},
|
|
)
|
|
snapshot_id = str(result.fetchone()[0])
|
|
db.commit()
|
|
logger.info(
|
|
"P80: snapshot saved id=%s check=%s domain=%s docs=%d",
|
|
snapshot_id, check_id, domain, len(doc_entries or []),
|
|
)
|
|
return snapshot_id
|
|
except Exception as e:
|
|
logger.warning("P80 snapshot save failed for %s: %s", check_id, e)
|
|
try:
|
|
db.rollback()
|
|
except Exception:
|
|
pass
|
|
return None
|
|
|
|
|
|
def load_snapshot(db: Session, snapshot_id: str) -> dict | None:
|
|
"""Load a snapshot by UUID. Returns dict with all fields or None."""
|
|
try:
|
|
row = db.execute(
|
|
text("""
|
|
SELECT id, check_id, site_domain, site_label,
|
|
doc_entries, banner_result, profile,
|
|
scan_context, cmp_vendors, created_at,
|
|
replay_count, notes
|
|
FROM compliance.compliance_check_snapshots
|
|
WHERE id = CAST(:sid AS uuid)
|
|
"""),
|
|
{"sid": snapshot_id},
|
|
).fetchone()
|
|
if not row:
|
|
return None
|
|
db.execute(
|
|
text("""
|
|
UPDATE compliance.compliance_check_snapshots
|
|
SET replay_count = replay_count + 1,
|
|
last_replay_at = now()
|
|
WHERE id = CAST(:sid AS uuid)
|
|
"""),
|
|
{"sid": snapshot_id},
|
|
)
|
|
db.commit()
|
|
return {
|
|
"id": str(row[0]),
|
|
"check_id": row[1],
|
|
"site_domain": row[2],
|
|
"site_label": row[3],
|
|
"doc_entries": row[4] or [],
|
|
"banner_result": row[5],
|
|
"profile": row[6] or {},
|
|
"scan_context": row[7] or {},
|
|
"cmp_vendors": row[8] or [],
|
|
"created_at": str(row[9]),
|
|
"replay_count": row[10],
|
|
"notes": row[11],
|
|
}
|
|
except Exception as e:
|
|
logger.warning("P80 snapshot load failed for %s: %s", snapshot_id, e)
|
|
return None
|
|
|
|
|
|
def update_browser_matrix(db: Session, snapshot_id: str, matrix: dict) -> bool:
|
|
"""Persistiert das Browser-Verhaltens-Matrix-Ergebnis MIGRATIONSFREI in die
|
|
bestehende `banner_result`-JSONB-Spalte unter dem Key `browser_matrix`.
|
|
|
|
Eigener Zeitstempel steckt im Matrix-Objekt (`scanned_at`) — der kann von
|
|
der Snapshot-Aufnahmezeit abweichen, weil die Matrix on-demand LIVE läuft
|
|
(Browser-Verhalten ist nur live messbar, anders als die Textmodule)."""
|
|
try:
|
|
db.execute(
|
|
text("""
|
|
UPDATE compliance.compliance_check_snapshots
|
|
SET banner_result = jsonb_set(
|
|
COALESCE(banner_result, '{}'::jsonb),
|
|
'{browser_matrix}', CAST(:bm AS JSONB), true)
|
|
WHERE id = CAST(:sid AS uuid)
|
|
"""),
|
|
{"sid": snapshot_id, "bm": _to_jsonb(matrix)},
|
|
)
|
|
db.commit()
|
|
return True
|
|
except Exception as e:
|
|
logger.warning("browser-matrix persist failed for %s: %s", snapshot_id, e)
|
|
try:
|
|
db.rollback()
|
|
except Exception:
|
|
pass
|
|
return False
|
|
|
|
|
|
def load_browser_matrix(db: Session, snapshot_id: str) -> dict | None:
|
|
"""Nur das persistierte `browser_matrix`-Sub-Objekt (kein Re-Crawl)."""
|
|
try:
|
|
row = db.execute(
|
|
text("""
|
|
SELECT banner_result -> 'browser_matrix'
|
|
FROM compliance.compliance_check_snapshots
|
|
WHERE id = CAST(:sid AS uuid)
|
|
"""),
|
|
{"sid": snapshot_id},
|
|
).fetchone()
|
|
return row[0] if row and row[0] else None
|
|
except Exception as e:
|
|
logger.warning("browser-matrix load failed for %s: %s", snapshot_id, e)
|
|
return None
|
|
|
|
|
|
def list_snapshots_for_domain(db: Session, domain: str, limit: int = 20) -> list[dict]:
|
|
"""List recent snapshots for a domain (for diff-mode P84)."""
|
|
try:
|
|
rows = db.execute(
|
|
text("""
|
|
SELECT id, check_id, site_domain, created_at, replay_count, notes
|
|
FROM compliance.compliance_check_snapshots
|
|
WHERE site_domain = :dom
|
|
ORDER BY created_at DESC
|
|
LIMIT :lim
|
|
"""),
|
|
{"dom": domain.lower().replace("www.", ""), "lim": limit},
|
|
).fetchall()
|
|
return [
|
|
{
|
|
"id": str(r[0]),
|
|
"check_id": r[1],
|
|
"site_domain": r[2],
|
|
"created_at": str(r[3]),
|
|
"replay_count": r[4],
|
|
"notes": r[5],
|
|
}
|
|
for r in rows
|
|
]
|
|
except Exception as e:
|
|
logger.warning("P80 list_snapshots failed for %s: %s", domain, e)
|
|
return []
|