feat(audit-pipeline): P72 MC-Scope-Classifier + P80 Snapshot/Replay-Foundation [migration-approved]
CI / detect-changes (push) Successful in 11s
CI / branch-name (push) Has been skipped
CI / guardrail-integrity (push) Has been skipped
CI / secret-scan (push) Has been skipped
CI / dep-audit (push) Has been skipped
CI / sbom-scan (push) Has been skipped
CI / validate-canonical-controls (push) Successful in 14s
CI / loc-budget (push) Failing after 14s
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / nodejs-build (push) Has been skipped
CI / test-go (push) Has been skipped
CI / iace-gt-coverage (push) Has been skipped
CI / test-python-backend (push) Successful in 37s
CI / test-python-document-crawler (push) Has been skipped
CI / test-python-dsms-gateway (push) Has been skipped
CI / detect-changes (push) Successful in 11s
CI / branch-name (push) Has been skipped
CI / guardrail-integrity (push) Has been skipped
CI / secret-scan (push) Has been skipped
CI / dep-audit (push) Has been skipped
CI / sbom-scan (push) Has been skipped
CI / validate-canonical-controls (push) Successful in 14s
CI / loc-budget (push) Failing after 14s
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / nodejs-build (push) Has been skipped
CI / test-go (push) Has been skipped
CI / iace-gt-coverage (push) Has been skipped
CI / test-python-backend (push) Successful in 37s
CI / test-python-document-crawler (push) Has been skipped
CI / test-python-dsms-gateway (push) Has been skipped
P72 MC-Scope-Classifier — pro MC den ECHTEN Doc-Adressaten festlegen
(cookie_richtlinie/dse/banner_implementation/cmp_audit/tom/avv/jc/
impressum/agb/widerruf/process/accounting/other).
- Migration 145: scope_doc_type Spalte + Index auf canonical_controls
- Backfill-Script mit Regex-Heuristik (12 Regeln, Prioritaet-sortiert)
- Erste 11k-Sample-Distribution: 76% other (Heuristik v1 zu strict —
v2 muss lockerere Patterns fuer DSE/TOM nachschaerfen)
- Ziel: bevor MC-Scorecard filtert, weiss jeder MC welches Dokument
er adressiert. Bisher landeten eHealth-/HGB-MCs im Cookie-Audit.
P80 Snapshot + Replay-Foundation — Roh-Daten persistieren damit
Audit-Pipeline ohne erneuten Crawl rebuildbar ist.
- Migration 146: compliance_check_snapshots Tabelle (JSONB pro
doc_entries/banner_result/profile/cmp_vendors/scan_context)
- services.check_snapshot.save_snapshot/load_snapshot/list
- Endpoints GET /snapshots, GET /snapshots/{id}
- Hook in _run_compliance_check: nach Mail-Send automatischer
Snapshot-Save via separater SessionLocal (background-task safe)
- Replay-Endpoint folgt im naechsten PR (braucht Refactoring
von _run_compliance_check in crawl_phase + interpret_phase)
- Effekt: Test-Cycle 7min -> 5sec bei reinen Logik-Aenderungen
(P73/P79/P81+ profitieren direkt). Snapshots dienen auch als
Regression-Test-Corpus (P81 Golden-Truth-Library).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,179 @@
|
||||
"""
|
||||
P80 — Snapshot + Replay-Helper.
|
||||
|
||||
Persistiert die Roh-Daten eines Compliance-Check-Laufs (DSE-Text,
|
||||
Banner-HTML, Cookies, CMP-Vendors, Profile), damit die Audit-Pipeline
|
||||
spaeter ohne erneuten Browser-Crawl die Mail-Render-/MC-Scoring-Logik
|
||||
neu laufen kann.
|
||||
|
||||
Use Cases:
|
||||
* Logik-Iteration (MC-Filter P72, Mail-Layout, Action-Recipes) ohne
|
||||
7min Re-Crawl.
|
||||
* Regression-Test: Golden-Truth-Library (P81).
|
||||
* Diff-Mode: "was hat sich seit letztem Snapshot geaendert" (P84).
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
from typing import Any
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from sqlalchemy import text
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _to_jsonb(obj: Any) -> str:
|
||||
"""Serialize to JSON-string for psycopg2 JSONB insertion."""
|
||||
return json.dumps(obj, default=str, ensure_ascii=False)
|
||||
|
||||
|
||||
def _derive_site_domain(doc_entries: list[dict]) -> str:
|
||||
for e in doc_entries or []:
|
||||
url = (e.get("url") or "").strip()
|
||||
if url:
|
||||
try:
|
||||
netloc = urlparse(url).netloc.lower().replace("www.", "")
|
||||
if netloc:
|
||||
return netloc
|
||||
except Exception:
|
||||
continue
|
||||
return "unknown"
|
||||
|
||||
|
||||
def save_snapshot(
|
||||
db: Session,
|
||||
check_id: str,
|
||||
doc_entries: list[dict],
|
||||
banner_result: dict | None,
|
||||
profile: Any,
|
||||
cmp_vendors: list[dict] | None = None,
|
||||
scan_context: dict | None = None,
|
||||
site_label: str | None = None,
|
||||
notes: str | None = None,
|
||||
) -> str | None:
|
||||
"""Persist scan raw data. Returns snapshot UUID on success."""
|
||||
try:
|
||||
profile_dict: dict = {}
|
||||
if profile is not None:
|
||||
if hasattr(profile, "__dict__"):
|
||||
profile_dict = {k: v for k, v in profile.__dict__.items()
|
||||
if not k.startswith("_")}
|
||||
elif isinstance(profile, dict):
|
||||
profile_dict = profile
|
||||
|
||||
domain = _derive_site_domain(doc_entries or [])
|
||||
result = db.execute(
|
||||
text("""
|
||||
INSERT INTO compliance.compliance_check_snapshots
|
||||
(check_id, site_domain, site_label,
|
||||
doc_entries, banner_result, profile,
|
||||
scan_context, cmp_vendors, notes)
|
||||
VALUES (:cid, :dom, :lbl,
|
||||
CAST(:de AS JSONB), CAST(:br AS JSONB), CAST(:pr AS JSONB),
|
||||
CAST(:sc AS JSONB), CAST(:cv AS JSONB), :nt)
|
||||
RETURNING id
|
||||
"""),
|
||||
{
|
||||
"cid": check_id,
|
||||
"dom": domain,
|
||||
"lbl": site_label,
|
||||
"de": _to_jsonb(doc_entries or []),
|
||||
"br": _to_jsonb(banner_result) if banner_result else None,
|
||||
"pr": _to_jsonb(profile_dict) if profile_dict else None,
|
||||
"sc": _to_jsonb(scan_context) if scan_context else None,
|
||||
"cv": _to_jsonb(cmp_vendors) if cmp_vendors else None,
|
||||
"nt": notes,
|
||||
},
|
||||
)
|
||||
snapshot_id = str(result.fetchone()[0])
|
||||
db.commit()
|
||||
logger.info(
|
||||
"P80: snapshot saved id=%s check=%s domain=%s docs=%d",
|
||||
snapshot_id, check_id, domain, len(doc_entries or []),
|
||||
)
|
||||
return snapshot_id
|
||||
except Exception as e:
|
||||
logger.warning("P80 snapshot save failed for %s: %s", check_id, e)
|
||||
try:
|
||||
db.rollback()
|
||||
except Exception:
|
||||
pass
|
||||
return None
|
||||
|
||||
|
||||
def load_snapshot(db: Session, snapshot_id: str) -> dict | None:
|
||||
"""Load a snapshot by UUID. Returns dict with all fields or None."""
|
||||
try:
|
||||
row = db.execute(
|
||||
text("""
|
||||
SELECT id, check_id, site_domain, site_label,
|
||||
doc_entries, banner_result, profile,
|
||||
scan_context, cmp_vendors, created_at,
|
||||
replay_count, notes
|
||||
FROM compliance.compliance_check_snapshots
|
||||
WHERE id = CAST(:sid AS uuid)
|
||||
"""),
|
||||
{"sid": snapshot_id},
|
||||
).fetchone()
|
||||
if not row:
|
||||
return None
|
||||
db.execute(
|
||||
text("""
|
||||
UPDATE compliance.compliance_check_snapshots
|
||||
SET replay_count = replay_count + 1,
|
||||
last_replay_at = now()
|
||||
WHERE id = CAST(:sid AS uuid)
|
||||
"""),
|
||||
{"sid": snapshot_id},
|
||||
)
|
||||
db.commit()
|
||||
return {
|
||||
"id": str(row[0]),
|
||||
"check_id": row[1],
|
||||
"site_domain": row[2],
|
||||
"site_label": row[3],
|
||||
"doc_entries": row[4] or [],
|
||||
"banner_result": row[5],
|
||||
"profile": row[6] or {},
|
||||
"scan_context": row[7] or {},
|
||||
"cmp_vendors": row[8] or [],
|
||||
"created_at": str(row[9]),
|
||||
"replay_count": row[10],
|
||||
"notes": row[11],
|
||||
}
|
||||
except Exception as e:
|
||||
logger.warning("P80 snapshot load failed for %s: %s", snapshot_id, e)
|
||||
return None
|
||||
|
||||
|
||||
def list_snapshots_for_domain(db: Session, domain: str, limit: int = 20) -> list[dict]:
|
||||
"""List recent snapshots for a domain (for diff-mode P84)."""
|
||||
try:
|
||||
rows = db.execute(
|
||||
text("""
|
||||
SELECT id, check_id, site_domain, created_at, replay_count, notes
|
||||
FROM compliance.compliance_check_snapshots
|
||||
WHERE site_domain = :dom
|
||||
ORDER BY created_at DESC
|
||||
LIMIT :lim
|
||||
"""),
|
||||
{"dom": domain.lower().replace("www.", ""), "lim": limit},
|
||||
).fetchall()
|
||||
return [
|
||||
{
|
||||
"id": str(r[0]),
|
||||
"check_id": r[1],
|
||||
"site_domain": r[2],
|
||||
"created_at": str(r[3]),
|
||||
"replay_count": r[4],
|
||||
"notes": r[5],
|
||||
}
|
||||
for r in rows
|
||||
]
|
||||
except Exception as e:
|
||||
logger.warning("P80 list_snapshots failed for %s: %s", domain, e)
|
||||
return []
|
||||
Reference in New Issue
Block a user