From c7fde93061d03dc764f9f6fd20659455c3c758f7 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Fri, 12 Jun 2026 23:03:28 +0200 Subject: [PATCH] feat(backend): On-demand Browser-Verhaltens-Matrix + Snapshot-Persistenz (Phase 2) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - check_snapshot: update_browser_matrix/load_browser_matrix — migrationsfrei in banner_result.browser_matrix (JSONB jsonb_set, eigener scanned_at) - snapshot_check_routes: POST /snapshots/{id}/browser-behavior/run laeuft /scan-matrix LIVE (Re-Crawl je Engine, nur live messbar), persistiert das Ergebnis; GET /snapshots/{id}/browser-behavior liefert die gespeicherte Matrix ohne Re-Crawl. Profil-Set = 4 Default-Engines + Brave/Chrome/Edge. - consent-tester multi_browser_scanner: Semaphore(2) gegen OOM (7 Browser parallel sprengten das 2g-mem_limit) - Pydantic-Modell mit Optional[List[...]] (nicht `| None`) → Py3.9-sicher - Tests: _snapshot_scan_url + Request-Defaults (5) Co-Authored-By: Claude Opus 4.7 --- .../compliance/api/snapshot_check_routes.py | 92 +++++++++++++++++++ .../compliance/services/check_snapshot.py | 46 ++++++++++ .../tests/test_snapshot_browser_behavior.py | 42 +++++++++ .../services/multi_browser_scanner.py | 13 ++- 4 files changed, 192 insertions(+), 1 deletion(-) create mode 100644 backend-compliance/tests/test_snapshot_browser_behavior.py diff --git a/backend-compliance/compliance/api/snapshot_check_routes.py b/backend-compliance/compliance/api/snapshot_check_routes.py index 8da99352..c360ab12 100644 --- a/backend-compliance/compliance/api/snapshot_check_routes.py +++ b/backend-compliance/compliance/api/snapshot_check_routes.py @@ -8,13 +8,46 @@ gespeicherten Snapshot-Texten. Ausgelagert aus agent_compliance_check_routes.py from __future__ import annotations import logging +from typing import List, Optional from fastapi import APIRouter, HTTPException +from pydantic import BaseModel logger = logging.getLogger(__name__) router = APIRouter(prefix="/compliance/agent", tags=["agent-snapshots"]) +# Lokal wie in agent_doc_check_routes/vendor_assessment_routes (kein Import-Zyklus). +CONSENT_TESTER_URL = "http://bp-compliance-consent-tester:8094" + +# Browser-Matrix-Profilsatz: 4 Default-Engines (alle Arches) + 3 echte Browser +# (amd64-only, nur Prod). Auf arm64-Dev scheitern die 3 Extras schnell beim +# Launch → Fehlerzeile in der Matrix (Frontend zeigt „nicht verfügbar"). +_BROWSER_MATRIX_PROFILES = [ + "chromium-headed-de", "firefox-headed-de", "webkit-headed-de", + "iphone-mobile-safari-de", + "brave-default-de", "chrome-channel-desktop-de", "edge-channel-desktop-de", +] + + +class BrowserBehaviorRunRequest(BaseModel): + browser_profiles: Optional[List[str]] = None + timeout_per_phase: int = 10 + + +def _snapshot_scan_url(snap: dict) -> str: + """Scanbare Homepage-URL aus dem Snapshot ableiten (doc_entry-Origin, + sonst https://).""" + from urllib.parse import urlparse + for e in snap.get("doc_entries") or []: + u = (e.get("url") or "").strip() + if u: + p = urlparse(u) + if p.scheme and p.netloc: + return f"{p.scheme}://{p.netloc}" + dom = (snap.get("site_domain") or "").strip() + return f"https://{dom}" if dom and dom != "unknown" else "" + async def _run_doc_agent(snapshot_id: str, doc_type: str, agent_id: str) -> dict: """Lädt den Snapshot, baut den AgentInput für doc_type und läuft den @@ -115,3 +148,62 @@ async def snapshot_dse_check(snapshot_id: str): async def snapshot_agb_check(snapshot_id: str): """AGB-Analyse (kuratierter AGBAgent, §§ 305 ff. BGB) auf dem gespeicherten Text.""" return await _run_doc_agent(snapshot_id, "agb", "agb") + + +@router.post("/snapshots/{snapshot_id}/browser-behavior/run") +async def snapshot_browser_behavior_run( + snapshot_id: str, req: Optional[BrowserBehaviorRunRequest] = None, +): + """On-demand: Browser-Verhaltens-Matrix LIVE laufen lassen (Re-Crawl der + Site je Engine — Browser-Verhalten ist nur live messbar) und das Ergebnis + migrationsfrei in den Snapshot (banner_result.browser_matrix) persistieren. + Teuer (mehrere Browser × 3 Phasen) → bewusst nur per Button, nicht je Check.""" + import httpx + from database import SessionLocal + from compliance.services.check_snapshot import ( + load_snapshot, update_browser_matrix, + ) + db = SessionLocal() + try: + snap = load_snapshot(db, snapshot_id) + if not snap: + raise HTTPException(status_code=404, detail="snapshot not found") + url = _snapshot_scan_url(snap) + if not url: + raise HTTPException( + status_code=422, detail="keine scanbare URL im Snapshot") + profiles = (req.browser_profiles if req and req.browser_profiles + else list(_BROWSER_MATRIX_PROFILES)) + payload = { + "url": url, "browser_profiles": profiles, + "timeout_per_phase": req.timeout_per_phase if req else 10, + } + try: + async with httpx.AsyncClient(timeout=360.0) as client: + r = await client.post( + f"{CONSENT_TESTER_URL}/scan-matrix", json=payload) + r.raise_for_status() + matrix = r.json() + except Exception as e: + logger.warning("browser-matrix scan failed for %s: %s", + snapshot_id, e) + raise HTTPException( + status_code=502, + detail=f"consent-tester /scan-matrix fehlgeschlagen: {e}") + update_browser_matrix(db, snapshot_id, matrix) + return matrix + finally: + db.close() + + +@router.get("/snapshots/{snapshot_id}/browser-behavior") +async def snapshot_browser_behavior(snapshot_id: str): + """Liefert die persistierte Browser-Matrix (kein Re-Crawl). `browser_matrix` + ist null, solange der On-demand-Lauf noch nie ausgelöst wurde.""" + from database import SessionLocal + from compliance.services.check_snapshot import load_browser_matrix + db = SessionLocal() + try: + return {"browser_matrix": load_browser_matrix(db, snapshot_id)} + finally: + db.close() diff --git a/backend-compliance/compliance/services/check_snapshot.py b/backend-compliance/compliance/services/check_snapshot.py index cde1869e..630acfad 100644 --- a/backend-compliance/compliance/services/check_snapshot.py +++ b/backend-compliance/compliance/services/check_snapshot.py @@ -150,6 +150,52 @@ def load_snapshot(db: Session, snapshot_id: str) -> dict | None: return None +def update_browser_matrix(db: Session, snapshot_id: str, matrix: dict) -> bool: + """Persistiert das Browser-Verhaltens-Matrix-Ergebnis MIGRATIONSFREI in die + bestehende `banner_result`-JSONB-Spalte unter dem Key `browser_matrix`. + + Eigener Zeitstempel steckt im Matrix-Objekt (`scanned_at`) — der kann von + der Snapshot-Aufnahmezeit abweichen, weil die Matrix on-demand LIVE läuft + (Browser-Verhalten ist nur live messbar, anders als die Textmodule).""" + try: + db.execute( + text(""" + UPDATE compliance.compliance_check_snapshots + SET banner_result = jsonb_set( + COALESCE(banner_result, '{}'::jsonb), + '{browser_matrix}', CAST(:bm AS JSONB), true) + WHERE id = CAST(:sid AS uuid) + """), + {"sid": snapshot_id, "bm": _to_jsonb(matrix)}, + ) + db.commit() + return True + except Exception as e: + logger.warning("browser-matrix persist failed for %s: %s", snapshot_id, e) + try: + db.rollback() + except Exception: + pass + return False + + +def load_browser_matrix(db: Session, snapshot_id: str) -> dict | None: + """Nur das persistierte `browser_matrix`-Sub-Objekt (kein Re-Crawl).""" + try: + row = db.execute( + text(""" + SELECT banner_result -> 'browser_matrix' + FROM compliance.compliance_check_snapshots + WHERE id = CAST(:sid AS uuid) + """), + {"sid": snapshot_id}, + ).fetchone() + return row[0] if row and row[0] else None + except Exception as e: + logger.warning("browser-matrix load failed for %s: %s", snapshot_id, e) + return None + + def list_snapshots_for_domain(db: Session, domain: str, limit: int = 20) -> list[dict]: """List recent snapshots for a domain (for diff-mode P84).""" try: diff --git a/backend-compliance/tests/test_snapshot_browser_behavior.py b/backend-compliance/tests/test_snapshot_browser_behavior.py new file mode 100644 index 00000000..7dbfc88b --- /dev/null +++ b/backend-compliance/tests/test_snapshot_browser_behavior.py @@ -0,0 +1,42 @@ +"""Browser-Verhaltens-Endpoint: URL-Ableitung + Request-Defaults (Phase 2). + +Reine Logik ohne DB/HTTP — die Live-Matrix + Persistenz werden im macmini-E2E +verifiziert. Sichert v.a.: scanbare Homepage-URL korrekt aus dem Snapshot +abgeleitet und das Request-Modell ist auf Python 3.9 baubar (Optional statt +`| None` — siehe Pydantic-v2-Falle mit `from __future__ import annotations`).""" + +from compliance.api.snapshot_check_routes import ( + _snapshot_scan_url, + BrowserBehaviorRunRequest, +) + + +def test_url_from_doc_entry_origin(): + snap = {"site_domain": "bmw.de", + "doc_entries": [{"url": "https://www.bmw.de/de/impressum.html"}]} + assert _snapshot_scan_url(snap) == "https://www.bmw.de" + + +def test_url_falls_back_to_site_domain(): + assert _snapshot_scan_url({"site_domain": "example.com", + "doc_entries": []}) == "https://example.com" + + +def test_url_skips_blank_entries_then_uses_domain(): + snap = {"site_domain": "shop.de", "doc_entries": [{"url": ""}, {"url": " "}]} + assert _snapshot_scan_url(snap) == "https://shop.de" + + +def test_url_empty_when_unknown_and_no_entries(): + assert _snapshot_scan_url({"site_domain": "unknown", "doc_entries": []}) == "" + assert _snapshot_scan_url({"site_domain": "", "doc_entries": []}) == "" + + +def test_request_model_defaults_build_on_py39(): + m = BrowserBehaviorRunRequest() + assert m.browser_profiles is None + assert m.timeout_per_phase == 10 + m2 = BrowserBehaviorRunRequest(browser_profiles=["brave-default-de"], + timeout_per_phase=20) + assert m2.browser_profiles == ["brave-default-de"] + assert m2.timeout_per_phase == 20 diff --git a/consent-tester/services/multi_browser_scanner.py b/consent-tester/services/multi_browser_scanner.py index dbc9c7e5..88fe0b3f 100644 --- a/consent-tester/services/multi_browser_scanner.py +++ b/consent-tester/services/multi_browser_scanner.py @@ -37,6 +37,11 @@ _HARD_FAIL_CAP = 55 # Banner-Design / Dark 20% _WEIGHTS = {"pre_consent": 0.5, "reject_respect": 0.3, "banner_design": 0.2} +# Nebenlaeufigkeit kappen: jeder Playwright-Browser braucht 300-500 MB; bei 7 +# Profilen wuerde paralleles Starten das 2g-mem_limit des Containers sprengen +# (OOM-Kill). 2 gleichzeitig → Peak ~1 GB, Wall-Time ~Profile/2. +_MAX_CONCURRENCY = 2 + def _extract_dimensions(banner_result: dict) -> dict[str, float]: """Best-effort: derive 3 sub-scores from the existing scan output. @@ -149,7 +154,13 @@ async def run_matrix( "verbal": _verbal(score), } - results = await asyncio.gather(*[_run_one(p) for p in profiles]) + _sem = asyncio.Semaphore(_MAX_CONCURRENCY) + + async def _bounded(prof: dict) -> dict: + async with _sem: + return await _run_one(prof) + + results = await asyncio.gather(*[_bounded(p) for p in profiles]) sorted_by_score = sorted(results, key=lambda r: r["score"]) worst = sorted_by_score[0] best = sorted_by_score[-1]