feat(backend): On-demand Browser-Verhaltens-Matrix + Snapshot-Persistenz (Phase 2)

- check_snapshot: update_browser_matrix/load_browser_matrix — migrationsfrei
  in banner_result.browser_matrix (JSONB jsonb_set, eigener scanned_at)
- snapshot_check_routes: POST /snapshots/{id}/browser-behavior/run laeuft
  /scan-matrix LIVE (Re-Crawl je Engine, nur live messbar), persistiert das
  Ergebnis; GET /snapshots/{id}/browser-behavior liefert die gespeicherte
  Matrix ohne Re-Crawl. Profil-Set = 4 Default-Engines + Brave/Chrome/Edge.
- consent-tester multi_browser_scanner: Semaphore(2) gegen OOM (7 Browser
  parallel sprengten das 2g-mem_limit)
- Pydantic-Modell mit Optional[List[...]] (nicht `| None`) → Py3.9-sicher
- Tests: _snapshot_scan_url + Request-Defaults (5)

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-06-12 23:03:28 +02:00
parent de140e564e
commit c7fde93061
4 changed files with 192 additions and 1 deletions
@@ -8,13 +8,46 @@ gespeicherten Snapshot-Texten. Ausgelagert aus agent_compliance_check_routes.py
from __future__ import annotations
import logging
from typing import List, Optional
from fastapi import APIRouter, HTTPException
from pydantic import BaseModel
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/compliance/agent", tags=["agent-snapshots"])
# Lokal wie in agent_doc_check_routes/vendor_assessment_routes (kein Import-Zyklus).
CONSENT_TESTER_URL = "http://bp-compliance-consent-tester:8094"
# Browser-Matrix-Profilsatz: 4 Default-Engines (alle Arches) + 3 echte Browser
# (amd64-only, nur Prod). Auf arm64-Dev scheitern die 3 Extras schnell beim
# Launch → Fehlerzeile in der Matrix (Frontend zeigt „nicht verfügbar").
_BROWSER_MATRIX_PROFILES = [
"chromium-headed-de", "firefox-headed-de", "webkit-headed-de",
"iphone-mobile-safari-de",
"brave-default-de", "chrome-channel-desktop-de", "edge-channel-desktop-de",
]
class BrowserBehaviorRunRequest(BaseModel):
browser_profiles: Optional[List[str]] = None
timeout_per_phase: int = 10
def _snapshot_scan_url(snap: dict) -> str:
"""Scanbare Homepage-URL aus dem Snapshot ableiten (doc_entry-Origin,
sonst https://<site_domain>)."""
from urllib.parse import urlparse
for e in snap.get("doc_entries") or []:
u = (e.get("url") or "").strip()
if u:
p = urlparse(u)
if p.scheme and p.netloc:
return f"{p.scheme}://{p.netloc}"
dom = (snap.get("site_domain") or "").strip()
return f"https://{dom}" if dom and dom != "unknown" else ""
async def _run_doc_agent(snapshot_id: str, doc_type: str, agent_id: str) -> dict:
"""Lädt den Snapshot, baut den AgentInput für doc_type und läuft den
@@ -115,3 +148,62 @@ async def snapshot_dse_check(snapshot_id: str):
async def snapshot_agb_check(snapshot_id: str):
"""AGB-Analyse (kuratierter AGBAgent, §§ 305 ff. BGB) auf dem gespeicherten Text."""
return await _run_doc_agent(snapshot_id, "agb", "agb")
@router.post("/snapshots/{snapshot_id}/browser-behavior/run")
async def snapshot_browser_behavior_run(
snapshot_id: str, req: Optional[BrowserBehaviorRunRequest] = None,
):
"""On-demand: Browser-Verhaltens-Matrix LIVE laufen lassen (Re-Crawl der
Site je Engine — Browser-Verhalten ist nur live messbar) und das Ergebnis
migrationsfrei in den Snapshot (banner_result.browser_matrix) persistieren.
Teuer (mehrere Browser × 3 Phasen) → bewusst nur per Button, nicht je Check."""
import httpx
from database import SessionLocal
from compliance.services.check_snapshot import (
load_snapshot, update_browser_matrix,
)
db = SessionLocal()
try:
snap = load_snapshot(db, snapshot_id)
if not snap:
raise HTTPException(status_code=404, detail="snapshot not found")
url = _snapshot_scan_url(snap)
if not url:
raise HTTPException(
status_code=422, detail="keine scanbare URL im Snapshot")
profiles = (req.browser_profiles if req and req.browser_profiles
else list(_BROWSER_MATRIX_PROFILES))
payload = {
"url": url, "browser_profiles": profiles,
"timeout_per_phase": req.timeout_per_phase if req else 10,
}
try:
async with httpx.AsyncClient(timeout=360.0) as client:
r = await client.post(
f"{CONSENT_TESTER_URL}/scan-matrix", json=payload)
r.raise_for_status()
matrix = r.json()
except Exception as e:
logger.warning("browser-matrix scan failed for %s: %s",
snapshot_id, e)
raise HTTPException(
status_code=502,
detail=f"consent-tester /scan-matrix fehlgeschlagen: {e}")
update_browser_matrix(db, snapshot_id, matrix)
return matrix
finally:
db.close()
@router.get("/snapshots/{snapshot_id}/browser-behavior")
async def snapshot_browser_behavior(snapshot_id: str):
"""Liefert die persistierte Browser-Matrix (kein Re-Crawl). `browser_matrix`
ist null, solange der On-demand-Lauf noch nie ausgelöst wurde."""
from database import SessionLocal
from compliance.services.check_snapshot import load_browser_matrix
db = SessionLocal()
try:
return {"browser_matrix": load_browser_matrix(db, snapshot_id)}
finally:
db.close()