feat(backend): On-demand Browser-Verhaltens-Matrix + Snapshot-Persistenz (Phase 2)
- check_snapshot: update_browser_matrix/load_browser_matrix — migrationsfrei
in banner_result.browser_matrix (JSONB jsonb_set, eigener scanned_at)
- snapshot_check_routes: POST /snapshots/{id}/browser-behavior/run laeuft
/scan-matrix LIVE (Re-Crawl je Engine, nur live messbar), persistiert das
Ergebnis; GET /snapshots/{id}/browser-behavior liefert die gespeicherte
Matrix ohne Re-Crawl. Profil-Set = 4 Default-Engines + Brave/Chrome/Edge.
- consent-tester multi_browser_scanner: Semaphore(2) gegen OOM (7 Browser
parallel sprengten das 2g-mem_limit)
- Pydantic-Modell mit Optional[List[...]] (nicht `| None`) → Py3.9-sicher
- Tests: _snapshot_scan_url + Request-Defaults (5)
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -8,13 +8,46 @@ gespeicherten Snapshot-Texten. Ausgelagert aus agent_compliance_check_routes.py
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from typing import List, Optional
|
||||
|
||||
from fastapi import APIRouter, HTTPException
|
||||
from pydantic import BaseModel
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
router = APIRouter(prefix="/compliance/agent", tags=["agent-snapshots"])
|
||||
|
||||
# Lokal wie in agent_doc_check_routes/vendor_assessment_routes (kein Import-Zyklus).
|
||||
CONSENT_TESTER_URL = "http://bp-compliance-consent-tester:8094"
|
||||
|
||||
# Browser-Matrix-Profilsatz: 4 Default-Engines (alle Arches) + 3 echte Browser
|
||||
# (amd64-only, nur Prod). Auf arm64-Dev scheitern die 3 Extras schnell beim
|
||||
# Launch → Fehlerzeile in der Matrix (Frontend zeigt „nicht verfügbar").
|
||||
_BROWSER_MATRIX_PROFILES = [
|
||||
"chromium-headed-de", "firefox-headed-de", "webkit-headed-de",
|
||||
"iphone-mobile-safari-de",
|
||||
"brave-default-de", "chrome-channel-desktop-de", "edge-channel-desktop-de",
|
||||
]
|
||||
|
||||
|
||||
class BrowserBehaviorRunRequest(BaseModel):
|
||||
browser_profiles: Optional[List[str]] = None
|
||||
timeout_per_phase: int = 10
|
||||
|
||||
|
||||
def _snapshot_scan_url(snap: dict) -> str:
|
||||
"""Scanbare Homepage-URL aus dem Snapshot ableiten (doc_entry-Origin,
|
||||
sonst https://<site_domain>)."""
|
||||
from urllib.parse import urlparse
|
||||
for e in snap.get("doc_entries") or []:
|
||||
u = (e.get("url") or "").strip()
|
||||
if u:
|
||||
p = urlparse(u)
|
||||
if p.scheme and p.netloc:
|
||||
return f"{p.scheme}://{p.netloc}"
|
||||
dom = (snap.get("site_domain") or "").strip()
|
||||
return f"https://{dom}" if dom and dom != "unknown" else ""
|
||||
|
||||
|
||||
async def _run_doc_agent(snapshot_id: str, doc_type: str, agent_id: str) -> dict:
|
||||
"""Lädt den Snapshot, baut den AgentInput für doc_type und läuft den
|
||||
@@ -115,3 +148,62 @@ async def snapshot_dse_check(snapshot_id: str):
|
||||
async def snapshot_agb_check(snapshot_id: str):
|
||||
"""AGB-Analyse (kuratierter AGBAgent, §§ 305 ff. BGB) auf dem gespeicherten Text."""
|
||||
return await _run_doc_agent(snapshot_id, "agb", "agb")
|
||||
|
||||
|
||||
@router.post("/snapshots/{snapshot_id}/browser-behavior/run")
|
||||
async def snapshot_browser_behavior_run(
|
||||
snapshot_id: str, req: Optional[BrowserBehaviorRunRequest] = None,
|
||||
):
|
||||
"""On-demand: Browser-Verhaltens-Matrix LIVE laufen lassen (Re-Crawl der
|
||||
Site je Engine — Browser-Verhalten ist nur live messbar) und das Ergebnis
|
||||
migrationsfrei in den Snapshot (banner_result.browser_matrix) persistieren.
|
||||
Teuer (mehrere Browser × 3 Phasen) → bewusst nur per Button, nicht je Check."""
|
||||
import httpx
|
||||
from database import SessionLocal
|
||||
from compliance.services.check_snapshot import (
|
||||
load_snapshot, update_browser_matrix,
|
||||
)
|
||||
db = SessionLocal()
|
||||
try:
|
||||
snap = load_snapshot(db, snapshot_id)
|
||||
if not snap:
|
||||
raise HTTPException(status_code=404, detail="snapshot not found")
|
||||
url = _snapshot_scan_url(snap)
|
||||
if not url:
|
||||
raise HTTPException(
|
||||
status_code=422, detail="keine scanbare URL im Snapshot")
|
||||
profiles = (req.browser_profiles if req and req.browser_profiles
|
||||
else list(_BROWSER_MATRIX_PROFILES))
|
||||
payload = {
|
||||
"url": url, "browser_profiles": profiles,
|
||||
"timeout_per_phase": req.timeout_per_phase if req else 10,
|
||||
}
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=360.0) as client:
|
||||
r = await client.post(
|
||||
f"{CONSENT_TESTER_URL}/scan-matrix", json=payload)
|
||||
r.raise_for_status()
|
||||
matrix = r.json()
|
||||
except Exception as e:
|
||||
logger.warning("browser-matrix scan failed for %s: %s",
|
||||
snapshot_id, e)
|
||||
raise HTTPException(
|
||||
status_code=502,
|
||||
detail=f"consent-tester /scan-matrix fehlgeschlagen: {e}")
|
||||
update_browser_matrix(db, snapshot_id, matrix)
|
||||
return matrix
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
|
||||
@router.get("/snapshots/{snapshot_id}/browser-behavior")
|
||||
async def snapshot_browser_behavior(snapshot_id: str):
|
||||
"""Liefert die persistierte Browser-Matrix (kein Re-Crawl). `browser_matrix`
|
||||
ist null, solange der On-demand-Lauf noch nie ausgelöst wurde."""
|
||||
from database import SessionLocal
|
||||
from compliance.services.check_snapshot import load_browser_matrix
|
||||
db = SessionLocal()
|
||||
try:
|
||||
return {"browser_matrix": load_browser_matrix(db, snapshot_id)}
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
@@ -150,6 +150,52 @@ def load_snapshot(db: Session, snapshot_id: str) -> dict | None:
|
||||
return None
|
||||
|
||||
|
||||
def update_browser_matrix(db: Session, snapshot_id: str, matrix: dict) -> bool:
|
||||
"""Persistiert das Browser-Verhaltens-Matrix-Ergebnis MIGRATIONSFREI in die
|
||||
bestehende `banner_result`-JSONB-Spalte unter dem Key `browser_matrix`.
|
||||
|
||||
Eigener Zeitstempel steckt im Matrix-Objekt (`scanned_at`) — der kann von
|
||||
der Snapshot-Aufnahmezeit abweichen, weil die Matrix on-demand LIVE läuft
|
||||
(Browser-Verhalten ist nur live messbar, anders als die Textmodule)."""
|
||||
try:
|
||||
db.execute(
|
||||
text("""
|
||||
UPDATE compliance.compliance_check_snapshots
|
||||
SET banner_result = jsonb_set(
|
||||
COALESCE(banner_result, '{}'::jsonb),
|
||||
'{browser_matrix}', CAST(:bm AS JSONB), true)
|
||||
WHERE id = CAST(:sid AS uuid)
|
||||
"""),
|
||||
{"sid": snapshot_id, "bm": _to_jsonb(matrix)},
|
||||
)
|
||||
db.commit()
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.warning("browser-matrix persist failed for %s: %s", snapshot_id, e)
|
||||
try:
|
||||
db.rollback()
|
||||
except Exception:
|
||||
pass
|
||||
return False
|
||||
|
||||
|
||||
def load_browser_matrix(db: Session, snapshot_id: str) -> dict | None:
|
||||
"""Nur das persistierte `browser_matrix`-Sub-Objekt (kein Re-Crawl)."""
|
||||
try:
|
||||
row = db.execute(
|
||||
text("""
|
||||
SELECT banner_result -> 'browser_matrix'
|
||||
FROM compliance.compliance_check_snapshots
|
||||
WHERE id = CAST(:sid AS uuid)
|
||||
"""),
|
||||
{"sid": snapshot_id},
|
||||
).fetchone()
|
||||
return row[0] if row and row[0] else None
|
||||
except Exception as e:
|
||||
logger.warning("browser-matrix load failed for %s: %s", snapshot_id, e)
|
||||
return None
|
||||
|
||||
|
||||
def list_snapshots_for_domain(db: Session, domain: str, limit: int = 20) -> list[dict]:
|
||||
"""List recent snapshots for a domain (for diff-mode P84)."""
|
||||
try:
|
||||
|
||||
@@ -0,0 +1,42 @@
|
||||
"""Browser-Verhaltens-Endpoint: URL-Ableitung + Request-Defaults (Phase 2).
|
||||
|
||||
Reine Logik ohne DB/HTTP — die Live-Matrix + Persistenz werden im macmini-E2E
|
||||
verifiziert. Sichert v.a.: scanbare Homepage-URL korrekt aus dem Snapshot
|
||||
abgeleitet und das Request-Modell ist auf Python 3.9 baubar (Optional statt
|
||||
`| None` — siehe Pydantic-v2-Falle mit `from __future__ import annotations`)."""
|
||||
|
||||
from compliance.api.snapshot_check_routes import (
|
||||
_snapshot_scan_url,
|
||||
BrowserBehaviorRunRequest,
|
||||
)
|
||||
|
||||
|
||||
def test_url_from_doc_entry_origin():
|
||||
snap = {"site_domain": "bmw.de",
|
||||
"doc_entries": [{"url": "https://www.bmw.de/de/impressum.html"}]}
|
||||
assert _snapshot_scan_url(snap) == "https://www.bmw.de"
|
||||
|
||||
|
||||
def test_url_falls_back_to_site_domain():
|
||||
assert _snapshot_scan_url({"site_domain": "example.com",
|
||||
"doc_entries": []}) == "https://example.com"
|
||||
|
||||
|
||||
def test_url_skips_blank_entries_then_uses_domain():
|
||||
snap = {"site_domain": "shop.de", "doc_entries": [{"url": ""}, {"url": " "}]}
|
||||
assert _snapshot_scan_url(snap) == "https://shop.de"
|
||||
|
||||
|
||||
def test_url_empty_when_unknown_and_no_entries():
|
||||
assert _snapshot_scan_url({"site_domain": "unknown", "doc_entries": []}) == ""
|
||||
assert _snapshot_scan_url({"site_domain": "", "doc_entries": []}) == ""
|
||||
|
||||
|
||||
def test_request_model_defaults_build_on_py39():
|
||||
m = BrowserBehaviorRunRequest()
|
||||
assert m.browser_profiles is None
|
||||
assert m.timeout_per_phase == 10
|
||||
m2 = BrowserBehaviorRunRequest(browser_profiles=["brave-default-de"],
|
||||
timeout_per_phase=20)
|
||||
assert m2.browser_profiles == ["brave-default-de"]
|
||||
assert m2.timeout_per_phase == 20
|
||||
@@ -37,6 +37,11 @@ _HARD_FAIL_CAP = 55
|
||||
# Banner-Design / Dark 20%
|
||||
_WEIGHTS = {"pre_consent": 0.5, "reject_respect": 0.3, "banner_design": 0.2}
|
||||
|
||||
# Nebenlaeufigkeit kappen: jeder Playwright-Browser braucht 300-500 MB; bei 7
|
||||
# Profilen wuerde paralleles Starten das 2g-mem_limit des Containers sprengen
|
||||
# (OOM-Kill). 2 gleichzeitig → Peak ~1 GB, Wall-Time ~Profile/2.
|
||||
_MAX_CONCURRENCY = 2
|
||||
|
||||
|
||||
def _extract_dimensions(banner_result: dict) -> dict[str, float]:
|
||||
"""Best-effort: derive 3 sub-scores from the existing scan output.
|
||||
@@ -149,7 +154,13 @@ async def run_matrix(
|
||||
"verbal": _verbal(score),
|
||||
}
|
||||
|
||||
results = await asyncio.gather(*[_run_one(p) for p in profiles])
|
||||
_sem = asyncio.Semaphore(_MAX_CONCURRENCY)
|
||||
|
||||
async def _bounded(prof: dict) -> dict:
|
||||
async with _sem:
|
||||
return await _run_one(prof)
|
||||
|
||||
results = await asyncio.gather(*[_bounded(p) for p in profiles])
|
||||
sorted_by_score = sorted(results, key=lambda r: r["score"])
|
||||
worst = sorted_by_score[0]
|
||||
best = sorted_by_score[-1]
|
||||
|
||||
Reference in New Issue
Block a user