feat(consent-tester): /scan-matrix echt — Profil je Engine + Per-Engine-Summary (Phase 1.2)

- _scanner_run reicht browser_profile an run_consent_test durch (statt Single-Chromium-Shim)
- neue scan_matrix_summary.matrix_scan_dict: ConsentTestResult -> schlanke
  Matrix-dict-Form (phases fuer _extract_dimensions + kompakter `summary`:
  cookies_before_consent/after_reject, reject_respected-Heuristik [keine
  Verstoesse UND kein neuer Tracker], surface, screenshot)
- multi_browser_scanner._run_one hebt summary + engine + is_mobile an die
  Zeile, verwirft die vollen Cookie-Listen (JSONB-Persistenz schlank)
- consent_scanner: _ctx_base mit Mobile-Device-Emulation (iPhone-Profil ->
  echtes Mobile-Viewport/Touch), alle 5 new_context auf **_ctx_base
- Tests: test_scan_matrix_summary (6) inkl. _extract_dimensions-Vertrag

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-06-12 22:46:42 +02:00
parent c816827720
commit 881e9c28de
5 changed files with 208 additions and 44 deletions
+17 -12
View File
@@ -5,11 +5,9 @@ returns the aggregated robustness-score per browser plus a
worst-of/best-of summary. Kept in its own module so main.py stays
under the 500-LOC cap.
KNOWN LIMITATION (stage 1.a):
The underlying `run_consent_test` does not yet accept a
`browser_profile` kwarg — all profiles currently execute on the
same Chromium instance. Engine diversity (real Firefox/WebKit
contexts) ships in stage 1.b once consent_scanner is split.
Stage 1.b (erledigt): `run_consent_test` nimmt jetzt einen `browser_profile`
kwarg → echte Engine-Diversität (Firefox/Gecko, WebKit/Safari, Blink inkl.
Chrome-/Edge-Channel + Brave). `_scanner_run` reicht das Profil durch.
"""
from __future__ import annotations
@@ -22,6 +20,7 @@ from pydantic import BaseModel
from services.consent_scanner import run_consent_test
from services.multi_browser_scanner import run_matrix
from services.scan_matrix_summary import matrix_scan_dict
logger = logging.getLogger(__name__)
router = APIRouter()
@@ -36,12 +35,18 @@ class MatrixScanRequest(BaseModel):
browser_profiles: list[str] | None = None
async def _scanner_shim(url: str, browser_profile: dict | None = None,
timeout_per_phase: int = 10,
categories: list[str] | None = None):
"""Shim that ignores `browser_profile` until consent_scanner accepts it."""
return await run_consent_test(url, timeout_per_phase,
categories or [])
async def _scanner_run(url: str, browser_profile: dict | None = None,
timeout_per_phase: int = 10,
categories: list[str] | None = None):
"""Adapter: reicht das aufgelöste `browser_profile` (Engine/Channel/Device)
an `run_consent_test` durch, damit jede Matrix-Zeile auf der echten Engine
läuft (Firefox/WebKit/Blink + Chrome-/Edge-Channel + Brave). Projiziert
das ConsentTestResult auf die schlanke Matrix-dict-Form (phases +
kompakter `summary`)."""
result = await run_consent_test(url, timeout_per_phase,
categories or [],
browser_profile=browser_profile)
return matrix_scan_dict(result)
@router.post("/scan-matrix")
@@ -50,7 +55,7 @@ async def scan_matrix(req: MatrixScanRequest):
logger.info("Matrix scan for %s profiles=%s", req.url,
req.browser_profiles or "default")
matrix = await run_matrix(
_scanner_shim,
_scanner_run,
req.url,
requested_profiles=req.browser_profiles,
timeout_per_phase=req.timeout_per_phase,
+23 -30
View File
@@ -172,15 +172,28 @@ async def run_consent_test(
_launch["executable_path"] = _prof["executable_path"]
browser = await p.chromium.launch(**_launch)
# Gemeinsame Context-Optionen. Bei Mobile-Profilen (Profil nennt ein
# Playwright-`device`, z.B. „iPhone 15") echte Mobile-Emulation
# (Viewport/UA/Touch) statt Desktop — sonst wäre die Mobile-Matrix-
# Zeile nur Desktop-WebKit. Nur bekannte new_context-kwargs kopieren
# (NICHT das volle Device-dict spreaden → default_browser_type bricht).
_device = p.devices.get(_prof["device"]) if _prof.get("device") else None
_ctx_base: dict = {
"user_agent": USER_AGENT,
"viewport": {"width": 1920, "height": 1080},
"locale": "de-DE",
"timezone_id": "Europe/Berlin",
}
if _device:
for _k in ("user_agent", "viewport", "device_scale_factor",
"is_mobile", "has_touch"):
if _k in _device:
_ctx_base[_k] = _device[_k]
try:
# ── Phase A: Before consent ──────────────────────────
logger.info("Phase A: First visit (no interaction)")
ctx_a = await browser.new_context(
user_agent=USER_AGENT,
viewport={"width": 1920, "height": 1080},
locale="de-DE",
timezone_id="Europe/Berlin",
)
ctx_a = await browser.new_context(**_ctx_base)
page_a = await ctx_a.new_page()
await page_a.add_init_script(_INTERCEPTOR_INIT)
if HAS_STEALTH:
@@ -271,12 +284,7 @@ async def run_consent_test(
# ── Phase B: After rejecting ─────────────────────────
logger.info("Phase B: Reject consent (%s)", banner.provider)
ctx_b = await browser.new_context(
user_agent=USER_AGENT,
viewport={"width": 1920, "height": 1080},
locale="de-DE",
timezone_id="Europe/Berlin",
)
ctx_b = await browser.new_context(**_ctx_base)
page_b = await ctx_b.new_page()
await page_b.add_init_script(_INTERCEPTOR_INIT)
if HAS_STEALTH:
@@ -338,12 +346,7 @@ async def run_consent_test(
# ── Phase C: After accepting ─────────────────────────
logger.info("Phase C: Accept consent (%s)", banner.provider)
ctx_c = await browser.new_context(
user_agent=USER_AGENT,
viewport={"width": 1920, "height": 1080},
locale="de-DE",
timezone_id="Europe/Berlin",
)
ctx_c = await browser.new_context(**_ctx_base)
page_c = await ctx_c.new_page()
await page_c.add_init_script(_INTERCEPTOR_INIT)
if HAS_STEALTH:
@@ -411,12 +414,7 @@ async def run_consent_test(
try:
from services.category_tester import detect_categories, test_single_category
ctx_cat = await browser.new_context(
user_agent=USER_AGENT,
viewport={"width": 1920, "height": 1080},
locale="de-DE",
timezone_id="Europe/Berlin",
)
ctx_cat = await browser.new_context(**_ctx_base)
page_cat = await ctx_cat.new_page()
if HAS_STEALTH:
await stealth_async(page_cat)
@@ -461,12 +459,7 @@ async def run_consent_test(
"skipping remaining %d categories",
len(unique_cats) - len(result.category_tests))
break
cat_ctx = await browser.new_context(
user_agent=USER_AGENT,
viewport={"width": 1920, "height": 1080},
locale="de-DE",
timezone_id="Europe/Berlin",
)
cat_ctx = await browser.new_context(**_ctx_base)
try:
cat_result = await asyncio.wait_for(
test_single_category(cat_ctx, url, cat, banner, wait_ms),
@@ -128,16 +128,24 @@ async def run_matrix(
logger.warning("matrix profile %s failed: %s", prof["id"], e)
return {
"profile_id": prof["id"], "label": prof["label"],
"scan": None, "error": str(e)[:200],
"engine": prof.get("engine"),
"is_mobile": bool(prof.get("device")),
"summary": None, "error": str(e)[:200],
"dimensions": {"pre_consent": 0, "reject_respect": 0,
"banner_design": 0},
"score": 0, "verbal": "Scan fehlgeschlagen",
}
dims = _extract_dimensions(scan or {})
score = _score(dims)
# Nur den kompakten `summary` an die Zeile heben — die vollen
# phases/Cookie-Listen werden für das Scoring konsumiert und dann
# verworfen (sonst bläht 6× volle Cookie-Liste die JSONB-Persistenz).
summary = (scan or {}).get("summary") if isinstance(scan, dict) else None
return {
"profile_id": prof["id"], "label": prof["label"],
"scan": scan, "dimensions": dims, "score": score,
"engine": prof.get("engine"),
"is_mobile": bool(prof.get("device")),
"summary": summary, "dimensions": dims, "score": score,
"verbal": _verbal(score),
}
@@ -0,0 +1,85 @@
"""Kompakte Per-Engine-Projektion eines ConsentTestResult für die Browser-Matrix.
Die Matrix braucht NICHT die volle `/scan`-Antwort — nur die Felder, die je
Browser-Zeile angezeigt + persistiert werden: Cookies vor Consent / nach
Ablehnen, ob „Ablehnen" respektiert wurde, Oberflächen-Signale, Screenshot.
Bewusst schlank gehalten, damit der in `banner_result.browser_matrix` (JSONB)
persistierte Block klein bleibt — 6 Engines × voller Cookie-Liste + Screenshot
würde sonst schnell mehrere MB groß (BMW: ~780 Cookies je Phase).
"""
from __future__ import annotations
from typing import Any
# Cookie-Namen je Phase deckeln — die Matrix zeigt Zahlen + Beispiele, nicht
# die volle Liste (die steckt im textbasierten Cookie-Modul).
_NAME_CAP = 40
_TRACK_CAP = 20
def _vdict(v: Any) -> dict:
"""Violation (dataclass/obj/dict) → serialisierbares dict."""
if isinstance(v, dict):
return v
return getattr(v, "__dict__", None) or {"text": str(v)}
def matrix_scan_dict(result: Any) -> dict:
"""`ConsentTestResult` → dict in der Form, die
`multi_browser_scanner._extract_dimensions` liest (phases/banner_checks)
plus ein kompakter `summary`-Block für Frontend + Persistenz.
Defensiv via getattr — funktioniert auch, falls der Scanner mal ein
bereits serialisiertes dict liefert (dann greifen die Defaults)."""
before = list(getattr(result, "before_cookies", []) or [])
after = list(getattr(result, "reject_cookies", []) or [])
before_violations = list(getattr(result, "before_violations", []) or [])
reject_violations = list(getattr(result, "reject_violations", []) or [])
reject_new_tracking = list(getattr(result, "reject_new_tracking", []) or [])
banner_text_violations = list(
getattr(result, "banner_text_violations", []) or [])
provider = getattr(result, "banner_provider", "") or ""
summary = {
"cookies_before_consent": len(before),
"cookies_after_reject": len(after),
"cookies_before_names": before[:_NAME_CAP],
"cookies_after_reject_names": after[:_NAME_CAP],
# „Ablehnen respektiert" = nach dem Klick auf „Ablehnen" keine Verstöße
# UND kein neuer Tracker. Verbleibende essentielle Cookies (z.B. die
# gespeicherte Consent-Entscheidung selbst) sind erlaubt → NICHT über
# die reine Cookie-Zahl bewerten (sonst False Positive).
"reject_respected": (len(reject_violations) == 0
and len(reject_new_tracking) == 0),
"reject_new_tracking": reject_new_tracking[:_TRACK_CAP],
"banner_detected": bool(getattr(result, "banner_detected", False)),
"banner_provider": provider,
"banner_screenshot_b64": getattr(result, "banner_screenshot_b64", "") or "",
"surface": {
"has_impressum_link": bool(
getattr(result, "banner_has_impressum_link", False)),
"has_dse_link": bool(
getattr(result, "banner_has_dse_link", False)),
"banner_text_issues": len(banner_text_violations),
},
"violations": {
"before_consent": len(before_violations),
"after_reject": len(reject_violations),
"banner_text": len(banner_text_violations),
},
}
return {
"banner_detected": bool(getattr(result, "banner_detected", False)),
"banner_provider": provider,
# Minimal-Form für _extract_dimensions (nur cookies-Listen + violations):
"phases": {
"before_consent": {"cookies": before},
"after_reject": {"cookies": after},
},
"banner_checks": {
"violations": [_vdict(v) for v in banner_text_violations],
},
"summary": summary,
}
@@ -0,0 +1,73 @@
"""Per-Engine-Projektion der Browser-Matrix (`scan_matrix_summary`).
Sichert: ConsentTestResult → schlanke Matrix-dict-Form mit (a) phases, die
`multi_browser_scanner._extract_dimensions` lesen kann, und (b) kompaktem
`summary` (cookies_before/after_reject, reject_respected-Heuristik, Surface).
"""
from types import SimpleNamespace
from services.scan_matrix_summary import matrix_scan_dict
from services.multi_browser_scanner import _extract_dimensions
def _result(**kw):
base = dict(
banner_detected=True, banner_provider="Usercentrics",
before_cookies=[], reject_cookies=[],
before_violations=[], reject_violations=[], reject_new_tracking=[],
banner_text_violations=[],
banner_has_impressum_link=True, banner_has_dse_link=True,
banner_screenshot_b64="iVBOR_fake",
)
base.update(kw)
return SimpleNamespace(**base)
def test_cookie_counts_and_names_capped():
r = _result(before_cookies=[f"c{i}" for i in range(50)],
reject_cookies=["a", "b"])
out = matrix_scan_dict(r)
s = out["summary"]
assert s["cookies_before_consent"] == 50
assert s["cookies_after_reject"] == 2
assert len(s["cookies_before_names"]) == 40 # gedeckelt
assert s["cookies_after_reject_names"] == ["a", "b"]
def test_reject_respected_true_when_no_violation_no_tracking():
r = _result(reject_cookies=["consent_choice"]) # essentielles Cookie bleibt
out = matrix_scan_dict(r)
# Verbleibendes essentielles Cookie allein darf NICHT als Verstoß zählen.
assert out["summary"]["reject_respected"] is True
def test_reject_respected_false_on_reject_violation():
r = _result(reject_violations=[SimpleNamespace(severity="HIGH", text="x")])
assert matrix_scan_dict(r)["summary"]["reject_respected"] is False
def test_reject_respected_false_on_new_tracking():
r = _result(reject_new_tracking=["google-analytics"])
assert matrix_scan_dict(r)["summary"]["reject_respected"] is False
def test_surface_and_screenshot_passthrough():
r = _result(banner_has_impressum_link=False,
banner_text_violations=[SimpleNamespace(severity="LOW", text="y")])
out = matrix_scan_dict(r)
assert out["summary"]["surface"]["has_impressum_link"] is False
assert out["summary"]["surface"]["has_dse_link"] is True
assert out["summary"]["surface"]["banner_text_issues"] == 1
assert out["summary"]["banner_screenshot_b64"] == "iVBOR_fake"
def test_phases_shape_readable_by_extract_dimensions():
# Vertrag: die Projektion MUSS von _extract_dimensions konsumierbar sein.
r = _result(before_cookies=["a", "b", "c"], reject_cookies=["d"])
out = matrix_scan_dict(r)
dims = _extract_dimensions(out)
assert set(dims) == {"pre_consent", "reject_respect", "banner_design"}
# 3 Pre-Cookies → pre_consent < 1.0; weniger Reject-Cookies → höher.
assert 0.0 <= dims["pre_consent"] <= 1.0
assert dims["reject_respect"] <= 1.0