feat(b17): Playwright Audit-Walk-Video (Stufe 1, #7)

Nimmt einen kompletten Site-Walk als WebKit-Browser-Session
inkl. Video auf. Reviewer kann nachträglich exakt nachvollziehen,
wie die Engine zum Befund kam.

consent-tester:
  - services/audit_walk_recorder.py: Playwright record_video_dir,
    iPhone-Viewport-free 1280×800. Goto homepage → Banner-Accept
    (Best-Effort: 12 Text-Phrasen + 5 CMP-Fallback-Selektoren) →
    Footer-Links sammeln (compliance-relevant gefiltert) →
    pro Link navigate + Dwell-Time → JSON-Action-Index mit
    UTC-Timestamps + SHA-256 vom Video als Manipulation-Schutz.
  - routes_audit_walk.py: POST /scan-audit-walk; statische
    Serves für /audit-walks/{walk_id}/video.webm + walk.json.
  - main.py: Router registriert.

backend:
  - _b17_wiring.py: Triggert /scan-audit-walk, speichert
    Walk-Metadata in state["audit_walk"]. Render-Block mit
    HTML-Tabelle aller Actions (HH:MM:SS + Aktion + Detail) +
    Links zu Video und walk.json.
  - _orchestrator.py: run_b17 nach run_b16, async-aufgerufen.
  - mail_render_v2/_compose.py: audit_walk_html im V2-Layout.
  - test_b17_audit_walk.py: 8 Tests (Render-Pfade + Wiring).

Stufe-2 (Akkordeon-Expansion) und Stufe-3 (DSMS-CID-Anchor)
folgen separat.

Real-World-Smoke gegen Elli:
  - 581 KB Video, SHA-256 verifizierbar
  - 3 Footer-Links besucht (Impressum, Datenschutzerkl., Nutzungs-)
  - 6 Actions im JSON-Index

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-06-07 17:20:13 +02:00
parent 529c032641
commit cb4b352846
7 changed files with 562 additions and 0 deletions
@@ -0,0 +1,133 @@
"""B17 wiring — Audit-Walk-Recorder.
Triggert beim consent-tester einen kompletten Playwright-Site-Walk
mit Video-Aufzeichnung. Result: Video + JSON-Action-Index mit
Timestamps + SHA-256-Hash für Manipulation-Schutz.
Speichert nur die Walk-Metadata + Video-URL im state. Der eigentliche
File-Body bleibt im consent-tester-Volume (Stufe 1). Stufe 3 wird das
Video zu DSMS-IPFS hochladen und die CID hier einbinden.
"""
from __future__ import annotations
import html
import logging
from urllib.parse import urlparse
import httpx
from ._constants import CONSENT_TESTER_URL
logger = logging.getLogger(__name__)
async def run_b17(state: dict) -> None:
"""Trigger walk recording + store metadata in state."""
req = state.get("req")
if req is None:
return
homepage = ""
for d in req.documents:
if d.url:
p = urlparse(d.url)
if p.scheme and p.netloc:
homepage = f"{p.scheme}://{p.netloc}/"
break
if not homepage:
return
walk: dict = {}
try:
async with httpx.AsyncClient(timeout=180.0) as c:
r = await c.post(
f"{CONSENT_TESTER_URL}/scan-audit-walk",
json={"url": homepage, "dwell_s": 4.0, "max_links": 8},
timeout=180.0,
)
if r.status_code == 200:
walk = r.json()
except Exception as e:
logger.warning("B17 audit-walk request failed: %s", e)
return
if not walk or not walk.get("walk_id"):
return
state["audit_walk"] = walk
state["audit_walk_html"] = _render(walk)
logger.info(
"B17 audit-walk: %s · %d actions · video %d bytes · sha256 %s",
walk.get("walk_id"),
len(walk.get("actions") or []),
(walk.get("video") or {}).get("size_bytes", 0),
((walk.get("video") or {}).get("sha256") or "")[:12],
)
def _video_link(walk_id: str) -> str:
"""External URL for the recorded video (when consent-tester is
reachable from the audit reviewer)."""
return f"{CONSENT_TESTER_URL}/audit-walks/{walk_id}/video.webm"
def _render(walk: dict) -> str:
wid = walk.get("walk_id") or ""
video = walk.get("video") or {}
actions = walk.get("actions") or []
nav_count = sum(1 for a in actions if a.get("action") == "navigate")
sha = (video.get("sha256") or "")[:12]
size_kb = round((video.get("size_bytes") or 0) / 1024, 1)
walk_link = _video_link(wid)
meta_link = f"{CONSENT_TESTER_URL}/audit-walks/{wid}/walk.json"
rows = []
for a in actions:
ts = (a.get("timestamp") or "")[11:19] # HH:MM:SS
act = a.get("action") or ""
detail = ""
if act == "goto" or act == "navigate":
detail = (a.get("url") or "")[:120]
if a.get("status"):
detail += f" → HTTP {a['status']}"
elif act == "accept_banner":
r = a.get("result") or ""
if r == "clicked":
detail = f"Banner akzeptiert ({a.get('phrase') or a.get('selector') or ''})"
else:
detail = "Kein Accept-Button gefunden"
elif act == "discover_footer_links":
detail = f"{a.get('count', 0)} Compliance-Links im Footer"
rows.append(
f"<tr><td style='padding:4px 8px;font-family:monospace;"
f"color:#475569;'>{html.escape(ts)}</td>"
f"<td style='padding:4px 8px;'>{html.escape(act)}</td>"
f"<td style='padding:4px 8px;color:#475569;'>"
f"{html.escape(detail)}</td></tr>"
)
return (
"<div style='margin:24px 0;padding:16px;border-left:4px solid #0ea5e9;"
"background:#f0f9ff;border-radius:4px;'>"
"<h2 style='margin:0 0 8px;color:#0c4a6e;font-size:16px;'>"
"🎥 Audit-Walk-Video (Beweis-Aufzeichnung)"
"</h2>"
"<p style='margin:0 0 8px;font-size:13px;color:#475569;'>"
f"<strong>Video:</strong> "
f"<a href='{html.escape(walk_link)}' style='color:#0369a1;'>video.webm</a> "
f"({size_kb} KB, SHA-256 <code>{html.escape(sha)}…</code>) · "
f"<strong>Metadata:</strong> "
f"<a href='{html.escape(meta_link)}' style='color:#0369a1;'>walk.json</a>"
"</p>"
"<p style='margin:0 0 8px;font-size:13px;color:#475569;'>"
f"{nav_count} Compliance-Seiten besucht, jede 4 Sek "
"verweilt — Reviewer kann den Audit-Walk nachverfolgen."
"</p>"
"<table style='font-size:12px;width:100%;border-collapse:collapse;"
"background:#fff;border-radius:4px;'>"
"<thead><tr style='background:#e0f2fe;'>"
"<th style='padding:6px 8px;text-align:left;'>Zeit (UTC)</th>"
"<th style='padding:6px 8px;text-align:left;'>Aktion</th>"
"<th style='padding:6px 8px;text-align:left;'>Detail</th>"
"</tr></thead><tbody>" + "".join(rows) + "</tbody></table>"
"</div>"
)
@@ -27,6 +27,7 @@ from ._b13_wiring import run_b13
from ._b14_wiring import run_b14
from ._b15_wiring import run_b15
from ._b16_wiring import run_b16
from ._b17_wiring import run_b17
from ._constants import _compliance_check_jobs
from ._phase_a_resolve import run_phase_a
from ._phase_b_profile_check import run_phase_b
@@ -78,6 +79,7 @@ async def run_compliance_check(check_id: str, req) -> None:
run_b14(state) # Widersprüchliche Speicherdauer im selben Doc
run_b15(state) # AI-Act Rechtsgrundlage (LLM-Vendor auf lit. f)
run_b16(state) # Footer-Label-vs-URL-Slug-Drift
await run_b17(state) # Audit-Walk-Video (Beweis-Aufzeichnung)
# Phase D-3 top/mid/bot: Step 5 HTML blocks
await run_phase_d3_top(state)
await run_phase_d3_mid(state)
@@ -56,6 +56,8 @@ def compose_v2(state: dict) -> str:
state.get("ai_legal_basis_html", ""),
# B16 Footer-Label-vs-URL-Slug-Drift (SEO / Bookmarks)
state.get("url_slug_drift_html", ""),
# B17 Audit-Walk-Video (Beweis-Aufzeichnung)
state.get("audit_walk_html", ""),
# Browser-Matrix (Stage 1.c)
state.get("browser_matrix_html", ""),
# All legacy build_*_html() wrapped in V2 sections — preserves
@@ -0,0 +1,95 @@
"""Tests for B17 Audit-Walk-Wiring (Stufe 1)."""
import asyncio
from unittest.mock import patch, MagicMock, AsyncMock
import pytest
from compliance.api.agent_check._b17_wiring import _render, run_b17
_FAKE_WALK = {
"walk_id": "abc123def456",
"url": "https://example.com/",
"started_at": "2026-06-07T10:00:00+00:00",
"completed_at": "2026-06-07T10:00:30+00:00",
"engine": "playwright/webkit",
"viewport": "1280x800",
"actions": [
{"timestamp": "2026-06-07T10:00:00+00:00", "action": "goto",
"url": "https://example.com/", "status": 200},
{"timestamp": "2026-06-07T10:00:02+00:00", "action": "accept_banner",
"result": "clicked", "phrase": "alle akzeptieren"},
{"timestamp": "2026-06-07T10:00:04+00:00",
"action": "discover_footer_links", "count": 3, "links": []},
{"timestamp": "2026-06-07T10:00:06+00:00", "action": "navigate",
"url": "https://example.com/datenschutz",
"anchor_text": "Datenschutz", "status": 200,
"title": "Datenschutzerklärung"},
],
"video": {
"filename": "video.webm",
"size_bytes": 512000,
"sha256": "a1b2c3d4e5f67890fedcba0987654321ffffeeeeddddccccbbbbaaaa00001111",
},
}
class TestRender:
def test_renders_walk_id_and_link(self):
html = _render(_FAKE_WALK)
assert "abc123def456" in html
assert "video.webm" in html
assert "walk.json" in html
def test_includes_sha_prefix(self):
html = _render(_FAKE_WALK)
# First 12 chars of sha
assert "a1b2c3d4e5f6" in html
def test_action_table_lists_all_actions(self):
html = _render(_FAKE_WALK)
# All four actions appear as <tr>
assert html.count("<tr>") >= 4 # incl. header
def test_nav_count_reflects_navigate_actions(self):
html = _render(_FAKE_WALK)
# 1 navigate in the fixture
assert "1 Compliance-Seiten" in html
class TestRunB17:
def test_no_request_skipped(self):
state = {}
asyncio.run(run_b17(state))
assert "audit_walk" not in state
def test_no_url_skipped(self):
state = {"req": MagicMock(documents=[MagicMock(url="")])}
asyncio.run(run_b17(state))
assert "audit_walk" not in state
def test_consent_tester_failure_skipped(self):
req = MagicMock(documents=[MagicMock(url="https://example.com/dse")])
state = {"req": req}
with patch(
"compliance.api.agent_check._b17_wiring.httpx.AsyncClient"
) as mock_client:
instance = mock_client.return_value.__aenter__.return_value
instance.post = AsyncMock(side_effect=Exception("nope"))
asyncio.run(run_b17(state))
assert "audit_walk" not in state
def test_success_populates_state(self):
req = MagicMock(documents=[MagicMock(url="https://example.com/dse")])
state = {"req": req}
resp = MagicMock(status_code=200)
resp.json = MagicMock(return_value=_FAKE_WALK)
with patch(
"compliance.api.agent_check._b17_wiring.httpx.AsyncClient"
) as mock_client:
instance = mock_client.return_value.__aenter__.return_value
instance.post = AsyncMock(return_value=resp)
asyncio.run(run_b17(state))
assert state["audit_walk"]["walk_id"] == "abc123def456"
assert "video.webm" in state["audit_walk_html"]
+2
View File
@@ -63,9 +63,11 @@ class ScanResponse(BaseModel):
from routes_matrix import router as matrix_router
from routes_mobile import router as mobile_router
from routes_cookie_matrix import router as cookie_matrix_router
from routes_audit_walk import router as audit_walk_router
app.include_router(matrix_router)
app.include_router(mobile_router)
app.include_router(cookie_matrix_router)
app.include_router(audit_walk_router)
@app.get("/health")
+53
View File
@@ -0,0 +1,53 @@
"""Routes für Audit-Walk-Recorder (POST /scan-audit-walk + Video-Serve)."""
from __future__ import annotations
import os
from pathlib import Path
from fastapi import APIRouter, HTTPException
from fastapi.responses import FileResponse
from pydantic import BaseModel
from services.audit_walk_recorder import WALK_ROOT, record_audit_walk
router = APIRouter()
class AuditWalkReq(BaseModel):
url: str
dwell_s: float = 5.0
max_links: int = 8
@router.post("/scan-audit-walk")
async def scan_audit_walk(req: AuditWalkReq) -> dict:
if not req.url or not req.url.startswith(("http://", "https://")):
raise HTTPException(400, "invalid url")
walk = await record_audit_walk(
req.url,
dwell_s=max(1.0, min(req.dwell_s, 10.0)),
max_links=max(1, min(req.max_links, 12)),
)
return walk
@router.get("/audit-walks/{walk_id}/video.webm")
async def serve_walk_video(walk_id: str):
# Basic path-traversal guard
if not walk_id.isalnum() or len(walk_id) > 32:
raise HTTPException(400, "invalid walk_id")
path = Path(WALK_ROOT) / walk_id / "video.webm"
if not path.exists():
raise HTTPException(404, "walk video not found")
return FileResponse(str(path), media_type="video/webm")
@router.get("/audit-walks/{walk_id}/walk.json")
async def serve_walk_meta(walk_id: str):
if not walk_id.isalnum() or len(walk_id) > 32:
raise HTTPException(400, "invalid walk_id")
path = Path(WALK_ROOT) / walk_id / "walk.json"
if not path.exists():
raise HTTPException(404, "walk.json not found")
return FileResponse(str(path), media_type="application/json")
@@ -0,0 +1,275 @@
"""Playwright Audit-Walk-Recorder.
Nimmt einen vollständigen Site-Walk per WebKit-Browser auf:
1. Goto homepage + Banner-Akzeptieren (Best-Effort)
2. Footer-Links sammeln (DSE, Impressum, AGB, Cookie, Widerruf, ...)
3. Pro Link: navigate + 5s Lese-Verweildauer
4. Video aufzeichnen (Playwright `record_video_dir`)
5. JSON-Action-Index mit Timestamps + SHA-256 für
Manipulation-Schutz
Output landet unter `/data/audit-walks/{walk_id}/`:
- `video.webm` — Playwright-Recording
- `walk.json` — Action-Index mit Timestamps + Hash
Dauer pro Walk: ~30-60 Sekunden bei 6-8 Footer-Links.
Stufe-1 dieser Suite. Stufe-2 (Akkordeon-Expansion) und
Stufe-3 (DSMS-CID-Anchor) folgen separat.
"""
from __future__ import annotations
import asyncio
import hashlib
import json
import logging
import os
import time
from datetime import datetime, timezone
from pathlib import Path
from typing import Any
from uuid import uuid4
logger = logging.getLogger(__name__)
# Walk-Output-Root (Volume mount: /data ist im docker-compose definiert)
WALK_ROOT = os.getenv("AUDIT_WALK_DIR", "/data/audit-walks")
# Footer-Link-Text-Hints — was wir als relevante Compliance-Anker
# erkennen. Wir laden NICHT jeden Footer-Link (sonst riesige Videos),
# sondern nur die compliance-relevanten.
_LINK_HINTS_LC = (
"impressum", "imprint", "legal",
"datenschutz", "privacy",
"cookie", "cookies",
"agb", "geschäftsbedingung", "geschaeftsbedingung",
"nutzungsbedingung", "terms",
"widerruf", "withdrawal", "cancellation",
"einwilligung", "consent",
)
# Banner-Accept-Buttons — Best-Effort-Liste.
_ACCEPT_PHRASES = (
"alle akzeptieren", "alle zulassen", "akzeptieren",
"alles akzeptieren", "zustimmen", "einverstanden",
"accept all", "accept", "agree", "allow all",
"ok", "verstanden",
)
def _ts() -> str:
return datetime.now(timezone.utc).isoformat()
def _sha256_file(path: Path) -> str:
h = hashlib.sha256()
with path.open("rb") as f:
for chunk in iter(lambda: f.read(65536), b""):
h.update(chunk)
return h.hexdigest()
async def _try_accept_banner(page) -> dict:
"""Best-effort: click an accept button. Tries text patterns first,
then common CMP selectors as fallback. Returns action-event dict."""
started = _ts()
for phrase in _ACCEPT_PHRASES:
try:
btn = page.get_by_role("button", name=phrase, exact=False).first
if await btn.count() > 0:
await btn.click(timeout=3000)
await page.wait_for_timeout(1500)
return {
"timestamp": started, "action": "accept_banner",
"result": "clicked", "phrase": phrase,
}
except Exception:
continue
# CMP-fallback selectors
cmp_selectors = (
"#usercentrics-cmp button",
".ot-sdk-container button.banner-actions-container .accept-btn",
".cmp-modal button[aria-label*=accept i]",
"[data-testid=cookie-accept]",
"[aria-label*=akzeptieren i]",
"[aria-label*=accept i]",
)
for sel in cmp_selectors:
try:
el = page.locator(sel).first
if await el.count() > 0:
await el.click(timeout=2000)
await page.wait_for_timeout(1500)
return {
"timestamp": started, "action": "accept_banner",
"result": "clicked", "selector": sel,
}
except Exception:
continue
return {"timestamp": started, "action": "accept_banner",
"result": "no_button_found"}
async def _collect_footer_links(page) -> list[dict]:
"""Find compliance-relevant anchors inside the page footer."""
try:
anchors = await page.eval_on_selector_all(
"footer a[href]",
"(els) => els.map(a => ({text: (a.innerText||'').trim(), "
"href: a.href}))",
)
except Exception as e:
logger.warning("footer-anchor query failed: %s", e)
return []
seen: set[str] = set()
out: list[dict] = []
for a in anchors:
href = (a.get("href") or "").strip()
text = (a.get("text") or "").strip()
if not href or not text:
continue
tl = text.lower()
if not any(h in tl for h in _LINK_HINTS_LC):
continue
key = href.split("#")[0]
if key in seen:
continue
seen.add(key)
out.append({"text": text[:80], "href": href})
if len(out) >= 10:
break
return out
async def _visit_link(page, link: dict, dwell_s: float = 5.0) -> dict:
"""Navigate to `link.href`, dwell, capture title + status."""
started = _ts()
start_t = time.monotonic()
status = 0
title = ""
err = ""
try:
resp = await page.goto(link["href"], wait_until="domcontentloaded",
timeout=20000)
if resp is not None:
status = resp.status
await page.wait_for_timeout(int(dwell_s * 1000))
try:
title = (await page.title())[:120]
except Exception:
pass
except Exception as e:
err = str(e)[:200]
return {
"timestamp": started, "action": "navigate",
"url": link["href"], "anchor_text": link["text"],
"status": status, "title": title,
"dwell_s": round(time.monotonic() - start_t, 2),
"error": err or None,
}
async def record_audit_walk(
url: str, dwell_s: float = 5.0, max_links: int = 8,
) -> dict[str, Any]:
"""Run a full audit walk + record video. Returns walk metadata."""
try:
from playwright.async_api import async_playwright
except Exception as e:
return {"error": f"playwright missing: {e}"}
walk_id = uuid4().hex[:12]
out_dir = Path(WALK_ROOT) / walk_id
out_dir.mkdir(parents=True, exist_ok=True)
actions: list[dict] = []
started_at = _ts()
err = None
async with async_playwright() as p:
try:
browser = await p.webkit.launch(headless=True)
context = await browser.new_context(
viewport={"width": 1280, "height": 800},
record_video_dir=str(out_dir),
record_video_size={"width": 1280, "height": 800},
locale="de-DE",
)
page = await context.new_page()
actions.append({
"timestamp": _ts(), "action": "goto",
"url": url,
})
try:
resp = await page.goto(url, wait_until="domcontentloaded",
timeout=30000)
actions[-1]["status"] = (resp.status if resp else 0)
except Exception as e:
actions[-1]["error"] = str(e)[:200]
await page.wait_for_timeout(2000)
accept_event = await _try_accept_banner(page)
actions.append(accept_event)
links = await _collect_footer_links(page)
actions.append({
"timestamp": _ts(), "action": "discover_footer_links",
"count": len(links), "links": links[:max_links],
})
for link in links[:max_links]:
ev = await _visit_link(page, link, dwell_s=dwell_s)
actions.append(ev)
await context.close()
await browser.close()
except Exception as e:
err = f"walk failed: {str(e)[:200]}"
logger.exception("walk failed")
completed_at = _ts()
# Find produced video file. Playwright writes the .webm with a
# random name when the context closes; rename it for stability.
video_meta: dict[str, Any] = {}
try:
candidates = sorted(out_dir.glob("*.webm"))
if candidates:
src = candidates[0]
dest = out_dir / "video.webm"
if src != dest:
src.rename(dest)
video_meta = {
"filename": "video.webm",
"size_bytes": dest.stat().st_size,
"sha256": _sha256_file(dest),
}
except Exception as e:
logger.warning("video rename failed: %s", e)
walk_doc = {
"walk_id": walk_id,
"url": url,
"started_at": started_at,
"completed_at": completed_at,
"error": err,
"engine": "playwright/webkit",
"viewport": "1280x800",
"actions": actions,
"video": video_meta,
}
try:
(out_dir / "walk.json").write_text(
json.dumps(walk_doc, indent=2, ensure_ascii=False),
)
except Exception as e:
logger.warning("walk.json write failed: %s", e)
return walk_doc
if __name__ == "__main__":
# Manual smoke
import sys
url = sys.argv[1] if len(sys.argv) > 1 else "https://www.elli.eco/de/startseite"
out = asyncio.run(record_audit_walk(url))
print(json.dumps(out, indent=2, ensure_ascii=False))