feat(b17): Playwright Audit-Walk-Video (Stufe 1, #7)

Nimmt einen kompletten Site-Walk als WebKit-Browser-Session
inkl. Video auf. Reviewer kann nachträglich exakt nachvollziehen,
wie die Engine zum Befund kam.

consent-tester:
  - services/audit_walk_recorder.py: Playwright record_video_dir,
    iPhone-Viewport-free 1280×800. Goto homepage → Banner-Accept
    (Best-Effort: 12 Text-Phrasen + 5 CMP-Fallback-Selektoren) →
    Footer-Links sammeln (compliance-relevant gefiltert) →
    pro Link navigate + Dwell-Time → JSON-Action-Index mit
    UTC-Timestamps + SHA-256 vom Video als Manipulation-Schutz.
  - routes_audit_walk.py: POST /scan-audit-walk; statische
    Serves für /audit-walks/{walk_id}/video.webm + walk.json.
  - main.py: Router registriert.

backend:
  - _b17_wiring.py: Triggert /scan-audit-walk, speichert
    Walk-Metadata in state["audit_walk"]. Render-Block mit
    HTML-Tabelle aller Actions (HH:MM:SS + Aktion + Detail) +
    Links zu Video und walk.json.
  - _orchestrator.py: run_b17 nach run_b16, async-aufgerufen.
  - mail_render_v2/_compose.py: audit_walk_html im V2-Layout.
  - test_b17_audit_walk.py: 8 Tests (Render-Pfade + Wiring).

Stufe-2 (Akkordeon-Expansion) und Stufe-3 (DSMS-CID-Anchor)
folgen separat.

Real-World-Smoke gegen Elli:
  - 581 KB Video, SHA-256 verifizierbar
  - 3 Footer-Links besucht (Impressum, Datenschutzerkl., Nutzungs-)
  - 6 Actions im JSON-Index

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-06-07 17:20:13 +02:00
parent 529c032641
commit cb4b352846
7 changed files with 562 additions and 0 deletions
+2
View File
@@ -63,9 +63,11 @@ class ScanResponse(BaseModel):
from routes_matrix import router as matrix_router
from routes_mobile import router as mobile_router
from routes_cookie_matrix import router as cookie_matrix_router
from routes_audit_walk import router as audit_walk_router
app.include_router(matrix_router)
app.include_router(mobile_router)
app.include_router(cookie_matrix_router)
app.include_router(audit_walk_router)
@app.get("/health")
+53
View File
@@ -0,0 +1,53 @@
"""Routes für Audit-Walk-Recorder (POST /scan-audit-walk + Video-Serve)."""
from __future__ import annotations
import os
from pathlib import Path
from fastapi import APIRouter, HTTPException
from fastapi.responses import FileResponse
from pydantic import BaseModel
from services.audit_walk_recorder import WALK_ROOT, record_audit_walk
router = APIRouter()
class AuditWalkReq(BaseModel):
url: str
dwell_s: float = 5.0
max_links: int = 8
@router.post("/scan-audit-walk")
async def scan_audit_walk(req: AuditWalkReq) -> dict:
if not req.url or not req.url.startswith(("http://", "https://")):
raise HTTPException(400, "invalid url")
walk = await record_audit_walk(
req.url,
dwell_s=max(1.0, min(req.dwell_s, 10.0)),
max_links=max(1, min(req.max_links, 12)),
)
return walk
@router.get("/audit-walks/{walk_id}/video.webm")
async def serve_walk_video(walk_id: str):
# Basic path-traversal guard
if not walk_id.isalnum() or len(walk_id) > 32:
raise HTTPException(400, "invalid walk_id")
path = Path(WALK_ROOT) / walk_id / "video.webm"
if not path.exists():
raise HTTPException(404, "walk video not found")
return FileResponse(str(path), media_type="video/webm")
@router.get("/audit-walks/{walk_id}/walk.json")
async def serve_walk_meta(walk_id: str):
if not walk_id.isalnum() or len(walk_id) > 32:
raise HTTPException(400, "invalid walk_id")
path = Path(WALK_ROOT) / walk_id / "walk.json"
if not path.exists():
raise HTTPException(404, "walk.json not found")
return FileResponse(str(path), media_type="application/json")
@@ -0,0 +1,275 @@
"""Playwright Audit-Walk-Recorder.
Nimmt einen vollständigen Site-Walk per WebKit-Browser auf:
1. Goto homepage + Banner-Akzeptieren (Best-Effort)
2. Footer-Links sammeln (DSE, Impressum, AGB, Cookie, Widerruf, ...)
3. Pro Link: navigate + 5s Lese-Verweildauer
4. Video aufzeichnen (Playwright `record_video_dir`)
5. JSON-Action-Index mit Timestamps + SHA-256 für
Manipulation-Schutz
Output landet unter `/data/audit-walks/{walk_id}/`:
- `video.webm` — Playwright-Recording
- `walk.json` — Action-Index mit Timestamps + Hash
Dauer pro Walk: ~30-60 Sekunden bei 6-8 Footer-Links.
Stufe-1 dieser Suite. Stufe-2 (Akkordeon-Expansion) und
Stufe-3 (DSMS-CID-Anchor) folgen separat.
"""
from __future__ import annotations
import asyncio
import hashlib
import json
import logging
import os
import time
from datetime import datetime, timezone
from pathlib import Path
from typing import Any
from uuid import uuid4
logger = logging.getLogger(__name__)
# Walk-Output-Root (Volume mount: /data ist im docker-compose definiert)
WALK_ROOT = os.getenv("AUDIT_WALK_DIR", "/data/audit-walks")
# Footer-Link-Text-Hints — was wir als relevante Compliance-Anker
# erkennen. Wir laden NICHT jeden Footer-Link (sonst riesige Videos),
# sondern nur die compliance-relevanten.
_LINK_HINTS_LC = (
"impressum", "imprint", "legal",
"datenschutz", "privacy",
"cookie", "cookies",
"agb", "geschäftsbedingung", "geschaeftsbedingung",
"nutzungsbedingung", "terms",
"widerruf", "withdrawal", "cancellation",
"einwilligung", "consent",
)
# Banner-Accept-Buttons — Best-Effort-Liste.
_ACCEPT_PHRASES = (
"alle akzeptieren", "alle zulassen", "akzeptieren",
"alles akzeptieren", "zustimmen", "einverstanden",
"accept all", "accept", "agree", "allow all",
"ok", "verstanden",
)
def _ts() -> str:
return datetime.now(timezone.utc).isoformat()
def _sha256_file(path: Path) -> str:
h = hashlib.sha256()
with path.open("rb") as f:
for chunk in iter(lambda: f.read(65536), b""):
h.update(chunk)
return h.hexdigest()
async def _try_accept_banner(page) -> dict:
"""Best-effort: click an accept button. Tries text patterns first,
then common CMP selectors as fallback. Returns action-event dict."""
started = _ts()
for phrase in _ACCEPT_PHRASES:
try:
btn = page.get_by_role("button", name=phrase, exact=False).first
if await btn.count() > 0:
await btn.click(timeout=3000)
await page.wait_for_timeout(1500)
return {
"timestamp": started, "action": "accept_banner",
"result": "clicked", "phrase": phrase,
}
except Exception:
continue
# CMP-fallback selectors
cmp_selectors = (
"#usercentrics-cmp button",
".ot-sdk-container button.banner-actions-container .accept-btn",
".cmp-modal button[aria-label*=accept i]",
"[data-testid=cookie-accept]",
"[aria-label*=akzeptieren i]",
"[aria-label*=accept i]",
)
for sel in cmp_selectors:
try:
el = page.locator(sel).first
if await el.count() > 0:
await el.click(timeout=2000)
await page.wait_for_timeout(1500)
return {
"timestamp": started, "action": "accept_banner",
"result": "clicked", "selector": sel,
}
except Exception:
continue
return {"timestamp": started, "action": "accept_banner",
"result": "no_button_found"}
async def _collect_footer_links(page) -> list[dict]:
"""Find compliance-relevant anchors inside the page footer."""
try:
anchors = await page.eval_on_selector_all(
"footer a[href]",
"(els) => els.map(a => ({text: (a.innerText||'').trim(), "
"href: a.href}))",
)
except Exception as e:
logger.warning("footer-anchor query failed: %s", e)
return []
seen: set[str] = set()
out: list[dict] = []
for a in anchors:
href = (a.get("href") or "").strip()
text = (a.get("text") or "").strip()
if not href or not text:
continue
tl = text.lower()
if not any(h in tl for h in _LINK_HINTS_LC):
continue
key = href.split("#")[0]
if key in seen:
continue
seen.add(key)
out.append({"text": text[:80], "href": href})
if len(out) >= 10:
break
return out
async def _visit_link(page, link: dict, dwell_s: float = 5.0) -> dict:
"""Navigate to `link.href`, dwell, capture title + status."""
started = _ts()
start_t = time.monotonic()
status = 0
title = ""
err = ""
try:
resp = await page.goto(link["href"], wait_until="domcontentloaded",
timeout=20000)
if resp is not None:
status = resp.status
await page.wait_for_timeout(int(dwell_s * 1000))
try:
title = (await page.title())[:120]
except Exception:
pass
except Exception as e:
err = str(e)[:200]
return {
"timestamp": started, "action": "navigate",
"url": link["href"], "anchor_text": link["text"],
"status": status, "title": title,
"dwell_s": round(time.monotonic() - start_t, 2),
"error": err or None,
}
async def record_audit_walk(
url: str, dwell_s: float = 5.0, max_links: int = 8,
) -> dict[str, Any]:
"""Run a full audit walk + record video. Returns walk metadata."""
try:
from playwright.async_api import async_playwright
except Exception as e:
return {"error": f"playwright missing: {e}"}
walk_id = uuid4().hex[:12]
out_dir = Path(WALK_ROOT) / walk_id
out_dir.mkdir(parents=True, exist_ok=True)
actions: list[dict] = []
started_at = _ts()
err = None
async with async_playwright() as p:
try:
browser = await p.webkit.launch(headless=True)
context = await browser.new_context(
viewport={"width": 1280, "height": 800},
record_video_dir=str(out_dir),
record_video_size={"width": 1280, "height": 800},
locale="de-DE",
)
page = await context.new_page()
actions.append({
"timestamp": _ts(), "action": "goto",
"url": url,
})
try:
resp = await page.goto(url, wait_until="domcontentloaded",
timeout=30000)
actions[-1]["status"] = (resp.status if resp else 0)
except Exception as e:
actions[-1]["error"] = str(e)[:200]
await page.wait_for_timeout(2000)
accept_event = await _try_accept_banner(page)
actions.append(accept_event)
links = await _collect_footer_links(page)
actions.append({
"timestamp": _ts(), "action": "discover_footer_links",
"count": len(links), "links": links[:max_links],
})
for link in links[:max_links]:
ev = await _visit_link(page, link, dwell_s=dwell_s)
actions.append(ev)
await context.close()
await browser.close()
except Exception as e:
err = f"walk failed: {str(e)[:200]}"
logger.exception("walk failed")
completed_at = _ts()
# Find produced video file. Playwright writes the .webm with a
# random name when the context closes; rename it for stability.
video_meta: dict[str, Any] = {}
try:
candidates = sorted(out_dir.glob("*.webm"))
if candidates:
src = candidates[0]
dest = out_dir / "video.webm"
if src != dest:
src.rename(dest)
video_meta = {
"filename": "video.webm",
"size_bytes": dest.stat().st_size,
"sha256": _sha256_file(dest),
}
except Exception as e:
logger.warning("video rename failed: %s", e)
walk_doc = {
"walk_id": walk_id,
"url": url,
"started_at": started_at,
"completed_at": completed_at,
"error": err,
"engine": "playwright/webkit",
"viewport": "1280x800",
"actions": actions,
"video": video_meta,
}
try:
(out_dir / "walk.json").write_text(
json.dumps(walk_doc, indent=2, ensure_ascii=False),
)
except Exception as e:
logger.warning("walk.json write failed: %s", e)
return walk_doc
if __name__ == "__main__":
# Manual smoke
import sys
url = sys.argv[1] if len(sys.argv) > 1 else "https://www.elli.eco/de/startseite"
out = asyncio.run(record_audit_walk(url))
print(json.dumps(out, indent=2, ensure_ascii=False))