cb4b352846
Nimmt einen kompletten Site-Walk als WebKit-Browser-Session
inkl. Video auf. Reviewer kann nachträglich exakt nachvollziehen,
wie die Engine zum Befund kam.
consent-tester:
- services/audit_walk_recorder.py: Playwright record_video_dir,
iPhone-Viewport-free 1280×800. Goto homepage → Banner-Accept
(Best-Effort: 12 Text-Phrasen + 5 CMP-Fallback-Selektoren) →
Footer-Links sammeln (compliance-relevant gefiltert) →
pro Link navigate + Dwell-Time → JSON-Action-Index mit
UTC-Timestamps + SHA-256 vom Video als Manipulation-Schutz.
- routes_audit_walk.py: POST /scan-audit-walk; statische
Serves für /audit-walks/{walk_id}/video.webm + walk.json.
- main.py: Router registriert.
backend:
- _b17_wiring.py: Triggert /scan-audit-walk, speichert
Walk-Metadata in state["audit_walk"]. Render-Block mit
HTML-Tabelle aller Actions (HH:MM:SS + Aktion + Detail) +
Links zu Video und walk.json.
- _orchestrator.py: run_b17 nach run_b16, async-aufgerufen.
- mail_render_v2/_compose.py: audit_walk_html im V2-Layout.
- test_b17_audit_walk.py: 8 Tests (Render-Pfade + Wiring).
Stufe-2 (Akkordeon-Expansion) und Stufe-3 (DSMS-CID-Anchor)
folgen separat.
Real-World-Smoke gegen Elli:
- 581 KB Video, SHA-256 verifizierbar
- 3 Footer-Links besucht (Impressum, Datenschutzerkl., Nutzungs-)
- 6 Actions im JSON-Index
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
276 lines
8.8 KiB
Python
276 lines
8.8 KiB
Python
"""Playwright Audit-Walk-Recorder.
|
|
|
|
Nimmt einen vollständigen Site-Walk per WebKit-Browser auf:
|
|
1. Goto homepage + Banner-Akzeptieren (Best-Effort)
|
|
2. Footer-Links sammeln (DSE, Impressum, AGB, Cookie, Widerruf, ...)
|
|
3. Pro Link: navigate + 5s Lese-Verweildauer
|
|
4. Video aufzeichnen (Playwright `record_video_dir`)
|
|
5. JSON-Action-Index mit Timestamps + SHA-256 für
|
|
Manipulation-Schutz
|
|
|
|
Output landet unter `/data/audit-walks/{walk_id}/`:
|
|
- `video.webm` — Playwright-Recording
|
|
- `walk.json` — Action-Index mit Timestamps + Hash
|
|
|
|
Dauer pro Walk: ~30-60 Sekunden bei 6-8 Footer-Links.
|
|
|
|
Stufe-1 dieser Suite. Stufe-2 (Akkordeon-Expansion) und
|
|
Stufe-3 (DSMS-CID-Anchor) folgen separat.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import asyncio
|
|
import hashlib
|
|
import json
|
|
import logging
|
|
import os
|
|
import time
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Any
|
|
from uuid import uuid4
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Walk-Output-Root (Volume mount: /data ist im docker-compose definiert)
|
|
WALK_ROOT = os.getenv("AUDIT_WALK_DIR", "/data/audit-walks")
|
|
|
|
# Footer-Link-Text-Hints — was wir als relevante Compliance-Anker
|
|
# erkennen. Wir laden NICHT jeden Footer-Link (sonst riesige Videos),
|
|
# sondern nur die compliance-relevanten.
|
|
_LINK_HINTS_LC = (
|
|
"impressum", "imprint", "legal",
|
|
"datenschutz", "privacy",
|
|
"cookie", "cookies",
|
|
"agb", "geschäftsbedingung", "geschaeftsbedingung",
|
|
"nutzungsbedingung", "terms",
|
|
"widerruf", "withdrawal", "cancellation",
|
|
"einwilligung", "consent",
|
|
)
|
|
|
|
# Banner-Accept-Buttons — Best-Effort-Liste.
|
|
_ACCEPT_PHRASES = (
|
|
"alle akzeptieren", "alle zulassen", "akzeptieren",
|
|
"alles akzeptieren", "zustimmen", "einverstanden",
|
|
"accept all", "accept", "agree", "allow all",
|
|
"ok", "verstanden",
|
|
)
|
|
|
|
|
|
def _ts() -> str:
|
|
return datetime.now(timezone.utc).isoformat()
|
|
|
|
|
|
def _sha256_file(path: Path) -> str:
|
|
h = hashlib.sha256()
|
|
with path.open("rb") as f:
|
|
for chunk in iter(lambda: f.read(65536), b""):
|
|
h.update(chunk)
|
|
return h.hexdigest()
|
|
|
|
|
|
async def _try_accept_banner(page) -> dict:
|
|
"""Best-effort: click an accept button. Tries text patterns first,
|
|
then common CMP selectors as fallback. Returns action-event dict."""
|
|
started = _ts()
|
|
for phrase in _ACCEPT_PHRASES:
|
|
try:
|
|
btn = page.get_by_role("button", name=phrase, exact=False).first
|
|
if await btn.count() > 0:
|
|
await btn.click(timeout=3000)
|
|
await page.wait_for_timeout(1500)
|
|
return {
|
|
"timestamp": started, "action": "accept_banner",
|
|
"result": "clicked", "phrase": phrase,
|
|
}
|
|
except Exception:
|
|
continue
|
|
# CMP-fallback selectors
|
|
cmp_selectors = (
|
|
"#usercentrics-cmp button",
|
|
".ot-sdk-container button.banner-actions-container .accept-btn",
|
|
".cmp-modal button[aria-label*=accept i]",
|
|
"[data-testid=cookie-accept]",
|
|
"[aria-label*=akzeptieren i]",
|
|
"[aria-label*=accept i]",
|
|
)
|
|
for sel in cmp_selectors:
|
|
try:
|
|
el = page.locator(sel).first
|
|
if await el.count() > 0:
|
|
await el.click(timeout=2000)
|
|
await page.wait_for_timeout(1500)
|
|
return {
|
|
"timestamp": started, "action": "accept_banner",
|
|
"result": "clicked", "selector": sel,
|
|
}
|
|
except Exception:
|
|
continue
|
|
return {"timestamp": started, "action": "accept_banner",
|
|
"result": "no_button_found"}
|
|
|
|
|
|
async def _collect_footer_links(page) -> list[dict]:
|
|
"""Find compliance-relevant anchors inside the page footer."""
|
|
try:
|
|
anchors = await page.eval_on_selector_all(
|
|
"footer a[href]",
|
|
"(els) => els.map(a => ({text: (a.innerText||'').trim(), "
|
|
"href: a.href}))",
|
|
)
|
|
except Exception as e:
|
|
logger.warning("footer-anchor query failed: %s", e)
|
|
return []
|
|
seen: set[str] = set()
|
|
out: list[dict] = []
|
|
for a in anchors:
|
|
href = (a.get("href") or "").strip()
|
|
text = (a.get("text") or "").strip()
|
|
if not href or not text:
|
|
continue
|
|
tl = text.lower()
|
|
if not any(h in tl for h in _LINK_HINTS_LC):
|
|
continue
|
|
key = href.split("#")[0]
|
|
if key in seen:
|
|
continue
|
|
seen.add(key)
|
|
out.append({"text": text[:80], "href": href})
|
|
if len(out) >= 10:
|
|
break
|
|
return out
|
|
|
|
|
|
async def _visit_link(page, link: dict, dwell_s: float = 5.0) -> dict:
|
|
"""Navigate to `link.href`, dwell, capture title + status."""
|
|
started = _ts()
|
|
start_t = time.monotonic()
|
|
status = 0
|
|
title = ""
|
|
err = ""
|
|
try:
|
|
resp = await page.goto(link["href"], wait_until="domcontentloaded",
|
|
timeout=20000)
|
|
if resp is not None:
|
|
status = resp.status
|
|
await page.wait_for_timeout(int(dwell_s * 1000))
|
|
try:
|
|
title = (await page.title())[:120]
|
|
except Exception:
|
|
pass
|
|
except Exception as e:
|
|
err = str(e)[:200]
|
|
return {
|
|
"timestamp": started, "action": "navigate",
|
|
"url": link["href"], "anchor_text": link["text"],
|
|
"status": status, "title": title,
|
|
"dwell_s": round(time.monotonic() - start_t, 2),
|
|
"error": err or None,
|
|
}
|
|
|
|
|
|
async def record_audit_walk(
|
|
url: str, dwell_s: float = 5.0, max_links: int = 8,
|
|
) -> dict[str, Any]:
|
|
"""Run a full audit walk + record video. Returns walk metadata."""
|
|
try:
|
|
from playwright.async_api import async_playwright
|
|
except Exception as e:
|
|
return {"error": f"playwright missing: {e}"}
|
|
|
|
walk_id = uuid4().hex[:12]
|
|
out_dir = Path(WALK_ROOT) / walk_id
|
|
out_dir.mkdir(parents=True, exist_ok=True)
|
|
actions: list[dict] = []
|
|
started_at = _ts()
|
|
err = None
|
|
|
|
async with async_playwright() as p:
|
|
try:
|
|
browser = await p.webkit.launch(headless=True)
|
|
context = await browser.new_context(
|
|
viewport={"width": 1280, "height": 800},
|
|
record_video_dir=str(out_dir),
|
|
record_video_size={"width": 1280, "height": 800},
|
|
locale="de-DE",
|
|
)
|
|
page = await context.new_page()
|
|
actions.append({
|
|
"timestamp": _ts(), "action": "goto",
|
|
"url": url,
|
|
})
|
|
try:
|
|
resp = await page.goto(url, wait_until="domcontentloaded",
|
|
timeout=30000)
|
|
actions[-1]["status"] = (resp.status if resp else 0)
|
|
except Exception as e:
|
|
actions[-1]["error"] = str(e)[:200]
|
|
await page.wait_for_timeout(2000)
|
|
|
|
accept_event = await _try_accept_banner(page)
|
|
actions.append(accept_event)
|
|
|
|
links = await _collect_footer_links(page)
|
|
actions.append({
|
|
"timestamp": _ts(), "action": "discover_footer_links",
|
|
"count": len(links), "links": links[:max_links],
|
|
})
|
|
|
|
for link in links[:max_links]:
|
|
ev = await _visit_link(page, link, dwell_s=dwell_s)
|
|
actions.append(ev)
|
|
|
|
await context.close()
|
|
await browser.close()
|
|
except Exception as e:
|
|
err = f"walk failed: {str(e)[:200]}"
|
|
logger.exception("walk failed")
|
|
|
|
completed_at = _ts()
|
|
|
|
# Find produced video file. Playwright writes the .webm with a
|
|
# random name when the context closes; rename it for stability.
|
|
video_meta: dict[str, Any] = {}
|
|
try:
|
|
candidates = sorted(out_dir.glob("*.webm"))
|
|
if candidates:
|
|
src = candidates[0]
|
|
dest = out_dir / "video.webm"
|
|
if src != dest:
|
|
src.rename(dest)
|
|
video_meta = {
|
|
"filename": "video.webm",
|
|
"size_bytes": dest.stat().st_size,
|
|
"sha256": _sha256_file(dest),
|
|
}
|
|
except Exception as e:
|
|
logger.warning("video rename failed: %s", e)
|
|
|
|
walk_doc = {
|
|
"walk_id": walk_id,
|
|
"url": url,
|
|
"started_at": started_at,
|
|
"completed_at": completed_at,
|
|
"error": err,
|
|
"engine": "playwright/webkit",
|
|
"viewport": "1280x800",
|
|
"actions": actions,
|
|
"video": video_meta,
|
|
}
|
|
try:
|
|
(out_dir / "walk.json").write_text(
|
|
json.dumps(walk_doc, indent=2, ensure_ascii=False),
|
|
)
|
|
except Exception as e:
|
|
logger.warning("walk.json write failed: %s", e)
|
|
return walk_doc
|
|
|
|
|
|
if __name__ == "__main__":
|
|
# Manual smoke
|
|
import sys
|
|
url = sys.argv[1] if len(sys.argv) > 1 else "https://www.elli.eco/de/startseite"
|
|
out = asyncio.run(record_audit_walk(url))
|
|
print(json.dumps(out, indent=2, ensure_ascii=False))
|