Files
breakpilot-compliance/backend-compliance/compliance/api/agent_check/_b17_wiring.py
T
Benjamin Admin d6b8bf87c2
CI / detect-changes (push) Successful in 9s
CI / branch-name (push) Has been skipped
CI / guardrail-integrity (push) Has been skipped
CI / secret-scan (push) Has been skipped
CI / dep-audit (push) Has been skipped
CI / test-python-backend (push) Successful in 29s
CI / test-python-document-crawler (push) Has been skipped
CI / test-python-dsms-gateway (push) Has been skipped
CI / sbom-scan (push) Has been skipped
CI / build-sha-integrity (push) Failing after 4s
CI / validate-canonical-controls (push) Successful in 10s
CI / loc-budget (push) Successful in 13s
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / nodejs-build (push) Has been skipped
CI / test-go (push) Has been skipped
CI / iace-gt-coverage (push) Has been skipped
fix: 4 Bugs gemeinsam — B22 PDF + B17 Walk-Fallback + company_name + Plausibility-Fallback
(1) B22 Cross-Domain (fix #59):
  Elli-Test fand AGB auf logpay.de NICHT obwohl URL in doc_entries
  korrekt. Vermutete Ursache: Discovery-Phase A drops/überschreibt
  Original-URL bei PDF-Fetch-Fail (word_count=0).
  Fix: _collect_audit_urls() iteriert über state.doc_entries +
  rejected_url + req.documents — Cross-Domain-Hosting ist
  unabhängig vom Text-Inhalt. Plus Trace-Logging für künftige
  Diagnose. Dedup per (doc_type, host_sld).

(2) B17 Audit-Walk-Fail-Fallback (fix #60):
  BMW v5 hatte audit_walk=None ohne Mail-Hinweis. Vermutlich
  180s-Timeout bei OneTrust-CMP-Banner-Tour.
  Fix: Timeout 180s → 300s. Plus: Bei Fail wird ein Hinweis-
  Stub mit error-Grund in state["audit_walk"] + HTML-Block
  geschrieben — Reviewer sieht den Fail statt silent-skip.

(3) company_name + origin_domain im Backend (fix #61):
  Frontend sendet seit ec03317 die zwei Felder — Backend ignorierte
  sie.
  Fix: ComplianceCheckRequest-Schema um company_name +
  origin_domain erweitert. phase_e_email priorisiert User-Input
  vor URL-Heuristik für site_name. Bei origin_domain ohne
  ableitbare doc_entries-domain wird der User-Input als domain
  übernommen.

(4) Plausibility-LLM Fallback-Modell (fix #62):
  qwen3:30b-a3b liefert auf großen DSEs (BMW 122 FAIL) gehäuft
  leere format='json'-Responses — Circuit-Breaker griff aber
  Phase blieb nutzlos.
  Fix: Default-Modell auf qwen2.5:7b umgestellt (4× kleiner,
  zuverlässiger bei format=json, ausreichendes Reasoning für
  PASS/MODIFY/DROP-Klassifikation). Plus Strategy-C eingeführt
  — Fallback-Modell (llama3.2:3b) wenn primary leer bleibt.
  BATCH_SIZE 4 → 3. ENV-Switches PLAUSIBILITY_LLM_MODEL +
  PLAUSIBILITY_FALLBACK_MODEL für Tuning.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-06-08 16:39:33 +02:00

253 lines
10 KiB
Python

"""B17 wiring — Audit-Walk-Recorder.
Triggert beim consent-tester einen kompletten Playwright-Site-Walk
mit Video-Aufzeichnung. Result: Video + JSON-Action-Index mit
Timestamps + SHA-256-Hash für Manipulation-Schutz.
Speichert nur die Walk-Metadata + Video-URL im state. Der eigentliche
File-Body bleibt im consent-tester-Volume (Stufe 1). Stufe 3 wird das
Video zu DSMS-IPFS hochladen und die CID hier einbinden.
"""
from __future__ import annotations
import html
import logging
import os
from urllib.parse import urlparse
import httpx
from ._constants import CONSENT_TESTER_URL
logger = logging.getLogger(__name__)
# Optionaler Override für die öffentliche IPFS-Gateway-URL. DSMS gibt
# intern http://dsms-node:8080/ipfs/{cid} zurück — für die Mail brauchen
# Reviewer aber eine extern erreichbare URL.
DSMS_PUBLIC_GATEWAY = os.environ.get(
"DSMS_PUBLIC_GATEWAY", "https://dsms-dev.breakpilot.ai",
)
def _publicize_gateway_url(internal_url: str) -> str:
"""Replace internal dsms-node host with the public gateway."""
if not internal_url:
return ""
return internal_url.replace(
"http://dsms-node:8080", DSMS_PUBLIC_GATEWAY,
).replace(
"http://bp-compliance-dsms-node:8080", DSMS_PUBLIC_GATEWAY,
)
async def run_b17(state: dict) -> None:
"""Trigger walk recording + store metadata in state."""
req = state.get("req")
if req is None:
return
homepage = ""
for d in req.documents:
if d.url:
p = urlparse(d.url)
if p.scheme and p.netloc:
homepage = f"{p.scheme}://{p.netloc}/"
break
if not homepage:
return
walk: dict = {}
walk_error: str | None = None
try:
async with httpx.AsyncClient(timeout=300.0) as c:
r = await c.post(
f"{CONSENT_TESTER_URL}/scan-audit-walk",
json={"url": homepage, "dwell_s": 4.0, "max_links": 8},
timeout=300.0,
)
if r.status_code == 200:
walk = r.json()
else:
walk_error = f"consent-tester HTTP {r.status_code}"
except Exception as e:
walk_error = f"{type(e).__name__}: {str(e)[:120]}"
logger.warning("B17 audit-walk request failed: %s", walk_error)
if not walk or not walk.get("walk_id"):
# Fallback-Stub damit Audit-Report einen Hinweis bekommt
# statt "audit_walk: None". Reviewer sieht den Fail.
state["audit_walk"] = {
"walk_id": "",
"url": homepage,
"video": {},
"actions": [],
"annotations": [],
"error": walk_error or "unknown (no walk_id returned)",
}
state["audit_walk_html"] = (
"<div style='margin:24px 0;padding:16px;border-left:4px solid #f59e0b;"
"background:#fef3c7;border-radius:4px;'>"
"<h2 style='margin:0 0 8px;color:#92400e;font-size:16px;'>"
"⚠️ Audit-Walk konnte nicht aufgezeichnet werden"
"</h2>"
f"<p style='margin:0;font-size:13px;color:#92400e;'>"
f"Site: <code>{homepage}</code> · Ursache: "
f"<code>{walk_error or 'unknown'}</code>. Mögliche "
"Gründe: komplexes CMP-Banner (lange Tour-Zeit), Anti-Bot-"
"Protection, oder consent-tester überlastet.</p>"
"</div>"
)
return
# Stufe-5: annotierte Screenshots pro Finding. Schickt die
# gesammelten findings (B1 mobile + B16 slug-drift + B13 widerruf)
# zum consent-tester der pro Finding ein PNG erzeugt.
annotations: list[dict] = []
try:
findings_for_annot: list[dict] = []
rf = state.get("reachability_finding")
if rf and not rf.get("passed", True):
findings_for_annot.append({
"check_id": "COOKIE-CONSENT-UX-001",
"mobile_playwright": rf.get("mobile_playwright") or {},
})
for f in (state.get("extra_findings") or []):
cid = (f.get("check_id") or "").upper()
if cid in ("URL-SLUG-DRIFT-001", "WIDERRUF-REACH-001"):
findings_for_annot.append(f)
if findings_for_annot:
async with httpx.AsyncClient(timeout=120.0) as c:
r = await c.post(
f"{CONSENT_TESTER_URL}/annotate-findings",
json={"findings": findings_for_annot,
"home_url": homepage},
timeout=120.0,
)
if r.status_code == 200:
annotations = (r.json() or {}).get("annotations") or []
logger.info(
"B17 annotations: %d Screenshots erzeugt",
len(annotations),
)
except Exception as e:
logger.warning("annotate-findings request failed: %s", e)
walk["annotations"] = annotations
state["audit_walk"] = walk
state["audit_walk_html"] = _render(walk)
logger.info(
"B17 audit-walk: %s · %d actions · video %d bytes · sha256 %s",
walk.get("walk_id"),
len(walk.get("actions") or []),
(walk.get("video") or {}).get("size_bytes", 0),
((walk.get("video") or {}).get("sha256") or "")[:12],
)
def _video_link(walk_id: str) -> str:
"""External URL for the recorded video (when consent-tester is
reachable from the audit reviewer)."""
return f"{CONSENT_TESTER_URL}/audit-walks/{walk_id}/video.webm"
def _render(walk: dict) -> str:
wid = walk.get("walk_id") or ""
video = walk.get("video") or {}
actions = walk.get("actions") or []
nav_count = sum(1 for a in actions if a.get("action") == "navigate")
sha = (video.get("sha256") or "")[:12]
size_kb = round((video.get("size_bytes") or 0) / 1024, 1)
walk_link = _video_link(wid)
meta_link = f"{CONSENT_TESTER_URL}/audit-walks/{wid}/walk.json"
# Stufe-3 DSMS-Anchor
video_dsms = (video.get("dsms") or {})
meta_dsms = (walk.get("walk_json_dsms") or {})
video_cid = video_dsms.get("cid") or ""
meta_cid = meta_dsms.get("cid") or ""
video_gw = _publicize_gateway_url(video_dsms.get("gateway_url") or "")
meta_gw = _publicize_gateway_url(meta_dsms.get("gateway_url") or "")
dsms_html = ""
if video_cid or meta_cid:
parts = []
if video_cid:
link = (f"<a href='{html.escape(video_gw)}' style='color:#0369a1;'>"
f"<code>{html.escape(video_cid[:20])}…</code></a>"
if video_gw else
f"<code>{html.escape(video_cid)}</code>")
parts.append(f"Video-CID: {link}")
if meta_cid:
link = (f"<a href='{html.escape(meta_gw)}' style='color:#0369a1;'>"
f"<code>{html.escape(meta_cid[:20])}…</code></a>"
if meta_gw else
f"<code>{html.escape(meta_cid)}</code>")
parts.append(f"walk.json-CID: {link}")
dsms_html = (
"<p style='margin:0 0 8px;padding:6px 10px;background:#fef3c7;"
"border-radius:4px;font-size:12px;color:#78350f;'>"
"<strong>🔒 DSMS-Anchor (manipulationssicher):</strong> "
+ " · ".join(parts) +
"</p>"
)
rows = []
for a in actions:
ts = (a.get("timestamp") or "")[11:19] # HH:MM:SS
act = a.get("action") or ""
detail = ""
if act == "goto" or act == "navigate":
detail = (a.get("url") or "")[:120]
if a.get("status"):
detail += f" → HTTP {a['status']}"
elif act == "accept_banner":
r = a.get("result") or ""
if r == "clicked":
detail = f"Banner akzeptiert ({a.get('phrase') or a.get('selector') or ''})"
else:
detail = "Kein Accept-Button gefunden"
elif act == "discover_footer_links":
detail = f"{a.get('count', 0)} Compliance-Links im Footer"
elif act == "expand_accordions":
n = a.get("expanded", 0)
detail = (f"{n} Akkordeon/Details-Sektion(en) entfaltet"
if n else "Keine Akkordeons gefunden")
elif act == "tour_cookie_banner":
n = a.get("clicks", 0)
opened = "Settings geöffnet" if a.get("settings_opened") \
else "kein Settings-Trigger gefunden"
detail = f"Cookie-Banner-Tour: {n} Klicks ({opened})"
rows.append(
f"<tr><td style='padding:4px 8px;font-family:monospace;"
f"color:#475569;'>{html.escape(ts)}</td>"
f"<td style='padding:4px 8px;'>{html.escape(act)}</td>"
f"<td style='padding:4px 8px;color:#475569;'>"
f"{html.escape(detail)}</td></tr>"
)
return (
"<div style='margin:24px 0;padding:16px;border-left:4px solid #0ea5e9;"
"background:#f0f9ff;border-radius:4px;'>"
"<h2 style='margin:0 0 8px;color:#0c4a6e;font-size:16px;'>"
"🎥 Audit-Walk-Video (Beweis-Aufzeichnung)"
"</h2>"
"<p style='margin:0 0 8px;font-size:13px;color:#475569;'>"
f"<strong>Video:</strong> "
f"<a href='{html.escape(walk_link)}' style='color:#0369a1;'>video.webm</a> "
f"({size_kb} KB, SHA-256 <code>{html.escape(sha)}…</code>) · "
f"<strong>Metadata:</strong> "
f"<a href='{html.escape(meta_link)}' style='color:#0369a1;'>walk.json</a>"
"</p>"
"<p style='margin:0 0 8px;font-size:13px;color:#475569;'>"
f"{nav_count} Compliance-Seiten besucht, jede 4 Sek "
"verweilt — Reviewer kann den Audit-Walk nachverfolgen."
"</p>"
+ dsms_html +
"<table style='font-size:12px;width:100%;border-collapse:collapse;"
"background:#fff;border-radius:4px;'>"
"<thead><tr style='background:#e0f2fe;'>"
"<th style='padding:6px 8px;text-align:left;'>Zeit (UTC)</th>"
"<th style='padding:6px 8px;text-align:left;'>Aktion</th>"
"<th style='padding:6px 8px;text-align:left;'>Detail</th>"
"</tr></thead><tbody>" + "".join(rows) + "</tbody></table>"
"</div>"
)