cb4b352846
Nimmt einen kompletten Site-Walk als WebKit-Browser-Session
inkl. Video auf. Reviewer kann nachträglich exakt nachvollziehen,
wie die Engine zum Befund kam.
consent-tester:
- services/audit_walk_recorder.py: Playwright record_video_dir,
iPhone-Viewport-free 1280×800. Goto homepage → Banner-Accept
(Best-Effort: 12 Text-Phrasen + 5 CMP-Fallback-Selektoren) →
Footer-Links sammeln (compliance-relevant gefiltert) →
pro Link navigate + Dwell-Time → JSON-Action-Index mit
UTC-Timestamps + SHA-256 vom Video als Manipulation-Schutz.
- routes_audit_walk.py: POST /scan-audit-walk; statische
Serves für /audit-walks/{walk_id}/video.webm + walk.json.
- main.py: Router registriert.
backend:
- _b17_wiring.py: Triggert /scan-audit-walk, speichert
Walk-Metadata in state["audit_walk"]. Render-Block mit
HTML-Tabelle aller Actions (HH:MM:SS + Aktion + Detail) +
Links zu Video und walk.json.
- _orchestrator.py: run_b17 nach run_b16, async-aufgerufen.
- mail_render_v2/_compose.py: audit_walk_html im V2-Layout.
- test_b17_audit_walk.py: 8 Tests (Render-Pfade + Wiring).
Stufe-2 (Akkordeon-Expansion) und Stufe-3 (DSMS-CID-Anchor)
folgen separat.
Real-World-Smoke gegen Elli:
- 581 KB Video, SHA-256 verifizierbar
- 3 Footer-Links besucht (Impressum, Datenschutzerkl., Nutzungs-)
- 6 Actions im JSON-Index
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
491 lines
17 KiB
Python
491 lines
17 KiB
Python
"""
|
|
Consent Tester Service — Playwright-based 3-phase cookie consent test.
|
|
|
|
Tests what scripts/cookies load BEFORE consent, AFTER rejection, and AFTER acceptance.
|
|
Runs as independent microservice on port 8094.
|
|
"""
|
|
|
|
import logging
|
|
from datetime import datetime, timezone
|
|
|
|
from fastapi import FastAPI
|
|
from fastapi.middleware.cors import CORSMiddleware
|
|
from pydantic import BaseModel
|
|
|
|
from services.consent_scanner import run_consent_test, ConsentTestResult
|
|
from services.authenticated_scanner import run_authenticated_test, AuthTestResult
|
|
from services.playwright_scanner import scan_website_playwright
|
|
from services.dsi_discovery import discover_dsi_documents, DSIDiscoveryResult
|
|
from services.page_screenshot import (
|
|
capture_page_evidence,
|
|
capture_page_overlapping_slices,
|
|
)
|
|
from checks.banner_runner import map_scan_to_checks
|
|
|
|
logging.basicConfig(level=logging.INFO, format="%(levelname)s:%(name)s: %(message)s")
|
|
logger = logging.getLogger(__name__)
|
|
|
|
app = FastAPI(title="BreakPilot Consent Tester", version="1.0.0")
|
|
|
|
app.add_middleware(
|
|
CORSMiddleware,
|
|
allow_origins=["*"],
|
|
allow_methods=["*"],
|
|
allow_headers=["*"],
|
|
)
|
|
|
|
|
|
class ScanRequest(BaseModel):
|
|
url: str
|
|
timeout_per_phase: int = 10 # seconds to wait after page load
|
|
categories: list[str] = [] # empty = test all categories
|
|
|
|
|
|
class ScanResponse(BaseModel):
|
|
url: str
|
|
banner_detected: bool
|
|
banner_provider: str
|
|
phases: dict
|
|
summary: dict
|
|
scanned_at: str
|
|
category_tests: list = []
|
|
banner_checks: dict = {}
|
|
structured_checks: list = []
|
|
completeness_pct: int = 0
|
|
correctness_pct: int = 0
|
|
tcf_vendors: list = [] # Resolved TCF vendor list from GVL
|
|
cmp_payloads: list[dict] = [] # P48: raw CMP JSON-payloads (Usercentrics/OneTrust/...) captured during scan
|
|
vendor_details: list[dict] = [] # P50: per-vendor detail-modal-extracts (Beschreibung/Cookies/Opt-Out/Privacy)
|
|
cookies_detailed: list[dict] = [] # P59b: full cookie details for behavior-validation (name,value,domain,expires,phase,declared_category)
|
|
banner_screenshot_b64: str = "" # P85: base64-PNG des Banners (initial-view)
|
|
|
|
|
|
from routes_matrix import router as matrix_router
|
|
from routes_mobile import router as mobile_router
|
|
from routes_cookie_matrix import router as cookie_matrix_router
|
|
from routes_audit_walk import router as audit_walk_router
|
|
app.include_router(matrix_router)
|
|
app.include_router(mobile_router)
|
|
app.include_router(cookie_matrix_router)
|
|
app.include_router(audit_walk_router)
|
|
|
|
|
|
@app.get("/health")
|
|
async def health():
|
|
return {"status": "healthy", "service": "consent-tester"}
|
|
|
|
|
|
@app.post("/scan", response_model=ScanResponse)
|
|
async def scan_consent(req: ScanRequest):
|
|
"""Run 3-phase consent test on a URL."""
|
|
logger.info("Starting consent test for %s", req.url)
|
|
result = await run_consent_test(req.url, req.timeout_per_phase, req.categories)
|
|
|
|
# Build raw response dict for structured check mapping
|
|
phases = {
|
|
"before_consent": {
|
|
"scripts": result.before_scripts,
|
|
"cookies": result.before_cookies,
|
|
"tracking_services": result.before_tracking,
|
|
"violations": [v.__dict__ for v in result.before_violations],
|
|
},
|
|
"after_reject": {
|
|
"scripts": result.reject_scripts,
|
|
"cookies": result.reject_cookies,
|
|
"new_tracking": result.reject_new_tracking,
|
|
"violations": [v.__dict__ for v in result.reject_violations],
|
|
},
|
|
"after_accept": {
|
|
"scripts": result.accept_scripts,
|
|
"cookies": result.accept_cookies,
|
|
"new_tracking": result.accept_new_tracking,
|
|
"undocumented": result.accept_undocumented,
|
|
},
|
|
}
|
|
banner_checks_data = {
|
|
"has_impressum_link": result.banner_has_impressum_link,
|
|
"has_dse_link": result.banner_has_dse_link,
|
|
"violations": [v.__dict__ for v in result.banner_text_violations],
|
|
}
|
|
|
|
# Map to L1/L2 hierarchy
|
|
raw_for_mapping = {
|
|
"banner_detected": result.banner_detected,
|
|
"banner_provider": result.banner_provider,
|
|
"phases": phases,
|
|
"banner_checks": banner_checks_data,
|
|
}
|
|
check_result = map_scan_to_checks(raw_for_mapping)
|
|
|
|
return ScanResponse(
|
|
url=req.url,
|
|
banner_detected=result.banner_detected,
|
|
banner_provider=result.banner_provider,
|
|
tcf_vendors=result.tcf_vendors,
|
|
phases=phases,
|
|
summary={
|
|
"critical": sum(1 for v in result.reject_violations if v.severity == "CRITICAL"),
|
|
"high": len(result.before_violations) + sum(1 for v in result.banner_text_violations if v.severity == "HIGH"),
|
|
"undocumented": len(result.accept_undocumented),
|
|
"total_violations": len(result.before_violations) + len(result.reject_violations) + len(result.banner_text_violations),
|
|
"category_violations": sum(len(ct.violations) for ct in result.category_tests),
|
|
"categories_tested": len(result.category_tests),
|
|
"banner_text_issues": len(result.banner_text_violations),
|
|
},
|
|
banner_checks=banner_checks_data,
|
|
structured_checks=check_result["structured_checks"],
|
|
completeness_pct=check_result["completeness_pct"],
|
|
correctness_pct=check_result["correctness_pct"],
|
|
scanned_at=datetime.now(timezone.utc).isoformat(),
|
|
category_tests=[{
|
|
"category": ct.category,
|
|
"category_label": ct.category_label,
|
|
"tracking_services": ct.tracking_services,
|
|
"violations": ct.violations,
|
|
"provider_details_visible": getattr(ct, "provider_details_visible", False),
|
|
"cookies_set": ct.cookies_set,
|
|
} for ct in result.category_tests] if result.category_tests else [],
|
|
cmp_payloads=result.cmp_payloads, # P48
|
|
vendor_details=result.vendor_details, # P50
|
|
cookies_detailed=result.cookies_detailed, # P59b
|
|
banner_screenshot_b64=result.banner_screenshot_b64, # P85
|
|
)
|
|
|
|
|
|
class AuthScanRequest(BaseModel):
|
|
url: str
|
|
username: str
|
|
password: str
|
|
username_selector: str = ""
|
|
password_selector: str = ""
|
|
submit_selector: str = ""
|
|
|
|
|
|
class AuthCheckInfo(BaseModel):
|
|
found: bool = False
|
|
text: str = ""
|
|
legal_ref: str = ""
|
|
|
|
|
|
class AuthScanResponse(BaseModel):
|
|
url: str
|
|
authenticated: bool
|
|
login_error: str = ""
|
|
checks: dict[str, AuthCheckInfo]
|
|
findings_count: int
|
|
scanned_at: str
|
|
|
|
|
|
LEGAL_REFS = {
|
|
"cancel_subscription": "§312k BGB (Kuendigungsbutton)",
|
|
"delete_account": "Art. 17 DSGVO (Recht auf Loeschung)",
|
|
"export_data": "Art. 20 DSGVO (Datenportabilitaet)",
|
|
"consent_settings": "Art. 7 Abs. 3 DSGVO (Widerruf der Einwilligung)",
|
|
"profile_visible": "Art. 15 DSGVO (Auskunftsrecht)",
|
|
}
|
|
|
|
|
|
@app.post("/authenticated-scan", response_model=AuthScanResponse)
|
|
async def authenticated_scan(req: AuthScanRequest):
|
|
"""Test post-login functionality. Credentials are destroyed after test."""
|
|
logger.info("Starting authenticated test for %s", req.url)
|
|
|
|
result = await run_authenticated_test(
|
|
url=req.url,
|
|
username=req.username,
|
|
password=req.password,
|
|
username_selector=req.username_selector,
|
|
password_selector=req.password_selector,
|
|
submit_selector=req.submit_selector,
|
|
)
|
|
|
|
checks = {
|
|
"cancel_subscription": AuthCheckInfo(
|
|
found=result.cancel_subscription.found,
|
|
text=result.cancel_subscription.text,
|
|
legal_ref=LEGAL_REFS["cancel_subscription"],
|
|
),
|
|
"delete_account": AuthCheckInfo(
|
|
found=result.delete_account.found,
|
|
text=result.delete_account.text,
|
|
legal_ref=LEGAL_REFS["delete_account"],
|
|
),
|
|
"export_data": AuthCheckInfo(
|
|
found=result.export_data.found,
|
|
text=result.export_data.text,
|
|
legal_ref=LEGAL_REFS["export_data"],
|
|
),
|
|
"consent_settings": AuthCheckInfo(
|
|
found=result.consent_settings.found,
|
|
text=result.consent_settings.text,
|
|
legal_ref=LEGAL_REFS["consent_settings"],
|
|
),
|
|
"profile_visible": AuthCheckInfo(
|
|
found=result.profile_visible.found,
|
|
text=result.profile_visible.text,
|
|
legal_ref=LEGAL_REFS["profile_visible"],
|
|
),
|
|
}
|
|
|
|
missing = sum(1 for c in checks.values() if not c.found)
|
|
|
|
return AuthScanResponse(
|
|
url=req.url,
|
|
authenticated=result.authenticated,
|
|
login_error=result.login_error,
|
|
checks=checks,
|
|
findings_count=missing,
|
|
scanned_at=datetime.now(timezone.utc).isoformat(),
|
|
)
|
|
|
|
|
|
# ═══════════════════════════════════════════════════════════════
|
|
# PLAYWRIGHT WEBSITE SCAN (Phase 10 — replaces httpx scanner)
|
|
# ═══════════════════════════════════════════════════════════════
|
|
|
|
class WebsiteScanRequest(BaseModel):
|
|
url: str
|
|
max_pages: int = 15
|
|
click_nav: bool = True
|
|
|
|
|
|
class PageInfo(BaseModel):
|
|
url: str
|
|
status: int
|
|
title: str = ""
|
|
error: str = ""
|
|
|
|
|
|
class WebsiteScanResponse(BaseModel):
|
|
url: str
|
|
pages: list[PageInfo]
|
|
pages_count: int
|
|
external_scripts: list[str]
|
|
cookies: list[str]
|
|
page_htmls: dict[str, str] # url -> rendered HTML (for backend analysis)
|
|
scanned_at: str
|
|
|
|
|
|
@app.post("/website-scan", response_model=WebsiteScanResponse)
|
|
async def website_scan(req: WebsiteScanRequest):
|
|
"""Scan website using Playwright — discovers pages via JS navigation + menu clicks."""
|
|
logger.info("Starting Playwright website scan for %s (max %d pages)", req.url, req.max_pages)
|
|
|
|
result = await scan_website_playwright(req.url, req.max_pages, req.click_nav)
|
|
|
|
# Build page HTML map (only successful pages, truncated)
|
|
page_htmls = {}
|
|
for p in result.pages:
|
|
if p.html and p.status < 400:
|
|
page_htmls[p.url] = p.html[:50000] # Cap at 50KB per page
|
|
|
|
return WebsiteScanResponse(
|
|
url=req.url,
|
|
pages=[PageInfo(url=p.url, status=p.status, title=p.title, error=p.error) for p in result.pages],
|
|
pages_count=len(result.pages),
|
|
external_scripts=result.external_scripts[:50],
|
|
cookies=result.all_cookies,
|
|
page_htmls=page_htmls,
|
|
scanned_at=datetime.now(timezone.utc).isoformat(),
|
|
)
|
|
|
|
|
|
# ═══════════════════════════════════════════════════════════════
|
|
# DSI DISCOVERY (finds all privacy + legal documents on a website)
|
|
# ═══════════════════════════════════════════════════════════════
|
|
|
|
class DSIDiscoveryRequest(BaseModel):
|
|
url: str
|
|
max_documents: int = 30
|
|
|
|
|
|
class DSIDocumentInfo(BaseModel):
|
|
title: str
|
|
url: str
|
|
source_url: str
|
|
language: str = ""
|
|
doc_type: str = ""
|
|
word_count: int = 0
|
|
text_preview: str = ""
|
|
full_text: str = ""
|
|
# D — Tab-getrennte HTML-Tabellen aus dem DOM (z.B. Cookie-Tabellen).
|
|
# Pro Tabelle ein Array von Zeilen, jede Zeile Tab-getrennt.
|
|
# Backend nutzt das fuer deterministischen Cookie-Tabellen-Parse.
|
|
tables: list[list[str]] = []
|
|
|
|
|
|
class DSIDiscoveryResponse(BaseModel):
|
|
url: str
|
|
documents: list[DSIDocumentInfo]
|
|
total_found: int
|
|
languages_detected: list[str]
|
|
errors: list[str]
|
|
scanned_at: str
|
|
# Raw CMP payloads captured during navigation (ePaaS, OneTrust, etc.).
|
|
# Backend uses these to build the per-vendor compliance table.
|
|
cmp_payloads: list[dict] = []
|
|
|
|
|
|
@app.post("/dsi-discovery", response_model=DSIDiscoveryResponse)
|
|
async def dsi_discovery(req: DSIDiscoveryRequest):
|
|
"""Discover all privacy/data protection documents on a website.
|
|
|
|
Generically finds DSI, AGB, Nutzungsbedingungen, Widerrufsbelehrung,
|
|
Cookie-Richtlinien etc. regardless of website technology or language.
|
|
Supports HTML pages, accordions, sidebars, PDFs, cross-domain links.
|
|
"""
|
|
logger.info("Starting DSI discovery for %s (max %d docs)", req.url, req.max_documents)
|
|
|
|
from playwright.async_api import async_playwright
|
|
|
|
async with async_playwright() as p:
|
|
browser = await p.chromium.launch(
|
|
headless=True,
|
|
args=["--no-sandbox", "--disable-dev-shm-usage"],
|
|
)
|
|
context = await browser.new_context(
|
|
user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
|
|
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
|
)
|
|
page = await context.new_page()
|
|
|
|
try:
|
|
result = await discover_dsi_documents(page, req.url, req.max_documents)
|
|
finally:
|
|
await context.close()
|
|
await browser.close()
|
|
|
|
return DSIDiscoveryResponse(
|
|
url=req.url,
|
|
documents=[
|
|
DSIDocumentInfo(
|
|
title=d.title,
|
|
url=d.url,
|
|
source_url=d.source_url,
|
|
language=d.language,
|
|
doc_type=d.doc_type,
|
|
word_count=d.word_count,
|
|
text_preview=d.text[:500] if d.text else "",
|
|
full_text=d.text[:200000] if d.text else "",
|
|
tables=getattr(d, "tables", []) or [],
|
|
)
|
|
for d in result.documents
|
|
],
|
|
total_found=result.total_found,
|
|
languages_detected=result.languages_detected,
|
|
errors=result.errors,
|
|
scanned_at=datetime.now(timezone.utc).isoformat(),
|
|
cmp_payloads=result.cmp_payloads,
|
|
)
|
|
|
|
|
|
# ── Evidence screenshot (full-page + timestamp) ─────────────────────
|
|
|
|
class EvidenceRequest(BaseModel):
|
|
url: str
|
|
check_id: str = ""
|
|
|
|
|
|
class EvidenceResponse(BaseModel):
|
|
url: str # final URL after redirects
|
|
captured_at: str
|
|
width_px: int
|
|
height_px: int
|
|
accepted_banner: bool
|
|
expanded: int
|
|
png_b64: str
|
|
png_size: int
|
|
|
|
|
|
@app.post("/capture-evidence", response_model=EvidenceResponse)
|
|
async def capture_evidence(req: EvidenceRequest):
|
|
"""Full-page screenshot with timestamp banner — for legal evidence.
|
|
|
|
Used by backend to capture the Cookie-Richtlinie + DSE pages so the
|
|
audit-mail ZIP-attachment contains the exact rendered DOM at scan time.
|
|
"""
|
|
import base64 as _b64
|
|
logger.info("Capturing evidence screenshot for %s", req.url)
|
|
data = await capture_page_evidence(req.url, check_id=req.check_id)
|
|
png = data["png_bytes"]
|
|
return EvidenceResponse(
|
|
url=data["url"],
|
|
captured_at=data["captured_at"],
|
|
width_px=data["width_px"],
|
|
height_px=data["height_px"],
|
|
accepted_banner=data["accepted_banner"],
|
|
expanded=data["expanded"],
|
|
png_b64=_b64.b64encode(png).decode("ascii") if png else "",
|
|
png_size=len(png) if png else 0,
|
|
)
|
|
|
|
|
|
# ── Evidence slices (overlapping scrolling screenshots) ─────────────
|
|
|
|
class EvidenceSlicesRequest(BaseModel):
|
|
url: str
|
|
check_id: str = ""
|
|
viewport_h: int = 1024
|
|
overlap_px: int = 200
|
|
max_slices: int = 40
|
|
|
|
|
|
class EvidenceSliceItem(BaseModel):
|
|
idx: int
|
|
ts: str
|
|
top_y: int
|
|
bot_y: int
|
|
sha256: str
|
|
png_b64: str
|
|
png_size: int
|
|
|
|
|
|
class EvidenceSlicesResponse(BaseModel):
|
|
url: str
|
|
total_height_px: int
|
|
width_px: int
|
|
accepted_banner: bool
|
|
expanded: int
|
|
slices: list[EvidenceSliceItem]
|
|
|
|
|
|
@app.post("/capture-evidence-slices", response_model=EvidenceSlicesResponse)
|
|
async def capture_evidence_slices(req: EvidenceSlicesRequest):
|
|
"""Overlapping viewport-screenshots fuer lueckenlose Beweiskette.
|
|
|
|
Jede Slice ueberlappt die vorherige um overlap_px Pixel — jeder Cookie
|
|
erscheint in mind. einem Bild, an Slice-Grenzen sogar in zwei. Dedup
|
|
nach Cookie-Name eliminiert die Doppel im Endresultat.
|
|
"""
|
|
logger.info("Capturing overlapping evidence slices for %s", req.url)
|
|
data = await capture_page_overlapping_slices(
|
|
req.url, check_id=req.check_id,
|
|
viewport_h=req.viewport_h, overlap_px=req.overlap_px,
|
|
max_slices=req.max_slices,
|
|
)
|
|
return EvidenceSlicesResponse(
|
|
url=data["url"],
|
|
total_height_px=data["total_height_px"],
|
|
width_px=data["width_px"],
|
|
accepted_banner=data["accepted_banner"],
|
|
expanded=data["expanded"],
|
|
slices=[EvidenceSliceItem(**s) for s in data["slices"]],
|
|
)
|
|
|
|
|
|
# ── Admin: CMP discoveries (Phase E) ────────────────────────────────
|
|
|
|
@app.get("/cmp-discoveries")
|
|
async def cmp_discoveries(limit: int = 200):
|
|
"""List LLM-discovered CMP patterns (Phase E auto-promote log)."""
|
|
from services.cmp_discovery_log import list_discoveries
|
|
return {"discoveries": list_discoveries(limit=limit)}
|
|
|
|
|
|
@app.delete("/cmp-discoveries/{disc_id}")
|
|
async def cmp_discovery_delete(disc_id: int):
|
|
"""Delete a discovery + its auto-promoted module (rollback)."""
|
|
from services.cmp_discovery_log import delete_discovery
|
|
ok = delete_discovery(disc_id)
|
|
return {"deleted": ok, "id": disc_id}
|