feat: Phase 10 — Playwright website scanner replaces httpx

New /website-scan endpoint in consent-tester service: - Real browser renders JavaScript (finds dynamic content) - Clicks navigation menus (discovers hidden sub-pages like IHK DSB page) - Follows links within DSE to find regional privacy policies - Collects rendered HTML for each page (after JS execution) Backend integration: - agent_scan_routes tries Playwright first, falls back to httpx - DSE text and HTML extracted from Playwright-rendered pages - Service detection runs on rendered HTML (catches JS-loaded scripts) Also fixes: - GA regex: G-[A-Z0-9]{8,12} prevents CSS class false positives - etracker added to service registry - External page scanning blocked (same-domain only) - CSS/JS/image files excluded from page list Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-29 19:16:50 +02:00
parent 5eeef3a9c3
commit cedc5de15d
3 changed files with 367 additions and 6 deletions
@@ -14,6 +14,7 @@ from pydantic import BaseModel

 from services.consent_scanner import run_consent_test, ConsentTestResult
 from services.authenticated_scanner import run_authenticated_test, AuthTestResult
+from services.playwright_scanner import scan_website_playwright

 logging.basicConfig(level=logging.INFO, format="%(levelname)s:%(name)s: %(message)s")
 logger = logging.getLogger(__name__)
@@ -172,3 +173,54 @@ async def authenticated_scan(req: AuthScanRequest):
        findings_count=missing,
        scanned_at=datetime.now(timezone.utc).isoformat(),
    )
+
+
+# ═══════════════════════════════════════════════════════════════
+# PLAYWRIGHT WEBSITE SCAN (Phase 10 — replaces httpx scanner)
+# ═══════════════════════════════════════════════════════════════
+
+class WebsiteScanRequest(BaseModel):
+    url: str
+    max_pages: int = 15
+    click_nav: bool = True
+
+
+class PageInfo(BaseModel):
+    url: str
+    status: int
+    title: str = ""
+    error: str = ""
+
+
+class WebsiteScanResponse(BaseModel):
+    url: str
+    pages: list[PageInfo]
+    pages_count: int
+    external_scripts: list[str]
+    cookies: list[str]
+    page_htmls: dict[str, str]  # url -> rendered HTML (for backend analysis)
+    scanned_at: str
+
+
+@app.post("/website-scan", response_model=WebsiteScanResponse)
+async def website_scan(req: WebsiteScanRequest):
+    """Scan website using Playwright — discovers pages via JS navigation + menu clicks."""
+    logger.info("Starting Playwright website scan for %s (max %d pages)", req.url, req.max_pages)
+
+    result = await scan_website_playwright(req.url, req.max_pages, req.click_nav)
+
+    # Build page HTML map (only successful pages, truncated)
+    page_htmls = {}
+    for p in result.pages:
+        if p.html and p.status < 400:
+            page_htmls[p.url] = p.html[:50000]  # Cap at 50KB per page
+
+    return WebsiteScanResponse(
+        url=req.url,
+        pages=[PageInfo(url=p.url, status=p.status, title=p.title, error=p.error) for p in result.pages],
+        pages_count=len(result.pages),
+        external_scripts=result.external_scripts[:50],
+        cookies=result.all_cookies,
+        page_htmls=page_htmls,
+        scanned_at=datetime.now(timezone.utc).isoformat(),
+    )