feat: Phase 10 — Playwright website scanner replaces httpx
New /website-scan endpoint in consent-tester service:
- Real browser renders JavaScript (finds dynamic content)
- Clicks navigation menus (discovers hidden sub-pages like IHK DSB page)
- Follows links within DSE to find regional privacy policies
- Collects rendered HTML for each page (after JS execution)
Backend integration:
- agent_scan_routes tries Playwright first, falls back to httpx
- DSE text and HTML extracted from Playwright-rendered pages
- Service detection runs on rendered HTML (catches JS-loaded scripts)
Also fixes:
- GA regex: G-[A-Z0-9]{8,12} prevents CSS class false positives
- etracker added to service registry
- External page scanning blocked (same-domain only)
- CSS/JS/image files excluded from page list
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -98,19 +98,73 @@ async def scan_website_endpoint(req: ScanRequest):
|
||||
"""Deep website scan: multi-page crawl + SOLL/IST service comparison."""
|
||||
is_live = req.mode == "post_launch"
|
||||
|
||||
# Step 1: Scan website (5-10 pages)
|
||||
scan = await scan_website(req.url)
|
||||
# Step 1: Scan website — try Playwright first (JS-rendered), fallback to httpx
|
||||
playwright_htmls: dict[str, str] = {}
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=120.0) as pw_client:
|
||||
pw_resp = await pw_client.post(
|
||||
"http://bp-compliance-consent-tester:8094/website-scan",
|
||||
json={"url": req.url, "max_pages": 15, "click_nav": True},
|
||||
)
|
||||
if pw_resp.status_code == 200:
|
||||
pw_data = pw_resp.json()
|
||||
playwright_htmls = pw_data.get("page_htmls", {})
|
||||
logger.info("Playwright scan: %d pages, %d scripts",
|
||||
pw_data.get("pages_count", 0), len(pw_data.get("external_scripts", [])))
|
||||
except Exception as e:
|
||||
logger.warning("Playwright scanner unavailable, falling back to httpx: %s", e)
|
||||
|
||||
# Use Playwright results if available, otherwise fall back to httpx scanner
|
||||
if playwright_htmls:
|
||||
# Build ScanResult from Playwright data
|
||||
from compliance.services.website_scanner import ScanResult, DetectedService, _detect_services, _detect_ai_mentions
|
||||
from compliance.services.service_registry import SERVICE_REGISTRY
|
||||
scan = ScanResult()
|
||||
scan.pages_scanned = list(playwright_htmls.keys())
|
||||
for page_url, html in playwright_htmls.items():
|
||||
_detect_services(html, page_url, scan)
|
||||
_detect_ai_mentions(html, page_url, scan)
|
||||
# Deduplicate
|
||||
seen = set()
|
||||
unique = []
|
||||
for svc in scan.detected_services:
|
||||
if svc.id not in seen:
|
||||
seen.add(svc.id)
|
||||
unique.append(svc)
|
||||
scan.detected_services = unique
|
||||
scan.chatbot_detected = any(s.category == "chatbot" for s in scan.detected_services)
|
||||
if scan.chatbot_detected:
|
||||
scan.chatbot_provider = next(s.name for s in scan.detected_services if s.category == "chatbot")
|
||||
else:
|
||||
scan = await scan_website(req.url)
|
||||
|
||||
logger.info("Scanned %d pages, found %d services", len(scan.pages_scanned), len(scan.detected_services))
|
||||
|
||||
# Step 2: Fetch privacy policy text for SOLL extraction
|
||||
dse_text = await _fetch_dse_text(req.url, scan.pages_scanned)
|
||||
# Step 2: Fetch privacy policy text (from Playwright HTMLs or httpx)
|
||||
dse_text = ""
|
||||
for page_url, html in playwright_htmls.items():
|
||||
if re.search(r"datenschutz|privacy|dsgvo", page_url, re.IGNORECASE):
|
||||
import re as _re
|
||||
clean = _re.sub(r"<(script|style)[^>]*>.*?</\1>", "", html, flags=_re.DOTALL | _re.IGNORECASE)
|
||||
clean = _re.sub(r"<[^>]+>", " ", clean)
|
||||
clean = _re.sub(r"\s+", " ", clean).strip()
|
||||
dse_text = clean[:4000]
|
||||
break
|
||||
if not dse_text:
|
||||
dse_text = await _fetch_dse_text(req.url, scan.pages_scanned)
|
||||
|
||||
# Step 3: Extract services mentioned in DSE via LLM
|
||||
dse_services = await extract_dse_services(dse_text) if dse_text else []
|
||||
logger.info("DSE mentions %d services", len(dse_services))
|
||||
|
||||
# Step 4: Parse DSE into structured sections
|
||||
dse_html = await _fetch_dse_html(req.url, scan.pages_scanned)
|
||||
# Step 4: Parse DSE into structured sections (prefer Playwright HTML)
|
||||
dse_html = ""
|
||||
for page_url, html in playwright_htmls.items():
|
||||
if re.search(r"datenschutz|privacy|dsgvo", page_url, re.IGNORECASE):
|
||||
dse_html = html
|
||||
break
|
||||
if not dse_html:
|
||||
dse_html = await _fetch_dse_html(req.url, scan.pages_scanned)
|
||||
dse_sections = parse_dse(dse_html, req.url) if dse_html else []
|
||||
logger.info("Parsed %d DSE sections", len(dse_sections))
|
||||
|
||||
|
||||
Reference in New Issue
Block a user