fix: Playwright user permission + etracker DSE matching + CMP skip

1. Dockerfile: install Playwright AS appuser (not root) so chromium
   binary is accessible at runtime. Was causing 500 error.
2. DSE service matching: text-search fallback when LLM extraction fails.
   If "etracker" appears in DSE text, mark as documented even without
   LLM parsing the service list.
3. CMP skip: consent managers in category "cmp" skipped (not just "other"
   with id "cmp").

NOT DEPLOYED — RAG pipeline is running.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-04-29 19:36:46 +02:00
parent cedc5de15d
commit 58957a4aaa
3 changed files with 25 additions and 4 deletions
@@ -153,9 +153,21 @@ async def scan_website_endpoint(req: ScanRequest):
if not dse_text:
dse_text = await _fetch_dse_text(req.url, scan.pages_scanned)
# Step 3: Extract services mentioned in DSE via LLM
# Step 3: Extract services mentioned in DSE via LLM + text fallback
dse_services = await extract_dse_services(dse_text) if dse_text else []
logger.info("DSE mentions %d services", len(dse_services))
logger.info("DSE mentions %d services (LLM)", len(dse_services))
# Fallback: if LLM extraction failed, search DSE text directly for service names
if not dse_services and dse_text:
dse_lower = dse_text.lower()
detected_dicts_for_check = [_service_to_dict(s) for s in scan.detected_services]
for svc in detected_dicts_for_check:
name = svc.get("name", "").lower()
# Check if service name appears in DSE text
if name and len(name) > 3 and name in dse_lower:
dse_services.append({"name": svc["name"], "purpose": "", "country": svc.get("country", ""), "legal_basis": ""})
if dse_services:
logger.info("DSE text fallback found %d services", len(dse_services))
# Step 4: Parse DSE into structured sections (prefer Playwright HTML)
dse_html = ""
@@ -87,9 +87,10 @@ def compare_services(
for key, svc in detected_names.items():
# Skip CMP — consent managers don't need DSE mention
if svc.get("category") == "other" and svc.get("id") == "cmp":
if svc.get("category") == "cmp" or (svc.get("category") == "other" and svc.get("id") == "cmp"):
continue
matched = False
# Method 1: Match against LLM-extracted service list
for dse_key, dse_svc in dse_names.items():
if key == dse_key or _fuzzy_match(svc["name"], dse_svc["name"]):
documented.append({"detected": svc, "dse": dse_svc, "status": "ok"})