fix: Playwright user permission + etracker DSE matching + CMP skip
1. Dockerfile: install Playwright AS appuser (not root) so chromium binary is accessible at runtime. Was causing 500 error. 2. DSE service matching: text-search fallback when LLM extraction fails. If "etracker" appears in DSE text, mark as documented even without LLM parsing the service list. 3. CMP skip: consent managers in category "cmp" skipped (not just "other" with id "cmp"). NOT DEPLOYED — RAG pipeline is running. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -153,9 +153,21 @@ async def scan_website_endpoint(req: ScanRequest):
|
||||
if not dse_text:
|
||||
dse_text = await _fetch_dse_text(req.url, scan.pages_scanned)
|
||||
|
||||
# Step 3: Extract services mentioned in DSE via LLM
|
||||
# Step 3: Extract services mentioned in DSE via LLM + text fallback
|
||||
dse_services = await extract_dse_services(dse_text) if dse_text else []
|
||||
logger.info("DSE mentions %d services", len(dse_services))
|
||||
logger.info("DSE mentions %d services (LLM)", len(dse_services))
|
||||
|
||||
# Fallback: if LLM extraction failed, search DSE text directly for service names
|
||||
if not dse_services and dse_text:
|
||||
dse_lower = dse_text.lower()
|
||||
detected_dicts_for_check = [_service_to_dict(s) for s in scan.detected_services]
|
||||
for svc in detected_dicts_for_check:
|
||||
name = svc.get("name", "").lower()
|
||||
# Check if service name appears in DSE text
|
||||
if name and len(name) > 3 and name in dse_lower:
|
||||
dse_services.append({"name": svc["name"], "purpose": "", "country": svc.get("country", ""), "legal_basis": ""})
|
||||
if dse_services:
|
||||
logger.info("DSE text fallback found %d services", len(dse_services))
|
||||
|
||||
# Step 4: Parse DSE into structured sections (prefer Playwright HTML)
|
||||
dse_html = ""
|
||||
|
||||
@@ -87,9 +87,10 @@ def compare_services(
|
||||
|
||||
for key, svc in detected_names.items():
|
||||
# Skip CMP — consent managers don't need DSE mention
|
||||
if svc.get("category") == "other" and svc.get("id") == "cmp":
|
||||
if svc.get("category") == "cmp" or (svc.get("category") == "other" and svc.get("id") == "cmp"):
|
||||
continue
|
||||
matched = False
|
||||
# Method 1: Match against LLM-extracted service list
|
||||
for dse_key, dse_svc in dse_names.items():
|
||||
if key == dse_key or _fuzzy_match(svc["name"], dse_svc["name"]):
|
||||
documented.append({"detected": svc, "dse": dse_svc, "status": "ok"})
|
||||
|
||||
Reference in New Issue
Block a user