Merge feat/zeroclaw-compliance-agent into main

Brings all compliance doc-check features: - 162 regex checks + 1874 Master Controls - LLM-agnostic agent with tool calling - Banner check (46 checks, 30 CMPs, stealth, Shadow DOM) - Impressum check (24 checks) - Deep consent verification (DataLayer, GCM, TCF) - CMP E2E tests (39 tests) - HTML email reports, FAQ, persistent history Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-11 11:44:20 +02:00
parent e80bbe000f 2f0f76e365
commit 36c6101b91
175 changed files with 20063 additions and 1283 deletions
@@ -23,9 +23,13 @@ from compliance.services.mandatory_content_checker import (
    check_mandatory_documents, check_dse_mandatory_content, MandatoryFinding,
 )
 from compliance.services.legal_basis_validator import validate_legal_bases
+<<<<<<< HEAD
 from compliance.api.agent_scan_helpers import (
    add_corrections, build_scan_summary, fetch_dse_text, fetch_dse_html,
 )
+=======
+from compliance.api.agent_scan_helpers import add_corrections, build_scan_summary
+>>>>>>> feat/zeroclaw-compliance-agent

 logger = logging.getLogger(__name__)

@@ -79,7 +83,10 @@ class ScanFinding(BaseModel):
    severity: str
    text: str
    correction: str = ""
+<<<<<<< HEAD
    doc_title: str = ""
+=======
+>>>>>>> feat/zeroclaw-compliance-agent
    text_reference: TextReferenceModel | None = None


@@ -219,17 +226,69 @@ async def _execute_scan(req: ScanRequest, scan_id: str = "") -> ScanResponse:
    else:
        scan = await scan_website(req.url)

+<<<<<<< HEAD
    logger.info("Scanned %d pages, found %d services", len(scan.pages_scanned), len(scan.detected_services))

    _progress(f"Schritt 2/7: Rechtliche Dokumente suchen... ({len(scan.pages_scanned)} Seiten gescannt)")
+=======
+    # Step 1: Scan website — try Playwright first (JS-rendered), fallback to httpx
+    playwright_htmls: dict[str, str] = {}
+    try:
+        async with httpx.AsyncClient(timeout=120.0) as pw_client:
+            pw_resp = await pw_client.post(
+                "http://bp-compliance-consent-tester:8094/website-scan",
+                json={"url": req.url, "max_pages": 15, "click_nav": True},
+            )
+            if pw_resp.status_code == 200:
+                pw_data = pw_resp.json()
+                playwright_htmls = pw_data.get("page_htmls", {})
+                logger.info("Playwright scan: %d pages, %d scripts",
+                            pw_data.get("pages_count", 0), len(pw_data.get("external_scripts", [])))
+    except Exception as e:
+        logger.warning("Playwright scanner unavailable, falling back to httpx: %s", e)
+
+    # Use Playwright results if available, otherwise fall back to httpx scanner
+    if playwright_htmls:
+        # Build ScanResult from Playwright data
+        from compliance.services.website_scanner import ScanResult, DetectedService, _detect_services, _detect_ai_mentions
+        from compliance.services.service_registry import SERVICE_REGISTRY
+        scan = ScanResult()
+        scan.pages_scanned = list(playwright_htmls.keys())
+        for page_url, html in playwright_htmls.items():
+            _detect_services(html, page_url, scan)
+            _detect_ai_mentions(html, page_url, scan)
+        # Deduplicate
+        seen = set()
+        unique = []
+        for svc in scan.detected_services:
+            if svc.id not in seen:
+                seen.add(svc.id)
+                unique.append(svc)
+        scan.detected_services = unique
+        scan.chatbot_detected = any(s.category == "chatbot" for s in scan.detected_services)
+        if scan.chatbot_detected:
+            scan.chatbot_provider = next(s.name for s in scan.detected_services if s.category == "chatbot")
+    else:
+        scan = await scan_website(req.url)
+
+    logger.info("Scanned %d pages, found %d services", len(scan.pages_scanned), len(scan.detected_services))
+
+>>>>>>> feat/zeroclaw-compliance-agent
    # Step 1b: DSI Discovery — find all legal documents on the website
    discovered_docs: list[DiscoveredDocument] = []
    dsi_findings: list[ScanFinding] = []
    try:
+<<<<<<< HEAD
        async with httpx.AsyncClient(timeout=300.0) as dsi_client:
            dsi_resp = await dsi_client.post(
                "http://bp-compliance-consent-tester:8094/dsi-discovery",
                json={"url": req.url, "max_documents": 30},
+=======
+        async with httpx.AsyncClient(timeout=180.0) as dsi_client:
+            dsi_resp = await dsi_client.post(
+                "http://bp-compliance-consent-tester:8094/dsi-discovery",
+                json={"url": req.url, "max_documents": 20},
+>>>>>>> feat/zeroclaw-compliance-agent
            )
            if dsi_resp.status_code == 200:
                dsi_data = dsi_resp.json()
@@ -241,12 +300,17 @@ async def _execute_scan(req: ScanRequest, scan_id: str = "") -> ScanResponse:
                )
                for doc in dsi_data.get("documents", []):
                    doc_type = classify_document_type(doc["title"], doc["url"])
+<<<<<<< HEAD
                    doc_text = doc.get("full_text", "") or doc.get("text_preview", "")
                    logger.info("DSI check: '%s' type=%s text_len=%d full_text_len=%d preview_len=%d",
                                doc["title"][:50], doc_type, len(doc_text),
                                len(doc.get("full_text", "")), len(doc.get("text_preview", "")))
                    doc_findings = check_document_completeness(
                        doc_text, doc_type, doc["title"], doc["url"],
+=======
+                    doc_findings = check_document_completeness(
+                        doc.get("text_preview", ""), doc_type, doc["title"], doc["url"],
+>>>>>>> feat/zeroclaw-compliance-agent
                    )
                    # Count completeness
                    score_finding = next((f for f in doc_findings if "SCORE" in f.get("code", "")), None)
@@ -268,6 +332,7 @@ async def _execute_scan(req: ScanRequest, scan_id: str = "") -> ScanResponse:
                        if "SCORE" not in df.get("code", ""):
                            dsi_findings.append(ScanFinding(
                                code=df["code"], severity=df["severity"], text=df["text"],
+<<<<<<< HEAD
                                doc_title=doc["title"],
                            ))
    except Exception as e:
@@ -296,6 +361,24 @@ async def _execute_scan(req: ScanRequest, scan_id: str = "") -> ScanResponse:
        pass
    if not dse_text:
        dse_text = await fetch_dse_text(req.url, scan.pages_scanned)
+=======
+                            ))
+    except Exception as e:
+        logger.warning("DSI discovery failed: %s", e)
+
+    # Step 2: Fetch privacy policy text (from Playwright HTMLs or httpx)
+    dse_text = ""
+    for page_url, html in playwright_htmls.items():
+        if re.search(r"datenschutz|privacy|dsgvo", page_url, re.IGNORECASE):
+            import re as _re
+            clean = _re.sub(r"<(script|style)[^>]*>.*?</\1>", "", html, flags=_re.DOTALL | _re.IGNORECASE)
+            clean = _re.sub(r"<[^>]+>", " ", clean)
+            clean = _re.sub(r"\s+", " ", clean).strip()
+            dse_text = clean[:4000]
+            break
+    if not dse_text:
+        dse_text = await _fetch_dse_text(req.url, scan.pages_scanned)
+>>>>>>> feat/zeroclaw-compliance-agent

    # Step 3: Extract services mentioned in DSE via LLM + text fallback
    dse_services = await extract_dse_services(dse_text) if dse_text else []
@@ -320,11 +403,18 @@ async def _execute_scan(req: ScanRequest, scan_id: str = "") -> ScanResponse:
            dse_html = html
            break
    if not dse_html:
+<<<<<<< HEAD
        dse_html = await fetch_dse_html(req.url, scan.pages_scanned)
    dse_sections = parse_dse(dse_html, req.url) if dse_html else []
    logger.info("Parsed %d DSE sections", len(dse_sections))

    _progress("Schritt 4/7: SOLL/IST Vergleich...")
+=======
+        dse_html = await _fetch_dse_html(req.url, scan.pages_scanned)
+    dse_sections = parse_dse(dse_html, req.url) if dse_html else []
+    logger.info("Parsed %d DSE sections", len(dse_sections))
+
+>>>>>>> feat/zeroclaw-compliance-agent
    # Step 5: SOLL/IST comparison
    detected_dicts = [_service_to_dict(s) for s in scan.detected_services]
    comparison = compare_services(detected_dicts, dse_services)
@@ -363,7 +453,10 @@ async def _execute_scan(req: ScanRequest, scan_id: str = "") -> ScanResponse:
    # Step 8c: Add DSI document findings
    findings.extend(dsi_findings)

+<<<<<<< HEAD
    _progress(f"Schritt 5/7: Korrekturen generieren... ({len(findings)} Findings)")
+=======
+>>>>>>> feat/zeroclaw-compliance-agent
    # Step 9: Generate corrections for pre-launch mode
    if not is_live and findings:
        await add_corrections(findings, dse_text)
@@ -400,6 +493,24 @@ async def _execute_scan(req: ScanRequest, scan_id: str = "") -> ScanResponse:



+async def _fetch_dse_html(url: str, scanned_pages: list[str]) -> str:
+    """Fetch the raw HTML of the privacy policy page (for structured parsing)."""
+    import re
+    dse_url = None
+    for page in scanned_pages:
+        if re.search(r"datenschutz|privacy|dsgvo", page, re.IGNORECASE):
+            dse_url = page
+            break
+    if not dse_url:
+        dse_url = url
+    try:
+        async with httpx.AsyncClient(timeout=15.0, follow_redirects=True) as client:
+            resp = await client.get(dse_url, headers={"User-Agent": "BreakPilot-Compliance-Agent/1.0"})
+            return resp.text
+    except Exception:
+        return ""
+
+
 def _service_to_dict(svc: DetectedService) -> dict:
    return {
        "id": svc.id, "name": svc.name, "category": svc.category,