Merge feat/zeroclaw-compliance-agent into main
Brings all compliance doc-check features: - 162 regex checks + 1874 Master Controls - LLM-agnostic agent with tool calling - Banner check (46 checks, 30 CMPs, stealth, Shadow DOM) - Impressum check (24 checks) - Deep consent verification (DataLayer, GCM, TCF) - CMP E2E tests (39 tests) - HTML email reports, FAQ, persistent history Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -23,9 +23,13 @@ from compliance.services.mandatory_content_checker import (
|
||||
check_mandatory_documents, check_dse_mandatory_content, MandatoryFinding,
|
||||
)
|
||||
from compliance.services.legal_basis_validator import validate_legal_bases
|
||||
<<<<<<< HEAD
|
||||
from compliance.api.agent_scan_helpers import (
|
||||
add_corrections, build_scan_summary, fetch_dse_text, fetch_dse_html,
|
||||
)
|
||||
=======
|
||||
from compliance.api.agent_scan_helpers import add_corrections, build_scan_summary
|
||||
>>>>>>> feat/zeroclaw-compliance-agent
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -79,7 +83,10 @@ class ScanFinding(BaseModel):
|
||||
severity: str
|
||||
text: str
|
||||
correction: str = ""
|
||||
<<<<<<< HEAD
|
||||
doc_title: str = ""
|
||||
=======
|
||||
>>>>>>> feat/zeroclaw-compliance-agent
|
||||
text_reference: TextReferenceModel | None = None
|
||||
|
||||
|
||||
@@ -219,17 +226,69 @@ async def _execute_scan(req: ScanRequest, scan_id: str = "") -> ScanResponse:
|
||||
else:
|
||||
scan = await scan_website(req.url)
|
||||
|
||||
<<<<<<< HEAD
|
||||
logger.info("Scanned %d pages, found %d services", len(scan.pages_scanned), len(scan.detected_services))
|
||||
|
||||
_progress(f"Schritt 2/7: Rechtliche Dokumente suchen... ({len(scan.pages_scanned)} Seiten gescannt)")
|
||||
=======
|
||||
# Step 1: Scan website — try Playwright first (JS-rendered), fallback to httpx
|
||||
playwright_htmls: dict[str, str] = {}
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=120.0) as pw_client:
|
||||
pw_resp = await pw_client.post(
|
||||
"http://bp-compliance-consent-tester:8094/website-scan",
|
||||
json={"url": req.url, "max_pages": 15, "click_nav": True},
|
||||
)
|
||||
if pw_resp.status_code == 200:
|
||||
pw_data = pw_resp.json()
|
||||
playwright_htmls = pw_data.get("page_htmls", {})
|
||||
logger.info("Playwright scan: %d pages, %d scripts",
|
||||
pw_data.get("pages_count", 0), len(pw_data.get("external_scripts", [])))
|
||||
except Exception as e:
|
||||
logger.warning("Playwright scanner unavailable, falling back to httpx: %s", e)
|
||||
|
||||
# Use Playwright results if available, otherwise fall back to httpx scanner
|
||||
if playwright_htmls:
|
||||
# Build ScanResult from Playwright data
|
||||
from compliance.services.website_scanner import ScanResult, DetectedService, _detect_services, _detect_ai_mentions
|
||||
from compliance.services.service_registry import SERVICE_REGISTRY
|
||||
scan = ScanResult()
|
||||
scan.pages_scanned = list(playwright_htmls.keys())
|
||||
for page_url, html in playwright_htmls.items():
|
||||
_detect_services(html, page_url, scan)
|
||||
_detect_ai_mentions(html, page_url, scan)
|
||||
# Deduplicate
|
||||
seen = set()
|
||||
unique = []
|
||||
for svc in scan.detected_services:
|
||||
if svc.id not in seen:
|
||||
seen.add(svc.id)
|
||||
unique.append(svc)
|
||||
scan.detected_services = unique
|
||||
scan.chatbot_detected = any(s.category == "chatbot" for s in scan.detected_services)
|
||||
if scan.chatbot_detected:
|
||||
scan.chatbot_provider = next(s.name for s in scan.detected_services if s.category == "chatbot")
|
||||
else:
|
||||
scan = await scan_website(req.url)
|
||||
|
||||
logger.info("Scanned %d pages, found %d services", len(scan.pages_scanned), len(scan.detected_services))
|
||||
|
||||
>>>>>>> feat/zeroclaw-compliance-agent
|
||||
# Step 1b: DSI Discovery — find all legal documents on the website
|
||||
discovered_docs: list[DiscoveredDocument] = []
|
||||
dsi_findings: list[ScanFinding] = []
|
||||
try:
|
||||
<<<<<<< HEAD
|
||||
async with httpx.AsyncClient(timeout=300.0) as dsi_client:
|
||||
dsi_resp = await dsi_client.post(
|
||||
"http://bp-compliance-consent-tester:8094/dsi-discovery",
|
||||
json={"url": req.url, "max_documents": 30},
|
||||
=======
|
||||
async with httpx.AsyncClient(timeout=180.0) as dsi_client:
|
||||
dsi_resp = await dsi_client.post(
|
||||
"http://bp-compliance-consent-tester:8094/dsi-discovery",
|
||||
json={"url": req.url, "max_documents": 20},
|
||||
>>>>>>> feat/zeroclaw-compliance-agent
|
||||
)
|
||||
if dsi_resp.status_code == 200:
|
||||
dsi_data = dsi_resp.json()
|
||||
@@ -241,12 +300,17 @@ async def _execute_scan(req: ScanRequest, scan_id: str = "") -> ScanResponse:
|
||||
)
|
||||
for doc in dsi_data.get("documents", []):
|
||||
doc_type = classify_document_type(doc["title"], doc["url"])
|
||||
<<<<<<< HEAD
|
||||
doc_text = doc.get("full_text", "") or doc.get("text_preview", "")
|
||||
logger.info("DSI check: '%s' type=%s text_len=%d full_text_len=%d preview_len=%d",
|
||||
doc["title"][:50], doc_type, len(doc_text),
|
||||
len(doc.get("full_text", "")), len(doc.get("text_preview", "")))
|
||||
doc_findings = check_document_completeness(
|
||||
doc_text, doc_type, doc["title"], doc["url"],
|
||||
=======
|
||||
doc_findings = check_document_completeness(
|
||||
doc.get("text_preview", ""), doc_type, doc["title"], doc["url"],
|
||||
>>>>>>> feat/zeroclaw-compliance-agent
|
||||
)
|
||||
# Count completeness
|
||||
score_finding = next((f for f in doc_findings if "SCORE" in f.get("code", "")), None)
|
||||
@@ -268,6 +332,7 @@ async def _execute_scan(req: ScanRequest, scan_id: str = "") -> ScanResponse:
|
||||
if "SCORE" not in df.get("code", ""):
|
||||
dsi_findings.append(ScanFinding(
|
||||
code=df["code"], severity=df["severity"], text=df["text"],
|
||||
<<<<<<< HEAD
|
||||
doc_title=doc["title"],
|
||||
))
|
||||
except Exception as e:
|
||||
@@ -296,6 +361,24 @@ async def _execute_scan(req: ScanRequest, scan_id: str = "") -> ScanResponse:
|
||||
pass
|
||||
if not dse_text:
|
||||
dse_text = await fetch_dse_text(req.url, scan.pages_scanned)
|
||||
=======
|
||||
))
|
||||
except Exception as e:
|
||||
logger.warning("DSI discovery failed: %s", e)
|
||||
|
||||
# Step 2: Fetch privacy policy text (from Playwright HTMLs or httpx)
|
||||
dse_text = ""
|
||||
for page_url, html in playwright_htmls.items():
|
||||
if re.search(r"datenschutz|privacy|dsgvo", page_url, re.IGNORECASE):
|
||||
import re as _re
|
||||
clean = _re.sub(r"<(script|style)[^>]*>.*?</\1>", "", html, flags=_re.DOTALL | _re.IGNORECASE)
|
||||
clean = _re.sub(r"<[^>]+>", " ", clean)
|
||||
clean = _re.sub(r"\s+", " ", clean).strip()
|
||||
dse_text = clean[:4000]
|
||||
break
|
||||
if not dse_text:
|
||||
dse_text = await _fetch_dse_text(req.url, scan.pages_scanned)
|
||||
>>>>>>> feat/zeroclaw-compliance-agent
|
||||
|
||||
# Step 3: Extract services mentioned in DSE via LLM + text fallback
|
||||
dse_services = await extract_dse_services(dse_text) if dse_text else []
|
||||
@@ -320,11 +403,18 @@ async def _execute_scan(req: ScanRequest, scan_id: str = "") -> ScanResponse:
|
||||
dse_html = html
|
||||
break
|
||||
if not dse_html:
|
||||
<<<<<<< HEAD
|
||||
dse_html = await fetch_dse_html(req.url, scan.pages_scanned)
|
||||
dse_sections = parse_dse(dse_html, req.url) if dse_html else []
|
||||
logger.info("Parsed %d DSE sections", len(dse_sections))
|
||||
|
||||
_progress("Schritt 4/7: SOLL/IST Vergleich...")
|
||||
=======
|
||||
dse_html = await _fetch_dse_html(req.url, scan.pages_scanned)
|
||||
dse_sections = parse_dse(dse_html, req.url) if dse_html else []
|
||||
logger.info("Parsed %d DSE sections", len(dse_sections))
|
||||
|
||||
>>>>>>> feat/zeroclaw-compliance-agent
|
||||
# Step 5: SOLL/IST comparison
|
||||
detected_dicts = [_service_to_dict(s) for s in scan.detected_services]
|
||||
comparison = compare_services(detected_dicts, dse_services)
|
||||
@@ -363,7 +453,10 @@ async def _execute_scan(req: ScanRequest, scan_id: str = "") -> ScanResponse:
|
||||
# Step 8c: Add DSI document findings
|
||||
findings.extend(dsi_findings)
|
||||
|
||||
<<<<<<< HEAD
|
||||
_progress(f"Schritt 5/7: Korrekturen generieren... ({len(findings)} Findings)")
|
||||
=======
|
||||
>>>>>>> feat/zeroclaw-compliance-agent
|
||||
# Step 9: Generate corrections for pre-launch mode
|
||||
if not is_live and findings:
|
||||
await add_corrections(findings, dse_text)
|
||||
@@ -400,6 +493,24 @@ async def _execute_scan(req: ScanRequest, scan_id: str = "") -> ScanResponse:
|
||||
|
||||
|
||||
|
||||
async def _fetch_dse_html(url: str, scanned_pages: list[str]) -> str:
|
||||
"""Fetch the raw HTML of the privacy policy page (for structured parsing)."""
|
||||
import re
|
||||
dse_url = None
|
||||
for page in scanned_pages:
|
||||
if re.search(r"datenschutz|privacy|dsgvo", page, re.IGNORECASE):
|
||||
dse_url = page
|
||||
break
|
||||
if not dse_url:
|
||||
dse_url = url
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=15.0, follow_redirects=True) as client:
|
||||
resp = await client.get(dse_url, headers={"User-Agent": "BreakPilot-Compliance-Agent/1.0"})
|
||||
return resp.text
|
||||
except Exception:
|
||||
return ""
|
||||
|
||||
|
||||
def _service_to_dict(svc: DetectedService) -> dict:
|
||||
return {
|
||||
"id": svc.id, "name": svc.name, "category": svc.category,
|
||||
|
||||
Reference in New Issue
Block a user