Files
breakpilot-compliance/backend-compliance/compliance/api/agent_analyze_routes.py
T
Benjamin Admin c5b22e0c99 fix: derive intake flags from DETECTED SERVICES, not from text content
Fundamental architecture fix: data processing happens through APIs/scripts/
cookies — NOT through visible page text. A news site about healthcare does
NOT process health data.

Before: Qwen reads website text → guesses "health_data: true" (WRONG)
After: Google Analytics detected → tracking: true (CORRECT, deterministic)

New flow: detect services from HTML → map service categories to flags →
feed flags into UCCA assessment. No LLM needed for flag extraction.

SERVICE_TO_FLAGS maps categories: tracking→tracking, marketing→marketing+
third_party_sharing, payment→payment_data, heatmap→profiling, etc.
SPECIFIC_SERVICE_FLAGS for Klarna (Art.22), Stripe (US transfer), etc.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-02 08:37:51 +02:00

318 lines
13 KiB
Python

"""
Agent Analyze Routes — combined endpoint that fetches a URL, classifies it,
assesses DSGVO compliance, and sends a notification email.
POST /api/compliance/agent/analyze
"""
import logging
import re
import os
from datetime import datetime, timezone
import httpx
from fastapi import APIRouter
from pydantic import BaseModel
from compliance.services.smtp_sender import send_email
from compliance.services.intake_extractor import extract_intake_flags_from_services, flags_to_ucca_intake
from compliance.services.relevance_filter import filter_controls
from compliance.services.website_compliance_checks import (
check_website_compliance as _check_website_compliance,
FollowUpQuestion,
to_string_list as _to_string_list,
risk_to_escalation as _risk_to_escalation,
)
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/compliance/agent", tags=["agent"])
SDK_URL = os.environ.get("AI_SDK_URL", "http://bp-compliance-ai-sdk:8090")
TENANT_ID = "9282a473-5c95-4b3a-bf78-0ecc0ec71d3e"
USER_ID = "00000000-0000-0000-0000-000000000001"
ESCALATION_ROLES = {
"E0": "Kein Handlungsbedarf",
"E1": "Teamleitung Datenschutz",
"E2": "Datenschutzbeauftragter (DSB)",
"E3": "DSB + Rechtsabteilung",
}
SDK_HEADERS = {
"Content-Type": "application/json",
"X-Tenant-ID": TENANT_ID,
"X-User-ID": USER_ID,
}
class AnalyzeRequest(BaseModel):
url: str
recipient: str = "dsb@breakpilot.local"
mode: str = "post_launch" # "pre_launch" or "post_launch"
class FollowUpQuestion(BaseModel):
id: str
question: str
legal_basis: str
severity: str # "high", "medium", "low"
finding_if_no: str # Finding text if user answers "no"
class AnalyzeResponse(BaseModel):
url: str
classification: str
risk_level: str
risk_score: float
escalation_level: str
responsible_role: str
findings: list[str]
required_controls: list[str]
summary: str
email_status: str
analyzed_at: str
follow_up_questions: list[FollowUpQuestion] = []
@router.post("/analyze", response_model=AnalyzeResponse)
async def analyze_url(req: AnalyzeRequest):
"""Fetch URL, classify, assess compliance, and notify responsible role."""
async with httpx.AsyncClient(timeout=60.0) as client:
# Step 1: Fetch and clean
text, raw_html = await _fetch_and_clean(client, req.url)
# Step 2: Classify via SDK LLM
classification = await _classify(client, text)
# Step 3: Detect services from HTML (deterministic, no LLM needed)
from compliance.services.service_registry import SERVICE_REGISTRY
detected_services = []
html_lower = raw_html.lower()
for pattern, meta in SERVICE_REGISTRY.items():
if re.search(pattern, html_lower):
detected_services.append(meta)
# Step 4: Derive intake flags from DETECTED SERVICES (not from text!)
intake_flags = extract_intake_flags_from_services(detected_services)
# Step 5: Assess via UCCA with service-derived flags
assessment = await _assess(client, text, classification, intake_flags)
# Step 5: Determine role
esc_level = assessment.get("escalation_level", "E0")
role = ESCALATION_ROLES.get(esc_level, ESCALATION_ROLES["E0"])
# Step 6: Website compliance checks (§312k BGB etc.)
site_findings, follow_ups = await _check_website_compliance(client, req.url, raw_html)
# Step 7: Merge and filter findings/controls
findings = assessment.get("triggered_rules", [])
controls = assessment.get("required_controls", [])
findings_str = _to_string_list(findings) + site_findings
controls_str = filter_controls(_to_string_list(controls), text, intake_flags)
# Escalate if website checks found issues
if site_findings and esc_level == "E0":
esc_level = "E1"
role = ESCALATION_ROLES["E1"]
summary = _build_summary(req.url, classification, assessment, role, findings_str, controls_str, req.mode)
# Step 7: Send notification
mode_label = "INTERNE PRUEFUNG" if req.mode == "pre_launch" else "LIVE-WEBSITE"
email_result = send_email(
recipient=req.recipient,
subject=f"[{mode_label}] Compliance-Finding: {classification}{req.url[:60]}",
body_html=summary,
)
return AnalyzeResponse(
url=req.url,
classification=classification,
risk_level=assessment.get("risk_level", "unknown"),
risk_score=assessment.get("risk_score", 0),
escalation_level=esc_level,
responsible_role=role,
findings=findings_str,
required_controls=controls_str,
summary=summary,
email_status=email_result.get("status", "failed"),
analyzed_at=datetime.now(timezone.utc).isoformat(),
follow_up_questions=follow_ups,
)
async def _fetch_and_clean(client: httpx.AsyncClient, url: str) -> tuple[str, str]:
"""Fetch URL. Returns (clean_text, raw_html)."""
resp = await client.get(url, follow_redirects=True, headers={
"User-Agent": "BreakPilot-Compliance-Agent/1.0",
})
html = resp.text
# Strip script/style blocks, then all tags
clean = re.sub(r"<(script|style)[^>]*>.*?</\1>", "", html, flags=re.DOTALL | re.IGNORECASE)
clean = re.sub(r"<[^>]+>", " ", clean)
clean = re.sub(r"&nbsp;", " ", clean)
clean = re.sub(r"\s+", " ", clean).strip()
return clean[:4000], html
async def _classify(client: httpx.AsyncClient, text: str) -> str:
"""Classify document type via SDK LLM chat."""
try:
resp = await client.post(f"{SDK_URL}/sdk/v1/llm/chat", headers=SDK_HEADERS, json={
"messages": [
{"role": "system", "content": (
"/no_think\n"
"Klassifiziere das Dokument in GENAU EINE Kategorie: "
"privacy_policy, cookie_banner, terms_of_service, imprint, dpa, other. "
"Antworte NUR mit dem Kategorienamen, nichts anderes. Kein Denken, keine Erklaerung."
)},
{"role": "user", "content": text[:2000]},
],
})
data = resp.json()
# Qwen 3.5 may use think mode — content can be in message.content or response
raw = (
data.get("response", "")
or data.get("content", "")
or (data.get("message", {}) or {}).get("content", "")
or ""
).strip().lower()
# Strip Qwen think tags if present
raw = re.sub(r"<think>.*?</think>", "", raw, flags=re.DOTALL).strip()
logger.info("Classification raw response: %s", raw[:200])
for cat in ["privacy_policy", "cookie_banner", "terms_of_service", "imprint", "dpa"]:
if cat in raw:
return cat
# Also check German terms
if "datenschutz" in raw:
return "privacy_policy"
if "cookie" in raw:
return "cookie_banner"
if "impressum" in raw:
return "imprint"
return "other"
except Exception as e:
logger.warning("Classification failed: %s", e)
return "other"
async def _assess(client: httpx.AsyncClient, text: str, classification: str, intake_flags: dict | None = None) -> dict:
"""Run UCCA assessment via SDK. Returns flattened result dict."""
try:
# Use LLM-extracted flags if available, otherwise minimal defaults
if intake_flags:
ucca_intake = flags_to_ucca_intake(intake_flags)
else:
ucca_intake = {
"data_types": {"personal_data": True},
"purpose": {},
"automation": "manual",
"outputs": {},
}
resp = await client.post(f"{SDK_URL}/sdk/v1/ucca/assess", headers=SDK_HEADERS, json={
"use_case_text": text[:3000],
"domain": classification,
**ucca_intake,
})
data = resp.json()
# Flatten: UCCA wraps result under "assessment" and "result"
assessment = data.get("assessment", data.get("result", data))
result = data.get("result", {})
return {
"risk_level": assessment.get("risk_level", result.get("risk_level", "unknown")),
"risk_score": assessment.get("risk_score", result.get("risk_score", 0)),
"escalation_level": _risk_to_escalation(assessment.get("risk_level", "")),
"triggered_rules": assessment.get("triggered_rules", result.get("triggered_rules", [])),
"required_controls": assessment.get("required_controls", result.get("required_controls", [])),
"summary": result.get("summary", ""),
"recommendation": result.get("recommendation", ""),
"dsfa_recommended": assessment.get("dsfa_recommended", False),
}
except Exception as e:
logger.warning("Assessment failed: %s", e)
return {"risk_level": "unknown", "risk_score": 0, "escalation_level": "E0"}
# _check_website_compliance, _to_string_list, _risk_to_escalation
# → extracted to compliance/services/website_compliance_checks.py
DOC_TYPE_LABELS = {
"privacy_policy": "Datenschutzerklaerung",
"cookie_banner": "Cookie-Banner",
"terms_of_service": "AGB",
"imprint": "Impressum",
"dpa": "Auftragsverarbeitung (AVV)",
"other": "Sonstiges",
}
RISK_COLORS = {
"MINIMAL": ("#16a34a", "Niedrig"),
"LOW": ("#ca8a04", "Gering"),
"LIMITED": ("#ea580c", "Mittel"),
"HIGH": ("#dc2626", "Hoch"),
"UNACCEPTABLE": ("#991b1b", "Kritisch"),
}
def _build_summary(
url: str, classification: str, assessment: dict, role: str,
findings_str: list[str], controls_str: list[str],
mode: str = "post_launch",
) -> str:
"""Build HTML summary for email and frontend."""
risk = assessment.get("risk_level", "unbekannt")
score = assessment.get("risk_score", 0)
recommendation = assessment.get("recommendation", "")
dsfa = assessment.get("dsfa_recommended", False)
is_live = mode == "post_launch"
risk_color, risk_label = RISK_COLORS.get(risk, ("#6b7280", risk))
doc_label = DOC_TYPE_LABELS.get(classification, classification)
mode_banner = (
'<div style="background:#fef2f2;border-left:4px solid #dc2626;padding:12px 16px;margin-bottom:16px;">'
'<strong style="color:#991b1b;">LIVE-WEBSITE</strong> — Das Dokument ist bereits oeffentlich zugaenglich.</div>'
if is_live else
'<div style="background:#eff6ff;border-left:4px solid #3b82f6;padding:12px 16px;margin-bottom:16px;">'
'<strong style="color:#1e40af;">INTERNE PRUEFUNG</strong> — Dokument noch nicht veroeffentlicht.</div>'
)
findings_html = "".join(f'<li style="margin-bottom:4px;">{f}</li>' for f in findings_str[:8]) if findings_str else '<li style="color:#6b7280;">Keine</li>'
controls_html = "".join(f'<li style="margin-bottom:4px;">{c}</li>' for c in controls_str[:8]) if controls_str else '<li style="color:#6b7280;">Keine</li>'
warning = ""
if is_live and findings_str:
warning = (
'<div style="background:#fef2f2;border:1px solid #fecaca;border-radius:8px;padding:12px 16px;margin-top:16px;">'
'<strong style="color:#dc2626;">⚠ ACHTUNG:</strong> Diese Maengel sind bereits oeffentlich sichtbar. '
'Sofortige Nachbesserung empfohlen um Abmahnrisiken zu minimieren.</div>'
)
elif not is_live and controls_str:
warning = (
'<div style="background:#f0fdf4;border:1px solid #bbf7d0;border-radius:8px;padding:12px 16px;margin-top:16px;">'
'Empfehlung: Implementieren Sie die erforderlichen Kontrollen vor der Veroeffentlichung.</div>'
)
rec_html = f'<p style="color:#475569;margin-top:12px;"><em>{recommendation}</em></p>' if recommendation else ""
return f"""
{mode_banner}
<table style="width:100%;border-collapse:collapse;margin-bottom:16px;">
<tr><td style="padding:6px 0;color:#64748b;width:180px;">Dokumenttyp</td><td style="padding:6px 0;font-weight:600;">{doc_label}</td></tr>
<tr><td style="padding:6px 0;color:#64748b;">Quelle</td><td style="padding:6px 0;"><a href="{url}" style="color:#6366f1;">{url}</a></td></tr>
<tr><td style="padding:6px 0;color:#64748b;">Risikobewertung</td><td style="padding:6px 0;"><span style="background:{risk_color};color:white;padding:2px 8px;border-radius:4px;font-size:13px;">{risk_label} ({score}/100)</span></td></tr>
<tr><td style="padding:6px 0;color:#64748b;">Zustaendig</td><td style="padding:6px 0;font-weight:600;">{role}</td></tr>
<tr><td style="padding:6px 0;color:#64748b;">DSFA empfohlen</td><td style="padding:6px 0;">{'Ja' if dsfa else 'Nein'}</td></tr>
</table>
<h3 style="color:#1e293b;font-size:15px;margin:16px 0 8px;">Findings</h3>
<ul style="margin:0;padding-left:20px;color:#334155;">{findings_html}</ul>
<h3 style="color:#1e293b;font-size:15px;margin:16px 0 8px;">Erforderliche Massnahmen</h3>
<ul style="margin:0;padding-left:20px;color:#334155;">{controls_html}</ul>
{warning}
{rec_html}
"""