""" Agent Analyze Routes — combined endpoint that fetches a URL, classifies it, assesses DSGVO compliance, and sends a notification email. POST /api/compliance/agent/analyze """ import logging import re import os from datetime import datetime, timezone import httpx from fastapi import APIRouter from pydantic import BaseModel from compliance.services.smtp_sender import send_email from compliance.services.intake_extractor import extract_intake_flags_from_services, flags_to_ucca_intake from compliance.services.relevance_filter import filter_controls from compliance.services.website_compliance_checks import ( check_website_compliance as _check_website_compliance, FollowUpQuestion, to_string_list as _to_string_list, risk_to_escalation as _risk_to_escalation, ) logger = logging.getLogger(__name__) router = APIRouter(prefix="/compliance/agent", tags=["agent"]) SDK_URL = os.environ.get("AI_SDK_URL", "http://bp-compliance-ai-sdk:8090") TENANT_ID = "9282a473-5c95-4b3a-bf78-0ecc0ec71d3e" USER_ID = "00000000-0000-0000-0000-000000000001" ESCALATION_ROLES = { "E0": "Kein Handlungsbedarf", "E1": "Teamleitung Datenschutz", "E2": "Datenschutzbeauftragter (DSB)", "E3": "DSB + Rechtsabteilung", } SDK_HEADERS = { "Content-Type": "application/json", "X-Tenant-ID": TENANT_ID, "X-User-ID": USER_ID, } class AnalyzeRequest(BaseModel): url: str recipient: str = "dsb@breakpilot.local" mode: str = "post_launch" # "pre_launch" or "post_launch" class FollowUpQuestion(BaseModel): id: str question: str legal_basis: str severity: str # "high", "medium", "low" finding_if_no: str # Finding text if user answers "no" class AnalyzeResponse(BaseModel): url: str classification: str risk_level: str risk_score: float escalation_level: str responsible_role: str findings: list[str] required_controls: list[str] summary: str email_status: str analyzed_at: str follow_up_questions: list[FollowUpQuestion] = [] @router.post("/analyze", response_model=AnalyzeResponse) async def analyze_url(req: AnalyzeRequest): """Fetch URL, classify, assess compliance, and notify responsible role.""" async with httpx.AsyncClient(timeout=60.0) as client: # Step 1: Fetch and clean text, raw_html = await _fetch_and_clean(client, req.url) # Step 2: Classify via SDK LLM classification = await _classify(client, text) # Step 3: Detect services from HTML (deterministic, no LLM needed) from compliance.services.service_registry import SERVICE_REGISTRY detected_services = [] html_lower = raw_html.lower() for pattern, meta in SERVICE_REGISTRY.items(): if re.search(pattern, html_lower): detected_services.append(meta) # Step 4: Derive intake flags from DETECTED SERVICES (not from text!) intake_flags = extract_intake_flags_from_services(detected_services) # Step 5: Assess via UCCA with service-derived flags assessment = await _assess(client, text, classification, intake_flags) # Step 5: Determine role esc_level = assessment.get("escalation_level", "E0") role = ESCALATION_ROLES.get(esc_level, ESCALATION_ROLES["E0"]) # Step 6: Website compliance checks (§312k BGB etc.) site_findings, follow_ups = await _check_website_compliance(client, req.url, raw_html) # Step 7: Merge and filter findings/controls findings = assessment.get("triggered_rules", []) controls = assessment.get("required_controls", []) findings_str = _to_string_list(findings) + site_findings controls_str = filter_controls(_to_string_list(controls), text, intake_flags) # Escalate if website checks found issues if site_findings and esc_level == "E0": esc_level = "E1" role = ESCALATION_ROLES["E1"] summary = _build_summary(req.url, classification, assessment, role, findings_str, controls_str, req.mode) # Step 7: Send notification mode_label = "INTERNE PRUEFUNG" if req.mode == "pre_launch" else "LIVE-WEBSITE" email_result = send_email( recipient=req.recipient, subject=f"[{mode_label}] Compliance-Finding: {classification} — {req.url[:60]}", body_html=summary, ) return AnalyzeResponse( url=req.url, classification=classification, risk_level=assessment.get("risk_level", "unknown"), risk_score=assessment.get("risk_score", 0), escalation_level=esc_level, responsible_role=role, findings=findings_str, required_controls=controls_str, summary=summary, email_status=email_result.get("status", "failed"), analyzed_at=datetime.now(timezone.utc).isoformat(), follow_up_questions=follow_ups, ) async def _fetch_and_clean(client: httpx.AsyncClient, url: str) -> tuple[str, str]: """Fetch URL. Returns (clean_text, raw_html).""" resp = await client.get(url, follow_redirects=True, headers={ "User-Agent": "BreakPilot-Compliance-Agent/1.0", }) html = resp.text # Strip script/style blocks, then all tags clean = re.sub(r"<(script|style)[^>]*>.*?", "", html, flags=re.DOTALL | re.IGNORECASE) clean = re.sub(r"<[^>]+>", " ", clean) clean = re.sub(r" ", " ", clean) clean = re.sub(r"\s+", " ", clean).strip() return clean[:4000], html async def _classify(client: httpx.AsyncClient, text: str) -> str: """Classify document type via SDK LLM chat.""" try: resp = await client.post(f"{SDK_URL}/sdk/v1/llm/chat", headers=SDK_HEADERS, json={ "messages": [ {"role": "system", "content": ( "/no_think\n" "Klassifiziere das Dokument in GENAU EINE Kategorie: " "privacy_policy, cookie_banner, terms_of_service, imprint, dpa, other. " "Antworte NUR mit dem Kategorienamen, nichts anderes. Kein Denken, keine Erklaerung." )}, {"role": "user", "content": text[:2000]}, ], }) data = resp.json() # Qwen 3.5 may use think mode — content can be in message.content or response raw = ( data.get("response", "") or data.get("content", "") or (data.get("message", {}) or {}).get("content", "") or "" ).strip().lower() # Strip Qwen think tags if present raw = re.sub(r".*?", "", raw, flags=re.DOTALL).strip() logger.info("Classification raw response: %s", raw[:200]) for cat in ["privacy_policy", "cookie_banner", "terms_of_service", "imprint", "dpa"]: if cat in raw: return cat # Also check German terms if "datenschutz" in raw: return "privacy_policy" if "cookie" in raw: return "cookie_banner" if "impressum" in raw: return "imprint" return "other" except Exception as e: logger.warning("Classification failed: %s", e) return "other" async def _assess(client: httpx.AsyncClient, text: str, classification: str, intake_flags: dict | None = None) -> dict: """Run UCCA assessment via SDK. Returns flattened result dict.""" try: # Use LLM-extracted flags if available, otherwise minimal defaults if intake_flags: ucca_intake = flags_to_ucca_intake(intake_flags) else: ucca_intake = { "data_types": {"personal_data": True}, "purpose": {}, "automation": "manual", "outputs": {}, } resp = await client.post(f"{SDK_URL}/sdk/v1/ucca/assess", headers=SDK_HEADERS, json={ "use_case_text": text[:3000], "domain": classification, **ucca_intake, }) data = resp.json() # Flatten: UCCA wraps result under "assessment" and "result" assessment = data.get("assessment", data.get("result", data)) result = data.get("result", {}) return { "risk_level": assessment.get("risk_level", result.get("risk_level", "unknown")), "risk_score": assessment.get("risk_score", result.get("risk_score", 0)), "escalation_level": _risk_to_escalation(assessment.get("risk_level", "")), "triggered_rules": assessment.get("triggered_rules", result.get("triggered_rules", [])), "required_controls": assessment.get("required_controls", result.get("required_controls", [])), "summary": result.get("summary", ""), "recommendation": result.get("recommendation", ""), "dsfa_recommended": assessment.get("dsfa_recommended", False), } except Exception as e: logger.warning("Assessment failed: %s", e) return {"risk_level": "unknown", "risk_score": 0, "escalation_level": "E0"} # _check_website_compliance, _to_string_list, _risk_to_escalation # → extracted to compliance/services/website_compliance_checks.py DOC_TYPE_LABELS = { "privacy_policy": "Datenschutzerklaerung", "cookie_banner": "Cookie-Banner", "terms_of_service": "AGB", "imprint": "Impressum", "dpa": "Auftragsverarbeitung (AVV)", "other": "Sonstiges", } RISK_COLORS = { "MINIMAL": ("#16a34a", "Niedrig"), "LOW": ("#ca8a04", "Gering"), "LIMITED": ("#ea580c", "Mittel"), "HIGH": ("#dc2626", "Hoch"), "UNACCEPTABLE": ("#991b1b", "Kritisch"), } def _build_summary( url: str, classification: str, assessment: dict, role: str, findings_str: list[str], controls_str: list[str], mode: str = "post_launch", ) -> str: """Build HTML summary for email and frontend.""" risk = assessment.get("risk_level", "unbekannt") score = assessment.get("risk_score", 0) recommendation = assessment.get("recommendation", "") dsfa = assessment.get("dsfa_recommended", False) is_live = mode == "post_launch" risk_color, risk_label = RISK_COLORS.get(risk, ("#6b7280", risk)) doc_label = DOC_TYPE_LABELS.get(classification, classification) mode_banner = ( '
' 'LIVE-WEBSITE — Das Dokument ist bereits oeffentlich zugaenglich.
' if is_live else '
' 'INTERNE PRUEFUNG — Dokument noch nicht veroeffentlicht.
' ) findings_html = "".join(f'
  • {f}
  • ' for f in findings_str[:8]) if findings_str else '
  • Keine
  • ' controls_html = "".join(f'
  • {c}
  • ' for c in controls_str[:8]) if controls_str else '
  • Keine
  • ' warning = "" if is_live and findings_str: warning = ( '
    ' '⚠ ACHTUNG: Diese Maengel sind bereits oeffentlich sichtbar. ' 'Sofortige Nachbesserung empfohlen um Abmahnrisiken zu minimieren.
    ' ) elif not is_live and controls_str: warning = ( '
    ' 'Empfehlung: Implementieren Sie die erforderlichen Kontrollen vor der Veroeffentlichung.
    ' ) rec_html = f'

    {recommendation}

    ' if recommendation else "" return f""" {mode_banner}
    Dokumenttyp{doc_label}
    Quelle{url}
    Risikobewertung{risk_label} ({score}/100)
    Zustaendig{role}
    DSFA empfohlen{'Ja' if dsfa else 'Nein'}

    Findings

    Erforderliche Massnahmen

    {warning} {rec_html} """