feat: Phase 0+1 — LLM intake extraction + control relevance filter

Phase 0: Qwen extracts 14 structured intake flags (personal_data, marketing, profiling, ai_usage, etc.) instead of keyword matching. Fallback to keywords if LLM unavailable. Flags feed into UCCA for accurate scoring. Phase 1: Control relevance filter removes false positives. C_TRANSPARENCY only recommended if AI/ML keywords found in text. 7 control rules with keyword lists + intake flag fallback. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-29 11:36:24 +02:00
parent 0266dfd011
commit 4298ae17ab
3 changed files with 301 additions and 29 deletions
@@ -15,6 +15,8 @@ from fastapi import APIRouter
 from pydantic import BaseModel

 from compliance.services.smtp_sender import send_email
+from compliance.services.intake_extractor import extract_intake_flags, flags_to_ucca_intake
+from compliance.services.relevance_filter import filter_controls

 logger = logging.getLogger(__name__)

@@ -77,21 +79,24 @@ async def analyze_url(req: AnalyzeRequest):
        # Step 2: Classify via SDK LLM
        classification = await _classify(client, text)

-        # Step 3: Assess via UCCA
-        assessment = await _assess(client, text, classification)
+        # Step 3: Extract intake flags via LLM (better than keyword matching)
+        intake_flags = await extract_intake_flags(text)

-        # Step 4: Determine role
+        # Step 4: Assess via UCCA with LLM-extracted flags
+        assessment = await _assess(client, text, classification, intake_flags)
+
+        # Step 5: Determine role
        esc_level = assessment.get("escalation_level", "E0")
        role = ESCALATION_ROLES.get(esc_level, ESCALATION_ROLES["E0"])

-        # Step 5: Website compliance checks (§312k BGB etc.)
+        # Step 6: Website compliance checks (§312k BGB etc.)
        site_findings, follow_ups = await _check_website_compliance(client, req.url, raw_html)

-        # Step 6: Merge findings
+        # Step 7: Merge and filter findings/controls
        findings = assessment.get("triggered_rules", [])
        controls = assessment.get("required_controls", [])
        findings_str = _to_string_list(findings) + site_findings
-        controls_str = _to_string_list(controls)
+        controls_str = filter_controls(_to_string_list(controls), text, intake_flags)

        # Escalate if website checks found issues
        if site_findings and esc_level == "E0":
@@ -179,34 +184,24 @@ async def _classify(client: httpx.AsyncClient, text: str) -> str:
        return "other"


-async def _assess(client: httpx.AsyncClient, text: str, classification: str) -> dict:
+async def _assess(client: httpx.AsyncClient, text: str, classification: str, intake_flags: dict | None = None) -> dict:
    """Run UCCA assessment via SDK. Returns flattened result dict."""
    try:
-        # UCCA expects boolean intake flags, not string categories
+        # Use LLM-extracted flags if available, otherwise minimal defaults
+        if intake_flags:
+            ucca_intake = flags_to_ucca_intake(intake_flags)
+        else:
+            ucca_intake = {
+                "data_types": {"personal_data": True},
+                "purpose": {},
+                "automation": "manual",
+                "outputs": {},
+            }
+
        resp = await client.post(f"{SDK_URL}/sdk/v1/ucca/assess", headers=SDK_HEADERS, json={
            "use_case_text": text[:3000],
            "domain": classification,
-            "data_types": {
-                "personal_data": True,
-                "customer_data": True,
-                "location_data": "tracking" in text.lower() or "standort" in text.lower(),
-                "images": False,
-                "biometric_data": "biometrisch" in text.lower(),
-                "minor_data": "kinder" in text.lower() or "minderjährig" in text.lower(),
-            },
-            "purpose": {
-                "marketing": "werbung" in text.lower() or "marketing" in text.lower(),
-                "analytics": "analyse" in text.lower() or "analytics" in text.lower(),
-                "profiling": "profil" in text.lower() or "personalis" in text.lower(),
-                "automation": False,
-                "customer_support": False,
-            },
-            "automation": "partially_automated",
-            "outputs": {
-                "content_generation": False,
-                "recommendations_to_users": "empfehl" in text.lower(),
-                "data_export": "export" in text.lower() or "uebertrag" in text.lower(),
-            },
+            **ucca_intake,
        })
        data = resp.json()
        # Flatten: UCCA wraps result under "assessment" and "result"