feat: Phase 0+1 — LLM intake extraction + control relevance filter

Phase 0: Qwen extracts 14 structured intake flags (personal_data,
marketing, profiling, ai_usage, etc.) instead of keyword matching.
Fallback to keywords if LLM unavailable. Flags feed into UCCA for
accurate scoring.

Phase 1: Control relevance filter removes false positives.
C_TRANSPARENCY only recommended if AI/ML keywords found in text.
7 control rules with keyword lists + intake flag fallback.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-04-29 11:36:24 +02:00
parent 0266dfd011
commit 4298ae17ab
3 changed files with 301 additions and 29 deletions
@@ -15,6 +15,8 @@ from fastapi import APIRouter
from pydantic import BaseModel
from compliance.services.smtp_sender import send_email
from compliance.services.intake_extractor import extract_intake_flags, flags_to_ucca_intake
from compliance.services.relevance_filter import filter_controls
logger = logging.getLogger(__name__)
@@ -77,21 +79,24 @@ async def analyze_url(req: AnalyzeRequest):
# Step 2: Classify via SDK LLM
classification = await _classify(client, text)
# Step 3: Assess via UCCA
assessment = await _assess(client, text, classification)
# Step 3: Extract intake flags via LLM (better than keyword matching)
intake_flags = await extract_intake_flags(text)
# Step 4: Determine role
# Step 4: Assess via UCCA with LLM-extracted flags
assessment = await _assess(client, text, classification, intake_flags)
# Step 5: Determine role
esc_level = assessment.get("escalation_level", "E0")
role = ESCALATION_ROLES.get(esc_level, ESCALATION_ROLES["E0"])
# Step 5: Website compliance checks (§312k BGB etc.)
# Step 6: Website compliance checks (§312k BGB etc.)
site_findings, follow_ups = await _check_website_compliance(client, req.url, raw_html)
# Step 6: Merge findings
# Step 7: Merge and filter findings/controls
findings = assessment.get("triggered_rules", [])
controls = assessment.get("required_controls", [])
findings_str = _to_string_list(findings) + site_findings
controls_str = _to_string_list(controls)
controls_str = filter_controls(_to_string_list(controls), text, intake_flags)
# Escalate if website checks found issues
if site_findings and esc_level == "E0":
@@ -179,34 +184,24 @@ async def _classify(client: httpx.AsyncClient, text: str) -> str:
return "other"
async def _assess(client: httpx.AsyncClient, text: str, classification: str) -> dict:
async def _assess(client: httpx.AsyncClient, text: str, classification: str, intake_flags: dict | None = None) -> dict:
"""Run UCCA assessment via SDK. Returns flattened result dict."""
try:
# UCCA expects boolean intake flags, not string categories
# Use LLM-extracted flags if available, otherwise minimal defaults
if intake_flags:
ucca_intake = flags_to_ucca_intake(intake_flags)
else:
ucca_intake = {
"data_types": {"personal_data": True},
"purpose": {},
"automation": "manual",
"outputs": {},
}
resp = await client.post(f"{SDK_URL}/sdk/v1/ucca/assess", headers=SDK_HEADERS, json={
"use_case_text": text[:3000],
"domain": classification,
"data_types": {
"personal_data": True,
"customer_data": True,
"location_data": "tracking" in text.lower() or "standort" in text.lower(),
"images": False,
"biometric_data": "biometrisch" in text.lower(),
"minor_data": "kinder" in text.lower() or "minderjährig" in text.lower(),
},
"purpose": {
"marketing": "werbung" in text.lower() or "marketing" in text.lower(),
"analytics": "analyse" in text.lower() or "analytics" in text.lower(),
"profiling": "profil" in text.lower() or "personalis" in text.lower(),
"automation": False,
"customer_support": False,
},
"automation": "partially_automated",
"outputs": {
"content_generation": False,
"recommendations_to_users": "empfehl" in text.lower(),
"data_export": "export" in text.lower() or "uebertrag" in text.lower(),
},
**ucca_intake,
})
data = resp.json()
# Flatten: UCCA wraps result under "assessment" and "result"