feat: Phase 0+1 — LLM intake extraction + control relevance filter
Phase 0: Qwen extracts 14 structured intake flags (personal_data, marketing, profiling, ai_usage, etc.) instead of keyword matching. Fallback to keywords if LLM unavailable. Flags feed into UCCA for accurate scoring. Phase 1: Control relevance filter removes false positives. C_TRANSPARENCY only recommended if AI/ML keywords found in text. 7 control rules with keyword lists + intake flag fallback. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -15,6 +15,8 @@ from fastapi import APIRouter
|
||||
from pydantic import BaseModel
|
||||
|
||||
from compliance.services.smtp_sender import send_email
|
||||
from compliance.services.intake_extractor import extract_intake_flags, flags_to_ucca_intake
|
||||
from compliance.services.relevance_filter import filter_controls
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -77,21 +79,24 @@ async def analyze_url(req: AnalyzeRequest):
|
||||
# Step 2: Classify via SDK LLM
|
||||
classification = await _classify(client, text)
|
||||
|
||||
# Step 3: Assess via UCCA
|
||||
assessment = await _assess(client, text, classification)
|
||||
# Step 3: Extract intake flags via LLM (better than keyword matching)
|
||||
intake_flags = await extract_intake_flags(text)
|
||||
|
||||
# Step 4: Determine role
|
||||
# Step 4: Assess via UCCA with LLM-extracted flags
|
||||
assessment = await _assess(client, text, classification, intake_flags)
|
||||
|
||||
# Step 5: Determine role
|
||||
esc_level = assessment.get("escalation_level", "E0")
|
||||
role = ESCALATION_ROLES.get(esc_level, ESCALATION_ROLES["E0"])
|
||||
|
||||
# Step 5: Website compliance checks (§312k BGB etc.)
|
||||
# Step 6: Website compliance checks (§312k BGB etc.)
|
||||
site_findings, follow_ups = await _check_website_compliance(client, req.url, raw_html)
|
||||
|
||||
# Step 6: Merge findings
|
||||
# Step 7: Merge and filter findings/controls
|
||||
findings = assessment.get("triggered_rules", [])
|
||||
controls = assessment.get("required_controls", [])
|
||||
findings_str = _to_string_list(findings) + site_findings
|
||||
controls_str = _to_string_list(controls)
|
||||
controls_str = filter_controls(_to_string_list(controls), text, intake_flags)
|
||||
|
||||
# Escalate if website checks found issues
|
||||
if site_findings and esc_level == "E0":
|
||||
@@ -179,34 +184,24 @@ async def _classify(client: httpx.AsyncClient, text: str) -> str:
|
||||
return "other"
|
||||
|
||||
|
||||
async def _assess(client: httpx.AsyncClient, text: str, classification: str) -> dict:
|
||||
async def _assess(client: httpx.AsyncClient, text: str, classification: str, intake_flags: dict | None = None) -> dict:
|
||||
"""Run UCCA assessment via SDK. Returns flattened result dict."""
|
||||
try:
|
||||
# UCCA expects boolean intake flags, not string categories
|
||||
# Use LLM-extracted flags if available, otherwise minimal defaults
|
||||
if intake_flags:
|
||||
ucca_intake = flags_to_ucca_intake(intake_flags)
|
||||
else:
|
||||
ucca_intake = {
|
||||
"data_types": {"personal_data": True},
|
||||
"purpose": {},
|
||||
"automation": "manual",
|
||||
"outputs": {},
|
||||
}
|
||||
|
||||
resp = await client.post(f"{SDK_URL}/sdk/v1/ucca/assess", headers=SDK_HEADERS, json={
|
||||
"use_case_text": text[:3000],
|
||||
"domain": classification,
|
||||
"data_types": {
|
||||
"personal_data": True,
|
||||
"customer_data": True,
|
||||
"location_data": "tracking" in text.lower() or "standort" in text.lower(),
|
||||
"images": False,
|
||||
"biometric_data": "biometrisch" in text.lower(),
|
||||
"minor_data": "kinder" in text.lower() or "minderjährig" in text.lower(),
|
||||
},
|
||||
"purpose": {
|
||||
"marketing": "werbung" in text.lower() or "marketing" in text.lower(),
|
||||
"analytics": "analyse" in text.lower() or "analytics" in text.lower(),
|
||||
"profiling": "profil" in text.lower() or "personalis" in text.lower(),
|
||||
"automation": False,
|
||||
"customer_support": False,
|
||||
},
|
||||
"automation": "partially_automated",
|
||||
"outputs": {
|
||||
"content_generation": False,
|
||||
"recommendations_to_users": "empfehl" in text.lower(),
|
||||
"data_export": "export" in text.lower() or "uebertrag" in text.lower(),
|
||||
},
|
||||
**ucca_intake,
|
||||
})
|
||||
data = resp.json()
|
||||
# Flatten: UCCA wraps result under "assessment" and "result"
|
||||
|
||||
Reference in New Issue
Block a user