c5b22e0c99
Fundamental architecture fix: data processing happens through APIs/scripts/ cookies — NOT through visible page text. A news site about healthcare does NOT process health data. Before: Qwen reads website text → guesses "health_data: true" (WRONG) After: Google Analytics detected → tracking: true (CORRECT, deterministic) New flow: detect services from HTML → map service categories to flags → feed flags into UCCA assessment. No LLM needed for flag extraction. SERVICE_TO_FLAGS maps categories: tracking→tracking, marketing→marketing+ third_party_sharing, payment→payment_data, heatmap→profiling, etc. SPECIFIC_SERVICE_FLAGS for Klarna (Art.22), Stripe (US transfer), etc. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
180 lines
6.4 KiB
Python
180 lines
6.4 KiB
Python
"""
|
|
Intake Extractor — derives UCCA intake flags from DETECTED SERVICES,
|
|
not from website text content.
|
|
|
|
The actual data processing happens through APIs, scripts, and cookies —
|
|
NOT through visible text on the page. A news website reporting about
|
|
healthcare does NOT process health data.
|
|
|
|
Flags are derived deterministically from:
|
|
1. Which third-party services are embedded (Google Analytics → tracking)
|
|
2. Which payment providers are used (Stripe → payment_data)
|
|
3. Which CDN/fonts are loaded (Google Fonts → cross_border_transfer)
|
|
"""
|
|
|
|
import logging
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Service category → intake flags mapping
|
|
# This is the ONLY source of truth for what a service implies
|
|
SERVICE_TO_FLAGS: dict[str, dict[str, bool]] = {
|
|
# Tracking & Analytics → personal_data + tracking
|
|
"tracking": {
|
|
"personal_data": True,
|
|
"tracking": True,
|
|
},
|
|
# Marketing → marketing + tracking + third_party_sharing
|
|
"marketing": {
|
|
"personal_data": True,
|
|
"tracking": True,
|
|
"marketing": True,
|
|
"third_party_sharing": True,
|
|
},
|
|
# Heatmap/Session Recording → tracking + profiling
|
|
"heatmap": {
|
|
"personal_data": True,
|
|
"tracking": True,
|
|
"profiling": True,
|
|
},
|
|
# Payment → payment_data
|
|
"payment": {
|
|
"personal_data": True,
|
|
"payment_data": True,
|
|
},
|
|
# Chatbot → personal_data (user sends messages)
|
|
"chatbot": {
|
|
"personal_data": True,
|
|
"customer_data": True,
|
|
},
|
|
# CRM → customer_data + profiling
|
|
"crm": {
|
|
"personal_data": True,
|
|
"customer_data": True,
|
|
"profiling": True,
|
|
},
|
|
# CDN from non-EU → cross_border_transfer (IP sent to US)
|
|
"cdn": {
|
|
"personal_data": True,
|
|
},
|
|
}
|
|
|
|
# Specific services with special flags
|
|
SPECIFIC_SERVICE_FLAGS: dict[str, dict[str, bool]] = {
|
|
"klarna": {"automated_decisions": True, "payment_data": True},
|
|
"paypal": {"cross_border_transfer": True, "payment_data": True},
|
|
"stripe": {"cross_border_transfer": True, "payment_data": True},
|
|
"google_analytics": {"cross_border_transfer": True, "tracking": True},
|
|
"facebook_pixel": {"cross_border_transfer": True, "marketing": True, "profiling": True},
|
|
"hotjar": {"profiling": True, "tracking": True},
|
|
"ms_clarity": {"cross_border_transfer": True, "profiling": True},
|
|
"tiktok_pixel": {"cross_border_transfer": True, "marketing": True},
|
|
"intercom": {"cross_border_transfer": True, "ai_usage": True},
|
|
}
|
|
|
|
|
|
def extract_intake_flags_from_services(detected_services: list[dict]) -> dict:
|
|
"""Derive intake flags from detected third-party services.
|
|
|
|
This is deterministic and 100% accurate — if Google Analytics is
|
|
embedded, tracking IS happening. No guessing needed.
|
|
"""
|
|
flags = {
|
|
"personal_data": False,
|
|
"customer_data": False,
|
|
"payment_data": False,
|
|
"location_data": False,
|
|
"biometric_data": False,
|
|
"minor_data": False,
|
|
"health_data": False,
|
|
"marketing": False,
|
|
"profiling": False,
|
|
"automated_decisions": False,
|
|
"third_party_sharing": False,
|
|
"cross_border_transfer": False,
|
|
"tracking": False,
|
|
"ai_usage": False,
|
|
}
|
|
|
|
for svc in detected_services:
|
|
category = svc.get("category", "other")
|
|
service_id = svc.get("id", "")
|
|
eu_adequate = svc.get("eu_adequate", True)
|
|
|
|
# Apply category-level flags
|
|
cat_flags = SERVICE_TO_FLAGS.get(category, {})
|
|
for key, value in cat_flags.items():
|
|
if value:
|
|
flags[key] = True
|
|
|
|
# Apply service-specific flags
|
|
svc_flags = SPECIFIC_SERVICE_FLAGS.get(service_id, {})
|
|
for key, value in svc_flags.items():
|
|
if value:
|
|
flags[key] = True
|
|
|
|
# Non-EU service → cross_border_transfer
|
|
if not eu_adequate:
|
|
flags["cross_border_transfer"] = True
|
|
flags["third_party_sharing"] = True
|
|
|
|
# Any website with detected services processes personal data (IP at minimum)
|
|
if detected_services:
|
|
flags["personal_data"] = True
|
|
|
|
active = {k: v for k, v in flags.items() if v}
|
|
logger.info("Intake flags from %d services: %s", len(detected_services), active)
|
|
return flags
|
|
|
|
|
|
# Keep backward compatibility
|
|
async def extract_intake_flags(text: str) -> dict:
|
|
"""DEPRECATED — use extract_intake_flags_from_services() instead.
|
|
|
|
This function used LLM to guess flags from text content.
|
|
Text content does NOT represent actual data processing.
|
|
"""
|
|
logger.warning(
|
|
"extract_intake_flags(text) called — DEPRECATED. "
|
|
"Use extract_intake_flags_from_services(detected_services) instead."
|
|
)
|
|
# Return minimal flags — website exists = personal_data (IP)
|
|
return {"personal_data": True, "tracking": False}
|
|
|
|
|
|
def flags_to_ucca_intake(flags: dict) -> dict:
|
|
"""Convert extracted flags to UCCA intake format."""
|
|
return {
|
|
"data_types": {
|
|
"personal_data": flags.get("personal_data", False),
|
|
"customer_data": flags.get("customer_data", False),
|
|
"location_data": flags.get("location_data", False),
|
|
"biometric_data": flags.get("biometric_data", False),
|
|
"minor_data": flags.get("minor_data", False),
|
|
"images": False,
|
|
"audio": False,
|
|
"financial_data": flags.get("payment_data", False),
|
|
"employee_data": False,
|
|
"article_9_data": flags.get("health_data", False) or flags.get("biometric_data", False),
|
|
},
|
|
"purpose": {
|
|
"marketing": flags.get("marketing", False),
|
|
"analytics": flags.get("tracking", False),
|
|
"profiling": flags.get("profiling", False),
|
|
"automation": flags.get("ai_usage", False),
|
|
"customer_support": False,
|
|
"evaluation_scoring": flags.get("automated_decisions", False),
|
|
"decision_making": flags.get("automated_decisions", False),
|
|
},
|
|
"automation": "fully_automated" if flags.get("automated_decisions") else
|
|
"partially_automated" if flags.get("ai_usage") else "manual",
|
|
"outputs": {
|
|
"recommendations_to_users": flags.get("profiling", False),
|
|
"data_export": flags.get("cross_border_transfer", False),
|
|
"legal_effects": flags.get("automated_decisions", False),
|
|
},
|
|
"hosting": {
|
|
"region": "non_eu" if flags.get("cross_border_transfer") else "eu",
|
|
},
|
|
}
|