Files
breakpilot-compliance/backend-compliance/compliance/services/intake_extractor.py
T
Benjamin Admin c5b22e0c99 fix: derive intake flags from DETECTED SERVICES, not from text content
Fundamental architecture fix: data processing happens through APIs/scripts/
cookies — NOT through visible page text. A news site about healthcare does
NOT process health data.

Before: Qwen reads website text → guesses "health_data: true" (WRONG)
After: Google Analytics detected → tracking: true (CORRECT, deterministic)

New flow: detect services from HTML → map service categories to flags →
feed flags into UCCA assessment. No LLM needed for flag extraction.

SERVICE_TO_FLAGS maps categories: tracking→tracking, marketing→marketing+
third_party_sharing, payment→payment_data, heatmap→profiling, etc.
SPECIFIC_SERVICE_FLAGS for Klarna (Art.22), Stripe (US transfer), etc.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-02 08:37:51 +02:00

180 lines
6.4 KiB
Python

"""
Intake Extractor — derives UCCA intake flags from DETECTED SERVICES,
not from website text content.
The actual data processing happens through APIs, scripts, and cookies —
NOT through visible text on the page. A news website reporting about
healthcare does NOT process health data.
Flags are derived deterministically from:
1. Which third-party services are embedded (Google Analytics → tracking)
2. Which payment providers are used (Stripe → payment_data)
3. Which CDN/fonts are loaded (Google Fonts → cross_border_transfer)
"""
import logging
logger = logging.getLogger(__name__)
# Service category → intake flags mapping
# This is the ONLY source of truth for what a service implies
SERVICE_TO_FLAGS: dict[str, dict[str, bool]] = {
# Tracking & Analytics → personal_data + tracking
"tracking": {
"personal_data": True,
"tracking": True,
},
# Marketing → marketing + tracking + third_party_sharing
"marketing": {
"personal_data": True,
"tracking": True,
"marketing": True,
"third_party_sharing": True,
},
# Heatmap/Session Recording → tracking + profiling
"heatmap": {
"personal_data": True,
"tracking": True,
"profiling": True,
},
# Payment → payment_data
"payment": {
"personal_data": True,
"payment_data": True,
},
# Chatbot → personal_data (user sends messages)
"chatbot": {
"personal_data": True,
"customer_data": True,
},
# CRM → customer_data + profiling
"crm": {
"personal_data": True,
"customer_data": True,
"profiling": True,
},
# CDN from non-EU → cross_border_transfer (IP sent to US)
"cdn": {
"personal_data": True,
},
}
# Specific services with special flags
SPECIFIC_SERVICE_FLAGS: dict[str, dict[str, bool]] = {
"klarna": {"automated_decisions": True, "payment_data": True},
"paypal": {"cross_border_transfer": True, "payment_data": True},
"stripe": {"cross_border_transfer": True, "payment_data": True},
"google_analytics": {"cross_border_transfer": True, "tracking": True},
"facebook_pixel": {"cross_border_transfer": True, "marketing": True, "profiling": True},
"hotjar": {"profiling": True, "tracking": True},
"ms_clarity": {"cross_border_transfer": True, "profiling": True},
"tiktok_pixel": {"cross_border_transfer": True, "marketing": True},
"intercom": {"cross_border_transfer": True, "ai_usage": True},
}
def extract_intake_flags_from_services(detected_services: list[dict]) -> dict:
"""Derive intake flags from detected third-party services.
This is deterministic and 100% accurate — if Google Analytics is
embedded, tracking IS happening. No guessing needed.
"""
flags = {
"personal_data": False,
"customer_data": False,
"payment_data": False,
"location_data": False,
"biometric_data": False,
"minor_data": False,
"health_data": False,
"marketing": False,
"profiling": False,
"automated_decisions": False,
"third_party_sharing": False,
"cross_border_transfer": False,
"tracking": False,
"ai_usage": False,
}
for svc in detected_services:
category = svc.get("category", "other")
service_id = svc.get("id", "")
eu_adequate = svc.get("eu_adequate", True)
# Apply category-level flags
cat_flags = SERVICE_TO_FLAGS.get(category, {})
for key, value in cat_flags.items():
if value:
flags[key] = True
# Apply service-specific flags
svc_flags = SPECIFIC_SERVICE_FLAGS.get(service_id, {})
for key, value in svc_flags.items():
if value:
flags[key] = True
# Non-EU service → cross_border_transfer
if not eu_adequate:
flags["cross_border_transfer"] = True
flags["third_party_sharing"] = True
# Any website with detected services processes personal data (IP at minimum)
if detected_services:
flags["personal_data"] = True
active = {k: v for k, v in flags.items() if v}
logger.info("Intake flags from %d services: %s", len(detected_services), active)
return flags
# Keep backward compatibility
async def extract_intake_flags(text: str) -> dict:
"""DEPRECATED — use extract_intake_flags_from_services() instead.
This function used LLM to guess flags from text content.
Text content does NOT represent actual data processing.
"""
logger.warning(
"extract_intake_flags(text) called — DEPRECATED. "
"Use extract_intake_flags_from_services(detected_services) instead."
)
# Return minimal flags — website exists = personal_data (IP)
return {"personal_data": True, "tracking": False}
def flags_to_ucca_intake(flags: dict) -> dict:
"""Convert extracted flags to UCCA intake format."""
return {
"data_types": {
"personal_data": flags.get("personal_data", False),
"customer_data": flags.get("customer_data", False),
"location_data": flags.get("location_data", False),
"biometric_data": flags.get("biometric_data", False),
"minor_data": flags.get("minor_data", False),
"images": False,
"audio": False,
"financial_data": flags.get("payment_data", False),
"employee_data": False,
"article_9_data": flags.get("health_data", False) or flags.get("biometric_data", False),
},
"purpose": {
"marketing": flags.get("marketing", False),
"analytics": flags.get("tracking", False),
"profiling": flags.get("profiling", False),
"automation": flags.get("ai_usage", False),
"customer_support": False,
"evaluation_scoring": flags.get("automated_decisions", False),
"decision_making": flags.get("automated_decisions", False),
},
"automation": "fully_automated" if flags.get("automated_decisions") else
"partially_automated" if flags.get("ai_usage") else "manual",
"outputs": {
"recommendations_to_users": flags.get("profiling", False),
"data_export": flags.get("cross_border_transfer", False),
"legal_effects": flags.get("automated_decisions", False),
},
"hosting": {
"region": "non_eu" if flags.get("cross_border_transfer") else "eu",
},
}