""" Intake Extractor — derives UCCA intake flags from DETECTED SERVICES, not from website text content. The actual data processing happens through APIs, scripts, and cookies — NOT through visible text on the page. A news website reporting about healthcare does NOT process health data. Flags are derived deterministically from: 1. Which third-party services are embedded (Google Analytics → tracking) 2. Which payment providers are used (Stripe → payment_data) 3. Which CDN/fonts are loaded (Google Fonts → cross_border_transfer) """ import logging logger = logging.getLogger(__name__) # Service category → intake flags mapping # This is the ONLY source of truth for what a service implies SERVICE_TO_FLAGS: dict[str, dict[str, bool]] = { # Tracking & Analytics → personal_data + tracking "tracking": { "personal_data": True, "tracking": True, }, # Marketing → marketing + tracking + third_party_sharing "marketing": { "personal_data": True, "tracking": True, "marketing": True, "third_party_sharing": True, }, # Heatmap/Session Recording → tracking + profiling "heatmap": { "personal_data": True, "tracking": True, "profiling": True, }, # Payment → payment_data "payment": { "personal_data": True, "payment_data": True, }, # Chatbot → personal_data (user sends messages) "chatbot": { "personal_data": True, "customer_data": True, }, # CRM → customer_data + profiling "crm": { "personal_data": True, "customer_data": True, "profiling": True, }, # CDN from non-EU → cross_border_transfer (IP sent to US) "cdn": { "personal_data": True, }, } # Specific services with special flags SPECIFIC_SERVICE_FLAGS: dict[str, dict[str, bool]] = { "klarna": {"automated_decisions": True, "payment_data": True}, "paypal": {"cross_border_transfer": True, "payment_data": True}, "stripe": {"cross_border_transfer": True, "payment_data": True}, "google_analytics": {"cross_border_transfer": True, "tracking": True}, "facebook_pixel": {"cross_border_transfer": True, "marketing": True, "profiling": True}, "hotjar": {"profiling": True, "tracking": True}, "ms_clarity": {"cross_border_transfer": True, "profiling": True}, "tiktok_pixel": {"cross_border_transfer": True, "marketing": True}, "intercom": {"cross_border_transfer": True, "ai_usage": True}, } def extract_intake_flags_from_services(detected_services: list[dict]) -> dict: """Derive intake flags from detected third-party services. This is deterministic and 100% accurate — if Google Analytics is embedded, tracking IS happening. No guessing needed. """ flags = { "personal_data": False, "customer_data": False, "payment_data": False, "location_data": False, "biometric_data": False, "minor_data": False, "health_data": False, "marketing": False, "profiling": False, "automated_decisions": False, "third_party_sharing": False, "cross_border_transfer": False, "tracking": False, "ai_usage": False, } for svc in detected_services: category = svc.get("category", "other") service_id = svc.get("id", "") eu_adequate = svc.get("eu_adequate", True) # Apply category-level flags cat_flags = SERVICE_TO_FLAGS.get(category, {}) for key, value in cat_flags.items(): if value: flags[key] = True # Apply service-specific flags svc_flags = SPECIFIC_SERVICE_FLAGS.get(service_id, {}) for key, value in svc_flags.items(): if value: flags[key] = True # Non-EU service → cross_border_transfer if not eu_adequate: flags["cross_border_transfer"] = True flags["third_party_sharing"] = True # Any website with detected services processes personal data (IP at minimum) if detected_services: flags["personal_data"] = True active = {k: v for k, v in flags.items() if v} logger.info("Intake flags from %d services: %s", len(detected_services), active) return flags # Keep backward compatibility async def extract_intake_flags(text: str) -> dict: """DEPRECATED — use extract_intake_flags_from_services() instead. This function used LLM to guess flags from text content. Text content does NOT represent actual data processing. """ logger.warning( "extract_intake_flags(text) called — DEPRECATED. " "Use extract_intake_flags_from_services(detected_services) instead." ) # Return minimal flags — website exists = personal_data (IP) return {"personal_data": True, "tracking": False} def flags_to_ucca_intake(flags: dict) -> dict: """Convert extracted flags to UCCA intake format.""" return { "data_types": { "personal_data": flags.get("personal_data", False), "customer_data": flags.get("customer_data", False), "location_data": flags.get("location_data", False), "biometric_data": flags.get("biometric_data", False), "minor_data": flags.get("minor_data", False), "images": False, "audio": False, "financial_data": flags.get("payment_data", False), "employee_data": False, "article_9_data": flags.get("health_data", False) or flags.get("biometric_data", False), }, "purpose": { "marketing": flags.get("marketing", False), "analytics": flags.get("tracking", False), "profiling": flags.get("profiling", False), "automation": flags.get("ai_usage", False), "customer_support": False, "evaluation_scoring": flags.get("automated_decisions", False), "decision_making": flags.get("automated_decisions", False), }, "automation": "fully_automated" if flags.get("automated_decisions") else "partially_automated" if flags.get("ai_usage") else "manual", "outputs": { "recommendations_to_users": flags.get("profiling", False), "data_export": flags.get("cross_border_transfer", False), "legal_effects": flags.get("automated_decisions", False), }, "hosting": { "region": "non_eu" if flags.get("cross_border_transfer") else "eu", }, }