""" Recipient-type classifier for vendor records (Art. 30(1)(d) DSGVO). Tags each extracted vendor entry with one of the canonical RecipientCategoryType values used by the VVT module: - INTERNAL — owner's own department / own system (BMW AG processing for itself, e.g. 'BMW AG — Form Validation') - GROUP_COMPANY — parent/subsidiary/sister of the owner (BMW Bank, BMW Motorrad, BMW Financial Services) - PROCESSOR — external Auftragsverarbeiter under AVV (Adobe, Akamai, AWS, Salesforce — they process on behalf) - CONTROLLER — independent / joint controller (Meta Pixel, Google YouTube — they run their own profiles) - AUTHORITY — government bodies (rare in cookie contexts) - OTHER — fallback Heuristic only — does not query Vault or external sources. A site-owner name is derived from the user-submitted URL (e.g. bmw.de -> 'BMW AG' or 'BMW'). Classification compares the vendor name to that owner name. """ from __future__ import annotations import re from urllib.parse import urlparse # Known tracking/advertising platforms that typically act as INDEPENDENT # or JOINT CONTROLLERS rather than processors. They build their own user # profiles across many sites; the site owner has limited control over # what they do with the data once collected. _JOINT_CONTROLLER_HINTS = { "meta", # Meta Pixel (Facebook/Instagram) "facebook", "instagram", "google adverti", # Google Advertising "google ads", "youtube", "doubleclick", "linkedin insight", "linkedin", "tiktok", "pinterest", "twitter", "x.com", "snapchat", "taboola", "outbrain", "criteo", "amazon adverti", # Amazon Advertising (vs AWS) "microsoft adverti", "yandex", "reddit", "quora", "spotify", } def owner_from_url(url: str) -> str: """Derive a short owner name from a URL. bmw.de -> 'BMW', mercedes-benz.de -> 'Mercedes-Benz', deutsche-bahn.de -> 'Deutsche-Bahn'. Used to detect the INTERNAL case when a vendor record's provider name starts with or contains this token. """ if not url or "://" not in url: return "" netloc = urlparse(url).netloc.lower() if netloc.startswith("www."): netloc = netloc[4:] parts = netloc.split(".") if len(parts) < 2: return "" sld = parts[-2] if len(parts) <= 2 else parts[-2] # bmw # Acronym (<=4 chars, no hyphen) -> uppercase (BMW, ARD, ZDF) if len(sld) <= 4 and "-" not in sld: return sld.upper() return "-".join(p.capitalize() for p in sld.split("-")) def classify( vendor_name: str, category: str, owner_name: str, ) -> str: """Return one of INTERNAL / GROUP_COMPANY / PROCESSOR / CONTROLLER / OTHER. Args: vendor_name: the provider/processing name as it appears in the cookie policy (e.g. 'BMW AG — Form Validation' or 'Adobe Systems Software Ireland Limited — Adobe Analytics'). category: canonical category ('marketing', 'necessary', 'statistics', 'functional'). Used to distinguish controller vs processor for ad platforms. owner_name: short token derived from the site URL ('BMW', 'Mercedes-Benz'). Empty string disables INTERNAL detection. """ name = (vendor_name or "").strip() if not name: return "OTHER" lower = name.lower() # 1. INTERNAL — owner processing for itself. # Strict: provider must BE the owner's main legal entity: # ' AG', ' SE', ' GmbH', '' alone, or # ' AG — ' / ' SE — '. if owner_name: ow = owner_name.lower() first_token = lower.split(" — ", 1)[0].strip() # text before ' — ' if (first_token == ow or first_token == f"{ow} ag" or first_token == f"{ow} se" or first_token == f"{ow} gmbh" or first_token == f"{ow} ag & co. kg"): return "INTERNAL" # 2. GROUP_COMPANY — provider is in the owner's brand family but a # different legal entity (BMW Bank GmbH, BMW Motorrad GmbH, # BMW Financial Services). if owner_name: ow = owner_name.lower() first_token = lower.split(" — ", 1)[0].strip() if first_token.startswith(f"{ow} ") and first_token != f"{ow} ag": return "GROUP_COMPANY" # 3. CONTROLLER — known tracking/ad platforms if any(hint in lower for hint in _JOINT_CONTROLLER_HINTS): return "CONTROLLER" # 4. PROCESSOR — everything else with a corporate name is most likely # an Auftragsverarbeiter (hosting/CDN/analytics/chat/captcha/CRM) if any(suffix in lower for suffix in ( "gmbh", "ag ", " ag", "ag—", "ag ", "se ", "kg", "ohg", "inc.", "inc ", "ltd", "limited", "llc", "corp", "b.v.", "a/s", "s.a.", "s.l.", "s.r.l", "oy ", "ab ", "as ", )): return "PROCESSOR" return "OTHER" # Section ordering + display labels for the VVT email table RECIPIENT_TYPE_SECTIONS = [ ("INTERNAL", "Eigene Verarbeitungstaetigkeiten — fuer das VVT (Art. 30 DSGVO)"), ("GROUP_COMPANY", "Konzernunternehmen (Mutter/Tochter) — VVT + ggf. JC/AVV pruefen"), ("PROCESSOR", "Auftragsverarbeiter — AVV erforderlich (Art. 28 DSGVO)"), ("CONTROLLER", "Eigenverantwortliche Dritte / Joint Controller — Vereinbarung pruefen (Art. 26 DSGVO)"), ("AUTHORITY", "Behoerden"), ("OTHER", "Sonstige Empfaenger"), ]