""" Cookie-Function-Classifier — pro Cookie eine inhaltliche Funktionsbestimmung. Heute haben wir pro Vendor eine Kategorie (analytics/advertising/...). Aber: ein Vendor hat oft 10-50 verschiedene Cookies. Nicht jeder Cookie einer Marketing-Plattform macht Werbung — viele sind Session-Mgmt, Sprachpraeferenz, ScrollPosition etc. Dieses Modul klassifiziert pro Cookie: - functional_role : was der Cookie technisch tut (session_id, csrf_token, ab_test, user_id, ad_id, …) - data_collected : welche Daten dahinter stehen (visitor_id, page_view, click, conversion_event, …) - blocking_impact : was passiert wenn der Cookie geblockt wird (none, no_personalization, no_tracking, site_breaks) Damit kann der Vendor-Redundanz-Analyzer praezise sagen: "Adobe Analytics setzt 55 Cookies, davon 12 fuer Tracking, 8 fuer A/B-Test und 35 fuer interne Performance. Matomo deckt 12 Tracking + 8 A/B Tests ab — 55 Adobe-Cookies werden zu 20 Matomo-Cookies." """ from __future__ import annotations import re from typing import Iterable # Pattern → (functional_role, blocking_impact) # Reihenfolge entscheidet: spezifischer zuerst. _PATTERNS: list[tuple[str, str, str]] = [ # Session / Authentifizierung (r"^(jsessionid|phpsessid|sessionid|sid|connect\.sid)$", "session_id", "site_breaks"), (r"sso|signon|auth|login|token|jwt|bearer", "auth_token", "site_breaks"), (r"^csrf|xsrf|antiforgery", "csrf_token", "site_breaks"), # Spracheinstellung / Region (r"lang|locale|culture|region", "preference", "no_personalization"), # User-Praeferenzen (Theme, View, Bookmark) (r"theme|dark|mode|view|sort|filter", "ui_preference", "no_personalization"), (r"bookmark|favorite|favorit", "user_data", "no_personalization"), # Consent-Cookie selbst (r"consent|gdpr|tcf|euconsent", "consent_state", "site_breaks"), # Tracking IDs (most analytics) (r"^_ga|gid|gat|google_analytic", "tracking_id", "no_tracking"), (r"^_pk_|matomo|piwik", "tracking_id", "no_tracking"), (r"^s_|s\.cc|adobesite|aam", "tracking_id", "no_tracking"), # Adobe (r"hjid|hjsession|hotjar", "session_recording", "no_tracking"), (r"_uetsid|_uetvid|microsoft", "tracking_id", "no_tracking"), # Visitor identification (r"visitor|uid|user_id|customer_id", "visitor_id", "no_personalization"), # A/B-Test / Personalisation (r"ab_test|abtest|variant|experiment|target|target_qa", "ab_test", "no_personalization"), (r"personalization|personalisation|adobe_target", "personalisation", "no_personalization"), # Werbung / Retargeting (r"fbp|fbc|fb_id|facebook|meta_pixel|fr$", "ad_pixel", "no_tracking"), (r"adform|criteo|outbrain|taboola|tapad|adsrvr", "ad_pixel", "no_tracking"), (r"doubleclick|test_cookie|ide|nid|exchange_uid", "ad_pixel", "no_tracking"), (r"google_ad|gads|gcl", "ad_pixel", "no_tracking"), (r"^li_|linkedin|bcookie|bscookie", "ad_pixel", "no_tracking"), (r"pinterest|_pinterest_|_pin_unauth", "ad_pixel", "no_tracking"), # Affiliate / Conversion (r"conversion|orderid|order_id|transaction|purchase", "conversion_event", "no_tracking"), (r"campaign|utm|source|medium|term", "campaign_attribution", "no_tracking"), # ScrollPosition / Form-Helper (r"scroll|position|form_|form_state", "ui_state", "no_personalization"), # Loadbalancer / Sticky (r"affinity|sticky|lb_|alb-|aws-alb", "load_balancer", "site_breaks"), # Chat / Support (r"chat|widget|genesys|livechat", "chat_session", "no_personalization"), # Captcha (r"hcaptcha|recaptcha|cf_|cloudflare", "bot_protection", "site_breaks"), ] _FUNCTIONAL_LABEL = { "session_id": "Sitzungs-ID", "auth_token": "Auth-Token", "csrf_token": "CSRF-Schutz", "preference": "Sprache / Region", "ui_preference": "UI-Praeferenz", "user_data": "Nutzer-Daten", "consent_state": "Consent-Speicher", "tracking_id": "Tracking-ID", "session_recording": "Session-Recording", "visitor_id": "Besucher-ID", "ab_test": "A/B-Test", "personalisation": "Personalisierung", "ad_pixel": "Werbe-Pixel", "conversion_event": "Konversions-Tracking", "campaign_attribution":"Kampagnen-Attribution", "ui_state": "UI-Zustand (ScrollPos etc.)", "load_balancer": "Load-Balancer", "chat_session": "Chat-Session", "bot_protection": "Bot-Schutz", "unknown": "Unbekannt", } # Welche functional_roles ueberlappen funktional — verwendet vom # vendor_redundancy.analyze() um echte Konsolidierungschancen zu # erkennen statt nur Provider-Doppelungen zu zaehlen. OVERLAPPING_ROLES = { "tracking_id": "tracking", "session_recording": "tracking", "ab_test": "personalisation", "personalisation": "personalisation", "ad_pixel": "advertising", "conversion_event": "advertising", "campaign_attribution":"advertising", } def classify_cookie(cookie_name: str) -> tuple[str, str]: """Return (functional_role, blocking_impact) for a cookie name.""" n = (cookie_name or "").lower().strip() for pattern, role, impact in _PATTERNS: if re.search(pattern, n): return role, impact return "unknown", "no_tracking" def annotate_vendor_cookies(vendor: dict) -> dict: """Enrich a vendor record with functional_role + KB knowledge per cookie.""" from compliance.services.cookie_knowledge import ( lookup_cookie, summarize_compliance_risk, ) cookies = vendor.get("cookies") or [] annotated = [] role_counts: dict[str, int] = {} for c in cookies: role, impact = classify_cookie(c.get("name", "")) knowledge = lookup_cookie(c.get("name", "")) entry = {**c, "functional_role": role, "blocking_impact": impact} if knowledge: entry["knowledge"] = knowledge annotated.append(entry) role_counts[role] = role_counts.get(role, 0) + 1 out = { **vendor, "cookies": annotated, "role_distribution": role_counts, "role_labels": {r: _FUNCTIONAL_LABEL.get(r, r) for r in role_counts}, } out["compliance_risk"] = summarize_compliance_risk(out) return out def aggregate_cookie_purposes(vendors: Iterable[dict]) -> dict: """Tenant-weite Verteilung: welche funktionalen Rollen kommen wie oft vor?""" total: dict[str, int] = {} by_vendor: dict[str, dict[str, int]] = {} for v in vendors: roles = v.get("role_distribution") or {} if not roles and v.get("cookies"): v = annotate_vendor_cookies(v) roles = v["role_distribution"] for r, n in roles.items(): total[r] = total.get(r, 0) + n by_vendor[v.get("name", "")] = roles return { "total_per_role": total, "labels": {r: _FUNCTIONAL_LABEL.get(r, r) for r in total}, "vendors_per_role": { r: [v for v, rd in by_vendor.items() if rd.get(r, 0) > 0] for r in total }, }