Files
breakpilot-compliance/backend-compliance/compliance/services/cookie_function_classifier.py
T
Benjamin Admin 6c223c7c9b
CI / detect-changes (push) Successful in 10s
CI / branch-name (push) Has been skipped
CI / guardrail-integrity (push) Has been skipped
CI / secret-scan (push) Has been skipped
CI / dep-audit (push) Has been skipped
CI / sbom-scan (push) Has been skipped
CI / validate-canonical-controls (push) Successful in 14s
CI / loc-budget (push) Failing after 15s
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / nodejs-build (push) Successful in 2m43s
CI / test-go (push) Has been skipped
CI / iace-gt-coverage (push) Has been skipped
CI / test-python-backend (push) Successful in 37s
CI / test-python-document-crawler (push) Has been skipped
CI / test-python-dsms-gateway (push) Has been skipped
feat(compliance-check): exec-summary + voll-audit + TDM-respect + cookie-KB-extended + saving-scan-funnel
P1 — Exec-Summary oben im Email-Report (4 KPIs + 2 CTAs, dunkler Gradient)
P3 — no_direct_sales-Flag fuer OEM-Konfigurator-Sites; AGB/Widerruf/AGB als
     "NICHT ANWENDBAR" (grau) statt "NICHT GEFUNDEN" (rot)
P5 — Voll-Audit Unification: alle Findings (MC + Pflichtangaben + Vendor +
     Redundanz) in /data/compliance_audits.db.unified_findings; neuer
     /api/compliance/agent/findings/<id> Endpoint + FindingsTab im Audit-UI
     mit Filter + CSV-Export
P7 — Crawl-Hardening: TDM-Reservation-Check (robots.txt / ai.txt / Header /
     Meta) vor jedem Run mit 24h-Cache; HeadlessChrome-UA (Firma noch nicht
     gegruendet — Switch via BREAKPILOT_BRANDED_UA env); per-Domain
     Rate-Limit 1 req/s + max 2 concurrent
P2 — Cookie-Knowledge-DB additiv erweitert (35 -> 74 Cookies): Adobe, Meta,
     Microsoft, LinkedIn, TikTok, HubSpot, Marketo, Salesforce, Hotjar,
     FullStory, Mouseflow, Intercom, Drift, Zendesk, Cloudflare, Stripe,
     OneTrust/Cookiebot/Usercentrics, Matomo, Pinterest, Snapchat, X/Twitter,
     YouTube, Vimeo, Klaviyo, Mailchimp, Mixpanel, Segment, Amplitude,
     Optimizely, Datadog; Wire-in in cookie_function_classifier liefert
     compliance_risk-Label (kritisch/hoch/mittel/gering) pro Vendor
A  — k-Anonymitaets-Helper (benchmark_k_anonymity) fuer P6-Vorbereitung
B  — Cross-Tenant-Domain-Assertion im /findings-Endpoint (expected_domain
     Query-Param -> 403 bei Mismatch)
C  — Saving-Scan-Funnel: /api/compliance/agent/saving-scan/start mit
     Validierung + 24h-Rate-Limit pro Domain + Lead-Persistenz in
     saving_scan_leads + Auto-Discovery via _run_compliance_check; 6 Tests
D  — Risk-Badge im Email-Vendor-Row

Rechtliche Leitplanken (Memory feedback_oem_data_legal.md): nur eigene
Knapp-Bewertungen + Source-Pointer, keine 1:1-Kopien fremder CMP-Texte.
TDM-Opt-Out-Respect nach § 44b UrhG. KEINE Schema-Aenderungen — alles in
Sidecar-SQLite.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-18 23:48:34 +02:00

177 lines
7.5 KiB
Python

"""
Cookie-Function-Classifier — pro Cookie eine inhaltliche Funktionsbestimmung.
Heute haben wir pro Vendor eine Kategorie (analytics/advertising/...).
Aber: ein Vendor hat oft 10-50 verschiedene Cookies. Nicht jeder Cookie
einer Marketing-Plattform macht Werbung — viele sind Session-Mgmt,
Sprachpraeferenz, ScrollPosition etc.
Dieses Modul klassifiziert pro Cookie:
- functional_role : was der Cookie technisch tut (session_id,
csrf_token, ab_test, user_id, ad_id, …)
- data_collected : welche Daten dahinter stehen (visitor_id,
page_view, click, conversion_event, …)
- blocking_impact : was passiert wenn der Cookie geblockt wird
(none, no_personalization, no_tracking, site_breaks)
Damit kann der Vendor-Redundanz-Analyzer praezise sagen:
"Adobe Analytics setzt 55 Cookies, davon 12 fuer Tracking, 8 fuer A/B-Test
und 35 fuer interne Performance. Matomo deckt 12 Tracking + 8 A/B Tests
ab — 55 Adobe-Cookies werden zu 20 Matomo-Cookies."
"""
from __future__ import annotations
import re
from typing import Iterable
# Pattern → (functional_role, blocking_impact)
# Reihenfolge entscheidet: spezifischer zuerst.
_PATTERNS: list[tuple[str, str, str]] = [
# Session / Authentifizierung
(r"^(jsessionid|phpsessid|sessionid|sid|connect\.sid)$", "session_id", "site_breaks"),
(r"sso|signon|auth|login|token|jwt|bearer", "auth_token", "site_breaks"),
(r"^csrf|xsrf|antiforgery", "csrf_token", "site_breaks"),
# Spracheinstellung / Region
(r"lang|locale|culture|region", "preference", "no_personalization"),
# User-Praeferenzen (Theme, View, Bookmark)
(r"theme|dark|mode|view|sort|filter", "ui_preference", "no_personalization"),
(r"bookmark|favorite|favorit", "user_data", "no_personalization"),
# Consent-Cookie selbst
(r"consent|gdpr|tcf|euconsent", "consent_state", "site_breaks"),
# Tracking IDs (most analytics)
(r"^_ga|gid|gat|google_analytic", "tracking_id", "no_tracking"),
(r"^_pk_|matomo|piwik", "tracking_id", "no_tracking"),
(r"^s_|s\.cc|adobesite|aam", "tracking_id", "no_tracking"), # Adobe
(r"hjid|hjsession|hotjar", "session_recording", "no_tracking"),
(r"_uetsid|_uetvid|microsoft", "tracking_id", "no_tracking"),
# Visitor identification
(r"visitor|uid|user_id|customer_id", "visitor_id", "no_personalization"),
# A/B-Test / Personalisation
(r"ab_test|abtest|variant|experiment|target|target_qa", "ab_test", "no_personalization"),
(r"personalization|personalisation|adobe_target", "personalisation", "no_personalization"),
# Werbung / Retargeting
(r"fbp|fbc|fb_id|facebook|meta_pixel|fr$", "ad_pixel", "no_tracking"),
(r"adform|criteo|outbrain|taboola|tapad|adsrvr", "ad_pixel", "no_tracking"),
(r"doubleclick|test_cookie|ide|nid|exchange_uid", "ad_pixel", "no_tracking"),
(r"google_ad|gads|gcl", "ad_pixel", "no_tracking"),
(r"^li_|linkedin|bcookie|bscookie", "ad_pixel", "no_tracking"),
(r"pinterest|_pinterest_|_pin_unauth", "ad_pixel", "no_tracking"),
# Affiliate / Conversion
(r"conversion|orderid|order_id|transaction|purchase", "conversion_event", "no_tracking"),
(r"campaign|utm|source|medium|term", "campaign_attribution", "no_tracking"),
# ScrollPosition / Form-Helper
(r"scroll|position|form_|form_state", "ui_state", "no_personalization"),
# Loadbalancer / Sticky
(r"affinity|sticky|lb_|alb-|aws-alb", "load_balancer", "site_breaks"),
# Chat / Support
(r"chat|widget|genesys|livechat", "chat_session", "no_personalization"),
# Captcha
(r"hcaptcha|recaptcha|cf_|cloudflare", "bot_protection", "site_breaks"),
]
_FUNCTIONAL_LABEL = {
"session_id": "Sitzungs-ID",
"auth_token": "Auth-Token",
"csrf_token": "CSRF-Schutz",
"preference": "Sprache / Region",
"ui_preference": "UI-Praeferenz",
"user_data": "Nutzer-Daten",
"consent_state": "Consent-Speicher",
"tracking_id": "Tracking-ID",
"session_recording": "Session-Recording",
"visitor_id": "Besucher-ID",
"ab_test": "A/B-Test",
"personalisation": "Personalisierung",
"ad_pixel": "Werbe-Pixel",
"conversion_event": "Konversions-Tracking",
"campaign_attribution":"Kampagnen-Attribution",
"ui_state": "UI-Zustand (ScrollPos etc.)",
"load_balancer": "Load-Balancer",
"chat_session": "Chat-Session",
"bot_protection": "Bot-Schutz",
"unknown": "Unbekannt",
}
# Welche functional_roles ueberlappen funktional — verwendet vom
# vendor_redundancy.analyze() um echte Konsolidierungschancen zu
# erkennen statt nur Provider-Doppelungen zu zaehlen.
OVERLAPPING_ROLES = {
"tracking_id": "tracking",
"session_recording": "tracking",
"ab_test": "personalisation",
"personalisation": "personalisation",
"ad_pixel": "advertising",
"conversion_event": "advertising",
"campaign_attribution":"advertising",
}
def classify_cookie(cookie_name: str) -> tuple[str, str]:
"""Return (functional_role, blocking_impact) for a cookie name."""
n = (cookie_name or "").lower().strip()
for pattern, role, impact in _PATTERNS:
if re.search(pattern, n):
return role, impact
return "unknown", "no_tracking"
def annotate_vendor_cookies(vendor: dict) -> dict:
"""Enrich a vendor record with functional_role + KB knowledge per cookie."""
from compliance.services.cookie_knowledge import (
lookup_cookie, summarize_compliance_risk,
)
cookies = vendor.get("cookies") or []
annotated = []
role_counts: dict[str, int] = {}
for c in cookies:
role, impact = classify_cookie(c.get("name", ""))
knowledge = lookup_cookie(c.get("name", ""))
entry = {**c, "functional_role": role, "blocking_impact": impact}
if knowledge:
entry["knowledge"] = knowledge
annotated.append(entry)
role_counts[role] = role_counts.get(role, 0) + 1
out = {
**vendor,
"cookies": annotated,
"role_distribution": role_counts,
"role_labels": {r: _FUNCTIONAL_LABEL.get(r, r) for r in role_counts},
}
out["compliance_risk"] = summarize_compliance_risk(out)
return out
def aggregate_cookie_purposes(vendors: Iterable[dict]) -> dict:
"""Tenant-weite Verteilung: welche funktionalen Rollen kommen wie oft vor?"""
total: dict[str, int] = {}
by_vendor: dict[str, dict[str, int]] = {}
for v in vendors:
roles = v.get("role_distribution") or {}
if not roles and v.get("cookies"):
v = annotate_vendor_cookies(v)
roles = v["role_distribution"]
for r, n in roles.items():
total[r] = total.get(r, 0) + n
by_vendor[v.get("name", "")] = roles
return {
"total_per_role": total,
"labels": {r: _FUNCTIONAL_LABEL.get(r, r) for r in total},
"vendors_per_role": {
r: [v for v, rd in by_vendor.items() if rd.get(r, 0) > 0]
for r in total
},
}