""" Vendor-Cost-Estimator — leitet pro Vendor ein Pricing-Tier aus Cookie-Signalen ab und gibt eine intensitaets-basierte Jahres- kostenschaetzung zurueck. Cookie-Signale die wir auswerten: - Anzahl Cookies pro Vendor (proxy fuer Modul-Tiefe) - Premium-Feature-Cookies (z.B. 's_target_qa', '_ab_test' → Enterprise-Add-on) - Edge/Region-Cookies (Multi-Region → Premier-Tier CDN) - Cookie-Persistenz (Multi-Jahr → Heavy-Tracking-Lizenz) Plus business_profile fuer Company-Tier-Inferenz. Output pro Vendor: - inferred_tier: 'starter' | 'professional' | 'enterprise' | 'premier' - tier_signals : Liste der Indikatoren die zum Tier gefuehrt haben - cost_year_eur_range: (low, high) basierend auf Tier × Vendor-Pricing - confidence: 'low' | 'medium' | 'high' Dieses Modul ergaenzt vendor_redundancy.py — die einfachen low/high Pauschalen dort werden hier durch dynamische, signal-basierte Werte ersetzt. """ from __future__ import annotations import logging import re from typing import Iterable logger = logging.getLogger(__name__) # ─── Premium-Feature-Cookies: Indikator fuer Enterprise-Add-ons ───── # # Wenn ein Vendor diese Cookies setzt, ist der Kunde mit hoher # Wahrscheinlichkeit auf einem Enterprise-Plan. _PREMIUM_FEATURE_PATTERNS: list[tuple[str, str, str]] = [ # (regex, vendor_key, premium_feature_label) (r"^s_target_qa$", "adobe analytics", "Adobe Target Add-on"), (r"adobe.*target", "adobe target", "Personalization Enterprise"), (r"^aam_uuid", "adobe analytics", "Audience Manager Enterprise"), (r"^s_ecid", "adobe analytics", "Experience Cloud ID Service"), (r"^_pcid_", "adobe analytics", "People-Based Destinations"), (r"^_gat_gtag_UA", "google analytics", "GA360 Multi-Tracker"), (r"^_ga_[A-Z0-9]+_[A-Z0-9]+", "google analytics", "GA4 Enterprise Stream"), (r"^_uetmsdns", "microsoft advertising", "Custom Conversion Tracking"), (r"^_fbp.*test", "meta pixel", "Conversions API Premium"), (r"^_pin_unauth_premium", "pinterest", "Pinterest Premium-API"), (r"^afm", "adform", "Affinity-Module"), (r"^cto_dna", "criteo", "Dynamic Retargeting Premium"), # CDN / Infra Premium (r"^aws-alb-[a-z0-9]+", "amazon web services", "ALB + Multi-Region"), (r"^aws-waf", "amazon web services", "WAF Enterprise"), (r"^cf_clearance", "cloudflare", "Bot-Management Pro"), (r"^akm_[a-z]+", "akamai", "Adaptive Media Delivery Enterprise"), # Salesforce Customer-360 (r"^bid_n_", "salesforce", "Marketing Cloud Personalization"), (r"^_cs_", "salesforce", "CDP Premium"), ] # ─── Tier-Pricing pro Vendor (jaehrlich, EUR) ─────────────────────── # # 4 Tiers: starter (KMU), professional (Mid), enterprise (Konzern), # premier (Global Brand / Heavy User). _TIER_PRICING: dict[str, dict[str, tuple[int, int]]] = { "adobe analytics": { "starter": ( 10_000, 30_000), "professional": ( 60_000, 150_000), "enterprise": (200_000, 500_000), "premier": (500_000, 900_000), }, "adobe target": { "starter": ( 8_000, 25_000), "professional": ( 40_000, 100_000), "enterprise": (120_000, 300_000), "premier": (300_000, 600_000), }, "adobe campaign": { "starter": ( 10_000, 30_000), "professional": ( 40_000, 100_000), "enterprise": (120_000, 280_000), "premier": (280_000, 500_000), }, "google analytics": { "starter": ( 0, 0), # GA4 free "professional": ( 0, 0), "enterprise": ( 80_000, 150_000), # GA360 "premier": (150_000, 300_000), }, "matomo": { "starter": ( 0, 3_000), # On-prem free / Cloud Starter "professional": ( 6_000, 20_000), "enterprise": ( 20_000, 80_000), "premier": ( 60_000, 150_000), }, "content square": { "starter": ( 12_000, 40_000), "professional": ( 60_000, 150_000), "enterprise": (150_000, 350_000), "premier": (350_000, 700_000), }, "contentsquare": { "starter": ( 12_000, 40_000), "professional": ( 60_000, 150_000), "enterprise": (150_000, 350_000), "premier": (350_000, 700_000), }, "dynatrace": { "starter": ( 5_000, 15_000), "professional": ( 30_000, 80_000), "enterprise": (100_000, 300_000), "premier": (300_000, 800_000), }, "qualtrics": { "starter": ( 6_000, 20_000), "professional": ( 30_000, 80_000), "enterprise": ( 80_000, 200_000), "premier": (200_000, 500_000), }, # Advertising / Retargeting (Lizenz + Self-Service; Media-Spend SEPARAT) "criteo": { "starter": ( 6_000, 20_000), "professional": ( 30_000, 80_000), "enterprise": ( 80_000, 250_000), "premier": (250_000, 600_000), }, "adform": { "starter": ( 12_000, 40_000), "professional": ( 60_000, 150_000), "enterprise": (150_000, 400_000), "premier": (400_000, 800_000), }, "outbrain": { "starter": ( 6_000, 20_000), "professional": ( 30_000, 80_000), "enterprise": ( 80_000, 200_000), "premier": (200_000, 500_000), }, "taboola": { "starter": ( 6_000, 20_000), "professional": ( 30_000, 80_000), "enterprise": ( 80_000, 200_000), "premier": (200_000, 500_000), }, "teads": { "starter": ( 6_000, 18_000), "professional": ( 20_000, 60_000), "enterprise": ( 60_000, 150_000), "premier": (150_000, 350_000), }, "pinterest": { "starter": ( 3_000, 15_000), "professional": ( 15_000, 50_000), "enterprise": ( 50_000, 150_000), "premier": (150_000, 400_000), }, "linkedin insight": { "starter": ( 3_000, 12_000), "professional": ( 12_000, 40_000), "enterprise": ( 40_000, 120_000), "premier": (120_000, 300_000), }, # CDN / Cloud "akamai": { "starter": ( 20_000, 60_000), "professional": ( 80_000, 200_000), "enterprise": (200_000, 500_000), "premier": (500_000, 1_500_000), }, "amazon web services": { "starter": ( 12_000, 60_000), "professional": ( 60_000, 300_000), "enterprise": (300_000, 1_500_000), "premier": (1_500_000, 8_000_000), }, "baqend": { "starter": ( 3_000, 12_000), "professional": ( 12_000, 40_000), "enterprise": ( 40_000, 120_000), "premier": (120_000, 300_000), }, "speedkit": { "starter": ( 3_000, 12_000), "professional": ( 12_000, 40_000), "enterprise": ( 40_000, 120_000), "premier": (120_000, 300_000), }, "speedcurve": { "starter": ( 1_200, 4_800), "professional": ( 6_000, 18_000), "enterprise": ( 18_000, 60_000), "premier": ( 60_000, 120_000), }, # CRM / Marketing "salesforce": { "starter": ( 20_000, 60_000), "professional": ( 80_000, 250_000), "enterprise": (250_000, 800_000), "premier": (800_000, 2_500_000), }, "genesys": { "starter": ( 24_000, 80_000), "professional": ( 80_000, 250_000), "enterprise": (250_000, 800_000), "premier": (800_000, 2_000_000), }, # Captcha "hcaptcha": { "starter": ( 0, 2_400), "professional": ( 2_400, 12_000), "enterprise": ( 12_000, 40_000), "premier": ( 40_000, 100_000), }, # Lead-Tracking "salesviewer": { "starter": ( 1_200, 3_600), "professional": ( 3_600, 12_000), "enterprise": ( 12_000, 40_000), "premier": ( 40_000, 100_000), }, } def _vendor_key(vendor_name: str) -> str | None: """Map a vendor name to a known pricing-table key.""" n = (vendor_name or "").lower() for k in _TIER_PRICING: if k in n: return k return None def infer_company_tier(business_profile: dict | None) -> str: """Coarse company-tier from business profile. Used as the baseline when vendor-specific signals are weak. """ if not business_profile: return "professional" bp = business_profile features = {f.lower() for f in (bp.get("features") or [])} btype = (bp.get("type") or "").lower() # Heavy enterprise-only signals if any(f in features for f in ("multi_country", "konzern", "enterprise", "international", "automotive", "banking", "luxury", "premium")): return "premier" # Large but maybe single-country if "shop" in features or "konfigurator" in features or btype == "b2c": return "enterprise" return "professional" def infer_vendor_tier(vendor: dict, company_tier: str) -> tuple[str, list[str]]: """Infer pricing tier for a single vendor from its cookie footprint. Signals (additive — more signals → higher tier): - cookie_count > 30 → +1 tier - cookie_count > 60 → +2 tiers - premium-feature cookie hit → +1 tier - 'is_third_party' on most cookies → +1 tier (heavy-tracking signal) - very long expiry (>=2 years) → +1 tier """ cookies = vendor.get("cookies") or [] n_cookies = len(cookies) cookie_names = [c.get("name", "").lower() for c in cookies] signals: list[str] = [] base_tiers = ["starter", "professional", "enterprise", "premier"] # Start at company-tier as baseline idx = base_tiers.index(company_tier) if company_tier in base_tiers else 1 if n_cookies >= 60: idx = min(len(base_tiers) - 1, idx + 1) signals.append(f"{n_cookies} Cookies (sehr hohe Modul-Tiefe)") elif n_cookies >= 30: signals.append(f"{n_cookies} Cookies (hohe Modul-Tiefe)") # Premium feature detection vk = _vendor_key(vendor.get("name", "")) for pattern, expected_key, feature_label in _PREMIUM_FEATURE_PATTERNS: if vk and vk != expected_key and expected_key not in (vendor.get("name") or "").lower(): continue for cn in cookie_names: if re.search(pattern, cn): idx = min(len(base_tiers) - 1, idx + 1) signals.append(f"Premium-Feature-Cookie: {feature_label}") break # Heavy third-party tracking third_party_ratio = sum(1 for c in cookies if c.get("is_third_party")) / max(n_cookies, 1) if third_party_ratio >= 0.6 and n_cookies >= 10: signals.append(f"{int(third_party_ratio * 100)}% Drittanbieter-Cookies — Tracking-Heavy") # Long-lived cookies long_lived = sum(1 for c in cookies if _expiry_years(c.get("expiry", "")) >= 2) if long_lived >= 3: signals.append(f"{long_lived} Cookies mit ≥2 Jahre Speicherdauer") return base_tiers[idx], signals def _expiry_years(expiry_str: str) -> float: """Rough parse: '2 Jahre' → 2.0, '24 Monate' → 2.0, '90 Tage' → 0.25""" s = (expiry_str or "").lower() m = re.search(r"(\d+)\s*(jahr|year)", s) if m: return float(m.group(1)) m = re.search(r"(\d+)\s*(monat|month)", s) if m: return float(m.group(1)) / 12.0 m = re.search(r"(\d+)\s*(tag|day)", s) if m: return float(m.group(1)) / 365.0 return 0.0 def estimate_vendor_cost(vendor: dict, business_profile: dict | None = None) -> dict: """Return cost estimation for one vendor incl. tier inference + signals.""" vk = _vendor_key(vendor.get("name", "")) company_tier = infer_company_tier(business_profile) if not vk: return { "vendor": vendor.get("name", ""), "matched_pricing_key": None, "inferred_tier": None, "tier_signals": [], "company_tier_baseline": company_tier, "cost_year_eur_range": (0, 0), "confidence": "none", "note": "Kein Pricing-Eintrag fuer diesen Anbieter — Saving-Schaetzung uebergangen.", } tier, signals = infer_vendor_tier(vendor, company_tier) pricing = _TIER_PRICING[vk].get(tier) or (0, 0) confidence = "high" if len(signals) >= 2 else ("medium" if signals else "low") return { "vendor": vendor.get("name", ""), "matched_pricing_key": vk, "inferred_tier": tier, "tier_signals": signals, "company_tier_baseline": company_tier, "cost_year_eur_range": pricing, "confidence": confidence, } def estimate_total_stack_cost( vendors: Iterable[dict], business_profile: dict | None = None, ) -> dict: """Aggregate cost estimation over all vendors. Returns: - per_vendor list (one entry each) - per_recipient_type aggregate (INTERNAL vs PROCESSOR vs CONTROLLER) - total range - master-contract dedup hint: vendors whose name starts with the site owner ('BMW AG — ...') are bundled into ONE master contract per vendor-tool-key (not double-counted). """ per_vendor: list[dict] = [] seen_master_keys: set[tuple[str, str]] = set() total_low = 0 total_high = 0 for v in vendors: est = estimate_vendor_cost(v, business_profile) per_vendor.append(est) if not est["matched_pricing_key"]: continue rtype = (v.get("recipient_type") or "").upper() master_key = (est["matched_pricing_key"], rtype if rtype == "INTERNAL" else "EXT") if rtype == "INTERNAL" and master_key in seen_master_keys: # Same Adobe contract serves many "BMW AG — Adobe XYZ" lines — # count cost only ONCE per (key, internal). est["bundled_into_master_contract"] = True continue seen_master_keys.add(master_key) lo, hi = est["cost_year_eur_range"] total_low += lo total_high += hi return { "per_vendor": per_vendor, "total_year_eur_range": (total_low, total_high), "master_contracts_counted": len(seen_master_keys), "disclaimer": ( "Schaetzung basiert auf Cookie-Signalen (Anzahl, Premium-Feature-Detection, " "Drittanbieter-Quote, Lebensdauer) + Listpreisen pro Tier. Konzern-Konditionen " "koennen 30-50% darunter liegen. Eintraege derselben Eigenmarke werden zu EINEM " "Master-Vertrag aggregiert. Media-Spend ist NICHT enthalten." ), }