""" P107 — Branchen-Benchmark-KPIs pro Snapshot. Extrahiert aus einem compliance_check_snapshot 18 KPIs die fuer den Multi-Site-Vergleich relevant sind. Wird vom /admin/benchmark Endpoint genutzt um Vergleichstabellen zu rendern. USP: keine andere Compliance-Software gibt einen Wirtschaftspruefer einen so granularen Branchen-Querschnitt. Bei DAX-Konzernen ist das ein echtes Verkaufs-Asset (Big 4 koennen es ihren Kunden als 'wir sehen die ganze Branche' verkaufen). """ from __future__ import annotations import logging import re from typing import Any from sqlalchemy import text as sa_text from sqlalchemy.orm import Session logger = logging.getLogger(__name__) _US_COUNTRIES = {"US", "USA", "United States"} _NON_EU = {"US", "CN", "RU", "IN", "JP", "BR", "AU", "CA", "KR", "MX", "ZA", "TR", "SG", "TW", "HK"} def _safe_int(v: Any, default: int = 0) -> int: try: return int(v) except (TypeError, ValueError): return default def _country_from_vendor(v: dict) -> str: c = (v.get("country") or "").strip().upper() if c: return c # Aus vendor_country wenn vorhanden (TCF-Authority Eintraege) return "" def extract_kpis(snapshot: dict) -> dict: """Liefert 18 KPIs aus einem snapshot-row. Snapshot-row keys: id, check_id, site_label, site_domain, created_at, banner_result, cmp_vendors, doc_entries, scan_context. """ br = snapshot.get("banner_result") or {} cv = snapshot.get("cmp_vendors") or [] de = snapshot.get("doc_entries") or [] sc = snapshot.get("scan_context") or {} # Banner-Phase Cookies phases = br.get("phases") or {} after_accept = (phases.get("after_accept") or {}) cookies_in_browser = len(after_accept.get("cookies") or []) cd = br.get("cookies_detailed") or [] # Doc-Text Lengths doc_text_total = sum(len((d.get("text") or "")) for d in de) cookie_doc_len = next( (len(d.get("text") or "") for d in de if d.get("doc_type") == "cookie"), 0, ) # Vendor breakdown n_vendors = len(cv) countries = [_country_from_vendor(v) for v in cv] countries = [c for c in countries if c] n_us = sum(1 for c in countries if c in _US_COUNTRIES) n_non_eu = sum(1 for c in countries if c in _NON_EU) us_pct = round(n_us / max(1, n_vendors) * 100, 1) non_eu_pct = round(n_non_eu / max(1, n_vendors) * 100, 1) # Vendor-Source-Mix by_src: dict[str, int] = {} for v in cv: for s in (v.get("source") or "?").split(";"): s = s.strip() or "?" by_src[s] = by_src.get(s, 0) + 1 # Cookies pro Vendor (Konzentration) cookie_counts = [len(v.get("cookies") or []) for v in cv] max_cookies_per_vendor = max(cookie_counts) if cookie_counts else 0 avg_cookies_per_vendor = ( round(sum(cookie_counts) / max(1, len(cookie_counts)), 1) if cookie_counts else 0 ) # Banner-Checks bc = br.get("banner_checks") or {} n_banner_violations = len(bc.get("violations") or []) banner_detected = bool(br.get("banner_detected")) # Compliance-Score (best effort) score = br.get("compliance_score") or br.get("completeness_pct") # Estimated Saving (Lizenz-Konsolidierung, Heuristik) # Pro 5 Vendor ueber Median (10) rechnen wir ~5k EUR/Jahr Einsparung median_vendors = 10 saving_low = max(0, (n_vendors - median_vendors)) * 1000 saving_high = max(0, (n_vendors - median_vendors)) * 5000 return { # Header "check_id": snapshot.get("check_id"), "site_label": snapshot.get("site_label"), "site_domain": snapshot.get("site_domain"), "captured_at": (snapshot.get("created_at").isoformat() if snapshot.get("created_at") else None), "industry": (sc or {}).get("industry") or "", # Vendor-KPIs "vendors_total": n_vendors, "vendors_us": n_us, "vendors_non_eu": n_non_eu, "us_pct": us_pct, "non_eu_pct": non_eu_pct, "source_breakdown": by_src, "max_cookies_per_vendor": max_cookies_per_vendor, "avg_cookies_per_vendor": avg_cookies_per_vendor, # Cookie-KPIs "cookies_in_browser": cookies_in_browser, "cookies_detailed_count": len(cd), "cookie_doc_chars": cookie_doc_len, "doc_text_chars_total": doc_text_total, # Banner "banner_detected": banner_detected, "banner_provider": br.get("banner_provider") or "", "banner_violations": n_banner_violations, # Compliance / Score "compliance_score": score, # Saving (Heuristik) "saving_low_eur": saving_low, "saving_high_eur": saving_high, # Capture-Quality (wie viele unserer 10+ Audit-Quellen liefern Daten) "data_quality_pct": _quality_pct(snapshot), } def _quality_pct(snapshot: dict) -> int: """Wieviel Prozent der erwarteten Datenquellen haben Inhalt?""" br = snapshot.get("banner_result") or {} cv = snapshot.get("cmp_vendors") or [] de = snapshot.get("doc_entries") or [] cd = br.get("cookies_detailed") or [] aa = (br.get("phases") or {}).get("after_accept") or {} checks = [ br.get("banner_detected") is True, len(cv) > 0, len(de) > 0, len(cd) > 0, len(aa.get("cookies") or []) > 0, any((d.get("text") or "") for d in de), br.get("compliance_score") is not None or br.get("completeness_pct") is not None, ] return round(sum(1 for x in checks if x) / len(checks) * 100) def load_snapshots_for_benchmark( db: Session, industry: str | None = None, sites: list[str] | None = None, limit: int = 50, ) -> list[dict]: """Liefert dicts mit Snapshot-Daten + extracted KPIs.""" where = [] params: dict[str, Any] = {} if industry: where.append("(scan_context->>'industry') = :ind") params["ind"] = industry if sites: where.append("site_label = ANY(:sites)") params["sites"] = sites where_sql = " AND ".join(where) if where else "TRUE" sql = ( "SELECT id::text, check_id, site_label, site_domain, created_at, " " banner_result, cmp_vendors, doc_entries, scan_context " "FROM compliance.compliance_check_snapshots " f"WHERE {where_sql} " "ORDER BY created_at DESC LIMIT :lim" ) params["lim"] = limit rows = db.execute(sa_text(sql), params).fetchall() out: list[dict] = [] for r in rows: import json as _j def _parse(v): if isinstance(v, (dict, list)) or v is None: return v try: return _j.loads(v) except Exception: return v snap = { "id": r[0], "check_id": r[1], "site_label": r[2], "site_domain": r[3], "created_at": r[4], "banner_result": _parse(r[5]), "cmp_vendors": _parse(r[6]) or [], "doc_entries": _parse(r[7]) or [], "scan_context": _parse(r[8]) or {}, } out.append(extract_kpis(snap)) return out def anonymize_kpis(kpis: list[dict], industry: str = "") -> list[dict]: """Ersetzt site_label durch 'OEM 1', 'OEM 2' etc. Industry-Prefix waehlbar (Automotive→OEM, Banking→Bank, Chemie→Chem). """ prefix_map = { "automotive": "OEM", "banking": "Bank", "chemistry": "Chem", "luftfahrt": "Airline", "saas": "SaaS", "ecommerce": "Shop", } pfx = prefix_map.get(industry.lower(), "Site") # Stable alphabetical numbering for determinism seen: dict[str, str] = {} next_idx = 1 out = [] for k in sorted(kpis, key=lambda x: (x.get("site_label") or "")): sl = k.get("site_label") or "" if sl not in seen: seen[sl] = f"{pfx} {next_idx}" next_idx += 1 anon_k = dict(k) anon_k["site_label"] = seen[sl] anon_k["site_domain"] = f"site-{next_idx-1}.example" out.append(anon_k) return out def build_benchmark_summary(kpis: list[dict]) -> dict: """Aggregate-Stats fuer den ganzen Branchen-Cut.""" if not kpis: return {} def avg(field: str) -> float: vals = [k.get(field) for k in kpis if isinstance(k.get(field), (int, float))] return round(sum(vals) / max(1, len(vals)), 1) if vals else 0 def maxv(field: str): vals = [k.get(field) for k in kpis if isinstance(k.get(field), (int, float))] return max(vals) if vals else 0 return { "n_sites": len(kpis), "avg_vendors": avg("vendors_total"), "avg_us_pct": avg("us_pct"), "avg_non_eu_pct": avg("non_eu_pct"), "avg_cookies_browser": avg("cookies_in_browser"), "avg_score": avg("compliance_score"), "max_vendors": maxv("vendors_total"), "max_saving_high": maxv("saving_high_eur"), "total_saving_low": sum(k.get("saving_low_eur") or 0 for k in kpis), "total_saving_high": sum(k.get("saving_high_eur") or 0 for k in kpis), }