From b663e2508f0c4ccf02d8e4cd52fe08034e40d229 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Fri, 22 May 2026 09:23:37 +0200 Subject: [PATCH] feat(audit): P107 Branchen-Benchmark-Cockpit fuer Big-4-Demos MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit benchmark_extractor.py — extract_kpis() liefert 18 KPIs pro Snapshot: * vendors_total, vendors_us, vendors_non_eu (mit % je Vendor-Land) * source_breakdown (llm/library/flat_pattern/table_paste/html_table_dom) * max/avg cookies_per_vendor (Konzentrations-Mass) * cookies_in_browser, cookies_detailed_count, cookie_doc_chars * banner_detected, banner_provider, banner_violations * compliance_score, data_quality_pct (wie viele unserer Datenquellen haben Inhalt) * saving_low/high_eur (Heuristik: (vendors - 10) × 1k-5k) anonymize_kpis() ersetzt site_label durch 'OEM 1/2/3' (Industry-Prefix Map: automotive→OEM, banking→Bank, chemistry→Chem, luftfahrt→Airline). GET /api/compliance/agent/admin/benchmark?industry=automotive&sites= VW,BMW,Mercedes&anonymized=true — liefert kpis + summary (n_sites, avg_vendors, total_saving_high). Admin-Page /sdk/benchmark: * Filter-Leiste: Industry-Dropdown, Sites-Input + 5 Preset-Gruppen (Automotive OEMs / Zulieferer, Chemie DAX, Luftfahrt, Banking DAX) * Anonymize-Toggle prominent * 5 Summary-KPI-Karten oben * Vergleichstabelle 13 Spalten (Score, Vendors, US%, Drittland%, Cookies-Browser, Cookie-Doc-kB, Banner ✓/✗, Provider, Verstoesse, Saving €/Jahr, Daten-Qualitaet, Captured-Time) * Red-/Amber-/Green-Indikatoren bei US%/Score/Drittland * Big-4-Hinweis-Footer Co-Authored-By: Claude Opus 4.7 (1M context) --- admin-compliance/app/sdk/benchmark/page.tsx | 266 ++++++++++++++++++ .../api/agent_compliance_check_routes.py | 36 +++ .../services/benchmark_extractor.py | 265 +++++++++++++++++ 3 files changed, 567 insertions(+) create mode 100644 admin-compliance/app/sdk/benchmark/page.tsx create mode 100644 backend-compliance/compliance/services/benchmark_extractor.py diff --git a/admin-compliance/app/sdk/benchmark/page.tsx b/admin-compliance/app/sdk/benchmark/page.tsx new file mode 100644 index 00000000..a9dae3bd --- /dev/null +++ b/admin-compliance/app/sdk/benchmark/page.tsx @@ -0,0 +1,266 @@ +'use client' + +/** + * P107 — Branchen-Benchmark-Cockpit. + * + * Multi-Site-Vergleich auf einen Blick. Anonymize-Toggle für Big-4- + * Wirtschaftspruefer-Demos. + * + * URL: /sdk/benchmark + */ + +import React, { useState, useEffect } from 'react' + +interface Kpi { + check_id: string + site_label: string + site_domain: string + captured_at: string + industry: string + vendors_total: number + vendors_us: number + vendors_non_eu: number + us_pct: number + non_eu_pct: number + source_breakdown: Record + max_cookies_per_vendor: number + avg_cookies_per_vendor: number + cookies_in_browser: number + cookies_detailed_count: number + cookie_doc_chars: number + banner_detected: boolean + banner_provider: string + banner_violations: number + compliance_score: number | null + saving_low_eur: number + saving_high_eur: number + data_quality_pct: number +} + +interface Summary { + n_sites: number + avg_vendors: number + avg_us_pct: number + avg_non_eu_pct: number + avg_cookies_browser: number + avg_score: number + max_vendors: number + max_saving_high: number + total_saving_low: number + total_saving_high: number +} + +const INDUSTRIES = [ + { id: '', label: 'Alle Branchen' }, + { id: 'automotive', label: 'Automotive (OEM)' }, + { id: 'banking', label: 'Banking / Finance' }, + { id: 'chemistry', label: 'Chemie / Pharma' }, + { id: 'luftfahrt', label: 'Luftfahrt' }, + { id: 'ecommerce', label: 'E-Commerce' }, + { id: 'saas', label: 'SaaS / Software' }, +] + +const PRESET_GROUPS = [ + { id: 'automotive_oem', label: 'Automotive OEMs', sites: 'Volkswagen,BMW,Mercedes-Benz,SEAT,AUDI' }, + { id: 'automotive_supl', label: 'Automotive Zulieferer', sites: 'ZF Friedrichshafen,Robert Bosch,Continental' }, + { id: 'chemie', label: 'Chemie (DAX)', sites: 'BASF,Bayer,Henkel,Linde' }, + { id: 'luftfahrt', label: 'Luftfahrt', sites: 'Lufthansa,Eurowings,Condor' }, + { id: 'banking', label: 'Banking (DAX)', sites: 'Deutsche Bank,Commerzbank,DZ Bank,KfW' }, +] + +export default function BenchmarkPage() { + const [industry, setIndustry] = useState('') + const [sites, setSites] = useState('') + const [anonymized, setAnonymized] = useState(false) + const [data, setData] = useState<{kpis: Kpi[]; summary: Summary} | null>(null) + const [loading, setLoading] = useState(false) + const [error, setError] = useState(null) + + const fetchData = async () => { + setLoading(true); setError(null) + try { + const url = new URL('/api/compliance/admin/benchmark', window.location.origin) + if (industry) url.searchParams.set('industry', industry) + if (sites) url.searchParams.set('sites', sites) + if (anonymized) url.searchParams.set('anonymized', 'true') + const r = await fetch(url.toString()) + if (!r.ok) throw new Error(`HTTP ${r.status}`) + setData(await r.json()) + } catch (e: any) { + setError(e.message || String(e)) + } finally { + setLoading(false) + } + } + + useEffect(() => { fetchData() }, []) + + return ( +
+
+

+ Branchen-Benchmark-Cockpit +

+

+ DAX-Konzern-Vergleich auf Basis aller bisher gepruefter Sites. + Mit Anonymize-Toggle fuer Wirtschaftspruefer-Demos. +

+
+ + {/* Filter-Leiste */} +
+
+ + +
+
+ + setSites(e.target.value)} + placeholder="Volkswagen,BMW,Mercedes-Benz" + className="w-full px-3 py-2 border rounded text-sm font-mono" /> +
+ {PRESET_GROUPS.map(p => ( + + ))} +
+
+ + +
+ + {error && ( +
+ Fehler: {error} +
+ )} + + {/* Summary-KPIs */} + {data?.summary && ( +
+ + + 60 ? 'warn' : 'ok'} /> + + +
+ )} + + {/* Vergleichstabelle */} + {data?.kpis && data.kpis.length > 0 ? ( +
+ + + + + + + + + + + + + + + + + + + + {data.kpis.map((k, i) => ( + + + + + + + + + + + + + + + + ))} + +
SiteScoreVendorsUS%Drittland%Cookies BrowserCookie-Doc kBBannerProviderBanner-VerstößeSaving € JahrDaten-QualitätCaptured
+ {k.site_label} +
{k.check_id}
+
= 80 ? 'text-green-700' : + k.compliance_score >= 60 ? 'text-amber-700' : 'text-red-700' + }`}> + {k.compliance_score ?? '—'} + {k.vendors_total} 60 ? 'text-red-700 font-semibold' : ''}`}> + {k.us_pct}% + 70 ? 'text-red-700' : ''}`}> + {k.non_eu_pct}% + {k.cookies_in_browser} + {Math.round(k.cookie_doc_chars / 1000)}k + {k.banner_detected ? '✓' : '✗'}{k.banner_provider || '—'} + {k.banner_violations || 0} + + {k.saving_high_eur ? `${(k.saving_high_eur/1000).toFixed(0)}k` : '—'} + = 70 ? 'text-green-700' : + k.data_quality_pct >= 40 ? 'text-amber-700' : 'text-red-700' + }`}> + {k.data_quality_pct}% + + {k.captured_at?.substring(0, 16).replace('T', ' ')} +
+
+ ) : !loading && ( +
+ Keine Snapshots gefunden — Filter anpassen oder einen Audit-Lauf starten. +
+ )} + +
+ Big-4-Hinweis: Mit Anonymize-Toggle koennen wir den + kompletten Branchen-Cut zeigen ohne Hersteller-Namen zu nennen + (z.B. "OEM 3 hat 78% US-Vendor-Anteil"). Damit ist die Daten- + Hoheit bei BreakPilot und Big 4 sieht den Mehrwert ohne dass + Wettbewerber-Vergleiche extern werden. +
+
+ ) +} + +function Kpi({ label, value, tone = 'neutral' }: { + label: string; value: any; tone?: 'ok' | 'warn' | 'bad' | 'neutral' +}) { + const colors: Record = { + ok: 'text-green-700 bg-green-50 border-green-200', + warn: 'text-amber-700 bg-amber-50 border-amber-200', + bad: 'text-red-700 bg-red-50 border-red-200', + neutral: 'text-gray-700 bg-white border-gray-200', + } + return ( +
+
{label}
+
{value}
+
+ ) +} diff --git a/backend-compliance/compliance/api/agent_compliance_check_routes.py b/backend-compliance/compliance/api/agent_compliance_check_routes.py index 6f928309..b70093a6 100644 --- a/backend-compliance/compliance/api/agent_compliance_check_routes.py +++ b/backend-compliance/compliance/api/agent_compliance_check_routes.py @@ -207,6 +207,42 @@ async def get_snapshot(snapshot_id: str): db.close() +@router.get("/admin/benchmark") +async def benchmark( + industry: str = "", + sites: str = "", + anonymized: bool = False, + limit: int = 50, +): + """P107 — Branchen-Benchmark-Cockpit Endpoint. + industry: 'automotive' / 'banking' / etc (optional) + sites: comma-separated site_label list (optional) + anonymized: bool — wenn true, Hersteller-Namen → 'OEM 1/2/3' + """ + from database import SessionLocal + from compliance.services.benchmark_extractor import ( + load_snapshots_for_benchmark, anonymize_kpis, + build_benchmark_summary, + ) + site_list = [s.strip() for s in sites.split(",") if s.strip()] if sites else None + db = SessionLocal() + try: + kpis = load_snapshots_for_benchmark( + db, industry=industry or None, sites=site_list, limit=limit, + ) + finally: + db.close() + if anonymized: + kpis = anonymize_kpis(kpis, industry=industry) + return { + "industry": industry or "all", + "anonymized": anonymized, + "sites": [k.get("site_label") for k in kpis], + "kpis": kpis, + "summary": build_benchmark_summary(kpis), + } + + @router.post("/admin/tcf-ingest") async def tcf_ingest(): """P105 — IAB TCF Vendor-Liste ingestieren / refreshen. diff --git a/backend-compliance/compliance/services/benchmark_extractor.py b/backend-compliance/compliance/services/benchmark_extractor.py new file mode 100644 index 00000000..bd70f24b --- /dev/null +++ b/backend-compliance/compliance/services/benchmark_extractor.py @@ -0,0 +1,265 @@ +""" +P107 — Branchen-Benchmark-KPIs pro Snapshot. + +Extrahiert aus einem compliance_check_snapshot 18 KPIs die fuer den +Multi-Site-Vergleich relevant sind. Wird vom /admin/benchmark Endpoint +genutzt um Vergleichstabellen zu rendern. + +USP: keine andere Compliance-Software gibt einen Wirtschaftspruefer +einen so granularen Branchen-Querschnitt. Bei DAX-Konzernen ist das +ein echtes Verkaufs-Asset (Big 4 koennen es ihren Kunden als +'wir sehen die ganze Branche' verkaufen). +""" + +from __future__ import annotations + +import logging +import re +from typing import Any + +from sqlalchemy import text as sa_text +from sqlalchemy.orm import Session + +logger = logging.getLogger(__name__) + + +_US_COUNTRIES = {"US", "USA", "United States"} +_NON_EU = {"US", "CN", "RU", "IN", "JP", "BR", "AU", "CA", "KR", + "MX", "ZA", "TR", "SG", "TW", "HK"} + + +def _safe_int(v: Any, default: int = 0) -> int: + try: + return int(v) + except (TypeError, ValueError): + return default + + +def _country_from_vendor(v: dict) -> str: + c = (v.get("country") or "").strip().upper() + if c: + return c + # Aus vendor_country wenn vorhanden (TCF-Authority Eintraege) + return "" + + +def extract_kpis(snapshot: dict) -> dict: + """Liefert 18 KPIs aus einem snapshot-row. + + Snapshot-row keys: id, check_id, site_label, site_domain, created_at, + banner_result, cmp_vendors, doc_entries, scan_context. + """ + br = snapshot.get("banner_result") or {} + cv = snapshot.get("cmp_vendors") or [] + de = snapshot.get("doc_entries") or [] + sc = snapshot.get("scan_context") or {} + + # Banner-Phase Cookies + phases = br.get("phases") or {} + after_accept = (phases.get("after_accept") or {}) + cookies_in_browser = len(after_accept.get("cookies") or []) + cd = br.get("cookies_detailed") or [] + + # Doc-Text Lengths + doc_text_total = sum(len((d.get("text") or "")) for d in de) + cookie_doc_len = next( + (len(d.get("text") or "") for d in de if d.get("doc_type") == "cookie"), 0, + ) + + # Vendor breakdown + n_vendors = len(cv) + countries = [_country_from_vendor(v) for v in cv] + countries = [c for c in countries if c] + n_us = sum(1 for c in countries if c in _US_COUNTRIES) + n_non_eu = sum(1 for c in countries if c in _NON_EU) + us_pct = round(n_us / max(1, n_vendors) * 100, 1) + non_eu_pct = round(n_non_eu / max(1, n_vendors) * 100, 1) + + # Vendor-Source-Mix + by_src: dict[str, int] = {} + for v in cv: + for s in (v.get("source") or "?").split(";"): + s = s.strip() or "?" + by_src[s] = by_src.get(s, 0) + 1 + + # Cookies pro Vendor (Konzentration) + cookie_counts = [len(v.get("cookies") or []) for v in cv] + max_cookies_per_vendor = max(cookie_counts) if cookie_counts else 0 + avg_cookies_per_vendor = ( + round(sum(cookie_counts) / max(1, len(cookie_counts)), 1) + if cookie_counts else 0 + ) + + # Banner-Checks + bc = br.get("banner_checks") or {} + n_banner_violations = len(bc.get("violations") or []) + banner_detected = bool(br.get("banner_detected")) + + # Compliance-Score (best effort) + score = br.get("compliance_score") or br.get("completeness_pct") + + # Estimated Saving (Lizenz-Konsolidierung, Heuristik) + # Pro 5 Vendor ueber Median (10) rechnen wir ~5k EUR/Jahr Einsparung + median_vendors = 10 + saving_low = max(0, (n_vendors - median_vendors)) * 1000 + saving_high = max(0, (n_vendors - median_vendors)) * 5000 + + return { + # Header + "check_id": snapshot.get("check_id"), + "site_label": snapshot.get("site_label"), + "site_domain": snapshot.get("site_domain"), + "captured_at": (snapshot.get("created_at").isoformat() + if snapshot.get("created_at") else None), + "industry": (sc or {}).get("industry") or "", + # Vendor-KPIs + "vendors_total": n_vendors, + "vendors_us": n_us, + "vendors_non_eu": n_non_eu, + "us_pct": us_pct, + "non_eu_pct": non_eu_pct, + "source_breakdown": by_src, + "max_cookies_per_vendor": max_cookies_per_vendor, + "avg_cookies_per_vendor": avg_cookies_per_vendor, + # Cookie-KPIs + "cookies_in_browser": cookies_in_browser, + "cookies_detailed_count": len(cd), + "cookie_doc_chars": cookie_doc_len, + "doc_text_chars_total": doc_text_total, + # Banner + "banner_detected": banner_detected, + "banner_provider": br.get("banner_provider") or "", + "banner_violations": n_banner_violations, + # Compliance / Score + "compliance_score": score, + # Saving (Heuristik) + "saving_low_eur": saving_low, + "saving_high_eur": saving_high, + # Capture-Quality (wie viele unserer 10+ Audit-Quellen liefern Daten) + "data_quality_pct": _quality_pct(snapshot), + } + + +def _quality_pct(snapshot: dict) -> int: + """Wieviel Prozent der erwarteten Datenquellen haben Inhalt?""" + br = snapshot.get("banner_result") or {} + cv = snapshot.get("cmp_vendors") or [] + de = snapshot.get("doc_entries") or [] + cd = br.get("cookies_detailed") or [] + aa = (br.get("phases") or {}).get("after_accept") or {} + + checks = [ + br.get("banner_detected") is True, + len(cv) > 0, + len(de) > 0, + len(cd) > 0, + len(aa.get("cookies") or []) > 0, + any((d.get("text") or "") for d in de), + br.get("compliance_score") is not None or br.get("completeness_pct") is not None, + ] + return round(sum(1 for x in checks if x) / len(checks) * 100) + + +def load_snapshots_for_benchmark( + db: Session, + industry: str | None = None, + sites: list[str] | None = None, + limit: int = 50, +) -> list[dict]: + """Liefert dicts mit Snapshot-Daten + extracted KPIs.""" + where = [] + params: dict[str, Any] = {} + if industry: + where.append("(scan_context->>'industry') = :ind") + params["ind"] = industry + if sites: + where.append("site_label = ANY(:sites)") + params["sites"] = sites + where_sql = " AND ".join(where) if where else "TRUE" + + sql = ( + "SELECT id::text, check_id, site_label, site_domain, created_at, " + " banner_result, cmp_vendors, doc_entries, scan_context " + "FROM compliance.compliance_check_snapshots " + f"WHERE {where_sql} " + "ORDER BY created_at DESC LIMIT :lim" + ) + params["lim"] = limit + + rows = db.execute(sa_text(sql), params).fetchall() + out: list[dict] = [] + for r in rows: + import json as _j + def _parse(v): + if isinstance(v, (dict, list)) or v is None: + return v + try: + return _j.loads(v) + except Exception: + return v + snap = { + "id": r[0], + "check_id": r[1], + "site_label": r[2], + "site_domain": r[3], + "created_at": r[4], + "banner_result": _parse(r[5]), + "cmp_vendors": _parse(r[6]) or [], + "doc_entries": _parse(r[7]) or [], + "scan_context": _parse(r[8]) or {}, + } + out.append(extract_kpis(snap)) + return out + + +def anonymize_kpis(kpis: list[dict], industry: str = "") -> list[dict]: + """Ersetzt site_label durch 'OEM 1', 'OEM 2' etc. + Industry-Prefix waehlbar (Automotive→OEM, Banking→Bank, Chemie→Chem). + """ + prefix_map = { + "automotive": "OEM", + "banking": "Bank", + "chemistry": "Chem", + "luftfahrt": "Airline", + "saas": "SaaS", + "ecommerce": "Shop", + } + pfx = prefix_map.get(industry.lower(), "Site") + # Stable alphabetical numbering for determinism + seen: dict[str, str] = {} + next_idx = 1 + out = [] + for k in sorted(kpis, key=lambda x: (x.get("site_label") or "")): + sl = k.get("site_label") or "" + if sl not in seen: + seen[sl] = f"{pfx} {next_idx}" + next_idx += 1 + anon_k = dict(k) + anon_k["site_label"] = seen[sl] + anon_k["site_domain"] = f"site-{next_idx-1}.example" + out.append(anon_k) + return out + + +def build_benchmark_summary(kpis: list[dict]) -> dict: + """Aggregate-Stats fuer den ganzen Branchen-Cut.""" + if not kpis: + return {} + def avg(field: str) -> float: + vals = [k.get(field) for k in kpis if isinstance(k.get(field), (int, float))] + return round(sum(vals) / max(1, len(vals)), 1) if vals else 0 + def maxv(field: str): + vals = [k.get(field) for k in kpis if isinstance(k.get(field), (int, float))] + return max(vals) if vals else 0 + return { + "n_sites": len(kpis), + "avg_vendors": avg("vendors_total"), + "avg_us_pct": avg("us_pct"), + "avg_non_eu_pct": avg("non_eu_pct"), + "avg_cookies_browser": avg("cookies_in_browser"), + "avg_score": avg("compliance_score"), + "max_vendors": maxv("vendors_total"), + "max_saving_high": maxv("saving_high_eur"), + "total_saving_low": sum(k.get("saving_low_eur") or 0 for k in kpis), + "total_saving_high": sum(k.get("saving_high_eur") or 0 for k in kpis), + }