b663e2508f
CI / secret-scan (push) Has been skipped
CI / dep-audit (push) Has been skipped
CI / sbom-scan (push) Has been skipped
CI / validate-canonical-controls (push) Successful in 17s
CI / loc-budget (push) Failing after 18s
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / nodejs-build (push) Successful in 3m5s
CI / test-go (push) Failing after 54s
CI / iace-gt-coverage (push) Successful in 27s
CI / test-python-backend (push) Successful in 47s
CI / detect-changes (push) Successful in 13s
CI / branch-name (push) Has been skipped
CI / guardrail-integrity (push) Has been skipped
CI / test-python-document-crawler (push) Has been skipped
CI / test-python-dsms-gateway (push) Has been skipped
benchmark_extractor.py — extract_kpis() liefert 18 KPIs pro Snapshot: * vendors_total, vendors_us, vendors_non_eu (mit % je Vendor-Land) * source_breakdown (llm/library/flat_pattern/table_paste/html_table_dom) * max/avg cookies_per_vendor (Konzentrations-Mass) * cookies_in_browser, cookies_detailed_count, cookie_doc_chars * banner_detected, banner_provider, banner_violations * compliance_score, data_quality_pct (wie viele unserer Datenquellen haben Inhalt) * saving_low/high_eur (Heuristik: (vendors - 10) × 1k-5k) anonymize_kpis() ersetzt site_label durch 'OEM 1/2/3' (Industry-Prefix Map: automotive→OEM, banking→Bank, chemistry→Chem, luftfahrt→Airline). GET /api/compliance/agent/admin/benchmark?industry=automotive&sites= VW,BMW,Mercedes&anonymized=true — liefert kpis + summary (n_sites, avg_vendors, total_saving_high). Admin-Page /sdk/benchmark: * Filter-Leiste: Industry-Dropdown, Sites-Input + 5 Preset-Gruppen (Automotive OEMs / Zulieferer, Chemie DAX, Luftfahrt, Banking DAX) * Anonymize-Toggle prominent * 5 Summary-KPI-Karten oben * Vergleichstabelle 13 Spalten (Score, Vendors, US%, Drittland%, Cookies-Browser, Cookie-Doc-kB, Banner ✓/✗, Provider, Verstoesse, Saving €/Jahr, Daten-Qualitaet, Captured-Time) * Red-/Amber-/Green-Indikatoren bei US%/Score/Drittland * Big-4-Hinweis-Footer Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
266 lines
9.1 KiB
Python
266 lines
9.1 KiB
Python
"""
|
|
P107 — Branchen-Benchmark-KPIs pro Snapshot.
|
|
|
|
Extrahiert aus einem compliance_check_snapshot 18 KPIs die fuer den
|
|
Multi-Site-Vergleich relevant sind. Wird vom /admin/benchmark Endpoint
|
|
genutzt um Vergleichstabellen zu rendern.
|
|
|
|
USP: keine andere Compliance-Software gibt einen Wirtschaftspruefer
|
|
einen so granularen Branchen-Querschnitt. Bei DAX-Konzernen ist das
|
|
ein echtes Verkaufs-Asset (Big 4 koennen es ihren Kunden als
|
|
'wir sehen die ganze Branche' verkaufen).
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
import re
|
|
from typing import Any
|
|
|
|
from sqlalchemy import text as sa_text
|
|
from sqlalchemy.orm import Session
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
_US_COUNTRIES = {"US", "USA", "United States"}
|
|
_NON_EU = {"US", "CN", "RU", "IN", "JP", "BR", "AU", "CA", "KR",
|
|
"MX", "ZA", "TR", "SG", "TW", "HK"}
|
|
|
|
|
|
def _safe_int(v: Any, default: int = 0) -> int:
|
|
try:
|
|
return int(v)
|
|
except (TypeError, ValueError):
|
|
return default
|
|
|
|
|
|
def _country_from_vendor(v: dict) -> str:
|
|
c = (v.get("country") or "").strip().upper()
|
|
if c:
|
|
return c
|
|
# Aus vendor_country wenn vorhanden (TCF-Authority Eintraege)
|
|
return ""
|
|
|
|
|
|
def extract_kpis(snapshot: dict) -> dict:
|
|
"""Liefert 18 KPIs aus einem snapshot-row.
|
|
|
|
Snapshot-row keys: id, check_id, site_label, site_domain, created_at,
|
|
banner_result, cmp_vendors, doc_entries, scan_context.
|
|
"""
|
|
br = snapshot.get("banner_result") or {}
|
|
cv = snapshot.get("cmp_vendors") or []
|
|
de = snapshot.get("doc_entries") or []
|
|
sc = snapshot.get("scan_context") or {}
|
|
|
|
# Banner-Phase Cookies
|
|
phases = br.get("phases") or {}
|
|
after_accept = (phases.get("after_accept") or {})
|
|
cookies_in_browser = len(after_accept.get("cookies") or [])
|
|
cd = br.get("cookies_detailed") or []
|
|
|
|
# Doc-Text Lengths
|
|
doc_text_total = sum(len((d.get("text") or "")) for d in de)
|
|
cookie_doc_len = next(
|
|
(len(d.get("text") or "") for d in de if d.get("doc_type") == "cookie"), 0,
|
|
)
|
|
|
|
# Vendor breakdown
|
|
n_vendors = len(cv)
|
|
countries = [_country_from_vendor(v) for v in cv]
|
|
countries = [c for c in countries if c]
|
|
n_us = sum(1 for c in countries if c in _US_COUNTRIES)
|
|
n_non_eu = sum(1 for c in countries if c in _NON_EU)
|
|
us_pct = round(n_us / max(1, n_vendors) * 100, 1)
|
|
non_eu_pct = round(n_non_eu / max(1, n_vendors) * 100, 1)
|
|
|
|
# Vendor-Source-Mix
|
|
by_src: dict[str, int] = {}
|
|
for v in cv:
|
|
for s in (v.get("source") or "?").split(";"):
|
|
s = s.strip() or "?"
|
|
by_src[s] = by_src.get(s, 0) + 1
|
|
|
|
# Cookies pro Vendor (Konzentration)
|
|
cookie_counts = [len(v.get("cookies") or []) for v in cv]
|
|
max_cookies_per_vendor = max(cookie_counts) if cookie_counts else 0
|
|
avg_cookies_per_vendor = (
|
|
round(sum(cookie_counts) / max(1, len(cookie_counts)), 1)
|
|
if cookie_counts else 0
|
|
)
|
|
|
|
# Banner-Checks
|
|
bc = br.get("banner_checks") or {}
|
|
n_banner_violations = len(bc.get("violations") or [])
|
|
banner_detected = bool(br.get("banner_detected"))
|
|
|
|
# Compliance-Score (best effort)
|
|
score = br.get("compliance_score") or br.get("completeness_pct")
|
|
|
|
# Estimated Saving (Lizenz-Konsolidierung, Heuristik)
|
|
# Pro 5 Vendor ueber Median (10) rechnen wir ~5k EUR/Jahr Einsparung
|
|
median_vendors = 10
|
|
saving_low = max(0, (n_vendors - median_vendors)) * 1000
|
|
saving_high = max(0, (n_vendors - median_vendors)) * 5000
|
|
|
|
return {
|
|
# Header
|
|
"check_id": snapshot.get("check_id"),
|
|
"site_label": snapshot.get("site_label"),
|
|
"site_domain": snapshot.get("site_domain"),
|
|
"captured_at": (snapshot.get("created_at").isoformat()
|
|
if snapshot.get("created_at") else None),
|
|
"industry": (sc or {}).get("industry") or "",
|
|
# Vendor-KPIs
|
|
"vendors_total": n_vendors,
|
|
"vendors_us": n_us,
|
|
"vendors_non_eu": n_non_eu,
|
|
"us_pct": us_pct,
|
|
"non_eu_pct": non_eu_pct,
|
|
"source_breakdown": by_src,
|
|
"max_cookies_per_vendor": max_cookies_per_vendor,
|
|
"avg_cookies_per_vendor": avg_cookies_per_vendor,
|
|
# Cookie-KPIs
|
|
"cookies_in_browser": cookies_in_browser,
|
|
"cookies_detailed_count": len(cd),
|
|
"cookie_doc_chars": cookie_doc_len,
|
|
"doc_text_chars_total": doc_text_total,
|
|
# Banner
|
|
"banner_detected": banner_detected,
|
|
"banner_provider": br.get("banner_provider") or "",
|
|
"banner_violations": n_banner_violations,
|
|
# Compliance / Score
|
|
"compliance_score": score,
|
|
# Saving (Heuristik)
|
|
"saving_low_eur": saving_low,
|
|
"saving_high_eur": saving_high,
|
|
# Capture-Quality (wie viele unserer 10+ Audit-Quellen liefern Daten)
|
|
"data_quality_pct": _quality_pct(snapshot),
|
|
}
|
|
|
|
|
|
def _quality_pct(snapshot: dict) -> int:
|
|
"""Wieviel Prozent der erwarteten Datenquellen haben Inhalt?"""
|
|
br = snapshot.get("banner_result") or {}
|
|
cv = snapshot.get("cmp_vendors") or []
|
|
de = snapshot.get("doc_entries") or []
|
|
cd = br.get("cookies_detailed") or []
|
|
aa = (br.get("phases") or {}).get("after_accept") or {}
|
|
|
|
checks = [
|
|
br.get("banner_detected") is True,
|
|
len(cv) > 0,
|
|
len(de) > 0,
|
|
len(cd) > 0,
|
|
len(aa.get("cookies") or []) > 0,
|
|
any((d.get("text") or "") for d in de),
|
|
br.get("compliance_score") is not None or br.get("completeness_pct") is not None,
|
|
]
|
|
return round(sum(1 for x in checks if x) / len(checks) * 100)
|
|
|
|
|
|
def load_snapshots_for_benchmark(
|
|
db: Session,
|
|
industry: str | None = None,
|
|
sites: list[str] | None = None,
|
|
limit: int = 50,
|
|
) -> list[dict]:
|
|
"""Liefert dicts mit Snapshot-Daten + extracted KPIs."""
|
|
where = []
|
|
params: dict[str, Any] = {}
|
|
if industry:
|
|
where.append("(scan_context->>'industry') = :ind")
|
|
params["ind"] = industry
|
|
if sites:
|
|
where.append("site_label = ANY(:sites)")
|
|
params["sites"] = sites
|
|
where_sql = " AND ".join(where) if where else "TRUE"
|
|
|
|
sql = (
|
|
"SELECT id::text, check_id, site_label, site_domain, created_at, "
|
|
" banner_result, cmp_vendors, doc_entries, scan_context "
|
|
"FROM compliance.compliance_check_snapshots "
|
|
f"WHERE {where_sql} "
|
|
"ORDER BY created_at DESC LIMIT :lim"
|
|
)
|
|
params["lim"] = limit
|
|
|
|
rows = db.execute(sa_text(sql), params).fetchall()
|
|
out: list[dict] = []
|
|
for r in rows:
|
|
import json as _j
|
|
def _parse(v):
|
|
if isinstance(v, (dict, list)) or v is None:
|
|
return v
|
|
try:
|
|
return _j.loads(v)
|
|
except Exception:
|
|
return v
|
|
snap = {
|
|
"id": r[0],
|
|
"check_id": r[1],
|
|
"site_label": r[2],
|
|
"site_domain": r[3],
|
|
"created_at": r[4],
|
|
"banner_result": _parse(r[5]),
|
|
"cmp_vendors": _parse(r[6]) or [],
|
|
"doc_entries": _parse(r[7]) or [],
|
|
"scan_context": _parse(r[8]) or {},
|
|
}
|
|
out.append(extract_kpis(snap))
|
|
return out
|
|
|
|
|
|
def anonymize_kpis(kpis: list[dict], industry: str = "") -> list[dict]:
|
|
"""Ersetzt site_label durch 'OEM 1', 'OEM 2' etc.
|
|
Industry-Prefix waehlbar (Automotive→OEM, Banking→Bank, Chemie→Chem).
|
|
"""
|
|
prefix_map = {
|
|
"automotive": "OEM",
|
|
"banking": "Bank",
|
|
"chemistry": "Chem",
|
|
"luftfahrt": "Airline",
|
|
"saas": "SaaS",
|
|
"ecommerce": "Shop",
|
|
}
|
|
pfx = prefix_map.get(industry.lower(), "Site")
|
|
# Stable alphabetical numbering for determinism
|
|
seen: dict[str, str] = {}
|
|
next_idx = 1
|
|
out = []
|
|
for k in sorted(kpis, key=lambda x: (x.get("site_label") or "")):
|
|
sl = k.get("site_label") or ""
|
|
if sl not in seen:
|
|
seen[sl] = f"{pfx} {next_idx}"
|
|
next_idx += 1
|
|
anon_k = dict(k)
|
|
anon_k["site_label"] = seen[sl]
|
|
anon_k["site_domain"] = f"site-{next_idx-1}.example"
|
|
out.append(anon_k)
|
|
return out
|
|
|
|
|
|
def build_benchmark_summary(kpis: list[dict]) -> dict:
|
|
"""Aggregate-Stats fuer den ganzen Branchen-Cut."""
|
|
if not kpis:
|
|
return {}
|
|
def avg(field: str) -> float:
|
|
vals = [k.get(field) for k in kpis if isinstance(k.get(field), (int, float))]
|
|
return round(sum(vals) / max(1, len(vals)), 1) if vals else 0
|
|
def maxv(field: str):
|
|
vals = [k.get(field) for k in kpis if isinstance(k.get(field), (int, float))]
|
|
return max(vals) if vals else 0
|
|
return {
|
|
"n_sites": len(kpis),
|
|
"avg_vendors": avg("vendors_total"),
|
|
"avg_us_pct": avg("us_pct"),
|
|
"avg_non_eu_pct": avg("non_eu_pct"),
|
|
"avg_cookies_browser": avg("cookies_in_browser"),
|
|
"avg_score": avg("compliance_score"),
|
|
"max_vendors": maxv("vendors_total"),
|
|
"max_saving_high": maxv("saving_high_eur"),
|
|
"total_saving_low": sum(k.get("saving_low_eur") or 0 for k in kpis),
|
|
"total_saving_high": sum(k.get("saving_high_eur") or 0 for k in kpis),
|
|
}
|