feat(audit): P107 Branchen-Benchmark-Cockpit fuer Big-4-Demos
CI / secret-scan (push) Has been skipped
CI / dep-audit (push) Has been skipped
CI / sbom-scan (push) Has been skipped
CI / validate-canonical-controls (push) Successful in 17s
CI / loc-budget (push) Failing after 18s
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / nodejs-build (push) Successful in 3m5s
CI / test-go (push) Failing after 54s
CI / iace-gt-coverage (push) Successful in 27s
CI / test-python-backend (push) Successful in 47s
CI / detect-changes (push) Successful in 13s
CI / branch-name (push) Has been skipped
CI / guardrail-integrity (push) Has been skipped
CI / test-python-document-crawler (push) Has been skipped
CI / test-python-dsms-gateway (push) Has been skipped
CI / secret-scan (push) Has been skipped
CI / dep-audit (push) Has been skipped
CI / sbom-scan (push) Has been skipped
CI / validate-canonical-controls (push) Successful in 17s
CI / loc-budget (push) Failing after 18s
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / nodejs-build (push) Successful in 3m5s
CI / test-go (push) Failing after 54s
CI / iace-gt-coverage (push) Successful in 27s
CI / test-python-backend (push) Successful in 47s
CI / detect-changes (push) Successful in 13s
CI / branch-name (push) Has been skipped
CI / guardrail-integrity (push) Has been skipped
CI / test-python-document-crawler (push) Has been skipped
CI / test-python-dsms-gateway (push) Has been skipped
benchmark_extractor.py — extract_kpis() liefert 18 KPIs pro Snapshot: * vendors_total, vendors_us, vendors_non_eu (mit % je Vendor-Land) * source_breakdown (llm/library/flat_pattern/table_paste/html_table_dom) * max/avg cookies_per_vendor (Konzentrations-Mass) * cookies_in_browser, cookies_detailed_count, cookie_doc_chars * banner_detected, banner_provider, banner_violations * compliance_score, data_quality_pct (wie viele unserer Datenquellen haben Inhalt) * saving_low/high_eur (Heuristik: (vendors - 10) × 1k-5k) anonymize_kpis() ersetzt site_label durch 'OEM 1/2/3' (Industry-Prefix Map: automotive→OEM, banking→Bank, chemistry→Chem, luftfahrt→Airline). GET /api/compliance/agent/admin/benchmark?industry=automotive&sites= VW,BMW,Mercedes&anonymized=true — liefert kpis + summary (n_sites, avg_vendors, total_saving_high). Admin-Page /sdk/benchmark: * Filter-Leiste: Industry-Dropdown, Sites-Input + 5 Preset-Gruppen (Automotive OEMs / Zulieferer, Chemie DAX, Luftfahrt, Banking DAX) * Anonymize-Toggle prominent * 5 Summary-KPI-Karten oben * Vergleichstabelle 13 Spalten (Score, Vendors, US%, Drittland%, Cookies-Browser, Cookie-Doc-kB, Banner ✓/✗, Provider, Verstoesse, Saving €/Jahr, Daten-Qualitaet, Captured-Time) * Red-/Amber-/Green-Indikatoren bei US%/Score/Drittland * Big-4-Hinweis-Footer Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -207,6 +207,42 @@ async def get_snapshot(snapshot_id: str):
|
||||
db.close()
|
||||
|
||||
|
||||
@router.get("/admin/benchmark")
|
||||
async def benchmark(
|
||||
industry: str = "",
|
||||
sites: str = "",
|
||||
anonymized: bool = False,
|
||||
limit: int = 50,
|
||||
):
|
||||
"""P107 — Branchen-Benchmark-Cockpit Endpoint.
|
||||
industry: 'automotive' / 'banking' / etc (optional)
|
||||
sites: comma-separated site_label list (optional)
|
||||
anonymized: bool — wenn true, Hersteller-Namen → 'OEM 1/2/3'
|
||||
"""
|
||||
from database import SessionLocal
|
||||
from compliance.services.benchmark_extractor import (
|
||||
load_snapshots_for_benchmark, anonymize_kpis,
|
||||
build_benchmark_summary,
|
||||
)
|
||||
site_list = [s.strip() for s in sites.split(",") if s.strip()] if sites else None
|
||||
db = SessionLocal()
|
||||
try:
|
||||
kpis = load_snapshots_for_benchmark(
|
||||
db, industry=industry or None, sites=site_list, limit=limit,
|
||||
)
|
||||
finally:
|
||||
db.close()
|
||||
if anonymized:
|
||||
kpis = anonymize_kpis(kpis, industry=industry)
|
||||
return {
|
||||
"industry": industry or "all",
|
||||
"anonymized": anonymized,
|
||||
"sites": [k.get("site_label") for k in kpis],
|
||||
"kpis": kpis,
|
||||
"summary": build_benchmark_summary(kpis),
|
||||
}
|
||||
|
||||
|
||||
@router.post("/admin/tcf-ingest")
|
||||
async def tcf_ingest():
|
||||
"""P105 — IAB TCF Vendor-Liste ingestieren / refreshen.
|
||||
|
||||
@@ -0,0 +1,265 @@
|
||||
"""
|
||||
P107 — Branchen-Benchmark-KPIs pro Snapshot.
|
||||
|
||||
Extrahiert aus einem compliance_check_snapshot 18 KPIs die fuer den
|
||||
Multi-Site-Vergleich relevant sind. Wird vom /admin/benchmark Endpoint
|
||||
genutzt um Vergleichstabellen zu rendern.
|
||||
|
||||
USP: keine andere Compliance-Software gibt einen Wirtschaftspruefer
|
||||
einen so granularen Branchen-Querschnitt. Bei DAX-Konzernen ist das
|
||||
ein echtes Verkaufs-Asset (Big 4 koennen es ihren Kunden als
|
||||
'wir sehen die ganze Branche' verkaufen).
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import re
|
||||
from typing import Any
|
||||
|
||||
from sqlalchemy import text as sa_text
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
_US_COUNTRIES = {"US", "USA", "United States"}
|
||||
_NON_EU = {"US", "CN", "RU", "IN", "JP", "BR", "AU", "CA", "KR",
|
||||
"MX", "ZA", "TR", "SG", "TW", "HK"}
|
||||
|
||||
|
||||
def _safe_int(v: Any, default: int = 0) -> int:
|
||||
try:
|
||||
return int(v)
|
||||
except (TypeError, ValueError):
|
||||
return default
|
||||
|
||||
|
||||
def _country_from_vendor(v: dict) -> str:
|
||||
c = (v.get("country") or "").strip().upper()
|
||||
if c:
|
||||
return c
|
||||
# Aus vendor_country wenn vorhanden (TCF-Authority Eintraege)
|
||||
return ""
|
||||
|
||||
|
||||
def extract_kpis(snapshot: dict) -> dict:
|
||||
"""Liefert 18 KPIs aus einem snapshot-row.
|
||||
|
||||
Snapshot-row keys: id, check_id, site_label, site_domain, created_at,
|
||||
banner_result, cmp_vendors, doc_entries, scan_context.
|
||||
"""
|
||||
br = snapshot.get("banner_result") or {}
|
||||
cv = snapshot.get("cmp_vendors") or []
|
||||
de = snapshot.get("doc_entries") or []
|
||||
sc = snapshot.get("scan_context") or {}
|
||||
|
||||
# Banner-Phase Cookies
|
||||
phases = br.get("phases") or {}
|
||||
after_accept = (phases.get("after_accept") or {})
|
||||
cookies_in_browser = len(after_accept.get("cookies") or [])
|
||||
cd = br.get("cookies_detailed") or []
|
||||
|
||||
# Doc-Text Lengths
|
||||
doc_text_total = sum(len((d.get("text") or "")) for d in de)
|
||||
cookie_doc_len = next(
|
||||
(len(d.get("text") or "") for d in de if d.get("doc_type") == "cookie"), 0,
|
||||
)
|
||||
|
||||
# Vendor breakdown
|
||||
n_vendors = len(cv)
|
||||
countries = [_country_from_vendor(v) for v in cv]
|
||||
countries = [c for c in countries if c]
|
||||
n_us = sum(1 for c in countries if c in _US_COUNTRIES)
|
||||
n_non_eu = sum(1 for c in countries if c in _NON_EU)
|
||||
us_pct = round(n_us / max(1, n_vendors) * 100, 1)
|
||||
non_eu_pct = round(n_non_eu / max(1, n_vendors) * 100, 1)
|
||||
|
||||
# Vendor-Source-Mix
|
||||
by_src: dict[str, int] = {}
|
||||
for v in cv:
|
||||
for s in (v.get("source") or "?").split(";"):
|
||||
s = s.strip() or "?"
|
||||
by_src[s] = by_src.get(s, 0) + 1
|
||||
|
||||
# Cookies pro Vendor (Konzentration)
|
||||
cookie_counts = [len(v.get("cookies") or []) for v in cv]
|
||||
max_cookies_per_vendor = max(cookie_counts) if cookie_counts else 0
|
||||
avg_cookies_per_vendor = (
|
||||
round(sum(cookie_counts) / max(1, len(cookie_counts)), 1)
|
||||
if cookie_counts else 0
|
||||
)
|
||||
|
||||
# Banner-Checks
|
||||
bc = br.get("banner_checks") or {}
|
||||
n_banner_violations = len(bc.get("violations") or [])
|
||||
banner_detected = bool(br.get("banner_detected"))
|
||||
|
||||
# Compliance-Score (best effort)
|
||||
score = br.get("compliance_score") or br.get("completeness_pct")
|
||||
|
||||
# Estimated Saving (Lizenz-Konsolidierung, Heuristik)
|
||||
# Pro 5 Vendor ueber Median (10) rechnen wir ~5k EUR/Jahr Einsparung
|
||||
median_vendors = 10
|
||||
saving_low = max(0, (n_vendors - median_vendors)) * 1000
|
||||
saving_high = max(0, (n_vendors - median_vendors)) * 5000
|
||||
|
||||
return {
|
||||
# Header
|
||||
"check_id": snapshot.get("check_id"),
|
||||
"site_label": snapshot.get("site_label"),
|
||||
"site_domain": snapshot.get("site_domain"),
|
||||
"captured_at": (snapshot.get("created_at").isoformat()
|
||||
if snapshot.get("created_at") else None),
|
||||
"industry": (sc or {}).get("industry") or "",
|
||||
# Vendor-KPIs
|
||||
"vendors_total": n_vendors,
|
||||
"vendors_us": n_us,
|
||||
"vendors_non_eu": n_non_eu,
|
||||
"us_pct": us_pct,
|
||||
"non_eu_pct": non_eu_pct,
|
||||
"source_breakdown": by_src,
|
||||
"max_cookies_per_vendor": max_cookies_per_vendor,
|
||||
"avg_cookies_per_vendor": avg_cookies_per_vendor,
|
||||
# Cookie-KPIs
|
||||
"cookies_in_browser": cookies_in_browser,
|
||||
"cookies_detailed_count": len(cd),
|
||||
"cookie_doc_chars": cookie_doc_len,
|
||||
"doc_text_chars_total": doc_text_total,
|
||||
# Banner
|
||||
"banner_detected": banner_detected,
|
||||
"banner_provider": br.get("banner_provider") or "",
|
||||
"banner_violations": n_banner_violations,
|
||||
# Compliance / Score
|
||||
"compliance_score": score,
|
||||
# Saving (Heuristik)
|
||||
"saving_low_eur": saving_low,
|
||||
"saving_high_eur": saving_high,
|
||||
# Capture-Quality (wie viele unserer 10+ Audit-Quellen liefern Daten)
|
||||
"data_quality_pct": _quality_pct(snapshot),
|
||||
}
|
||||
|
||||
|
||||
def _quality_pct(snapshot: dict) -> int:
|
||||
"""Wieviel Prozent der erwarteten Datenquellen haben Inhalt?"""
|
||||
br = snapshot.get("banner_result") or {}
|
||||
cv = snapshot.get("cmp_vendors") or []
|
||||
de = snapshot.get("doc_entries") or []
|
||||
cd = br.get("cookies_detailed") or []
|
||||
aa = (br.get("phases") or {}).get("after_accept") or {}
|
||||
|
||||
checks = [
|
||||
br.get("banner_detected") is True,
|
||||
len(cv) > 0,
|
||||
len(de) > 0,
|
||||
len(cd) > 0,
|
||||
len(aa.get("cookies") or []) > 0,
|
||||
any((d.get("text") or "") for d in de),
|
||||
br.get("compliance_score") is not None or br.get("completeness_pct") is not None,
|
||||
]
|
||||
return round(sum(1 for x in checks if x) / len(checks) * 100)
|
||||
|
||||
|
||||
def load_snapshots_for_benchmark(
|
||||
db: Session,
|
||||
industry: str | None = None,
|
||||
sites: list[str] | None = None,
|
||||
limit: int = 50,
|
||||
) -> list[dict]:
|
||||
"""Liefert dicts mit Snapshot-Daten + extracted KPIs."""
|
||||
where = []
|
||||
params: dict[str, Any] = {}
|
||||
if industry:
|
||||
where.append("(scan_context->>'industry') = :ind")
|
||||
params["ind"] = industry
|
||||
if sites:
|
||||
where.append("site_label = ANY(:sites)")
|
||||
params["sites"] = sites
|
||||
where_sql = " AND ".join(where) if where else "TRUE"
|
||||
|
||||
sql = (
|
||||
"SELECT id::text, check_id, site_label, site_domain, created_at, "
|
||||
" banner_result, cmp_vendors, doc_entries, scan_context "
|
||||
"FROM compliance.compliance_check_snapshots "
|
||||
f"WHERE {where_sql} "
|
||||
"ORDER BY created_at DESC LIMIT :lim"
|
||||
)
|
||||
params["lim"] = limit
|
||||
|
||||
rows = db.execute(sa_text(sql), params).fetchall()
|
||||
out: list[dict] = []
|
||||
for r in rows:
|
||||
import json as _j
|
||||
def _parse(v):
|
||||
if isinstance(v, (dict, list)) or v is None:
|
||||
return v
|
||||
try:
|
||||
return _j.loads(v)
|
||||
except Exception:
|
||||
return v
|
||||
snap = {
|
||||
"id": r[0],
|
||||
"check_id": r[1],
|
||||
"site_label": r[2],
|
||||
"site_domain": r[3],
|
||||
"created_at": r[4],
|
||||
"banner_result": _parse(r[5]),
|
||||
"cmp_vendors": _parse(r[6]) or [],
|
||||
"doc_entries": _parse(r[7]) or [],
|
||||
"scan_context": _parse(r[8]) or {},
|
||||
}
|
||||
out.append(extract_kpis(snap))
|
||||
return out
|
||||
|
||||
|
||||
def anonymize_kpis(kpis: list[dict], industry: str = "") -> list[dict]:
|
||||
"""Ersetzt site_label durch 'OEM 1', 'OEM 2' etc.
|
||||
Industry-Prefix waehlbar (Automotive→OEM, Banking→Bank, Chemie→Chem).
|
||||
"""
|
||||
prefix_map = {
|
||||
"automotive": "OEM",
|
||||
"banking": "Bank",
|
||||
"chemistry": "Chem",
|
||||
"luftfahrt": "Airline",
|
||||
"saas": "SaaS",
|
||||
"ecommerce": "Shop",
|
||||
}
|
||||
pfx = prefix_map.get(industry.lower(), "Site")
|
||||
# Stable alphabetical numbering for determinism
|
||||
seen: dict[str, str] = {}
|
||||
next_idx = 1
|
||||
out = []
|
||||
for k in sorted(kpis, key=lambda x: (x.get("site_label") or "")):
|
||||
sl = k.get("site_label") or ""
|
||||
if sl not in seen:
|
||||
seen[sl] = f"{pfx} {next_idx}"
|
||||
next_idx += 1
|
||||
anon_k = dict(k)
|
||||
anon_k["site_label"] = seen[sl]
|
||||
anon_k["site_domain"] = f"site-{next_idx-1}.example"
|
||||
out.append(anon_k)
|
||||
return out
|
||||
|
||||
|
||||
def build_benchmark_summary(kpis: list[dict]) -> dict:
|
||||
"""Aggregate-Stats fuer den ganzen Branchen-Cut."""
|
||||
if not kpis:
|
||||
return {}
|
||||
def avg(field: str) -> float:
|
||||
vals = [k.get(field) for k in kpis if isinstance(k.get(field), (int, float))]
|
||||
return round(sum(vals) / max(1, len(vals)), 1) if vals else 0
|
||||
def maxv(field: str):
|
||||
vals = [k.get(field) for k in kpis if isinstance(k.get(field), (int, float))]
|
||||
return max(vals) if vals else 0
|
||||
return {
|
||||
"n_sites": len(kpis),
|
||||
"avg_vendors": avg("vendors_total"),
|
||||
"avg_us_pct": avg("us_pct"),
|
||||
"avg_non_eu_pct": avg("non_eu_pct"),
|
||||
"avg_cookies_browser": avg("cookies_in_browser"),
|
||||
"avg_score": avg("compliance_score"),
|
||||
"max_vendors": maxv("vendors_total"),
|
||||
"max_saving_high": maxv("saving_high_eur"),
|
||||
"total_saving_low": sum(k.get("saving_low_eur") or 0 for k in kpis),
|
||||
"total_saving_high": sum(k.get("saving_high_eur") or 0 for k in kpis),
|
||||
}
|
||||
Reference in New Issue
Block a user