This commit is contained in:
Benjamin Admin
2026-05-22 11:51:27 +02:00
26 changed files with 4885 additions and 8 deletions
@@ -207,6 +207,42 @@ async def get_snapshot(snapshot_id: str):
db.close()
@router.get("/admin/benchmark")
async def benchmark(
industry: str = "",
sites: str = "",
anonymized: bool = False,
limit: int = 50,
):
"""P107 — Branchen-Benchmark-Cockpit Endpoint.
industry: 'automotive' / 'banking' / etc (optional)
sites: comma-separated site_label list (optional)
anonymized: bool — wenn true, Hersteller-Namen → 'OEM 1/2/3'
"""
from database import SessionLocal
from compliance.services.benchmark_extractor import (
load_snapshots_for_benchmark, anonymize_kpis,
build_benchmark_summary,
)
site_list = [s.strip() for s in sites.split(",") if s.strip()] if sites else None
db = SessionLocal()
try:
kpis = load_snapshots_for_benchmark(
db, industry=industry or None, sites=site_list, limit=limit,
)
finally:
db.close()
if anonymized:
kpis = anonymize_kpis(kpis, industry=industry)
return {
"industry": industry or "all",
"anonymized": anonymized,
"sites": [k.get("site_label") for k in kpis],
"kpis": kpis,
"summary": build_benchmark_summary(kpis),
}
@router.post("/admin/tcf-ingest")
async def tcf_ingest():
"""P105 — IAB TCF Vendor-Liste ingestieren / refreshen.
@@ -0,0 +1,265 @@
"""
P107 Branchen-Benchmark-KPIs pro Snapshot.
Extrahiert aus einem compliance_check_snapshot 18 KPIs die fuer den
Multi-Site-Vergleich relevant sind. Wird vom /admin/benchmark Endpoint
genutzt um Vergleichstabellen zu rendern.
USP: keine andere Compliance-Software gibt einen Wirtschaftspruefer
einen so granularen Branchen-Querschnitt. Bei DAX-Konzernen ist das
ein echtes Verkaufs-Asset (Big 4 koennen es ihren Kunden als
'wir sehen die ganze Branche' verkaufen).
"""
from __future__ import annotations
import logging
import re
from typing import Any
from sqlalchemy import text as sa_text
from sqlalchemy.orm import Session
logger = logging.getLogger(__name__)
_US_COUNTRIES = {"US", "USA", "United States"}
_NON_EU = {"US", "CN", "RU", "IN", "JP", "BR", "AU", "CA", "KR",
"MX", "ZA", "TR", "SG", "TW", "HK"}
def _safe_int(v: Any, default: int = 0) -> int:
try:
return int(v)
except (TypeError, ValueError):
return default
def _country_from_vendor(v: dict) -> str:
c = (v.get("country") or "").strip().upper()
if c:
return c
# Aus vendor_country wenn vorhanden (TCF-Authority Eintraege)
return ""
def extract_kpis(snapshot: dict) -> dict:
"""Liefert 18 KPIs aus einem snapshot-row.
Snapshot-row keys: id, check_id, site_label, site_domain, created_at,
banner_result, cmp_vendors, doc_entries, scan_context.
"""
br = snapshot.get("banner_result") or {}
cv = snapshot.get("cmp_vendors") or []
de = snapshot.get("doc_entries") or []
sc = snapshot.get("scan_context") or {}
# Banner-Phase Cookies
phases = br.get("phases") or {}
after_accept = (phases.get("after_accept") or {})
cookies_in_browser = len(after_accept.get("cookies") or [])
cd = br.get("cookies_detailed") or []
# Doc-Text Lengths
doc_text_total = sum(len((d.get("text") or "")) for d in de)
cookie_doc_len = next(
(len(d.get("text") or "") for d in de if d.get("doc_type") == "cookie"), 0,
)
# Vendor breakdown
n_vendors = len(cv)
countries = [_country_from_vendor(v) for v in cv]
countries = [c for c in countries if c]
n_us = sum(1 for c in countries if c in _US_COUNTRIES)
n_non_eu = sum(1 for c in countries if c in _NON_EU)
us_pct = round(n_us / max(1, n_vendors) * 100, 1)
non_eu_pct = round(n_non_eu / max(1, n_vendors) * 100, 1)
# Vendor-Source-Mix
by_src: dict[str, int] = {}
for v in cv:
for s in (v.get("source") or "?").split(";"):
s = s.strip() or "?"
by_src[s] = by_src.get(s, 0) + 1
# Cookies pro Vendor (Konzentration)
cookie_counts = [len(v.get("cookies") or []) for v in cv]
max_cookies_per_vendor = max(cookie_counts) if cookie_counts else 0
avg_cookies_per_vendor = (
round(sum(cookie_counts) / max(1, len(cookie_counts)), 1)
if cookie_counts else 0
)
# Banner-Checks
bc = br.get("banner_checks") or {}
n_banner_violations = len(bc.get("violations") or [])
banner_detected = bool(br.get("banner_detected"))
# Compliance-Score (best effort)
score = br.get("compliance_score") or br.get("completeness_pct")
# Estimated Saving (Lizenz-Konsolidierung, Heuristik)
# Pro 5 Vendor ueber Median (10) rechnen wir ~5k EUR/Jahr Einsparung
median_vendors = 10
saving_low = max(0, (n_vendors - median_vendors)) * 1000
saving_high = max(0, (n_vendors - median_vendors)) * 5000
return {
# Header
"check_id": snapshot.get("check_id"),
"site_label": snapshot.get("site_label"),
"site_domain": snapshot.get("site_domain"),
"captured_at": (snapshot.get("created_at").isoformat()
if snapshot.get("created_at") else None),
"industry": (sc or {}).get("industry") or "",
# Vendor-KPIs
"vendors_total": n_vendors,
"vendors_us": n_us,
"vendors_non_eu": n_non_eu,
"us_pct": us_pct,
"non_eu_pct": non_eu_pct,
"source_breakdown": by_src,
"max_cookies_per_vendor": max_cookies_per_vendor,
"avg_cookies_per_vendor": avg_cookies_per_vendor,
# Cookie-KPIs
"cookies_in_browser": cookies_in_browser,
"cookies_detailed_count": len(cd),
"cookie_doc_chars": cookie_doc_len,
"doc_text_chars_total": doc_text_total,
# Banner
"banner_detected": banner_detected,
"banner_provider": br.get("banner_provider") or "",
"banner_violations": n_banner_violations,
# Compliance / Score
"compliance_score": score,
# Saving (Heuristik)
"saving_low_eur": saving_low,
"saving_high_eur": saving_high,
# Capture-Quality (wie viele unserer 10+ Audit-Quellen liefern Daten)
"data_quality_pct": _quality_pct(snapshot),
}
def _quality_pct(snapshot: dict) -> int:
"""Wieviel Prozent der erwarteten Datenquellen haben Inhalt?"""
br = snapshot.get("banner_result") or {}
cv = snapshot.get("cmp_vendors") or []
de = snapshot.get("doc_entries") or []
cd = br.get("cookies_detailed") or []
aa = (br.get("phases") or {}).get("after_accept") or {}
checks = [
br.get("banner_detected") is True,
len(cv) > 0,
len(de) > 0,
len(cd) > 0,
len(aa.get("cookies") or []) > 0,
any((d.get("text") or "") for d in de),
br.get("compliance_score") is not None or br.get("completeness_pct") is not None,
]
return round(sum(1 for x in checks if x) / len(checks) * 100)
def load_snapshots_for_benchmark(
db: Session,
industry: str | None = None,
sites: list[str] | None = None,
limit: int = 50,
) -> list[dict]:
"""Liefert dicts mit Snapshot-Daten + extracted KPIs."""
where = []
params: dict[str, Any] = {}
if industry:
where.append("(scan_context->>'industry') = :ind")
params["ind"] = industry
if sites:
where.append("site_label = ANY(:sites)")
params["sites"] = sites
where_sql = " AND ".join(where) if where else "TRUE"
sql = (
"SELECT id::text, check_id, site_label, site_domain, created_at, "
" banner_result, cmp_vendors, doc_entries, scan_context "
"FROM compliance.compliance_check_snapshots "
f"WHERE {where_sql} "
"ORDER BY created_at DESC LIMIT :lim"
)
params["lim"] = limit
rows = db.execute(sa_text(sql), params).fetchall()
out: list[dict] = []
for r in rows:
import json as _j
def _parse(v):
if isinstance(v, (dict, list)) or v is None:
return v
try:
return _j.loads(v)
except Exception:
return v
snap = {
"id": r[0],
"check_id": r[1],
"site_label": r[2],
"site_domain": r[3],
"created_at": r[4],
"banner_result": _parse(r[5]),
"cmp_vendors": _parse(r[6]) or [],
"doc_entries": _parse(r[7]) or [],
"scan_context": _parse(r[8]) or {},
}
out.append(extract_kpis(snap))
return out
def anonymize_kpis(kpis: list[dict], industry: str = "") -> list[dict]:
"""Ersetzt site_label durch 'OEM 1', 'OEM 2' etc.
Industry-Prefix waehlbar (AutomotiveOEM, BankingBank, ChemieChem).
"""
prefix_map = {
"automotive": "OEM",
"banking": "Bank",
"chemistry": "Chem",
"luftfahrt": "Airline",
"saas": "SaaS",
"ecommerce": "Shop",
}
pfx = prefix_map.get(industry.lower(), "Site")
# Stable alphabetical numbering for determinism
seen: dict[str, str] = {}
next_idx = 1
out = []
for k in sorted(kpis, key=lambda x: (x.get("site_label") or "")):
sl = k.get("site_label") or ""
if sl not in seen:
seen[sl] = f"{pfx} {next_idx}"
next_idx += 1
anon_k = dict(k)
anon_k["site_label"] = seen[sl]
anon_k["site_domain"] = f"site-{next_idx-1}.example"
out.append(anon_k)
return out
def build_benchmark_summary(kpis: list[dict]) -> dict:
"""Aggregate-Stats fuer den ganzen Branchen-Cut."""
if not kpis:
return {}
def avg(field: str) -> float:
vals = [k.get(field) for k in kpis if isinstance(k.get(field), (int, float))]
return round(sum(vals) / max(1, len(vals)), 1) if vals else 0
def maxv(field: str):
vals = [k.get(field) for k in kpis if isinstance(k.get(field), (int, float))]
return max(vals) if vals else 0
return {
"n_sites": len(kpis),
"avg_vendors": avg("vendors_total"),
"avg_us_pct": avg("us_pct"),
"avg_non_eu_pct": avg("non_eu_pct"),
"avg_cookies_browser": avg("cookies_in_browser"),
"avg_score": avg("compliance_score"),
"max_vendors": maxv("vendors_total"),
"max_saving_high": maxv("saving_high_eur"),
"total_saving_low": sum(k.get("saving_low_eur") or 0 for k in kpis),
"total_saving_high": sum(k.get("saving_high_eur") or 0 for k in kpis),
}
@@ -172,6 +172,21 @@ async def generate_solution(
"Liefere die Loesung als JSON."
)
# P31: tiered Cascade (Qwen → OVH → Anthropic) mit Valkey-Cache.
try:
from compliance.services.llm_cascade import call_with_cascade
res = await call_with_cascade(
system=_SYSTEM_PROMPT, user=prompt,
min_confidence=0.5, max_tokens=600,
)
parsed = _parse(res.get("text", ""))
if parsed:
_cache_put(cache_key, parsed)
return parsed
except Exception:
# fall through to legacy direct calls
pass
content = await _call_ollama(prompt)
parsed = _parse(content)
if not parsed:
@@ -63,19 +63,34 @@ async def extract_vendors_via_llm(
excerpt = cookie_text[:max_text_chars]
user_prompt = f"Cookie-Richtlinie-Text:\n\n{excerpt}"
# Stage 1: local Qwen
# P31: nutze tiered LLM-Cascade mit Cache (Qwen → OVH → Anthropic).
# Re-Runs derselben Cookie-Doc landen im Valkey-Cache (7d TTL) und
# gehen in ~50ms statt 4-6min durch. Erstaufruf bleibt 4-6min lokal
# bzw ~2min auf OVH.
try:
from compliance.services.llm_cascade import call_with_cascade
res = await call_with_cascade(
system=_SYSTEM_PROMPT, user=user_prompt,
min_confidence=0.6, max_tokens=16000,
)
vendors = _parse_vendor_list(res.get("text", ""))
if vendors:
logger.info(
"LLM vendor extraction (cascade %s, conf=%.2f, cached=%s): %d vendors",
res.get("source"), res.get("confidence", 0),
res.get("cached"), len(vendors),
)
return vendors
except Exception as e:
logger.warning("Cascade extract failed, fallback to direct Qwen: %s", e)
# Fallback: alte direkte Logik
content = await _call_ollama(user_prompt)
vendors = _parse_vendor_list(content)
if vendors:
logger.info("LLM vendor extraction (Qwen): %d vendors", len(vendors))
return vendors
# Stage 2: OVH backup
content = await _call_ovh(user_prompt)
vendors = _parse_vendor_list(content)
if vendors:
logger.info("LLM vendor extraction (OVH): %d vendors", len(vendors))
return vendors
return _parse_vendor_list(content)
async def _call_ollama(user_prompt: str) -> str: