feat(compliance-check): exec-summary + voll-audit + TDM-respect + cookie-KB-extended + saving-scan-funnel
CI / detect-changes (push) Successful in 10s
CI / branch-name (push) Has been skipped
CI / guardrail-integrity (push) Has been skipped
CI / secret-scan (push) Has been skipped
CI / dep-audit (push) Has been skipped
CI / sbom-scan (push) Has been skipped
CI / validate-canonical-controls (push) Successful in 14s
CI / loc-budget (push) Failing after 15s
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / nodejs-build (push) Successful in 2m43s
CI / test-go (push) Has been skipped
CI / iace-gt-coverage (push) Has been skipped
CI / test-python-backend (push) Successful in 37s
CI / test-python-document-crawler (push) Has been skipped
CI / test-python-dsms-gateway (push) Has been skipped

P1 — Exec-Summary oben im Email-Report (4 KPIs + 2 CTAs, dunkler Gradient)
P3 — no_direct_sales-Flag fuer OEM-Konfigurator-Sites; AGB/Widerruf/AGB als
     "NICHT ANWENDBAR" (grau) statt "NICHT GEFUNDEN" (rot)
P5 — Voll-Audit Unification: alle Findings (MC + Pflichtangaben + Vendor +
     Redundanz) in /data/compliance_audits.db.unified_findings; neuer
     /api/compliance/agent/findings/<id> Endpoint + FindingsTab im Audit-UI
     mit Filter + CSV-Export
P7 — Crawl-Hardening: TDM-Reservation-Check (robots.txt / ai.txt / Header /
     Meta) vor jedem Run mit 24h-Cache; HeadlessChrome-UA (Firma noch nicht
     gegruendet — Switch via BREAKPILOT_BRANDED_UA env); per-Domain
     Rate-Limit 1 req/s + max 2 concurrent
P2 — Cookie-Knowledge-DB additiv erweitert (35 -> 74 Cookies): Adobe, Meta,
     Microsoft, LinkedIn, TikTok, HubSpot, Marketo, Salesforce, Hotjar,
     FullStory, Mouseflow, Intercom, Drift, Zendesk, Cloudflare, Stripe,
     OneTrust/Cookiebot/Usercentrics, Matomo, Pinterest, Snapchat, X/Twitter,
     YouTube, Vimeo, Klaviyo, Mailchimp, Mixpanel, Segment, Amplitude,
     Optimizely, Datadog; Wire-in in cookie_function_classifier liefert
     compliance_risk-Label (kritisch/hoch/mittel/gering) pro Vendor
A  — k-Anonymitaets-Helper (benchmark_k_anonymity) fuer P6-Vorbereitung
B  — Cross-Tenant-Domain-Assertion im /findings-Endpoint (expected_domain
     Query-Param -> 403 bei Mismatch)
C  — Saving-Scan-Funnel: /api/compliance/agent/saving-scan/start mit
     Validierung + 24h-Rate-Limit pro Domain + Lead-Persistenz in
     saving_scan_leads + Auto-Discovery via _run_compliance_check; 6 Tests
D  — Risk-Badge im Email-Vendor-Row

Rechtliche Leitplanken (Memory feedback_oem_data_legal.md): nur eigene
Knapp-Bewertungen + Source-Pointer, keine 1:1-Kopien fremder CMP-Texte.
TDM-Opt-Out-Respect nach § 44b UrhG. KEINE Schema-Aenderungen — alles in
Sidecar-SQLite.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-05-18 23:48:34 +02:00
parent a616b64273
commit 6c223c7c9b
23 changed files with 2685 additions and 29 deletions
@@ -0,0 +1,28 @@
/**
* Proxy: GET /api/sdk/v1/agent/findings/<checkId>
* -> backend GET /api/compliance/agent/findings/<checkId>
*
* Forwards all query params (source, severity, doc_type, status, q, limit).
*/
import { NextRequest, NextResponse } from 'next/server'
const BACKEND_URL = process.env.BACKEND_API_URL || 'http://backend-compliance:8002'
export async function GET(
request: NextRequest,
{ params }: { params: { checkId: string } },
) {
const checkId = params.checkId
const qs = request.nextUrl.searchParams.toString()
const url = `${BACKEND_URL}/api/compliance/agent/findings/${checkId}${qs ? `?${qs}` : ''}`
try {
const resp = await fetch(url, { signal: AbortSignal.timeout(20000) })
const data = await resp.json()
return NextResponse.json(data, { status: resp.status })
} catch {
return NextResponse.json(
{ error: 'Findings-Abfrage fehlgeschlagen' },
{ status: 503 },
)
}
}
@@ -119,11 +119,9 @@ export function ComplianceCheckTab() {
localStorage.removeItem(STORAGE_KEY_CHECK_ID); setActiveCheckId('')
return
}
if (data.status === 'failed' || data.status === 'not_found') {
if (data.status === 'failed') setError(data.error || 'Pruefung fehlgeschlagen')
setProgress(''); setProgressPct(0); setLoading(false)
localStorage.removeItem(STORAGE_KEY_CHECK_ID); setActiveCheckId('')
return
if (['failed', 'not_found', 'skipped_tdm'].includes(data.status)) {
if (data.status !== 'not_found') setError(data.error || (data.status === 'skipped_tdm' ? 'TDM-Vorbehalt erkannt — Crawl uebersprungen' : 'Pruefung fehlgeschlagen'))
setProgress(''); setProgressPct(0); setLoading(false); localStorage.removeItem(STORAGE_KEY_CHECK_ID); setActiveCheckId(''); return
}
} catch { /* retry */ }
}
@@ -236,9 +234,9 @@ export function ComplianceCheckTab() {
localStorage.setItem(STORAGE_KEY_HISTORY, JSON.stringify(updated))
break
}
if (pollData.status === 'failed') {
if (['failed', 'skipped_tdm'].includes(pollData.status)) {
localStorage.removeItem(STORAGE_KEY_CHECK_ID); setActiveCheckId('')
throw new Error(pollData.error || 'Pruefung fehlgeschlagen')
throw new Error(pollData.error || (pollData.status === 'skipped_tdm' ? 'TDM-Vorbehalt' : 'Pruefung fehlgeschlagen'))
}
attempts++
}
@@ -0,0 +1,274 @@
'use client'
import React, { useEffect, useMemo, useState } from 'react'
type Finding = {
id: number
source_type: string
doc_type: string
severity: string
status: string
regulation: string
label: string
hint: string
action_recipe: Record<string, string>
anchor_excerpt: string
anchor_conf: number
vendor_name: string
category: string
payload: Record<string, unknown>
}
type Summary = {
total: number
by_source: Record<string, number>
by_severity: Record<string, number>
by_status: Record<string, number>
by_doc_type: Record<string, number>
}
type Resp = {
found: boolean
summary: Summary
count: number
findings: Finding[]
}
const SOURCE_LABEL: Record<string, string> = {
all: 'Alle Quellen',
mc: 'Master-Controls',
pflichtangabe: 'Pflichtangaben',
vendor: 'Vendor-Findings',
redundanz: 'Redundanzen',
}
const SEVERITY_COLOR: Record<string, string> = {
CRITICAL: 'bg-red-600 text-white',
HIGH: 'bg-red-100 text-red-800',
MEDIUM: 'bg-amber-100 text-amber-800',
LOW: 'bg-blue-100 text-blue-800',
INFO: 'bg-gray-100 text-gray-600',
}
const STATUS_LABEL: Record<string, string> = {
failed: 'Fail',
passed: 'Pass',
skipped: 'Skip',
na: 'N/A',
info: 'Info',
}
const SEVERITY_OPTS = ['all', 'CRITICAL', 'HIGH', 'MEDIUM', 'LOW', 'INFO']
const STATUS_OPTS = ['all', 'failed', 'passed', 'skipped', 'na', 'info']
export default function FindingsTab({ checkId }: { checkId: string }) {
const [data, setData] = useState<Resp | null>(null)
const [loading, setLoading] = useState(true)
const [error, setError] = useState<string | null>(null)
const [source, setSource] = useState('all')
const [severity, setSeverity] = useState('all')
const [docType, setDocType] = useState('all')
const [status, setStatus] = useState('failed')
const [q, setQ] = useState('')
const [expanded, setExpanded] = useState<number | null>(null)
useEffect(() => {
let cancelled = false
setLoading(true)
const qs = new URLSearchParams({
source, severity, doc_type: docType, status, q, limit: '1500',
}).toString()
fetch(`/api/sdk/v1/agent/findings/${checkId}?${qs}`)
.then(r => r.json())
.then(d => { if (!cancelled) setData(d) })
.catch(e => { if (!cancelled) setError(String(e)) })
.finally(() => { if (!cancelled) setLoading(false) })
return () => { cancelled = true }
}, [checkId, source, severity, docType, status, q])
const docTypes = useMemo(
() => Object.keys(data?.summary?.by_doc_type ?? {}).filter(d => d !== '-').sort(),
[data],
)
const csvExport = () => {
const rows = data?.findings ?? []
const head = ['Quelle', 'Doc', 'Severity', 'Status', 'Regulation', 'Label', 'Vendor', 'Hint']
const lines = [head.join(',')]
for (const r of rows) {
const cells = [
r.source_type, r.doc_type, r.severity, r.status,
r.regulation, r.label, r.vendor_name, r.hint,
].map(c => `"${String(c ?? '').replace(/"/g, '""').replace(/\n/g, ' ')}"`)
lines.push(cells.join(','))
}
const blob = new Blob([lines.join('\n')], { type: 'text/csv;charset=utf-8' })
const url = URL.createObjectURL(blob)
const a = document.createElement('a')
a.href = url
a.download = `findings-${checkId}.csv`
a.click()
URL.revokeObjectURL(url)
}
if (loading && !data) return <div className="p-6 text-sm text-gray-500">Lade Voll-Audit</div>
if (error) return <div className="p-6 text-sm text-red-600">Fehler: {error}</div>
if (!data?.found) {
return (
<div className="p-6 text-sm text-gray-500">
Keine unified findings für diesen Run gespeichert (alter Run vor P5?).
</div>
)
}
const sum = data.summary
const findings = data.findings
return (
<div className="space-y-4">
{/* Summary Cards */}
<div className="grid grid-cols-2 md:grid-cols-4 gap-3 text-xs">
{Object.entries(SOURCE_LABEL).filter(([k]) => k !== 'all').map(([k, label]) => {
const count = sum.by_source?.[k] ?? 0
return (
<button key={k}
onClick={() => setSource(source === k ? 'all' : k)}
className={`text-left rounded-lg border px-3 py-2 transition ${
source === k
? 'border-blue-500 bg-blue-50 text-blue-900'
: 'border-gray-200 hover:border-gray-300 bg-white'
}`}>
<div className="text-[10px] uppercase tracking-wide text-gray-500">{label}</div>
<div className="text-lg font-semibold">{count}</div>
</button>
)
})}
</div>
{/* Filter row */}
<div className="flex flex-wrap gap-2 items-center text-xs">
<select value={severity} onChange={e => setSeverity(e.target.value)}
className="border border-gray-200 rounded px-2 py-1">
{SEVERITY_OPTS.map(s => (
<option key={s} value={s}>
{s === 'all' ? 'Alle Severities' : s}
{s !== 'all' && sum.by_severity?.[s] != null ? ` (${sum.by_severity[s]})` : ''}
</option>
))}
</select>
<select value={status} onChange={e => setStatus(e.target.value)}
className="border border-gray-200 rounded px-2 py-1">
{STATUS_OPTS.map(s => (
<option key={s} value={s}>
{s === 'all' ? 'Alle Status' : STATUS_LABEL[s] ?? s}
{s !== 'all' && sum.by_status?.[s] != null ? ` (${sum.by_status[s]})` : ''}
</option>
))}
</select>
<select value={docType} onChange={e => setDocType(e.target.value)}
className="border border-gray-200 rounded px-2 py-1">
<option value="all">Alle Doc-Types</option>
{docTypes.map(d => (
<option key={d} value={d}>{d} ({sum.by_doc_type?.[d] ?? 0})</option>
))}
</select>
<input value={q} onChange={e => setQ(e.target.value)}
placeholder="Suche Label / Anbieter…"
className="border border-gray-200 rounded px-2 py-1 min-w-[180px]" />
<button onClick={csvExport}
className="ml-auto border border-gray-200 hover:border-gray-300 rounded px-2 py-1">
CSV exportieren
</button>
<span className="text-gray-500">{data.count} Treffer</span>
</div>
{/* Findings table */}
<div className="border rounded-lg overflow-hidden">
<table className="w-full text-xs">
<thead className="bg-gray-50 text-gray-600">
<tr>
<th className="px-3 py-2 text-left">Quelle</th>
<th className="px-3 py-2 text-left">Doc</th>
<th className="px-3 py-2 text-left">Sev</th>
<th className="px-3 py-2 text-left">Status</th>
<th className="px-3 py-2 text-left">Finding</th>
</tr>
</thead>
<tbody>
{findings.map(f => (
<React.Fragment key={f.id}>
<tr className="border-t cursor-pointer hover:bg-gray-50"
onClick={() => setExpanded(expanded === f.id ? null : f.id)}>
<td className="px-3 py-2 text-gray-500 capitalize">{f.source_type}</td>
<td className="px-3 py-2 text-gray-700">{f.doc_type === '-' ? '—' : f.doc_type}</td>
<td className="px-3 py-2">
<span className={`px-2 py-0.5 rounded text-[10px] font-medium ${
SEVERITY_COLOR[f.severity] || 'bg-gray-100'
}`}>{f.severity}</span>
</td>
<td className="px-3 py-2 text-gray-600">{STATUS_LABEL[f.status] ?? f.status}</td>
<td className="px-3 py-2 text-gray-900">
{f.label}
{f.vendor_name && (
<span className="ml-2 text-[10px] text-gray-400">
· {f.vendor_name}
</span>
)}
{f.payload?.risk_label && (
<span className={`ml-2 px-1.5 py-0.5 rounded text-[10px] font-medium ${
f.payload.risk_label === 'kritisch' ? 'bg-red-600 text-white' :
f.payload.risk_label === 'hoch' ? 'bg-red-100 text-red-800' :
f.payload.risk_label === 'mittel' ? 'bg-amber-100 text-amber-800' :
f.payload.risk_label === 'gering' ? 'bg-green-50 text-green-700' :
'bg-gray-100 text-gray-500'
}`}>Risk: {String(f.payload.risk_label)}</span>
)}
</td>
</tr>
{expanded === f.id && (
<tr className="bg-gray-50/50">
<td colSpan={5} className="px-3 py-3 text-xs space-y-2">
{f.hint && (
<div className="text-gray-700">{f.hint}</div>
)}
{f.action_recipe?.fix_text && (
<div className="bg-amber-50 border-l-2 border-amber-300 pl-3 py-2">
<div className="font-medium text-amber-800 mb-1">Empfehlung</div>
<div className="whitespace-pre-line text-amber-900">
{f.action_recipe.fix_text}
</div>
{f.action_recipe.where && (
<div className="text-[10px] text-amber-700 mt-1">
Einfuegen in: {f.action_recipe.where}
</div>
)}
</div>
)}
{f.anchor_excerpt && (
<div className="bg-blue-50 border-l-2 border-blue-300 pl-3 py-2">
<div className="font-medium text-blue-800 mb-1">
Fundstelle im Dokument (Konfidenz {Math.round((f.anchor_conf || 0) * 100)}%)
</div>
<div className="italic text-blue-900">"{f.anchor_excerpt}"</div>
</div>
)}
<div className="text-[10px] text-gray-400">
Source: {f.source_type} · Regulation: {f.regulation || '—'}
{f.category && ` · Kategorie: ${f.category}`}
</div>
</td>
</tr>
)}
</React.Fragment>
))}
{findings.length === 0 && (
<tr><td colSpan={5} className="px-3 py-6 text-center text-gray-400">
Keine Findings fuer die aktuellen Filter.
</td></tr>
)}
</tbody>
</table>
</div>
</div>
)
}
@@ -2,6 +2,7 @@
import React, { useEffect, useState, useMemo } from 'react'
import { use as useUnwrap } from 'react'
import FindingsTab from './FindingsTab'
type MCRow = {
id: number
@@ -67,6 +68,7 @@ export default function AuditPage(
const [filterReg, setFilterReg] = useState<string>('')
const [filterDoc, setFilterDoc] = useState<string>('')
const [expanded, setExpanded] = useState<number | null>(null)
const [tab, setTab] = useState<'mc' | 'all'>('all')
useEffect(() => {
let cancelled = false
@@ -127,6 +129,25 @@ export default function AuditPage(
</p>
</div>
{/* Tab switcher */}
<div className="flex gap-2 border-b border-gray-200">
{([
{ key: 'all', label: 'Voll-Audit (alle Findings)' },
{ key: 'mc', label: 'Nur MC-Scorecard' },
] as const).map(t => (
<button key={t.key}
onClick={() => setTab(t.key)}
className={`px-4 py-2 text-sm border-b-2 -mb-px transition ${
tab === t.key
? 'border-blue-600 text-blue-700 font-medium'
: 'border-transparent text-gray-500 hover:text-gray-700'
}`}>{t.label}</button>
))}
</div>
{tab === 'all' && <FindingsTab checkId={checkId} />}
{tab === 'mc' && <>
{/* Scorecard */}
<div className="border rounded-lg overflow-hidden">
<div className="px-4 py-3 bg-blue-50 border-b border-blue-100">
@@ -272,6 +293,7 @@ export default function AuditPage(
</tbody>
</table>
</div>
</>}
</div>
)
}
@@ -166,6 +166,33 @@ async def _run_compliance_check(check_id: str, req: ComplianceCheckRequest):
except Exception:
pass
# P7: TDM-Reservation-Check der Base-Domain (§ 44b UrhG).
# Bei reserved/denied: Run sofort beenden, kein Crawl.
try:
from compliance.services.tdm_reservation_check import (
check_tdm_reservation, is_crawl_allowed,
)
first_url = next(
(d.url for d in req.documents if d.url), "",
)
if first_url:
tdm = await check_tdm_reservation(first_url)
_compliance_check_jobs[check_id]["tdm"] = tdm
if not is_crawl_allowed(tdm):
_compliance_check_jobs[check_id]["status"] = "skipped_tdm"
_compliance_check_jobs[check_id]["error"] = (
f"TDM-Vorbehalt fuer {tdm.get('domain')} erkannt "
f"(status={tdm.get('status')}) — Crawl nach § 44b "
f"UrhG nicht zulaessig. Signals: "
f"{[s.get('src') for s in tdm.get('signals', [])]}"
)
_compliance_check_jobs[check_id]["progress_pct"] = 100
logger.info("TDM-skip check_id=%s domain=%s status=%s",
check_id, tdm.get("domain"), tdm.get("status"))
return
except Exception as e:
logger.warning("TDM-check failed (proceeding): %s", e)
# Step 1: Resolve texts (fetch from URL if needed) — 0-30%
_update(check_id, "Texte werden geladen...", 1)
doc_texts: dict[str, str] = {}
@@ -526,15 +553,37 @@ async def _run_compliance_check(check_id: str, req: ComplianceCheckRequest):
report_html = build_html_report(results, None, doc_texts)
profile_html = _build_profile_html(profile)
# O4: Vendor-Redundanz / EU-Alternativen + Cost-Savings-Block
# zwischen VVT und Doc-Report einsortiert, damit Geschaeftsfuehrung
# die Einsparung sieht bevor sie in die Detail-Pruefung geht.
# O4: Vendor-Redundanz / EU-Alternativen + Cost-Savings-Block
from .agent_doc_check_redundancy import build_redundancy_html
redundancy_html = build_redundancy_html(redundancy_report)
# P1: Executive-Summary GANZ oben — CFO/GF sieht 4 KPIs + 2 CTAs.
from .agent_doc_check_exec_summary import build_exec_summary_html
# Site-Name fuer Header bestimmen (gleiche Logik wie Email-Subject)
url_company_for_exec = _company_name_from_url(doc_entries)
domain_for_exec = _extract_domain(doc_entries)
site_name_for_exec = url_company_for_exec or domain_for_exec or ""
exec_summary_html = build_exec_summary_html(
scorecard=scorecard,
previous_scorecard=prev_scorecard,
cmp_vendors=cmp_vendors,
redundancy_report=redundancy_report,
site_name=site_name_for_exec,
)
# Reihenfolge — Sales-optimiert:
# 1) Exec-Summary (KPIs + Saving + CTAs)
# 2) summary_html (Konkrete Aufgaben fuer die Geschaeftsfuehrung)
# 3) scanned_urls (Quellen-Transparenz)
# 4) profile_html (Erkanntes Geschaeftsmodell)
# 5) scorecard_html (MC-Scorecard)
# 6) redundancy_html (Optimierungspotenzial — direkt nach Compliance-Score)
# 7) providers_html + vvt_html (Vendor-Liste)
# 8) report_html (Doc-Pruefung Details)
full_html = (
summary_html + scanned_html + profile_html + scorecard_html
+ providers_html + vvt_html + redundancy_html + report_html
exec_summary_html + summary_html + scanned_html + profile_html
+ scorecard_html + redundancy_html
+ providers_html + vvt_html + report_html
)
# Step 6: Send email — derive site name primarily from entered URL.
@@ -619,6 +668,21 @@ async def _run_compliance_check(check_id: str, req: ComplianceCheckRequest):
vendors=cmp_vendors,
profile=extracted_profile,
)
# Unified findings (P5): bundle MC + Pflichtangaben + Vendor +
# Redundanz in one searchable table behind /agent/findings/<id>.
try:
from compliance.services.unified_findings_collector import collect
from compliance.services.unified_findings_store import record_findings
unified = collect(
check_id=check_id,
results=results,
cmp_vendors=cmp_vendors,
redundancy_report=redundancy_report,
doc_texts=doc_texts,
)
record_findings(check_id, unified)
except Exception as e:
logger.warning("Unified findings collect failed: %s", e)
except Exception as e:
logger.warning("Audit persistence skipped: %s", e)
@@ -696,11 +760,19 @@ async def _fetch_text(url: str, doc_type: str = "") -> tuple[str, list[dict]]:
except Exception as e:
logger.warning("Consent-tester fetch failed for %s: %s", url, e)
# 2. Fallback: direct HTTP fetch (works for SSR pages like BMW)
# 2. Fallback: direct HTTP fetch (works for SSR pages like BMW).
# P7: kenntlicher UA + per-Domain Rate-Limit.
try:
import re as _re
async with httpx.AsyncClient(timeout=30.0, follow_redirects=True) as client:
resp = await client.get(url)
from compliance.services.compliance_user_agent import (
default_request_headers, DomainRateLimiter,
)
async with httpx.AsyncClient(
timeout=30.0, follow_redirects=True,
headers=default_request_headers(),
) as client:
async with DomainRateLimiter(url):
resp = await client.get(url)
if resp.status_code == 200 and "text/html" in resp.headers.get("content-type", ""):
html = resp.text
# Strip HTML tags, decode entities
@@ -1135,8 +1207,25 @@ def _company_name_from_url(doc_entries: list[dict]) -> str | None:
def _get_skip_types(profile) -> dict[str, str]:
"""Doc_types to skip entirely. Currently empty — we check everything
and flag irrelevant items as INFO instead of skipping."""
"""Doc_types to skip entirely with a per-type reason message.
Heute primaer fuer OEM-Konfigurator-Pattern (BMW/Audi/Mercedes):
wenn die Site kein Direkt-Vertrieb macht, sind AGB/Widerruf/
Nutzungsbedingungen nicht Pflicht auf der Website — sie werden
beim Vertragshaendler ausgehaendigt.
"""
if getattr(profile, "no_direct_sales", False):
msg = (
"Nicht anwendbar — die Webseite schliesst keinen Direkt-"
"Kaufvertrag (OEM-Konfigurator-Pattern, Vertrag laeuft "
"ueber Vertragshaendler). AGB/Widerruf werden beim "
"Haendler ausgehaendigt."
)
return {
"agb": msg,
"widerruf": msg,
"nutzungsbedingungen": msg,
}
return {}
@@ -0,0 +1,135 @@
"""
Executive-Summary-Block der oberste Email-Abschnitt.
Zeigt CFO / GF in 4 Zahlen den Gesamt-Mehrwert des Compliance-Checks:
1) Compliance-Score (Trend vs Vorlauf)
2) Anzahl analysierter Anbieter
3) Geschaetztes jaehrliches Sparpotenzial (Range)
4) Konsolidierungs-Potenzial (Anbieter koennen reduziert werden)
Plus zwei Big-CTA-Buttons:
- "Compliance-Maengel im Detail" springt zum Doc-Pruefungs-Block
- "Konsolidierungs-Plan ansehen" springt zum Redundanz-Block
Ziel: in 5 Sekunden sieht der Vorstand den ROI. Wenn neugierig, scrollt
er weiter in die Detail-Bloecke (die UNTER dieser Summary liegen).
"""
from __future__ import annotations
def _fmt_eur_range(low: int, high: int) -> str:
if not low and not high:
return ""
if low == high:
return f"~{low:,}".replace(",", ".")
return f"{low:,}{high:,}".replace(",", ".")
def build_exec_summary_html(
scorecard: dict | None,
previous_scorecard: dict | None,
cmp_vendors: list[dict] | None,
redundancy_report: dict | None,
site_name: str = "",
) -> str:
"""Build the top-of-email Executive Summary with 4 KPIs + 2 CTAs."""
# 1) Compliance-Score
pct = 0
delta_str = ""
score_color = "#94a3b8"
if scorecard:
totals = scorecard.get("totals") or {}
pct = int(totals.get("pct", 0))
score_color = ("#16a34a" if pct >= 80 else
"#d97706" if pct >= 50 else "#dc2626")
if previous_scorecard:
prev_pct = int((previous_scorecard.get("totals") or {}).get("pct", 0))
d = pct - prev_pct
if d:
trend_color = "#16a34a" if d > 0 else "#dc2626"
delta_str = (
f'<span style="font-size:14px;color:{trend_color};margin-left:6px">'
f'{"+" if d > 0 else ""}{d} pp</span>'
)
# 2) Vendor-Count
n_vendors = len(cmp_vendors or [])
# 3+4) Saving + Konsolidierung
s = (redundancy_report or {}).get("summary") or {}
sav_low, sav_high = s.get("estimated_saving_year_eur", [0, 0])
n_consolidation = s.get("consolidation_potential", 0)
sav_pct = s.get("estimated_saving_pct", "")
parts = [
'<div style="font-family:-apple-system,BlinkMacSystemFont,sans-serif;'
'max-width:700px;margin:0 auto 18px;padding:18px 22px;'
'background:linear-gradient(135deg,#1e293b 0%,#0f172a 100%);'
'border-radius:10px;color:white">',
f'<div style="font-size:11px;color:#94a3b8;text-transform:uppercase;'
f'letter-spacing:1.5px;margin-bottom:6px">Executive Summary</div>',
f'<h2 style="margin:0 0 16px;font-size:18px;color:white">'
f'Compliance-Check {site_name}</h2>',
# 2x2 KPI grid
'<table style="width:100%;border-collapse:separate;border-spacing:8px">',
# Row 1: Compliance + Vendor count
'<tr>',
f'<td style="width:50%;padding:12px 14px;background:rgba(255,255,255,0.05);'
f'border-radius:6px;border:1px solid rgba(255,255,255,0.08)">'
f'<div style="font-size:10px;color:#94a3b8;text-transform:uppercase;'
f'letter-spacing:1px;margin-bottom:4px">DSGVO / TDDDG / TMG Score</div>'
f'<div style="font-size:28px;font-weight:700;color:{score_color}">'
f'{pct}%{delta_str}</div>'
f'<div style="font-size:11px;color:#cbd5e1;margin-top:2px">'
f'aus {int((scorecard or {}).get("totals", {}).get("total", 0))} Pflicht-Pruefungen</div>'
f'</td>',
f'<td style="width:50%;padding:12px 14px;background:rgba(255,255,255,0.05);'
f'border-radius:6px;border:1px solid rgba(255,255,255,0.08)">'
f'<div style="font-size:10px;color:#94a3b8;text-transform:uppercase;'
f'letter-spacing:1px;margin-bottom:4px">Identifizierte Anbieter</div>'
f'<div style="font-size:28px;font-weight:700;color:white">{n_vendors}</div>'
f'<div style="font-size:11px;color:#cbd5e1;margin-top:2px">'
f'davon {n_consolidation} konsolidierbar</div>'
f'</td>',
'</tr>',
# Row 2: Saving + CTA-Hinweis
'<tr>',
f'<td colspan="2" style="padding:14px 16px;background:linear-gradient(90deg,'
f'rgba(16,185,129,0.15) 0%,rgba(16,185,129,0.05) 100%);'
f'border-radius:6px;border:1px solid rgba(16,185,129,0.3)">'
f'<div style="font-size:10px;color:#86efac;text-transform:uppercase;'
f'letter-spacing:1px;margin-bottom:4px">'
f'Geschaetztes Sparpotenzial pro Jahr (Tool-Lizenzen, ohne Media-Spend)</div>'
f'<div style="font-size:24px;font-weight:700;color:#34d399">'
f'{_fmt_eur_range(sav_low, sav_high)}'
f'<span style="font-size:14px;color:#86efac;margin-left:8px">({sav_pct})</span></div>'
f'<div style="font-size:11px;color:#cbd5e1;margin-top:4px">'
f'durch Konsolidierung redundanter Anbieter auf je 1 EU-Tool pro '
f'Funktions-Kategorie. <em>Schaetzbereich, mit dem Einkauf zu verifizieren.</em>'
f'</div></td>',
'</tr>',
'</table>',
# CTAs
'<div style="margin-top:14px;padding-top:12px;border-top:1px solid '
'rgba(255,255,255,0.1);text-align:center">',
'<a href="#mc-scorecard" style="display:inline-block;padding:8px 16px;'
'background:#7c3aed;color:white;text-decoration:none;border-radius:6px;'
'font-size:12px;font-weight:600;margin-right:8px">'
'Compliance-Maengel im Detail &rarr;</a>',
'<a href="#optimierungspotenzial" style="display:inline-block;padding:8px 16px;'
'background:#10b981;color:white;text-decoration:none;border-radius:6px;'
'font-size:12px;font-weight:600">'
'Konsolidierungs-Plan &rarr;</a>',
'</div>',
'</div>',
]
return "".join(parts)
@@ -421,10 +421,18 @@ def _render_vendor_row_full(v: dict) -> str:
f'{", ".join(flags[:4])}</div>'
f'{actions_html}'
)
risk = v.get("compliance_risk") or {}
risk_label = risk.get("label") or ""
risk_badge = ""
if risk_label and risk_label != "unklar":
rc = {"kritisch": ("#dc2626", "#fff"), "hoch": ("#fecaca", "#991b1b"),
"mittel": ("#fde68a", "#92400e"), "gering": ("#d1fae5", "#065f46")}.get(risk_label, ("#e5e7eb", "#475569"))
risk_badge = (f'<span style="margin-left:6px;padding:1px 5px;border-radius:3px;font-size:9px;'
f'background:{rc[0]};color:{rc[1]}">Risk: {risk_label}</span>')
return (
f'<tr style="border-top:1px solid #e2e8f0">'
f'<td style="padding:6px 8px;color:#1e293b;font-size:11px">'
f'{name}{flag_str}</td>'
f'{name}{risk_badge}{flag_str}</td>'
f'<td style="padding:6px 8px;color:#475569;font-size:11px">{category}</td>'
f'<td style="padding:6px 8px;color:#475569;font-size:11px">{country}</td>'
f'<td style="padding:6px 8px;text-align:center;color:#475569;font-size:11px">'
@@ -28,9 +28,10 @@ def build_redundancy_html(report: dict | None) -> str:
pct = s.get("estimated_saving_pct") or "n/a"
parts = [
'<div style="font-family:-apple-system,BlinkMacSystemFont,sans-serif;'
'max-width:700px;margin:0 auto 16px;padding:14px 18px;'
'background:#fef3c7;border:1px solid #fcd34d;border-radius:8px">',
'<div id="optimierungspotenzial" style="font-family:-apple-system,'
'BlinkMacSystemFont,sans-serif;max-width:700px;margin:0 auto 16px;'
'padding:14px 18px;background:#fef3c7;border:1px solid #fcd34d;'
'border-radius:8px">',
'<h3 style="margin:0 0 6px;font-size:14px;color:#92400e">'
'Optimierungspotenzial: Redundanzen + EU-Alternativen</h3>',
f'<p style="margin:0 0 10px;font-size:11px;color:#78350f">'
@@ -134,7 +134,9 @@ def build_management_summary(results: list[DocCheckResult]) -> str:
ok = [r for r in results if r.completeness_pct == 100 and not r.error]
fixable = [r for r in results if 0 < r.completeness_pct < 100 and not r.error]
critical = [r for r in results if r.completeness_pct == 0 and not r.error]
errors = [r for r in results if r.error]
not_applicable = [r for r in results if r.error
and r.error.startswith("Nicht anwendbar")]
errors = [r for r in results if r.error and r not in not_applicable]
html = [
'<div style="font-family:-apple-system,BlinkMacSystemFont,sans-serif;'
@@ -150,17 +152,24 @@ def build_management_summary(results: list[DocCheckResult]) -> str:
html.append('<p>Keine Dokumente geprueft.</p></div>')
return "\n".join(html)
na_note = (
f' Zusaetzlich {len(not_applicable)} Dokument{"" if len(not_applicable) == 1 else "e"} '
f'als NICHT ANWENDBAR markiert (kein Direkt-Vertrieb — '
f'OEM-Konfigurator-Pattern).' if not_applicable else ""
)
if len(ok) == total:
html.append(
'<p style="color:#16a34a;font-weight:600;font-size:15px">'
'Alle Dokumente sind vollstaendig. Keine dringenden Massnahmen noetig.</p>'
f'<p style="color:#16a34a;font-weight:600;font-size:15px">'
f'Alle Dokumente sind vollstaendig. Keine dringenden Massnahmen noetig.'
f'{na_note}</p>'
)
else:
html.append(
f'<p style="font-size:14px;color:#475569">'
f'{len(ok)} von {total} Dokumenten sind vollstaendig. '
f'{len(fixable)} brauchen Korrekturen'
f'{f", {len(critical)} fehlen oder sind unbrauchbar" if critical else ""}.</p>'
f'{f", {len(critical)} fehlen oder sind unbrauchbar" if critical else ""}.'
f'{na_note}</p>'
)
# Concrete actions
@@ -279,10 +288,13 @@ def _render_document(html: list[str], r: DocCheckResult, doc_text: str = "") ->
r.error.startswith("Nicht eingereicht")
or r.error.startswith("Auf der Website nicht gefunden")
)
is_not_applicable = bool(r.error) and r.error.startswith("Nicht anwendbar")
if is_missing:
status_label = ("NICHT GEFUNDEN"
if r.error.startswith("Auf der Website")
else "NICHT EINGEREICHT")
elif is_not_applicable:
status_label = "NICHT ANWENDBAR"
elif r.error:
status_label = "FEHLER"
@@ -330,6 +342,13 @@ def _render_document(html: list[str], r: DocCheckResult, doc_text: str = "") ->
'background:#fafafa;border-top:1px solid #f3f4f6">'
+ body_msg + '</div>'
)
elif is_not_applicable:
html.append(
'<div style="padding:12px 16px;color:#475569;font-size:12px;'
'background:#f1f5f9;border-top:1px solid #cbd5e1;border-left:'
'3px solid #94a3b8">'
+ r.error + '</div>'
)
elif r.error:
html.append(f'<div style="padding:12px 16px;color:#991b1b">{r.error}</div>')
else:
@@ -44,7 +44,7 @@ def build_scorecard_html(
trend_str = _delta_badge(overall_pct, prev_total_pct) if prev_total_pct is not None else ""
head = (
'<div style="font-family:-apple-system,BlinkMacSystemFont,sans-serif;'
'<div id="mc-scorecard" style="font-family:-apple-system,BlinkMacSystemFont,sans-serif;'
'max-width:700px;margin:0 auto 16px;padding:12px 16px;'
'background:#f0f9ff;border:1px solid #bae6fd;border-radius:8px">'
'<h3 style="margin:0 0 6px;font-size:14px;color:#0369a1">'
@@ -0,0 +1,104 @@
"""
Voll-Audit Findings Router unified view across all 4 finding sources.
Endpoint:
GET /api/compliance/agent/findings/{check_id}
?source=mc|pflichtangabe|vendor|redundanz|all
&severity=CRITICAL|HIGH|MEDIUM|LOW|INFO|all
&doc_type=impressum|dse|cookie|...|all
&status=failed|passed|skipped|na|info|all
&q=<freitext>
&limit=<int>
Liefert summary + filtered findings list. Frontend rendert daraus den
Voll-Audit-Tab unter /sdk/agent/audit/<check_id>.
"""
from __future__ import annotations
import logging
from urllib.parse import urlparse
from fastapi import APIRouter, HTTPException, Query
from compliance.services.unified_findings_store import (
findings_summary,
list_findings,
)
from compliance.services.compliance_audit_log import get_check_run
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/compliance/agent", tags=["agent"])
def _normalize_domain(d: str) -> str:
if not d:
return ""
if "://" not in d:
d = "https://" + d
host = urlparse(d).netloc.lower()
return host[4:] if host.startswith("www.") else host
@router.get("/findings/{check_id}")
def get_findings(
check_id: str,
source: str | None = Query(None, description="mc|pflichtangabe|vendor|redundanz|all"),
severity: str | None = Query(None, description="CRITICAL|HIGH|MEDIUM|LOW|INFO|all"),
doc_type: str | None = Query(None),
status: str | None = Query(None, description="failed|passed|skipped|na|info|all"),
q: str | None = Query(None, description="freitext-suche label/vendor"),
limit: int = Query(1000, ge=1, le=5000),
expected_domain: str | None = Query(
None, description="Hard-Assertion: Run muss zu dieser Domain gehoeren (Cross-Tenant-Schutz)",
),
) -> dict:
"""Return aggregated findings + summary counters for a check run."""
# P7-Restpunkt: optionale Domain-Assertion. Verhindert dass ein Frontend
# einen check_id einer fremden Tenant-Domain anfragen kann.
if expected_domain:
run = get_check_run(check_id)
actual = _normalize_domain((run or {}).get("base_domain") or "")
if not run or actual != _normalize_domain(expected_domain):
raise HTTPException(
status_code=403,
detail=f"Cross-tenant access blocked: check_id {check_id} "
f"gehoert zu Domain '{actual or '?'}', angefragt: "
f"'{_normalize_domain(expected_domain)}'",
)
try:
summary = findings_summary(check_id)
findings = list_findings(
check_id=check_id,
source_type=source,
severity=severity,
doc_type=doc_type,
status=status,
q=q,
limit=limit,
)
return {
"found": summary.get("total", 0) > 0,
"check_id": check_id,
"summary": summary,
"filter": {
"source": source or "all",
"severity": severity or "all",
"doc_type": doc_type or "all",
"status": status or "all",
"q": q or "",
"limit": limit,
},
"count": len(findings),
"findings": findings,
}
except Exception as e:
logger.exception("get_findings failed for %s", check_id)
return {
"found": False,
"check_id": check_id,
"error": str(e)[:200],
"summary": {},
"count": 0,
"findings": [],
}
@@ -0,0 +1,196 @@
"""
Saving-Scan-Funnel Endpoint Marketing-Lead Compliance-Check.
Externes Form (https://breakpilot.ai/savings-scan) postet hier:
POST /api/compliance/agent/saving-scan/start
Body: {"url": "...", "email": "..."}
Server-side:
1. Validierung URL + Email (E-Mail-Regex, URL-Schema).
2. Rate-Limit: max 1 vollstaendiger Scan / Domain / 24h
(saving_scan_allowed aus compliance_user_agent).
3. Lead persistieren (saving_scan_leads in Sidecar-SQLite) fuer
spaeteren Report-Versand + Sales-Follow-Up.
4. Compliance-Check starten mit Auto-Discovery (DocumentInput leer
ausser Homepage). Der bestehende Worker laeuft TDM-Check, dann
Discovery, dann Pruefung.
5. check_id zurueck Frontend pollt /compliance-check/<check_id>.
"""
from __future__ import annotations
import logging
import os
import re
import sqlite3
import uuid as _uuid
from datetime import datetime, timezone
from pathlib import Path
import asyncio
from fastapi import APIRouter, HTTPException
from pydantic import BaseModel, Field
from compliance.services.compliance_user_agent import (
base_domain_of, saving_scan_allowed,
)
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/compliance/agent", tags=["agent"])
DB_PATH = os.getenv("COMPLIANCE_AUDIT_DB", "/data/compliance_audits.db")
_EMAIL_RE = re.compile(r"^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$")
_URL_RE = re.compile(r"^https?://[A-Za-z0-9.-]+(/.*)?$")
class SavingScanRequest(BaseModel):
url: str = Field(..., min_length=4, max_length=400)
email: str = Field(..., min_length=5, max_length=200)
consent: bool = Field(
True, description="Marketing-Consent fuer Sales-Follow-Up — "
"muss True sein laut Form-Checkbox.",
)
class SavingScanResponse(BaseModel):
check_id: str
status: str
message: str = ""
def _ensure_leads_table() -> None:
Path(DB_PATH).parent.mkdir(parents=True, exist_ok=True)
with sqlite3.connect(DB_PATH) as conn:
conn.executescript("""
CREATE TABLE IF NOT EXISTS saving_scan_leads (
id INTEGER PRIMARY KEY AUTOINCREMENT,
ts TEXT NOT NULL,
email TEXT NOT NULL,
url TEXT NOT NULL,
base_domain TEXT NOT NULL,
check_id TEXT,
consent INTEGER NOT NULL,
source TEXT
);
CREATE INDEX IF NOT EXISTS idx_leads_domain ON saving_scan_leads(base_domain, ts);
CREATE INDEX IF NOT EXISTS idx_leads_email ON saving_scan_leads(email, ts);
""")
def _persist_lead(email: str, url: str, check_id: str, consent: bool) -> None:
try:
_ensure_leads_table()
with sqlite3.connect(DB_PATH) as conn:
conn.execute(
"INSERT INTO saving_scan_leads "
"(ts, email, url, base_domain, check_id, consent, source) "
"VALUES (?, ?, ?, ?, ?, ?, ?)",
(
datetime.now(timezone.utc).isoformat(),
email.lower().strip(),
url,
base_domain_of(url),
check_id,
1 if consent else 0,
"saving_scan_form",
),
)
conn.commit()
except Exception as e:
logger.warning("persist lead failed: %s", e)
def _normalize_url(url: str) -> str:
"""Strip path → behaupt nur Homepage, der Discover findet den Rest."""
if "://" not in url:
url = "https://" + url
from urllib.parse import urlparse
p = urlparse(url)
return f"{p.scheme}://{p.netloc}/"
@router.post("/saving-scan/start", response_model=SavingScanResponse)
async def start_saving_scan(req: SavingScanRequest) -> SavingScanResponse:
"""Trigger compliance check from the marketing-funnel form."""
if not _EMAIL_RE.match(req.email):
raise HTTPException(400, "Ungueltige E-Mail-Adresse.")
if not _URL_RE.match(req.url):
raise HTTPException(400, "URL muss mit http:// oder https:// beginnen.")
if not req.consent:
raise HTTPException(400, "Marketing-Consent erforderlich.")
domain = base_domain_of(req.url)
if not domain:
raise HTTPException(400, "Konnte Domain nicht ermitteln.")
allowed, wait_s = saving_scan_allowed(req.url)
if not allowed:
raise HTTPException(
429,
f"Fuer '{domain}' wurde in den letzten 24h bereits ein Scan "
f"durchgefuehrt. Bitte in {wait_s // 3600}h {wait_s % 3600 // 60}min "
f"erneut versuchen.",
)
# Lazy import to avoid circular dependency at module load.
from compliance.api.agent_compliance_check_routes import (
DocumentInput,
ComplianceCheckRequest,
_run_compliance_check,
_compliance_check_jobs,
)
homepage = _normalize_url(req.url)
check_id = str(_uuid.uuid4())[:8]
_compliance_check_jobs[check_id] = {
"status": "running",
"progress": "Saving-Scan gestartet — Auto-Discovery laeuft...",
"progress_pct": 0,
"result": None,
"error": "",
}
# Single "other" entry forces auto-discovery to fill in the rest.
docs = [DocumentInput(doc_type="other", url=homepage)]
check_req = ComplianceCheckRequest(
documents=docs, recipient=req.email.lower().strip(),
)
_persist_lead(req.email, req.url, check_id, req.consent)
asyncio.create_task(_run_compliance_check(check_id, check_req))
logger.info("saving-scan start: check_id=%s domain=%s email=%s",
check_id, domain, req.email[:3] + "***")
return SavingScanResponse(
check_id=check_id,
status="running",
message=f"Scan gestartet fuer {domain}. Bericht in ~3-5 Minuten.",
)
@router.get("/saving-scan/lead-count")
def saving_scan_lead_count() -> dict:
"""Diagnostik fuer das Sales-Dashboard."""
try:
_ensure_leads_table()
with sqlite3.connect(DB_PATH) as conn:
total = conn.execute(
"SELECT COUNT(*) FROM saving_scan_leads",
).fetchone()[0]
last_24h = conn.execute(
"SELECT COUNT(*) FROM saving_scan_leads "
"WHERE ts > datetime('now', '-1 day')",
).fetchone()[0]
top_domains = conn.execute(
"SELECT base_domain, COUNT(*) AS n FROM saving_scan_leads "
"GROUP BY base_domain ORDER BY n DESC LIMIT 10",
).fetchall()
return {
"total_leads": total,
"last_24h": last_24h,
"top_domains": [{"domain": d, "scans": n} for d, n in top_domains],
}
except Exception as e:
return {"error": str(e)[:200]}
@@ -0,0 +1,149 @@
"""
k-Anonymitaets-Helper fuer Branchen-Benchmarks (P6-Vorbereitung).
Vor jeder Veroeffentlichung von Benchmark-Aussagen pruefen, ob die
zugrundeliegende Stichprobe gross genug ist, dass keine Re-Identifikation
einzelner Hersteller moeglich wird.
Default k=5: jede publizierbare Aussage muss auf mindestens 5 verschiedenen
Datensubjekten (z.B. OEM-Sites) beruhen. Bei OEM-Markt mit ~30 Spielern
ist k=5 das Minimum, um "ein deutscher Premium-Hersteller mit X Modellen"
auszuschliessen.
Memory: feedback_oem_data_legal.md + project_legal_contracts_2026_07.md.
Verwendung:
from compliance.services.benchmark_k_anonymity import (
enforce_k_anonymity, quantize_value, KAnonymityError,
)
rows = [...] # pro Hersteller 1 Row
safe_groups = enforce_k_anonymity(rows, group_keys=["segment", "country"])
# safe_groups: nur Gruppen mit count >= 5 zurueck
"""
from __future__ import annotations
from collections.abc import Iterable
from typing import Any
DEFAULT_K = 5
class KAnonymityError(RuntimeError):
"""Stichprobe ist zu klein fuer eine publizierbare Aussage."""
def assert_min_sample(n: int, k: int = DEFAULT_K, context: str = "") -> None:
"""Wirft KAnonymityError wenn n < k."""
if n < k:
raise KAnonymityError(
f"Stichprobe zu klein fuer Publikation: n={n} < k={k}"
+ (f" — Kontext: {context}" if context else "")
)
def quantize_value(value: float | int, step: int = 5) -> int:
"""Quantisiere Zahlenwerte auf step-Vielfache (Generalisierung).
quantize_value(67, 5) -> 65
quantize_value(83, 10) -> 80
Verhindert exakte Identifizierung ueber numerische Signale.
"""
if step <= 0:
return int(value)
return int(value // step) * step
def quantize_range(value: float | int, step: int = 10) -> str:
"""Gib ein Range-Bucket zurueck als String: '60-70%', '80-90%'."""
base = quantize_value(value, step)
return f"{base}-{base + step}%"
def group_and_count(
rows: Iterable[dict],
keys: list[str],
) -> dict[tuple, int]:
"""Gruppiere Rows nach allen `keys` und zaehle pro Bucket."""
counts: dict[tuple, int] = {}
for r in rows:
bucket = tuple(r.get(k, "") for k in keys)
counts[bucket] = counts.get(bucket, 0) + 1
return counts
def enforce_k_anonymity(
rows: list[dict],
group_keys: list[str],
k: int = DEFAULT_K,
) -> list[dict]:
"""Filtere Rows so, dass jede ueberlebende Gruppe >= k Mitglieder hat.
Returns: Rows die in ausreichend grossen Gruppen sind.
Rows in zu kleinen Gruppen werden suppressed (entfernt).
"""
counts = group_and_count(rows, group_keys)
safe_buckets = {bucket for bucket, n in counts.items() if n >= k}
return [
r for r in rows
if tuple(r.get(key, "") for key in group_keys) in safe_buckets
]
def summarize_benchmark(
rows: list[dict],
group_keys: list[str],
measure_key: str,
k: int = DEFAULT_K,
quantize_step: int = 5,
) -> list[dict]:
"""Erzeuge publizierbare Benchmark-Aggregat-Zeilen.
Pro Gruppe: count, mean (quantisiert), only-if count >= k.
Liefert sortiert nach count desc.
Beispiel:
rows = [{"segment": "premium", "consent_score": 84}, ...]
summarize_benchmark(rows, ["segment"], "consent_score")
-> [{"segment": "premium", "n": 8, "mean_quantized": 80}, ...]
"""
buckets: dict[tuple, list[float]] = {}
for r in rows:
bucket = tuple(r.get(k, "") for k in group_keys)
val = r.get(measure_key)
if val is not None:
buckets.setdefault(bucket, []).append(float(val))
out: list[dict] = []
for bucket, values in buckets.items():
n = len(values)
if n < k:
continue
mean = sum(values) / n
entry: dict[str, Any] = {key: bucket[i] for i, key in enumerate(group_keys)}
entry["n"] = n
entry["mean_quantized"] = quantize_value(mean, quantize_step)
entry["mean_range"] = quantize_range(mean, quantize_step * 2)
out.append(entry)
out.sort(key=lambda e: e["n"], reverse=True)
return out
def safe_to_publish(
statement: str,
sample_size: int,
k: int = DEFAULT_K,
) -> tuple[bool, str]:
"""Validator fuer Marketing/Press-Statements.
Returns (ok, message). Wenn ok=False, NICHT publishen.
"""
if sample_size < k:
return False, (
f'Aussage NICHT publizierbar: "{statement[:60]}" '
f'(n={sample_size} < k={k}). Risiko: Re-Identifikation '
f'einzelner Hersteller moeglich.'
)
return True, f"OK (n={sample_size}, k={k})"
@@ -28,6 +28,12 @@ class BusinessProfile:
needs_odr: bool = False # Online-Streitbeilegung
detected_services: list[str] = field(default_factory=list)
confidence: float = 0.0
# Wenn True: die Site selbst schliesst KEINEN Direktkauf-Vertrag
# (typisch OEM-Konfigurator-Sites BMW/Audi/Mercedes — Vertrag laeuft
# ueber den Vertragshaendler, nicht die Hersteller-Webseite).
# Konsequenz: AGB/Widerruf/Nutzungsbedingungen sind NICHT PFLICHT
# auf der Website, sondern werden beim Haendler ausgehaendigt.
no_direct_sales: bool = False
# ── Keyword lists ────────────────────────────────────────────────────
@@ -319,4 +325,49 @@ async def detect_business_profile(documents: dict[str, str]) -> BusinessProfile:
"steuerberater": "finance", "architekt": "craft"}
profile.industry = prof_map.get(profile.regulated_profession_type, "unknown")
# ── no_direct_sales (OEM-Konfigurator-Pattern) ───────────────
# Hersteller-Sites die nur konfigurieren + zu Vertragshaendlern
# weiterleiten (BMW/Audi/Mercedes/VW/Porsche) schliessen KEINEN
# Direkt-Kaufvertrag. AGB/Widerruf/Nutzungsbedingungen sind dort
# nicht Pflicht — werden beim Haendler ausgehaendigt.
profile.no_direct_sales = _detect_no_direct_sales(full_text)
return profile
# Indikatoren: Site verweist primaer auf Vertragshaendler/Niederlassungen
# statt einen eigenen Checkout-Vertragsabschluss zu bieten.
_NO_DIRECT_SALES_POSITIVE = [
"vertragshaendler", "vertragshändler", "vertragspartner",
"vertragswerkstatt", "haendlersuche", "händlersuche",
"niederlassung", "vertretung", "autorisierter haendler",
"autorisierter händler", "ihr haendler vor ort",
"ihr händler vor ort", "haendler in ihrer naehe",
"händler in ihrer nähe", "probefahrt vereinbaren",
"anfrage an haendler", "anfrage an händler",
"konfigurator", "fahrzeug konfigurieren",
"ihre individuelle anfrage",
# OEM-Markennamen — sind Hersteller-Marken die ueblicherweise via
# Haendler vertreiben.
"bmw vertriebs", "audi vertriebs", "mercedes-benz vertriebs",
"volkswagen vertriebs", "porsche zentrum",
]
# Indikatoren GEGEN no_direct_sales: echte Online-Shop-Funktionen.
_DIRECT_SALES_NEGATIVE = [
"in den warenkorb", "warenkorb hinzu", "zur kasse",
"jetzt kaufen", "kostenpflichtig bestellen",
"zahlungspflichtig bestellen", "sofort-kauf",
"online bestellen", "lieferadresse", "rechnungsadresse",
]
def _detect_no_direct_sales(full_text: str) -> bool:
"""Heuristik: erkennt OEM-Konfigurator-Sites die nicht direkt verkaufen."""
text = full_text.lower()
pos = sum(1 for k in _NO_DIRECT_SALES_POSITIVE if k in text)
neg = sum(1 for k in _DIRECT_SALES_NEGATIVE if k in text)
# Mindestens 3 Haendler-Indikatoren UND weniger Shop-Indikatoren als
# Haendler-Indikatoren. Vermeidet false-positive fuer Shops die
# zusaetzlich "Haendlersuche" als Filiale-Finder anbieten.
return pos >= 3 and pos > neg
@@ -0,0 +1,141 @@
"""
Zentraler User-Agent-Provider + Domain-Rate-Limiter fuer alle Crawls.
UA-Switch ist Trigger-gebunden an Firmengruendung:
- aktuell (Vor-Gruendung): generischer Headless-Chrome-UA
- nach Gruendung: env BREAKPILOT_BRANDED_UA=1 setzen
-> "BreakPilot-Compliance-Scanner/1.0 (+https://...)"
Memory: project_legal_contracts_2026_07.md (Punkt 0).
Rate-Limit:
- Default 1 req/sec/Domain, max 2 concurrent pro Domain.
- Saving-Scan-Funnel separat: max 1 vollstaendiger Run / Domain / 24h.
"""
from __future__ import annotations
import asyncio
import os
import time
from collections import defaultdict
from urllib.parse import urlparse
_BRANDED_UA = (
"BreakPilot-Compliance-Scanner/1.0 "
"(+https://breakpilot.ai/scanner)"
)
_NEUTRAL_UA = (
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
"(KHTML, like Gecko) HeadlessChrome/120.0.0.0 Safari/537.36"
)
def crawler_user_agent() -> str:
"""Aktueller UA-String fuer alle ausgehenden Crawls.
Switcht auf den Markennamen sobald BREAKPILOT_BRANDED_UA=1 gesetzt
wird (nach Firmengruendung siehe Memory).
"""
branded = (os.getenv("BREAKPILOT_BRANDED_UA") or "").strip().lower()
if branded in ("1", "true", "yes"):
return _BRANDED_UA
return _NEUTRAL_UA
def default_request_headers() -> dict:
"""Vollstaendiger Header-Satz fuer httpx-Calls."""
return {
"User-Agent": crawler_user_agent(),
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "de-DE,de;q=0.9,en;q=0.8",
}
def base_domain_of(url_or_host: str) -> str:
if not url_or_host:
return ""
if "://" not in url_or_host:
url_or_host = "https://" + url_or_host
netloc = urlparse(url_or_host).netloc.lower()
return netloc.replace("www.", "") or url_or_host
# --- per-Domain Rate-Limit ----------------------------------------------
_MIN_INTERVAL_S = 1.0 # 1 req/sec/Domain
_MAX_CONCURRENT_PER_DOMAIN = 2
_last_request_at: dict[str, float] = defaultdict(float)
_semaphores: dict[str, asyncio.Semaphore] = {}
_locks_lock = asyncio.Lock()
async def _get_semaphore(domain: str) -> asyncio.Semaphore:
async with _locks_lock:
sem = _semaphores.get(domain)
if sem is None:
sem = asyncio.Semaphore(_MAX_CONCURRENT_PER_DOMAIN)
_semaphores[domain] = sem
return sem
class DomainRateLimiter:
"""Async-Context: warte vor Request + halte concurrent-Slot.
async with DomainRateLimiter(url):
resp = await client.get(url)
"""
def __init__(self, url_or_domain: str):
self.domain = base_domain_of(url_or_domain)
async def __aenter__(self):
sem = await _get_semaphore(self.domain)
await sem.acquire()
last = _last_request_at[self.domain]
wait = (last + _MIN_INTERVAL_S) - time.monotonic()
if wait > 0:
await asyncio.sleep(wait)
_last_request_at[self.domain] = time.monotonic()
self._sem = sem
return self
async def __aexit__(self, exc_type, exc, tb):
self._sem.release()
return False
# --- per-Domain "1 full run / 24h" (Saving-Scan) -----------------------
_DB_PATH = os.getenv("COMPLIANCE_AUDIT_DB", "/data/compliance_audits.db")
_SAVING_SCAN_INTERVAL_S = 24 * 3600
def saving_scan_allowed(domain_or_url: str) -> tuple[bool, int]:
"""True wenn fuer diese Domain in den letzten 24h kein Saving-Scan lief.
Liest aus compliance_audit_log.check_runs (existierende Tabelle).
Liefert (allowed, seconds_until_allowed).
"""
import sqlite3
domain = base_domain_of(domain_or_url)
if not domain:
return True, 0
try:
with sqlite3.connect(_DB_PATH) as conn:
row = conn.execute(
"SELECT MAX(ts) FROM check_runs WHERE base_domain=?",
(domain,),
).fetchone()
last = row[0] if row else None
if not last:
return True, 0
from datetime import datetime
elapsed = time.time() - datetime.fromisoformat(last).timestamp()
if elapsed >= _SAVING_SCAN_INTERVAL_S:
return True, 0
return False, int(_SAVING_SCAN_INTERVAL_S - elapsed)
except Exception:
return True, 0
@@ -129,20 +129,29 @@ def classify_cookie(cookie_name: str) -> tuple[str, str]:
def annotate_vendor_cookies(vendor: dict) -> dict:
"""Enrich a vendor record with functional_role per cookie."""
"""Enrich a vendor record with functional_role + KB knowledge per cookie."""
from compliance.services.cookie_knowledge import (
lookup_cookie, summarize_compliance_risk,
)
cookies = vendor.get("cookies") or []
annotated = []
role_counts: dict[str, int] = {}
for c in cookies:
role, impact = classify_cookie(c.get("name", ""))
annotated.append({**c, "functional_role": role, "blocking_impact": impact})
knowledge = lookup_cookie(c.get("name", ""))
entry = {**c, "functional_role": role, "blocking_impact": impact}
if knowledge:
entry["knowledge"] = knowledge
annotated.append(entry)
role_counts[role] = role_counts.get(role, 0) + 1
return {
out = {
**vendor,
"cookies": annotated,
"role_distribution": role_counts,
"role_labels": {r: _FUNCTIONAL_LABEL.get(r, r) for r in role_counts},
}
out["compliance_risk"] = summarize_compliance_risk(out)
return out
def aggregate_cookie_purposes(vendors: Iterable[dict]) -> dict:
@@ -0,0 +1,106 @@
"""
Cookie-Knowledge Facade vereint die Basis-KB (cookie_knowledge_db) mit
der Erweiterung (cookie_knowledge_extended) hinter einer einzigen API.
Caller sollten von hier importieren statt von einer der beiden Sub-DBs.
from compliance.services.cookie_knowledge import (
lookup_cookie,
enrich_vendor_with_knowledge,
summarize_compliance_risk,
compliance_risk_label,
)
Lookup-Reihenfolge: Extended (kuratiert, juenger) vor Base. Dadurch
koennen wir Eintraege ueberschreiben ohne die Base zu touchen.
"""
from __future__ import annotations
from compliance.services.cookie_knowledge_db import (
CookieKnowledge,
lookup_cookie as _lookup_base,
)
from compliance.services.cookie_knowledge_extended import (
KB_EXT,
lookup_cookie_extended,
)
def lookup_cookie(name: str) -> CookieKnowledge | None:
"""Resolve cookie name to enriched knowledge — extended overrides base."""
return lookup_cookie_extended(name) or _lookup_base(name)
def enrich_vendor_with_knowledge(vendor: dict) -> dict:
"""Add per-cookie knowledge dict + per-vendor risk summary."""
cookies = vendor.get("cookies") or []
enriched = []
for c in cookies:
info = lookup_cookie(c.get("name", ""))
enriched.append({**c, "knowledge": info} if info else c)
out = {**vendor, "cookies": enriched}
out["compliance_risk"] = summarize_compliance_risk(out)
return out
def summarize_compliance_risk(vendor: dict) -> dict:
"""Aggregate Re-ID risk + Schrems-II exposure across all cookies."""
cookies = vendor.get("cookies") or []
risk_counts = {"high": 0, "medium": 0, "low": 0}
schrems_affected = 0
strictly_necessary = 0
classified = 0
for c in cookies:
k = c.get("knowledge") or lookup_cookie(c.get("name", ""))
if not k:
continue
classified += 1
risk = (k.get("reid_risk") or "low").lower()
risk_counts[risk] = risk_counts.get(risk, 0) + 1
if "us" in (k.get("vendor_country") or "").lower() or \
"schrems" in (k.get("schrems_ii_status") or "").lower():
schrems_affected += 1
if k.get("technical_necessity") == "full":
strictly_necessary += 1
return {
"reid_risk_distribution": risk_counts,
"high_risk_cookie_count": risk_counts["high"],
"schrems_ii_affected_cookies": schrems_affected,
"strictly_necessary_cookies": strictly_necessary,
"total_classified": classified,
"label": compliance_risk_label({
"high_risk_cookie_count": risk_counts["high"],
"schrems_ii_affected_cookies": schrems_affected,
"total_classified": classified,
}),
}
def compliance_risk_label(summary: dict) -> str:
"""Compact risk badge: 'kritisch' | 'hoch' | 'mittel' | 'gering' | 'unklar'."""
if not summary or not summary.get("total_classified"):
return "unklar"
high = summary.get("high_risk_cookie_count", 0)
schrems = summary.get("schrems_ii_affected_cookies", 0)
total = summary.get("total_classified", 0) or 1
if high >= 3 and schrems >= 2:
return "kritisch"
if high >= 2 or (high >= 1 and schrems >= 1):
return "hoch"
if high >= 1 or schrems >= 1:
return "mittel"
return "gering"
def kb_size() -> dict:
"""Diagnostik fuer den Admin/Health-Endpoint."""
from compliance.services.cookie_knowledge_db import KB as _KB_BASE
base_keys = set(_KB_BASE.keys())
ext_keys = set(KB_EXT.keys())
return {
"base_entries": len(base_keys),
"extended_entries": len(ext_keys),
"extended_overrides_base": len(base_keys & ext_keys),
"total_unique": len(base_keys | ext_keys),
}
@@ -0,0 +1,497 @@
"""
Cookie-Knowledge Erweiterung Adobe, Meta erweitert, Microsoft, LinkedIn,
TikTok, Salesforce/HubSpot/Marketo, Hotjar/Mouseflow/FullStory, Live-Chat,
Cloudflare/Akamai, Payment, CMP-eigene Cookies, EU-Analytics.
Hinweis zu Rechten: Eintraege enthalten ausschliesslich Identitaetsfelder
(Cookie-Name, Anbieter, Sitzland) + EIGENE Knappformulierungen + Verweise
auf oeffentliche EuGH-/CNIL-/EDPB-Quellen. KEINE 1:1-Kopien aus OneTrust,
Cookiepedia oder Vendor-eigenen Beschreibungstexten.
Quellen-Pointer: IAB TCF v2.2 Vendor List, CNIL Cookies & Trackers
Guidelines 2024, EDPB Guidelines 2/2023, EuGH-Rechtsprechung (Schrems II,
Planet49), DSK-Orientierungshilfen 2021/2024.
"""
from __future__ import annotations
from compliance.services.cookie_knowledge_db import CookieKnowledge
_ADOBE_BASE = {
"vendor": "Adobe Inc.", "vendor_country": "US",
"schrems_ii_status": "Drittlandtransfer US. Mit DPF (2023) wieder "
"zulaessig; EU-Datenresidenz-Option in Adobe "
"Experience Platform verfuegbar.",
"eugh_rulings": [
"EuGH C-311/18 (Schrems II)",
"EDPB Recommendations 01/2020 — Supplementary Measures",
],
}
_META_BASE = {
"vendor": "Meta Platforms Ireland Ltd.", "vendor_country": "IE",
"schrems_ii_status": "Verarbeitung in IE + US-Transfer. DPC Ireland "
"Bussgeld 2023 (€1,2 Mrd) wegen unzureichender "
"Schutzmassnahmen — DPF deckt seit 2023.",
"eugh_rulings": [
"EuGH C-311/18 (Schrems II)",
"DPC Ireland 2023 — Meta 1,2 Mrd. EUR",
],
}
_MICROSOFT_BASE = {
"vendor": "Microsoft Corp.", "vendor_country": "US",
"schrems_ii_status": "DPF-zertifiziert; EU Data Boundary fuer Azure/365 "
"seit 2024 verfuegbar.",
"eugh_rulings": ["EuGH C-311/18 (Schrems II)"],
}
_LINKEDIN_BASE = {
"vendor": "LinkedIn Ireland Unlimited Co.", "vendor_country": "IE",
"schrems_ii_status": "Microsoft-Konzern, EU-Hauptsitz IE, Transfer US.",
"eugh_rulings": ["EuGH C-311/18 (Schrems II)"],
}
KB_EXT: dict[str, CookieKnowledge] = {
# --- Adobe Experience Cloud --------------------------------------
# AMCV_, s_cc, s_sq leben in Base-KB.
"demdex": {
**_ADOBE_BASE,
"vendor": "Adobe Inc. (Audience Manager)",
"exact_purpose": "Adobe Audience Manager DMP — Cross-Site-Profil "
"fuer Zielgruppen-Segmentierung.",
"data_collected": ["dpuuid", "segments"],
"ip_relevant": True,
"tcf_purpose_ids": [4, 9, 10],
"typical_lifetime": "180 Tage",
"reid_risk": "high", "technical_necessity": "none",
},
# --- Meta erweitert -----------------------------------------------
# fr, _fbc leben in Base-KB.
"datr": {
**_META_BASE,
"exact_purpose": "Facebook Browser-Identifier — Anti-Abuse/Bot-Schutz.",
"data_collected": ["browser_fingerprint_id"],
"ip_relevant": True,
"typical_lifetime": "2 Jahre",
"reid_risk": "high", "technical_necessity": "partial",
"notes": "Wird auch ohne Consent gesetzt; Meta argumentiert "
"Sicherheit. Trotzdem von DSK 2024 kritisch bewertet.",
},
# --- Microsoft / Bing ---------------------------------------------
# MUID lebt in Base-KB.
"MSCC": {
**_MICROSOFT_BASE,
"exact_purpose": "Microsoft Site Consent — Consent-Status-Speicherung "
"fuer Microsoft-eigene Properties.",
"data_collected": ["consent_string"],
"typical_lifetime": "1 Jahr",
"reid_risk": "low", "technical_necessity": "full",
"notes": "Strictly necessary nach §25(2) TDDDG.",
},
"ai_session": {
**_MICROSOFT_BASE,
"vendor": "Microsoft Corp. (Application Insights)",
"exact_purpose": "Azure Application Insights — Session-Tracking fuer "
"Telemetry.",
"data_collected": ["session_id"],
"typical_lifetime": "30 Minuten",
"reid_risk": "medium", "technical_necessity": "partial",
},
# --- LinkedIn ------------------------------------------------------
"li_at": {
**_LINKEDIN_BASE,
"exact_purpose": "LinkedIn-Authentifizierung — Login-Session.",
"data_collected": ["auth_token"],
"typical_lifetime": "1 Jahr",
"reid_risk": "high", "technical_necessity": "full",
"notes": "Nur fuer eingeloggte Nutzer; auf externer Site = "
"Insight Tag (siehe li_sugr).",
},
"li_sugr": {
**_LINKEDIN_BASE,
"exact_purpose": "LinkedIn Insight Tag — Browser-ID fuer "
"Conversion-Tracking + Werbe-Targeting.",
"data_collected": ["browser_id"],
"ip_relevant": True,
"tcf_purpose_ids": [7, 9, 10],
"typical_lifetime": "90 Tage",
"reid_risk": "high", "technical_necessity": "none",
},
# bcookie, lidc leben in Base-KB.
# --- TikTok --------------------------------------------------------
"_ttp": {
"vendor": "TikTok Pte. Ltd.", "vendor_country": "SG/CN",
"exact_purpose": "TikTok Pixel — User-ID fuer Conversion-Tracking + "
"Werbeoptimierung.",
"data_collected": ["pixel_id", "browser_id"],
"ip_relevant": True,
"tcf_purpose_ids": [7, 9, 10],
"typical_lifetime": "13 Monate",
"reid_risk": "high", "technical_necessity": "none",
"schrems_ii_status": "Drittlandtransfer in Drittstaaten ohne "
"Angemessenheitsbeschluss. CNIL 2023 — "
"TikTok 5 Mio EUR Bussgeld.",
"eugh_rulings": [
"CNIL SAN-2022-027 — TikTok 5 Mio EUR",
"Italienische DPA 2024 — TikTok 10 Mio EUR",
],
},
"ttwid": {
"vendor": "TikTok Pte. Ltd.", "vendor_country": "SG/CN",
"exact_purpose": "TikTok Web-Identifier — eindeutige Browser-ID auch "
"ohne Login.",
"data_collected": ["ttwid"],
"typical_lifetime": "1 Jahr",
"reid_risk": "high", "technical_necessity": "none",
"schrems_ii_status": "Wie _ttp.",
},
# --- HubSpot / Marketo / Salesforce ------------------------------
"hubspotutk": {
"vendor": "HubSpot Inc.", "vendor_country": "US",
"exact_purpose": "HubSpot User-Token — Cross-Visit-Identitaet fuer "
"Lead-Tracking.",
"data_collected": ["user_token"],
"ip_relevant": True,
"tcf_purpose_ids": [7, 8],
"typical_lifetime": "6 Monate",
"reid_risk": "high", "technical_necessity": "none",
"schrems_ii_status": "DPF-zertifiziert.",
},
"__hssc": {
"vendor": "HubSpot Inc.", "vendor_country": "US",
"exact_purpose": "HubSpot Session-Tracking — Pageviews innerhalb "
"einer Session.",
"data_collected": ["session_count"],
"typical_lifetime": "30 Minuten",
"reid_risk": "low", "technical_necessity": "none",
},
"_mkto_trk": {
"vendor": "Adobe Inc. (Marketo)", "vendor_country": "US",
"exact_purpose": "Marketo Munchkin-Tracker — Lead-Identifikation "
"fuer Marketing-Automation.",
"data_collected": ["munchkin_id", "session_id"],
"ip_relevant": True,
"typical_lifetime": "2 Jahre",
"reid_risk": "high", "technical_necessity": "none",
"schrems_ii_status": _ADOBE_BASE["schrems_ii_status"],
},
"BrowserId_sec": {
"vendor": "Salesforce.com Inc.", "vendor_country": "US",
"exact_purpose": "Salesforce Marketing Cloud Browser-Token — "
"Cross-Visit-Identifikation.",
"data_collected": ["browser_id"],
"typical_lifetime": "1 Jahr",
"reid_risk": "medium", "technical_necessity": "none",
"schrems_ii_status": "DPF-zertifiziert.",
},
# --- Session-Recording / Heatmaps ---------------------------------
"_hjSessionUser_": {
"vendor": "Hotjar Ltd.", "vendor_country": "MT",
"exact_purpose": "Hotjar User-ID — Cross-Visit-Identifikation fuer "
"Session-Recording + Heatmaps.",
"data_collected": ["user_id"],
"ip_relevant": True,
"typical_lifetime": "1 Jahr",
"reid_risk": "high", "technical_necessity": "none",
"schrems_ii_status": "EU (Malta) — kein Drittland. Aber: parent "
"Contentsquare (FR) hostet teilweise in US.",
"notes": "Suffix `<site_id>`. Pattern-Match noetig. "
"DSGVO-Aufzeichnung = Einwilligung pflichtig.",
"eu_alternative_vendor": "Mouseflow / Smartlook (CZ)",
},
"_hjSession_": {
"vendor": "Hotjar Ltd.", "vendor_country": "MT",
"exact_purpose": "Hotjar Session-Token — eindeutige Session-ID "
"innerhalb 30min Inaktivitaet.",
"data_collected": ["session_id"],
"typical_lifetime": "30 Minuten",
"reid_risk": "medium", "technical_necessity": "none",
},
"fs_uid": {
"vendor": "FullStory Inc.", "vendor_country": "US",
"exact_purpose": "FullStory User-ID — Cross-Visit-Identifikation "
"fuer Session-Replay.",
"data_collected": ["user_id"],
"ip_relevant": True,
"typical_lifetime": "1 Jahr",
"reid_risk": "high", "technical_necessity": "none",
"schrems_ii_status": "DPF-zertifiziert. EU-Region verfuegbar (opt-in).",
},
"mf_user": {
"vendor": "Mouseflow Aps", "vendor_country": "DK",
"exact_purpose": "Mouseflow User-ID — Cross-Visit-Identifikation fuer "
"Heatmap + Recording.",
"data_collected": ["user_id"],
"typical_lifetime": "1 Jahr",
"reid_risk": "medium", "technical_necessity": "none",
"schrems_ii_status": "EU (DK) — kein Drittland.",
},
# --- Live-Chat ----------------------------------------------------
"intercom-id-": {
"vendor": "Intercom Inc.", "vendor_country": "US",
"exact_purpose": "Intercom Visitor-ID — Wiedererkennung anonymer "
"Besucher fuer Chat-History.",
"data_collected": ["visitor_id"],
"typical_lifetime": "9 Monate",
"reid_risk": "medium", "technical_necessity": "partial",
"schrems_ii_status": "DPF-zertifiziert; EU-Datenresidenz optional.",
"notes": "Suffix `<app_id>`. Pattern-Match noetig.",
},
"driftt_aid": {
"vendor": "Salesforce.com Inc. (Drift)", "vendor_country": "US",
"exact_purpose": "Drift Anonymous-Visitor-ID fuer Chat-Personalisierung.",
"data_collected": ["visitor_id"],
"typical_lifetime": "2 Jahre",
"reid_risk": "medium", "technical_necessity": "partial",
},
"__zlcmid": {
"vendor": "Zendesk Inc.", "vendor_country": "US",
"exact_purpose": "Zendesk Chat Visitor-ID fuer Session-Tracking.",
"data_collected": ["chat_visitor_id"],
"typical_lifetime": "1 Jahr",
"reid_risk": "medium", "technical_necessity": "partial",
"schrems_ii_status": "DPF-zertifiziert; EU-Datacenter optional.",
},
# --- CDN / Sicherheit (strictly necessary) -----------------------
# __cf_bm, cf_clearance leben in Base-KB.
"AKA_A2": {
"vendor": "Akamai Technologies Inc.", "vendor_country": "US",
"exact_purpose": "Akamai Adaptive Acceleration — geroutete Best-Path-"
"Optimierung.",
"data_collected": ["a2_route"],
"typical_lifetime": "1 Stunde",
"reid_risk": "low", "technical_necessity": "full",
},
# --- Payment (strictly necessary fuer Checkout) ------------------
"__stripe_mid": {
"vendor": "Stripe Payments Europe Ltd.", "vendor_country": "IE",
"exact_purpose": "Stripe Fraud-Detection Merchant-ID — Risiko-Scoring "
"fuer Zahlungs-Authentifizierung.",
"data_collected": ["merchant_visitor_id"],
"ip_relevant": True,
"typical_lifetime": "1 Jahr",
"reid_risk": "low", "technical_necessity": "full",
"schrems_ii_status": "EU (IE) — kein Drittland.",
"notes": "Strictly necessary nach §25(2) TDDDG fuer Zahlungsabwicklung.",
},
"__stripe_sid": {
"vendor": "Stripe Payments Europe Ltd.", "vendor_country": "IE",
"exact_purpose": "Stripe Session-ID — temporaere Zahlungs-Session.",
"data_collected": ["session_id"],
"typical_lifetime": "30 Minuten",
"reid_risk": "low", "technical_necessity": "full",
},
# --- CMP-eigene Cookies (strictly necessary) ---------------------
"CookieConsent": {
"vendor": "Cybot A/S (Cookiebot)", "vendor_country": "DK",
"exact_purpose": "Cookiebot Consent-Speicherung — gewaehlte "
"Kategorien + Zeitstempel.",
"data_collected": ["consent_categories", "consent_timestamp"],
"typical_lifetime": "1 Jahr",
"reid_risk": "low", "technical_necessity": "full",
"schrems_ii_status": "EU (DK). Wenn EU-Cloud, kein Drittland.",
},
"OptanonConsent": {
"vendor": "OneTrust LLC", "vendor_country": "US",
"exact_purpose": "OneTrust Consent-Speicherung — Kategorien + "
"Vendor-Liste + Zeitstempel.",
"data_collected": ["consent_categories", "consent_string"],
"typical_lifetime": "1 Jahr",
"reid_risk": "low", "technical_necessity": "full",
"schrems_ii_status": "DPF-zertifiziert; EU-Cloud optional.",
},
"OptanonAlertBoxClosed": {
"vendor": "OneTrust LLC", "vendor_country": "US",
"exact_purpose": "OneTrust UI-Flag — verhindert Re-Display des "
"Banners nach Schliessung.",
"data_collected": ["closed_timestamp"],
"typical_lifetime": "1 Jahr",
"reid_risk": "low", "technical_necessity": "full",
},
"usercentrics-uuid": {
"vendor": "Usercentrics GmbH", "vendor_country": "DE",
"exact_purpose": "Usercentrics Consent-Speicherung — UUID-basiert.",
"data_collected": ["consent_uuid", "consent_settings"],
"typical_lifetime": "1 Jahr",
"reid_risk": "low", "technical_necessity": "full",
"schrems_ii_status": "DE — kein Drittland.",
},
# --- Weitere Social / Werbeplattformen ---------------------------
# _pin_unauth lebt in Base-KB.
"_scid": {
"vendor": "Snap Group Ltd.", "vendor_country": "GB/US",
"exact_purpose": "Snapchat Pixel — Conversion-Tracking fuer "
"Snap Ads.",
"data_collected": ["snap_visitor_id"],
"ip_relevant": True,
"tcf_purpose_ids": [7, 9, 10],
"typical_lifetime": "1 Jahr",
"reid_risk": "high", "technical_necessity": "none",
"schrems_ii_status": "Drittlandtransfer; UK seit 2021 mit "
"Angemessenheitsbeschluss.",
},
"guest_id": {
"vendor": "X Corp. (Twitter)", "vendor_country": "US",
"exact_purpose": "X/Twitter Guest-Identifier — Tracking nicht "
"eingeloggter Besucher inkl. Embeds.",
"data_collected": ["guest_id"],
"ip_relevant": True,
"tcf_purpose_ids": [4, 9, 10],
"typical_lifetime": "2 Jahre",
"reid_risk": "high", "technical_necessity": "none",
"schrems_ii_status": "DPF-Status unklar seit Eigentuemerwechsel 2022. "
"Erhoehtes Risiko, EDPB beobachtet.",
},
"VISITOR_INFO1_LIVE": {
"vendor": "Google Ireland Ltd. (YouTube)", "vendor_country": "IE",
"exact_purpose": "YouTube Embed Visitor-ID — Bandbreiten-Optimierung "
"+ Empfehlungsalgorithmus.",
"data_collected": ["youtube_visitor_id"],
"ip_relevant": True,
"tcf_purpose_ids": [8, 10],
"typical_lifetime": "6 Monate",
"reid_risk": "high", "technical_necessity": "none",
"notes": "YouTube-NoCookie-Domain (youtube-nocookie.com) reduziert "
"Tracking — DSGVO-konformer.",
},
"vuid": {
"vendor": "Vimeo Inc.", "vendor_country": "US",
"exact_purpose": "Vimeo User-Identifier — Wiedererkennung "
"wiederkehrender Besucher fuer Statistik.",
"data_collected": ["vimeo_user_id"],
"typical_lifetime": "2 Jahre",
"reid_risk": "medium", "technical_necessity": "none",
"schrems_ii_status": "DPF-zertifiziert.",
},
# --- Marketing-Automation / Email --------------------------------
"__kla_id": {
"vendor": "Klaviyo Inc.", "vendor_country": "US",
"exact_purpose": "Klaviyo Visitor-Tracking — fuer E-Mail-Marketing-"
"Attribution.",
"data_collected": ["klaviyo_id"],
"ip_relevant": True,
"typical_lifetime": "2 Jahre",
"reid_risk": "high", "technical_necessity": "none",
"schrems_ii_status": "DPF-zertifiziert.",
},
"_mcid": {
"vendor": "Intuit Mailchimp", "vendor_country": "US",
"exact_purpose": "Mailchimp Email-Click-Tracking — Verknuepft "
"Pageviews mit gesendeter Kampagne.",
"data_collected": ["mc_email_id"],
"typical_lifetime": "1 Jahr",
"reid_risk": "high", "technical_necessity": "none",
"schrems_ii_status": "DPF-zertifiziert.",
},
# --- Product-Analytics / CDP -------------------------------------
"mp_": {
"vendor": "Mixpanel Inc.", "vendor_country": "US",
"exact_purpose": "Mixpanel Distinct-ID + Properties — "
"Pseudonyme Event-Analytics.",
"data_collected": ["distinct_id", "properties"],
"typical_lifetime": "1 Jahr",
"reid_risk": "high", "technical_necessity": "none",
"schrems_ii_status": "DPF-zertifiziert; EU-Residency optional.",
"notes": "Suffix `<token>_mixpanel`. Pattern-Match noetig.",
},
"ajs_anonymous_id": {
"vendor": "Twilio Inc. (Segment)", "vendor_country": "US",
"exact_purpose": "Segment Anonymous-ID — Cross-Device-Identitaet "
"vor Login.",
"data_collected": ["anonymous_id"],
"typical_lifetime": "1 Jahr",
"reid_risk": "high", "technical_necessity": "none",
"schrems_ii_status": "DPF-zertifiziert; EU-Datenresidenz optional.",
},
"AMP_": {
"vendor": "Amplitude Inc.", "vendor_country": "US",
"exact_purpose": "Amplitude Device-ID — Cross-Session-Identitaet "
"fuer Product-Analytics.",
"data_collected": ["device_id", "session_id"],
"typical_lifetime": "1 Jahr",
"reid_risk": "high", "technical_necessity": "none",
"schrems_ii_status": "DPF-zertifiziert.",
"notes": "Suffix `<api_key>`. Pattern-Match noetig.",
},
# --- A/B-Testing -------------------------------------------------
"optimizelyEndUserId": {
"vendor": "Optimizely Inc.", "vendor_country": "US",
"exact_purpose": "Optimizely End-User-ID — konsistente "
"Experiment-Zuteilung pro Besucher.",
"data_collected": ["end_user_id", "variation_assignments"],
"typical_lifetime": "6 Monate",
"reid_risk": "medium", "technical_necessity": "none",
"schrems_ii_status": "DPF-zertifiziert.",
},
# --- RUM / Monitoring (oft strictly necessary diskutiert) --------
"_dd_s": {
"vendor": "Datadog Inc.", "vendor_country": "US",
"exact_purpose": "Datadog RUM Session-Tracking — Performance- "
"Monitoring + Fehler-Telemetrie.",
"data_collected": ["session_id", "session_type"],
"typical_lifetime": "15 Minuten",
"reid_risk": "low", "technical_necessity": "partial",
"schrems_ii_status": "EU-Region (Frankfurt) verfuegbar.",
"notes": "Bei reiner Server-/Fehler-Telemetrie ohne Cross-Site-"
"Tracking Argument fuer berechtigtes Interesse moeglich.",
},
# --- EU-Analytics-Alternativen -----------------------------------
"_pk_ref": {
"vendor": "InnoCraft Ltd. (Matomo)", "vendor_country": "NZ",
"exact_purpose": "Matomo Referrer-Tracking — Quelle des Besuchs.",
"data_collected": ["referrer", "campaign"],
"typical_lifetime": "6 Monate",
"reid_risk": "low", "technical_necessity": "none",
"schrems_ii_status": "NZ hat Angemessenheitsbeschluss (2012). "
"Bei On-Premise-Hosting kein Transfer.",
"notes": "Self-Hosting empfohlen — dann zeroes Drittland.",
},
"_pk_cvar": {
"vendor": "InnoCraft Ltd. (Matomo)", "vendor_country": "NZ",
"exact_purpose": "Matomo Custom-Variables — pro Visit konfigurierbar.",
"data_collected": ["custom_vars"],
"typical_lifetime": "30 Minuten",
"reid_risk": "low", "technical_necessity": "none",
},
}
# Pattern-Lookups fuer dynamische Cookie-Namen
_EXT_PATTERNS: list[tuple[str, str]] = [
(r"^_hjSessionUser_", "_hjSessionUser_"),
(r"^_hjSession_", "_hjSession_"),
(r"^intercom-id-", "intercom-id-"),
(r"^mp_", "mp_"),
(r"^AMP_", "AMP_"),
]
def lookup_cookie_extended(name: str) -> CookieKnowledge | None:
"""Lookup in der KB_EXT (Extension). None wenn nicht gefunden."""
import re
if not name: return None # noqa: E701
if name in KB_EXT: return KB_EXT[name] # noqa: E701
for pat, key in _EXT_PATTERNS:
if re.search(pat, name): return KB_EXT.get(key) # noqa: E701
base = name.split(".", 1)[0]
if base != name and base in KB_EXT: return KB_EXT[base] # noqa: E701
return None
@@ -0,0 +1,242 @@
"""
TDM-Reservation-Check (§ 44b UrhG / EU CDSM Art. 4).
Prueft pro Domain ob ein maschinenlesbarer Nutzungsvorbehalt fuer
Text-and-Data-Mining gesetzt ist. Quellen:
1. robots.txt User-agent: * Disallow: / (oder spezifisch fuer uns)
2. /ai.txt neuer OpenAI-Standard
3. HTTP-Header `tdm-reservation: 1` auf Homepage
4. HTML <meta name="tdm-reservation" content="1"> auf Homepage
5. HTML <meta name="robots" content="noai|noimageai"> Tags
Status-Interpretation:
status=allowed -> kein Vorbehalt, crawlbar
status=reserved -> expliziter Vorbehalt, NICHT crawlen
status=denied -> robots.txt-Zugriff aktiv blockiert (403/401)
=> konservativ: NICHT crawlen
status=unknown -> Server-Error (500/timeout/DNS) auf robots.txt
=> crawlbar, aber 24h-Recheck markiert
Cache via sidecar SQLite (gleiche DB wie compliance_audit_log), 24h TTL.
"""
from __future__ import annotations
import json
import logging
import os
import sqlite3
import time
from datetime import datetime, timezone
from pathlib import Path
from typing import Literal
from urllib.parse import urlparse
import httpx
logger = logging.getLogger(__name__)
DB_PATH = os.getenv("COMPLIANCE_AUDIT_DB", "/data/compliance_audits.db")
CACHE_TTL_SECONDS = 24 * 3600
Status = Literal["allowed", "reserved", "denied", "unknown"]
_DEFAULT_UA = (
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
"(KHTML, like Gecko) HeadlessChrome/120.0.0.0 Safari/537.36"
)
def _ensure_cache_table() -> None:
Path(DB_PATH).parent.mkdir(parents=True, exist_ok=True)
with sqlite3.connect(DB_PATH) as conn:
conn.executescript("""
CREATE TABLE IF NOT EXISTS tdm_reservation_cache (
domain TEXT PRIMARY KEY,
ts TEXT NOT NULL,
status TEXT NOT NULL,
signals TEXT NOT NULL -- JSON list[dict]
);
CREATE INDEX IF NOT EXISTS idx_tdm_ts ON tdm_reservation_cache(ts);
""")
def _cache_get(domain: str) -> dict | None:
try:
_ensure_cache_table()
with sqlite3.connect(DB_PATH) as conn:
conn.row_factory = sqlite3.Row
row = conn.execute(
"SELECT * FROM tdm_reservation_cache WHERE domain=?", (domain,),
).fetchone()
if not row:
return None
ts = datetime.fromisoformat(row["ts"]).timestamp()
if time.time() - ts > CACHE_TTL_SECONDS:
return None
return {
"domain": domain,
"status": row["status"],
"signals": json.loads(row["signals"]),
"cached": True,
"ts": row["ts"],
}
except Exception as e:
logger.debug("tdm cache_get failed for %s: %s", domain, e)
return None
def _cache_put(domain: str, status: Status, signals: list[dict]) -> None:
try:
_ensure_cache_table()
with sqlite3.connect(DB_PATH) as conn:
conn.execute(
"INSERT OR REPLACE INTO tdm_reservation_cache "
"(domain, ts, status, signals) VALUES (?, ?, ?, ?)",
(
domain,
datetime.now(timezone.utc).isoformat(),
status,
json.dumps(signals, ensure_ascii=False),
),
)
conn.commit()
except Exception as e:
logger.warning("tdm cache_put failed for %s: %s", domain, e)
def _base_domain(url_or_domain: str) -> str:
if not url_or_domain:
return ""
if "://" not in url_or_domain:
url_or_domain = "https://" + url_or_domain
netloc = urlparse(url_or_domain).netloc.lower()
return netloc.replace("www.", "")
async def _fetch_status(client: httpx.AsyncClient, url: str) -> tuple[int, str, dict]:
"""Return (status_code, body, headers). Body capped at 16 KiB."""
try:
resp = await client.get(url)
body = resp.text[:16384] if resp.content else ""
return resp.status_code, body, dict(resp.headers)
except Exception as e:
logger.debug("tdm fetch %s failed: %s", url, e)
return 0, "", {}
def _robots_disallows_us(body: str) -> bool:
"""Parse robots.txt — true if our group has Disallow: /."""
if not body:
return False
relevant_groups = ["*", "claudebot", "anthropic-ai", "gptbot",
"google-extended", "ccbot", "breakpilot"]
current_uas: list[str] = []
in_our_group = False
for raw in body.splitlines():
line = raw.split("#", 1)[0].strip()
if not line:
in_our_group = False
current_uas = []
continue
if ":" not in line:
continue
key, val = (s.strip().lower() for s in line.split(":", 1))
if key == "user-agent":
current_uas.append(val)
in_our_group = any(ua in relevant_groups for ua in current_uas)
elif key == "disallow" and in_our_group:
if val == "/" or val == "":
if val == "/":
return True
return False
def _meta_has_reservation(body: str) -> bool:
"""Detect <meta name="tdm-reservation|robots|googlebot"> with noai/noimageai/1."""
low = body.lower()
needles = [
'name="tdm-reservation" content="1"',
"name='tdm-reservation' content='1'",
'"noai"', '"noimageai"',
"content=\"noai", "content='noai",
]
return any(n in low for n in needles)
async def check_tdm_reservation(domain_or_url: str) -> dict:
"""Probe a domain for machine-readable TDM reservations.
Returns:
{
domain, status, signals: [{src, detail}], cached, ts
}
"""
domain = _base_domain(domain_or_url)
if not domain:
return {"domain": "", "status": "unknown", "signals": [], "cached": False}
cached = _cache_get(domain)
if cached:
return cached
signals: list[dict] = []
status: Status = "allowed"
headers = {"User-Agent": _DEFAULT_UA, "Accept": "*/*"}
async with httpx.AsyncClient(
timeout=12.0, follow_redirects=True, headers=headers,
) as client:
for scheme in ("https", "http"):
r_code, r_body, _ = await _fetch_status(
client, f"{scheme}://www.{domain}/robots.txt",
)
if r_code == 0 and scheme == "https":
continue
signals.append({"src": "robots.txt", "status_code": r_code,
"scheme": scheme})
if r_code in (401, 403):
status = "denied"
elif r_code == 200 and _robots_disallows_us(r_body):
status = "reserved"
signals[-1]["detail"] = "Disallow: / for relevant UA group"
elif r_code not in (200, 404):
status = "unknown"
break
if status == "allowed":
ai_code, _, _ = await _fetch_status(
client, f"https://www.{domain}/ai.txt",
)
if ai_code == 200:
status = "reserved"
signals.append({"src": "ai.txt", "status_code": 200,
"detail": "ai.txt present"})
if status == "allowed":
h_code, h_body, h_hdrs = await _fetch_status(
client, f"https://www.{domain}/",
)
if h_code == 200:
if h_hdrs.get("tdm-reservation") == "1":
status = "reserved"
signals.append({"src": "http-header",
"detail": "tdm-reservation: 1"})
elif _meta_has_reservation(h_body):
status = "reserved"
signals.append({"src": "html-meta",
"detail": "noai/tdm-reservation meta"})
_cache_put(domain, status, signals)
return {
"domain": domain,
"status": status,
"signals": signals,
"cached": False,
"ts": datetime.now(timezone.utc).isoformat(),
}
def is_crawl_allowed(result: dict) -> bool:
"""Strict: only 'allowed' and 'unknown' are crawlable."""
return (result.get("status") or "unknown") in ("allowed", "unknown")
@@ -0,0 +1,277 @@
"""
Aggregator: Doc-Check-Results + cmp_vendors + redundancy_report
-> einheitliche Finding-Records fuer unified_findings_store.
Speichert nur ABGELEITETE/normalisierte Findings (siehe Memory
'feedback_oem_data_legal.md'): keine rohen CMP-Cookie-Texte, keine
1:1-Spiegelung fremder Vendor-Listen nur eigene Risk-/Status-Bewertung.
Hook:
from compliance.services.unified_findings_collector import collect
from compliance.services.unified_findings_store import record_findings
findings = collect(check_id, results, cmp_vendors, redundancy_report, doc_texts)
record_findings(check_id, findings)
"""
from __future__ import annotations
import logging
from typing import Any
logger = logging.getLogger(__name__)
_SEVERITY_DEFAULT = {
"mc": "MEDIUM",
"pflichtangabe": "MEDIUM",
"vendor": "MEDIUM",
"redundanz": "LOW",
}
# Mapping cmp_vendor.flag → action_recipe key + Default-Severity
_VENDOR_FLAG_SEVERITY = {
"no_cookies_listed": ("HIGH", "Cookie-Auflistung fehlt"),
"no_country": ("MEDIUM", "Sitzland des Anbieters fehlt"),
"no_privacy_url": ("HIGH", "Datenschutzerklaerung des Anbieters fehlt"),
"broken_privacy_url": ("HIGH", "Datenschutz-URL nicht erreichbar"),
"no_opt_out_url": ("MEDIUM", "Widerspruchs-/Opt-Out-Link fehlt"),
"broken_opt_out": ("MEDIUM", "Opt-Out-Link nicht erreichbar"),
"no_name": ("HIGH", "Anbieter-Name fehlt"),
"no_purpose": ("HIGH", "Verarbeitungszweck fehlt"),
"cookies_no_expiry": ("LOW", "Cookie-Speicherdauer fehlt"),
"cookies_no_names": ("LOW", "Cookie-Namen fehlen"),
}
def _safe_recipe(key: str) -> dict:
"""Lookup mit lazy-import — recipes-Modul ist optional."""
try:
from compliance.services.finding_action_recipes import recipe_for
r = recipe_for(key)
return dict(r) if r else {}
except Exception:
return {}
def _safe_anchor(label: str, doc_text: str, doc_id: str) -> dict:
"""Anchor-Lookup mit lazy-import + best-effort."""
if not label or not doc_text:
return {}
try:
from compliance.services.doc_anchor_locator import locate_anchor
a = locate_anchor(label, doc_text, doc_id)
return a or {}
except Exception:
return {}
def _from_doc_check(
check_id: str,
r: Any,
doc_text: str,
) -> list[dict]:
"""Convert one DocCheckResult into unified-finding rows."""
out: list[dict] = []
if r.error and r.error.startswith("Nicht anwendbar"):
out.append({
"source_type": "pflichtangabe",
"doc_type": r.doc_type,
"severity": "INFO",
"status": "na",
"regulation": "",
"label": f"{r.label}: {r.error}",
"hint": r.error,
"action_recipe": {},
"payload": {"scenario": r.scenario},
})
return out
if r.error:
out.append({
"source_type": "pflichtangabe",
"doc_type": r.doc_type,
"severity": "HIGH",
"status": "failed",
"regulation": "",
"label": f"{r.label}: Dokument nicht erreichbar",
"hint": r.error[:400],
"action_recipe": {},
"payload": {},
})
return out
for c in (r.checks or []):
is_mc = (c.id or "").startswith("mc-")
source = "mc" if is_mc else "pflichtangabe"
if c.passed:
status = "passed"
elif c.skipped:
status = "skipped"
else:
status = "failed"
severity = (c.severity or _SEVERITY_DEFAULT[source]).upper()
# Nur fuer Fails Anchor + Recipe — Pass-Eintraege halten wir mager
recipe: dict = {}
anchor: dict = {}
if status == "failed":
# Recipe per Label-Substring (mehr als nur exakte Keys)
recipe = _safe_recipe(c.label or "") or _safe_recipe(c.id or "")
anchor = _safe_anchor(c.label or "", doc_text, r.doc_type)
out.append({
"source_type": source,
"doc_type": r.doc_type,
"severity": severity,
"status": status,
"regulation": c.regulation or "",
"label": c.label or "",
"hint": c.hint or "",
"action_recipe": recipe,
"anchor_excerpt": (anchor.get("anchor_phrase") or "")[:800],
"anchor_conf": _conf_to_score(anchor),
"payload": {
"mc_id": c.id,
"level": c.level,
"parent": c.parent,
"matched_text": (c.matched_text or "")[:300],
"article": c.article or "",
"anchor_method": anchor.get("method"),
"anchor_position": anchor.get("position_hint"),
},
})
return out
def _conf_to_score(anchor: dict) -> float:
if not anchor:
return 0.0
try:
return float(anchor.get("score") or 0.0)
except (TypeError, ValueError):
return 0.0
def _from_vendors(check_id: str, vendors: list[dict]) -> list[dict]:
"""Per-vendor flag -> finding row."""
out: list[dict] = []
for v in vendors or []:
name = v.get("name") or v.get("vendor_name") or "Unbekannter Anbieter"
country = v.get("country") or ""
risk = v.get("compliance_risk") or {}
for flag in (v.get("compliance_flags") or v.get("flags") or []):
sev, label = _VENDOR_FLAG_SEVERITY.get(
flag, ("LOW", flag.replace("_", " ").title()),
)
out.append({
"source_type": "vendor",
"doc_type": "-",
"severity": sev,
"status": "failed",
"regulation": "DSGVO",
"label": f"{name}{label}",
"hint": _vendor_hint(flag, name),
"action_recipe": _safe_recipe(flag),
"vendor_name": name,
"category": (v.get("category") or "")[:64],
"payload": {
"flag": flag,
"country": country,
"compliance_score": v.get("compliance_score"),
"category": v.get("category"),
"risk_label": risk.get("label"),
"high_risk_cookies": risk.get("high_risk_cookie_count"),
"schrems_ii_cookies": risk.get("schrems_ii_affected_cookies"),
},
})
return out
def _vendor_hint(flag: str, name: str) -> str:
hints = {
"no_cookies_listed":
f"Bei '{name}' sind keine Cookies dokumentiert — DSK-Orientierungshilfe "
"verlangt Name + Zweck + Speicherdauer pro Cookie.",
"no_country":
f"Sitzland von '{name}' fehlt — bei Drittland-Anbieter "
"Art. 44 ff. DSGVO erforderlich.",
"no_privacy_url":
f"Link zur Datenschutzerklaerung von '{name}' fehlt — Art. 13 Abs. 1 lit. e.",
"broken_privacy_url":
f"Privacy-URL von '{name}' nicht erreichbar (404/Timeout).",
"no_opt_out_url":
f"Opt-Out/Widerspruchs-Link fuer '{name}' fehlt — Art. 21 DSGVO.",
"broken_opt_out":
f"Opt-Out-Link von '{name}' nicht erreichbar.",
"no_name":
"Anbieter ohne Name erfasst — Art. 13 Abs. 1 lit. a.",
"no_purpose":
f"Verarbeitungszweck fuer '{name}' fehlt — Art. 13 Abs. 1 lit. c.",
}
return hints.get(flag, f"Flag: {flag}")
def _from_redundancies(check_id: str, report: dict | None) -> list[dict]:
"""Each redundancy category -> finding row (status='info', sev='LOW')."""
if not report:
return []
out: list[dict] = []
for r in (report.get("redundancies") or []):
cat = r.get("category_label") or r.get("category") or "Unbekannt"
vendors = r.get("vendors") or []
sav = r.get("estimated_saving_year_eur") or [0, 0]
out.append({
"source_type": "redundanz",
"doc_type": "-",
"severity": "LOW",
"status": "info",
"regulation": "Cost-Optimization",
"label": f"Mehrfach-Anbieter in '{cat}' ({len(vendors)} Tools)",
"hint": (
f"Anbieter: {', '.join(vendors[:6])}"
+ (f" (+{len(vendors)-6} weitere)" if len(vendors) > 6 else "")
+ (f" · EU-Empfehlung: {r['suggested_eu_tool']}"
if r.get("suggested_eu_tool") else "")
),
"action_recipe": {
"what": "Konsolidierung auf 1 Tool pro Kategorie pruefen.",
"why": (r.get("consolidation_hint") or
"Mehrfach-Lizenzen + Vertrags-Overhead reduzieren."),
"fix_text": "Migrations-Plan zu einem Anbieter erarbeiten; "
"Vertraege ueberlappend kuendigen.",
},
"category": cat,
"payload": {
"vendors": vendors[:20],
"saving_year_eur_low": sav[0],
"saving_year_eur_high": sav[1],
"suggested_eu_tool": r.get("suggested_eu_tool"),
"caveats": (r.get("caveats") or [])[:4],
},
})
return out
def collect(
check_id: str,
results: list[Any],
cmp_vendors: list[dict] | None,
redundancy_report: dict | None,
doc_texts: dict[str, str] | None = None,
) -> list[dict]:
"""Bundle all 4 finding sources into one list ready for record_findings()."""
out: list[dict] = []
texts = doc_texts or {}
for r in (results or []):
try:
out.extend(_from_doc_check(check_id, r, texts.get(r.doc_type, "")))
except Exception as e:
logger.warning("collect: doc result %s failed: %s",
getattr(r, "doc_type", "?"), e)
try:
out.extend(_from_vendors(check_id, cmp_vendors or []))
except Exception as e:
logger.warning("collect: vendors failed: %s", e)
try:
out.extend(_from_redundancies(check_id, redundancy_report))
except Exception as e:
logger.warning("collect: redundancies failed: %s", e)
logger.info("collect: check=%s total_findings=%d", check_id, len(out))
return out
@@ -0,0 +1,190 @@
"""
Unified-Findings sidecar store.
A compliance check produces findings from 4 sources today:
- Master-Controls (mc_results table already persisted)
- Pflichtangaben (L1/L2 doc checks, e.g. Impressum-Vollstaendigkeit)
- Vendor scans (per cmp_vendor: missing privacy url, no opt-out, ...)
- Redundancies (multi-vendor in same category)
Previously the DSB had to look in 4 different blocks of the email to
find everything. This store flattens all of them into ONE searchable
table so the /audit/<check_id> frontend can show a unified list with
source / severity / status / doc_type filters.
Sidecar SQLite (same DB as compliance_audit_log) no Postgres
migration needed.
"""
from __future__ import annotations
import json
import logging
import os
import sqlite3
from pathlib import Path
logger = logging.getLogger(__name__)
DB_PATH = os.getenv("COMPLIANCE_AUDIT_DB", "/data/compliance_audits.db")
def _ensure_table() -> None:
Path(DB_PATH).parent.mkdir(parents=True, exist_ok=True)
with sqlite3.connect(DB_PATH) as conn:
conn.executescript("""
CREATE TABLE IF NOT EXISTS unified_findings (
id INTEGER PRIMARY KEY AUTOINCREMENT,
check_id TEXT NOT NULL,
source_type TEXT NOT NULL, -- mc|pflichtangabe|vendor|redundanz
doc_type TEXT, -- impressum|dse|cookie|... or '-' for vendor/redundanz
severity TEXT, -- CRITICAL|HIGH|MEDIUM|LOW|INFO
status TEXT, -- failed|passed|skipped|na|info
regulation TEXT,
label TEXT,
hint TEXT,
action_recipe TEXT, -- JSON {what,why,fix_text,where,example}
anchor_excerpt TEXT,
anchor_conf REAL,
vendor_name TEXT,
category TEXT,
payload TEXT -- JSON extras (matched_text, cookies count, ...)
);
CREATE INDEX IF NOT EXISTS idx_uf_check ON unified_findings(check_id);
CREATE INDEX IF NOT EXISTS idx_uf_source ON unified_findings(check_id, source_type);
CREATE INDEX IF NOT EXISTS idx_uf_status ON unified_findings(check_id, status);
CREATE INDEX IF NOT EXISTS idx_uf_severity ON unified_findings(check_id, severity);
""")
def record_findings(check_id: str, findings: list[dict]) -> int:
"""Bulk-insert all findings for a check. Idempotent on check_id."""
if not check_id:
return 0
try:
_ensure_table()
with sqlite3.connect(DB_PATH) as conn:
conn.execute(
"DELETE FROM unified_findings WHERE check_id=?", (check_id,),
)
if not findings:
conn.commit()
return 0
rows = [
(
check_id,
(f.get("source_type") or "mc")[:24],
(f.get("doc_type") or "")[:32],
(f.get("severity") or "MEDIUM").upper()[:16],
(f.get("status") or "failed")[:16],
(f.get("regulation") or "")[:64],
(f.get("label") or "")[:400],
(f.get("hint") or "")[:1200],
json.dumps(f.get("action_recipe") or {}, ensure_ascii=False),
(f.get("anchor_excerpt") or "")[:800],
float(f.get("anchor_conf") or 0.0),
(f.get("vendor_name") or "")[:160],
(f.get("category") or "")[:64],
json.dumps(f.get("payload") or {}, ensure_ascii=False),
)
for f in findings
]
conn.executemany(
"INSERT INTO unified_findings "
"(check_id, source_type, doc_type, severity, status, regulation, "
" label, hint, action_recipe, anchor_excerpt, anchor_conf, "
" vendor_name, category, payload) "
"VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)",
rows,
)
conn.commit()
logger.info(
"unified_findings: %s rows=%d sources=%s",
check_id, len(rows),
sorted(set(f.get("source_type", "mc") for f in findings)),
)
return len(rows)
except Exception as e:
logger.warning("record_findings failed for %s: %s", check_id, e)
return 0
def list_findings(
check_id: str,
source_type: str | None = None,
severity: str | None = None,
doc_type: str | None = None,
status: str | None = None,
q: str | None = None,
limit: int = 1000,
) -> list[dict]:
"""Return filtered findings. q matches label OR vendor_name (case-insensitive)."""
try:
_ensure_table()
where = ["check_id = ?"]
params: list = [check_id]
if source_type and source_type != "all":
where.append("source_type = ?")
params.append(source_type)
if severity and severity != "all":
where.append("severity = ?")
params.append(severity.upper())
if doc_type and doc_type != "all":
where.append("doc_type = ?")
params.append(doc_type)
if status and status != "all":
where.append("status = ?")
params.append(status)
if q:
where.append("(LOWER(label) LIKE ? OR LOWER(vendor_name) LIKE ?)")
needle = f"%{q.lower()}%"
params.extend([needle, needle])
sql = ("SELECT * FROM unified_findings WHERE " + " AND ".join(where) +
" ORDER BY CASE severity "
" WHEN 'CRITICAL' THEN 0 WHEN 'HIGH' THEN 1 "
" WHEN 'MEDIUM' THEN 2 WHEN 'LOW' THEN 3 "
" ELSE 4 END, source_type, label LIMIT ?")
params.append(int(limit))
with sqlite3.connect(DB_PATH) as conn:
conn.row_factory = sqlite3.Row
rows = conn.execute(sql, params).fetchall()
out = []
for r in rows:
d = dict(r)
d["action_recipe"] = json.loads(d.get("action_recipe") or "{}")
d["payload"] = json.loads(d.get("payload") or "{}")
out.append(d)
return out
except Exception as e:
logger.warning("list_findings failed: %s", e)
return []
def findings_summary(check_id: str) -> dict:
"""Return aggregate counts for the filter UI (source/severity/status)."""
out = {
"total": 0,
"by_source": {},
"by_severity": {},
"by_status": {},
"by_doc_type": {},
}
try:
_ensure_table()
with sqlite3.connect(DB_PATH) as conn:
conn.row_factory = sqlite3.Row
for col in ("source_type", "severity", "status", "doc_type"):
rows = conn.execute(
f"SELECT {col} AS k, COUNT(*) AS n FROM unified_findings "
f"WHERE check_id=? GROUP BY {col}",
(check_id,),
).fetchall()
bucket = f"by_{col if col != 'source_type' else 'source'}"
if col == "doc_type":
bucket = "by_doc_type"
out[bucket] = {r["k"] or "-": r["n"] for r in rows}
out["total"] = max(out["total"], sum(r["n"] for r in rows))
return out
except Exception as e:
logger.warning("findings_summary failed: %s", e)
return out
+4
View File
@@ -50,6 +50,8 @@ from compliance.api.agent_recurring_routes import router as agent_recurring_rout
from compliance.api.agent_compare_routes import router as agent_compare_router
from compliance.api.agent_doc_check_routes import router as agent_doc_check_router
from compliance.api.agent_compliance_check_routes import router as agent_compliance_check_router
from compliance.api.agent_findings_routes import router as agent_findings_router
from compliance.api.saving_scan_routes import router as saving_scan_router
from compliance.api.agent_migration_routes import router as agent_migration_router
from compliance.api.vendor_assessment_routes import router as vendor_assessment_router
from compliance.api.cra_routes import router as cra_router
@@ -157,6 +159,8 @@ app.include_router(agent_recurring_router, prefix="/api")
app.include_router(agent_compare_router, prefix="/api")
app.include_router(agent_doc_check_router, prefix="/api")
app.include_router(agent_compliance_check_router, prefix="/api")
app.include_router(agent_findings_router, prefix="/api")
app.include_router(saving_scan_router, prefix="/api")
app.include_router(agent_migration_router, prefix="/api")
# Vendor Contract Assessment
@@ -0,0 +1,116 @@
"""
Tests for the saving-scan funnel endpoint.
Focus: input validation + lead persistence + rate-limit error path.
The actual compliance check is mocked we only verify the route layer.
"""
import os
import sys
from unittest.mock import patch
import pytest
from fastapi import FastAPI
from fastapi.testclient import TestClient
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
# Use a temp SQLite for the sidecar
os.environ["COMPLIANCE_AUDIT_DB"] = "/tmp/test_saving_scan.db"
if os.path.exists("/tmp/test_saving_scan.db"):
os.remove("/tmp/test_saving_scan.db")
from compliance.api.saving_scan_routes import router # noqa: E402
app = FastAPI()
app.include_router(router, prefix="/api")
client = TestClient(app)
class TestStartSavingScanValidation:
def test_missing_email_returns_422(self):
resp = client.post("/api/compliance/agent/saving-scan/start",
json={"url": "https://example.de"})
assert resp.status_code == 422
def test_invalid_email_returns_400(self):
with patch("compliance.api.saving_scan_routes.asyncio.create_task"):
resp = client.post(
"/api/compliance/agent/saving-scan/start",
json={"url": "https://example.de", "email": "kein-email",
"consent": True},
)
assert resp.status_code == 400
assert "E-Mail" in resp.json()["detail"]
def test_invalid_url_returns_400(self):
with patch("compliance.api.saving_scan_routes.asyncio.create_task"):
resp = client.post(
"/api/compliance/agent/saving-scan/start",
json={"url": "ftp://wrong.de", "email": "u@x.de",
"consent": True},
)
assert resp.status_code == 400
def test_consent_required(self):
with patch("compliance.api.saving_scan_routes.asyncio.create_task"):
resp = client.post(
"/api/compliance/agent/saving-scan/start",
json={"url": "https://example.de", "email": "u@x.de",
"consent": False},
)
assert resp.status_code == 400
assert "Consent" in resp.json()["detail"]
def _patch_check_runner():
"""Stub the lazy-imported worker — avoids loading smtp_sender (Py3.10+)."""
import sys, types
fake = types.ModuleType("compliance.api.agent_compliance_check_routes")
class _DocInput:
def __init__(self, doc_type="other", url=""): self.doc_type, self.url = doc_type, url
class _Req:
def __init__(self, **kw): self.__dict__.update(kw)
async def _runner(*_a, **_kw): pass
fake.DocumentInput = _DocInput
fake.ComplianceCheckRequest = _Req
fake._run_compliance_check = _runner
fake._compliance_check_jobs = {}
sys.modules["compliance.api.agent_compliance_check_routes"] = fake
class TestStartSavingScanSuccess:
def test_valid_request_starts_check(self):
_patch_check_runner()
resp = client.post(
"/api/compliance/agent/saving-scan/start",
json={"url": "https://example-newdomain.de",
"email": "user@example.de", "consent": True},
)
assert resp.status_code == 200, resp.text
data = resp.json()
assert "check_id" in data
assert data["status"] == "running"
assert "example-newdomain.de" in data["message"]
class TestLeadCount:
def test_lead_count_after_submit(self):
_patch_check_runner()
client.post(
"/api/compliance/agent/saving-scan/start",
json={"url": "https://abc-leadtest.de",
"email": "lead@x.de", "consent": True},
)
resp = client.get("/api/compliance/agent/saving-scan/lead-count")
assert resp.status_code == 200
data = resp.json()
assert data["total_leads"] >= 1
assert "abc-leadtest.de" in str(data["top_domains"])