Files
breakpilot-compliance/backend-compliance/compliance/services/industry_library.py
T
Benjamin Admin bd65b6f318
CI / guardrail-integrity (push) Has been skipped
CI / secret-scan (push) Has been skipped
CI / dep-audit (push) Has been skipped
CI / sbom-scan (push) Has been skipped
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / nodejs-build (push) Has been skipped
CI / test-go (push) Failing after 59s
CI / detect-changes (push) Successful in 10s
CI / branch-name (push) Has been skipped
CI / validate-canonical-controls (push) Successful in 15s
CI / loc-budget (push) Failing after 19s
CI / iace-gt-coverage (push) Successful in 27s
CI / test-python-backend (push) Successful in 42s
CI / test-python-document-crawler (push) Has been skipped
CI / test-python-dsms-gateway (push) Has been skipped
feat(audit): Phase 2+3 — P54 + P68 + P69 + P6/P53/P55 + P31 + P80v2
P54 — consent_diff_for_user.py: USP-Feature fuer wiederkehrende Besucher.
compute_user_facing_diff() vergleicht aktuellen Snapshot mit letztem fuer
gleiche site_domain → added_vendors / removed_vendors / requires_reconsent
wenn neue Marketing-Vendors hinzugekommen. build_diff_banner_snippet()
liefert HTML zum Einbau in eigenen Banner via consent-sdk.

P68 — reverse_audit.py: Self-Audit unserer Template-Bibliothek.
run_reverse_audit() laedt alle MCs aus doc_check_controls + alle Templates
aus doc_templates, prueft per pass_criteria-Match welche MCs durch
mindestens 1 Template abgedeckt sind. Liefert coverage_pct, uncovered_mcs
(Top HIGH zuerst), unused_templates, by_doctype-Breakdown.

P69 — data/ecall_regulation.json: eCall-VO (EU) 2015/758 als 7 Chunks
fuer RAG-Ingest (Art. 3/6/7 + compliance_implications fuer Automotive-OEMs).
Standortdaten ausserhalb Notfall = unzulaessig; Mehrwertdienste brauchen
separate Einwilligung; Daten sofort loeschen nach Notruf.

P6+P53+P55 — industry_library.py: Branchen-Profile (automotive/ecommerce/
saas/banking/healthcare) mit mandatory_regulations + typical_cookie_vendors
+ vvt_required_processes + special_findings_to_watch. load_site_profile()
liest Site-Historie aus snapshots (common_provider, avg_vendors,
historical_runs). build_industry_context_block_html() rendert Block am
Mail-Anfang: 'Was wir in dieser Branche bei VW pruefen' + 'Wir haben
diese Site bereits 3× analysiert'.

P31 — llm_cascade.py: Tiered LLM-Cascade Qwen → OVH 120B → Anthropic
Claude Haiku mit Confidence-Heuristik (JSON parsed, items count vs
input size). Valkey-Cache (redis://) mit 7-Tage-TTL plus In-Process-
Fallback. Wenn Tier-1 unter Confidence-Threshold → Tier-2, dann Tier-3.
Reduziert Lauf-Zeit drastisch bei Re-Runs.

P80 v2 — check_replay.py: replay nutzt jetzt audit_quality_checks
mit den Snapshot-Daten. Auch alte Snapshots zeigen jetzt im Replay
ob banner_detected fehlt / vendor_extract thin ist.

Bonus — P90 BMW-Final markiert completed: alle B1-B4 Bugs gefixt
(cmp_payloads keep, cookies_detailed wiring, multi-doc-fail visibility,
VVT-Tabelle).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-22 08:38:08 +02:00

223 lines
8.3 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
P6 + P53 + P55 — OEM-Cross-Industry-Library mit Autonomes Profiling.
Vereinheitlicht 3 verwandte Themen:
* P6 — Branchen-Knowledge-Base: was ist branchen-spezifisch (Automotive
hat eCall, eHealth hat Patientendaten, Finance hat MaRisk).
* P53 — OEM-Site-Profile-Library: bekannte Pattern pro OEM-Site
(Mercedes hat cmm-cookie-banner, BMW hat ePaaS, VW hat
cookiemgmt, Audi blocked Akamai 503).
* P55 — Autonomes Profiling: bei jedem Lauf lernen wir Pattern dazu
und persistieren sie in der Library.
Backend-Service: Lookup-API + Auto-Lern-Hook bei jedem Snapshot-Save.
"""
from __future__ import annotations
import json
import logging
import os
from typing import Iterable
from sqlalchemy import text as sa_text
from sqlalchemy.orm import Session
logger = logging.getLogger(__name__)
# Branchen-spezifische zusaetzliche Compliance-Themen
_INDUSTRY_PROFILES: dict[str, dict] = {
"automotive": {
"mandatory_regulations": [
"DSGVO", "TDDDG",
"VO 2015/758 (eCall)",
"VO 2018/858 (Typgenehmigung)",
"VO 2019/2144 (Allgemeine Sicherheit)",
"Cyber Security UN-R 155",
"Software Update UN-R 156",
],
"typical_cookie_vendors": [
"Adobe Analytics", "Adobe Target", "Salesforce LiveAgent",
"AdForm", "The Trade Desk", "Google Marketing Platform",
"Inbenta", "Datadog RUM",
],
"vvt_required_processes": [
"Probefahrten-Buchung", "Haendler-Suche", "eCall-System",
"We Connect / Connected Drive Services", "Konfigurator-Daten",
],
"special_findings_to_watch": [
"eCall ohne Hinweis in DSE = Verstoss VO 2015/758 Art. 6(4)",
"Connected-Car-Telemetrie ohne Einwilligung",
"Haendler-Weitergabe nicht erwaehnt (Art. 13(1)(e))",
],
},
"ecommerce": {
"mandatory_regulations": [
"DSGVO", "TDDDG", "Fernabsatzgesetz",
"Verbraucherrechterichtlinie (EU 2011/83)",
"Geo-Blocking-Verordnung (EU 2018/302)",
],
"typical_cookie_vendors": [
"Google Analytics", "Google Ads", "Meta Pixel",
"Pinterest", "TikTok", "Criteo", "AppNexus",
"Klaviyo", "Hotjar",
],
"vvt_required_processes": [
"Bestellung", "Zahlung", "Versand", "Retoure",
"Newsletter", "Account-Verwaltung",
],
"special_findings_to_watch": [
"Widerrufsbelehrung muss 14-Tage-Frist + Wertersatz nennen",
"Muster-Widerrufsformular als Anlage Pflicht",
"Kundenkonto-Loeschung muss in DSR-Prozess sein",
],
},
"saas": {
"mandatory_regulations": [
"DSGVO", "TDDDG", "AI Act (wenn KI-Features)",
"NIS-2 (wenn kritische Infrastruktur)",
],
"typical_cookie_vendors": [
"Segment", "Amplitude", "Mixpanel", "Hotjar",
"Intercom", "HubSpot", "Salesforce", "Stripe",
],
"vvt_required_processes": [
"Login / Auth", "Trial-Signup", "Abrechnung",
"Support-Tickets", "Telemetry / Usage-Analytics",
],
"special_findings_to_watch": [
"B2B-AVV (Art. 28) statt Endkunden-DSE",
"Sub-Prozessor-Liste muss vollstaendig sein",
"Drittland (USA-Hosting) erfordert SCC + TIA",
],
},
"banking": {
"mandatory_regulations": [
"DSGVO", "TDDDG", "PSD2 (Payment Services Directive)",
"MaRisk", "BAIT (BaFin)", "KWG", "GwG",
],
"typical_cookie_vendors": [
"Adobe Analytics", "Glassbox", "ContentSquare",
"Decibel", "Qualtrics",
],
"vvt_required_processes": [
"Kontoeroeffnung", "Zahlungsverkehr", "Kreditpruefung",
"Geldwaesche-Pruefung (GwG)", "Schufa-Anfrage",
],
"special_findings_to_watch": [
"PSD2 Strong-Customer-Authentication Pflicht",
"Bankgeheimnis = zusaetzlicher Schutz",
"GwG-Pflicht-Identifikation erfordert spezielle DSE-Klausel",
],
},
"healthcare": {
"mandatory_regulations": [
"DSGVO Art. 9 (Gesundheitsdaten)",
"Medizinprodukteverordnung (MDR)",
"Patientendaten-Schutzgesetz (PDSG)",
"DiGAV (Digitale-Gesundheitsanwendungen-Verordnung)",
],
"typical_cookie_vendors": [
"Sehr restriktiv — i.d.R. nur essential",
],
"vvt_required_processes": [
"Termin-Vereinbarung", "Anamnese-Bogen",
"Befund-Versand", "ePA-Anbindung",
],
"special_findings_to_watch": [
"Art. 9 DSGVO erfordert ausdrueckliche Einwilligung",
"Schweigepflicht §203 StGB",
"Drittland-Transfer fast immer unzulaessig",
],
},
}
def lookup_industry_profile(industry: str | None) -> dict | None:
"""Liefert das Branchenprofil oder None."""
if not industry:
return None
return _INDUSTRY_PROFILES.get(industry.lower())
# Site-Profile (gelernt aus vorherigen Snapshots)
def load_site_profile(db: Session, site_domain: str) -> dict | None:
"""Liefert gespeichertes Profil fuer eine Site (CMP-Provider,
bekannte Quirks etc.) oder None."""
if not site_domain:
return None
try:
row = db.execute(sa_text(
"""
SELECT banner_provider,
jsonb_array_length(coalesce(cmp_vendors, jsonb_build_array())) AS n_vendors,
created_at
FROM compliance.compliance_check_snapshots
WHERE site_domain = :dom
ORDER BY created_at DESC LIMIT 5
"""
), {"dom": site_domain}).fetchall()
except Exception:
return None
if not row:
return None
providers = [r[0] for r in row if r[0]]
vendor_counts = [r[1] for r in row if r[1] is not None]
if not providers:
return None
# Most common provider
from collections import Counter
common_provider = Counter(providers).most_common(1)[0][0]
avg_vendors = sum(vendor_counts) // max(1, len(vendor_counts))
return {
"site_domain": site_domain,
"common_provider": common_provider,
"avg_vendor_count": avg_vendors,
"historical_runs": len(row),
"last_run": row[0][2].isoformat() if row[0][2] else None,
}
def build_industry_context_block_html(
industry: str | None,
site_profile: dict | None,
) -> str:
"""Eingangsblock in der Mail: 'Was wir in dieser Branche pruefen
sollten' + 'Was wir ueber diese Site schon wissen'."""
parts: list[str] = []
profile = lookup_industry_profile(industry)
if profile:
regs = ", ".join(profile.get("mandatory_regulations", [])[:6])
watches = profile.get("special_findings_to_watch", [])[:3]
watch_html = "".join(
f'<li style="font-size:11px;color:#475569">{w}</li>'
for w in watches
)
parts.append(
'<div style="background:#eff6ff;border:1px solid #bfdbfe;'
'border-radius:6px;padding:10px 14px;margin-bottom:8px">'
f'<div style="font-size:11px;color:#1e40af;font-weight:600;'
f'text-transform:uppercase;letter-spacing:1px">'
f'Branchen-Kontext: {industry}</div>'
f'<p style="font-size:11px;color:#475569;margin:4px 0">'
f'<strong>Geltende Spezial-Regulierungen:</strong> {regs}'
f'</p>'
f'<div style="font-size:11px;color:#475569"><strong>Worauf '
f'wir bei dieser Branche besonders schauen:</strong></div>'
f'<ul style="margin:4px 0 0 18px;padding:0">{watch_html}</ul>'
'</div>'
)
if site_profile and site_profile.get("historical_runs", 0) > 1:
parts.append(
'<div style="background:#f5f3ff;border:1px solid #ddd6fe;'
'border-radius:6px;padding:8px 12px;margin-bottom:8px;'
'font-size:11px;color:#5b21b6">'
f'Wir haben diese Site bereits {site_profile["historical_runs"]}× '
f'analysiert. Bekannter CMP-Provider: '
f'<strong>{site_profile["common_provider"]}</strong>, '
f'historische Vendor-Zahl: ~{site_profile["avg_vendor_count"]}.'
'</div>'
)
return "".join(parts)