Merge feat/zeroclaw-compliance-agent into main

Brings all compliance doc-check features:
- 162 regex checks + 1874 Master Controls
- LLM-agnostic agent with tool calling
- Banner check (46 checks, 30 CMPs, stealth, Shadow DOM)
- Impressum check (24 checks)
- Deep consent verification (DataLayer, GCM, TCF)
- CMP E2E tests (39 tests)
- HTML email reports, FAQ, persistent history

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-05-11 11:44:20 +02:00
175 changed files with 20063 additions and 1283 deletions
@@ -23,23 +23,10 @@ def consent_to_dict(c: BannerConsentDB) -> dict[str, Any]:
"device_fingerprint": c.device_fingerprint,
"categories": c.categories or [],
"vendors": c.vendors or [],
"vendor_consents": c.vendor_consents or {},
"ip_hash": c.ip_hash,
"user_agent": c.user_agent,
"consent_string": c.consent_string,
"linked_email": c.linked_email,
"consent_method": c.consent_method,
"banner_version": c.banner_version,
"banner_config_hash": c.banner_config_hash,
"geo_country": c.geo_country,
"geo_region": c.geo_region,
"consent_scope": c.consent_scope,
"page_url": c.page_url,
"referrer": c.referrer,
"device_type": c.device_type,
"browser": c.browser,
"os": c.os,
"screen_resolution": c.screen_resolution,
"session_id": c.session_id,
"expires_at": c.expires_at.isoformat() if c.expires_at else None,
"created_at": c.created_at.isoformat() if c.created_at else None,
"updated_at": c.updated_at.isoformat() if c.updated_at else None,
@@ -0,0 +1,95 @@
"""
Agent PDF Export — generates printable compliance scan reports.
Uses WeasyPrint to convert HTML report to PDF.
"""
import logging
from datetime import datetime, timezone
from io import BytesIO
logger = logging.getLogger(__name__)
def generate_scan_pdf(scan_data: dict) -> bytes:
"""Generate a PDF report from scan results."""
from weasyprint import HTML
html = _build_report_html(scan_data)
pdf_buffer = BytesIO()
HTML(string=html).write_pdf(pdf_buffer)
return pdf_buffer.getvalue()
def _severity_color(sev: str) -> str:
return {"HIGH": "#dc2626", "CRITICAL": "#991b1b", "MEDIUM": "#ea580c", "LOW": "#2563eb"}.get(sev, "#6b7280")
def _build_report_html(data: dict) -> str:
"""Build HTML for the PDF report."""
url = data.get("url", "")
scan_type = data.get("scan_type", "scan")
mode = data.get("analysis_mode", "post_launch")
findings = data.get("findings", [])
services = data.get("services", [])
risk = data.get("risk_level", "")
score = data.get("risk_score", 0)
pages = data.get("pages_scanned", 0)
now = datetime.now(timezone.utc).strftime("%d.%m.%Y %H:%M UTC")
mode_label = "Live-Website Pruefung" if mode == "post_launch" else "Interne Pruefung"
type_label = {"quick": "Schnellanalyse", "scan": "Website-Scan", "consent_test": "Cookie-Test"}.get(scan_type, scan_type)
findings_rows = ""
for f in findings:
sev = f.get("severity", "MEDIUM") if isinstance(f, dict) else "MEDIUM"
text = f.get("text", str(f)) if isinstance(f, dict) else str(f)
color = _severity_color(sev)
findings_rows += f'<tr><td style="color:{color};font-weight:bold;padding:6px 8px;border-bottom:1px solid #e5e7eb;">{sev}</td><td style="padding:6px 8px;border-bottom:1px solid #e5e7eb;">{text}</td></tr>'
services_rows = ""
for s in services:
if isinstance(s, dict):
status_icon = "" if s.get("in_dse") or s.get("status") == "ok" else ""
status_color = "#16a34a" if status_icon == "" else "#dc2626"
services_rows += f'<tr><td style="color:{status_color};font-weight:bold;padding:4px 8px;border-bottom:1px solid #f3f4f6;">{status_icon}</td><td style="padding:4px 8px;border-bottom:1px solid #f3f4f6;">{s.get("name","")}</td><td style="padding:4px 8px;border-bottom:1px solid #f3f4f6;">{s.get("country","")}</td><td style="padding:4px 8px;border-bottom:1px solid #f3f4f6;">{s.get("category","")}</td></tr>'
return f"""<!DOCTYPE html>
<html><head><meta charset="utf-8">
<style>
body {{ font-family: -apple-system, Arial, sans-serif; font-size: 11px; color: #1e293b; margin: 40px; }}
h1 {{ font-size: 20px; color: #1e1b4b; margin-bottom: 4px; }}
h2 {{ font-size: 14px; color: #334155; border-bottom: 2px solid #e2e8f0; padding-bottom: 4px; margin-top: 24px; }}
.meta {{ color: #64748b; font-size: 10px; margin-bottom: 20px; }}
.badge {{ display: inline-block; padding: 2px 8px; border-radius: 4px; color: white; font-size: 10px; font-weight: bold; }}
table {{ width: 100%; border-collapse: collapse; }}
th {{ text-align: left; padding: 6px 8px; background: #f8fafc; border-bottom: 2px solid #e2e8f0; font-size: 10px; color: #64748b; }}
.warning {{ background: #fef2f2; border-left: 4px solid #dc2626; padding: 10px 14px; margin: 16px 0; }}
.footer {{ margin-top: 30px; padding-top: 10px; border-top: 1px solid #e2e8f0; color: #94a3b8; font-size: 9px; }}
</style></head><body>
<h1>Compliance Agent Report</h1>
<p class="meta">{type_label} | {mode_label} | {now}</p>
<table style="margin-bottom:20px;">
<tr><td style="padding:4px 0;color:#64748b;width:150px;">URL</td><td style="padding:4px 0;"><strong>{url}</strong></td></tr>
<tr><td style="padding:4px 0;color:#64748b;">Risikobewertung</td><td style="padding:4px 0;"><span class="badge" style="background:{_severity_color(risk) if risk else '#6b7280'}">{risk} ({score}/100)</span></td></tr>
<tr><td style="padding:4px 0;color:#64748b;">Seiten gescannt</td><td style="padding:4px 0;">{pages}</td></tr>
<tr><td style="padding:4px 0;color:#64748b;">Findings</td><td style="padding:4px 0;"><strong>{len(findings)}</strong></td></tr>
</table>
{'<div class="warning"><strong>ACHTUNG:</strong> Maengel auf einer bereits veroeffentlichten Website. Sofortige Korrektur empfohlen.</div>' if mode == "post_launch" and findings else ''}
<h2>Findings ({len(findings)})</h2>
<table>
<tr><th>Schwere</th><th>Beschreibung</th></tr>
{findings_rows if findings_rows else '<tr><td colspan="2" style="padding:8px;color:#16a34a;">Keine Findings — alles OK</td></tr>'}
</table>
{'<h2>Dienstleister-Abgleich</h2><table><tr><th>Status</th><th>Dienst</th><th>Land</th><th>Kategorie</th></tr>' + services_rows + '</table>' if services_rows else ''}
<div class="footer">
Automatisch erstellt vom BreakPilot Compliance Agent | {now}<br>
Dieses Dokument ersetzt keine Rechtsberatung.
</div>
</body></html>"""
@@ -0,0 +1,193 @@
"""
Banner A/B Testing Service — variant assignment, stats, significance.
Deterministic variant assignment via device fingerprint hash ensures
the same device always sees the same variant (sticky bucketing).
"""
import hashlib
import math
import uuid
from datetime import datetime, timezone
from typing import Any, Optional
from sqlalchemy import text
from sqlalchemy.orm import Session
class BannerABService:
"""A/B testing for consent banner variants."""
def __init__(self, db: Session) -> None:
self.db = db
# ------------------------------------------------------------------
# Variant CRUD
# ------------------------------------------------------------------
def list_variants(self, tenant_id: str, site_config_id: str) -> list[dict]:
q = text("""
SELECT * FROM compliance_banner_variants
WHERE tenant_id = :tid AND site_config_id = :scid
ORDER BY variant_key
""")
rows = self.db.execute(q, {"tid": tenant_id, "scid": site_config_id}).fetchall()
return [dict(r._mapping) for r in rows]
def create_variant(self, tenant_id: str, site_config_id: str, data: dict) -> dict:
q = text("""
INSERT INTO compliance_banner_variants
(tenant_id, site_config_id, variant_name, variant_key, traffic_percent, is_control,
banner_title, banner_description, position, style, primary_color, show_decline_all, theme_overrides)
VALUES (:tid, :scid, :name, :key, :pct, :ctrl,
:title, :desc, :pos, :style, :color, :decline, :theme)
RETURNING *
""")
row = self.db.execute(q, {
"tid": tenant_id, "scid": site_config_id,
"name": data.get("variant_name", ""),
"key": data.get("variant_key", "A"),
"pct": data.get("traffic_percent", 50),
"ctrl": data.get("is_control", False),
"title": data.get("banner_title"),
"desc": data.get("banner_description"),
"pos": data.get("position"),
"style": data.get("style"),
"color": data.get("primary_color"),
"decline": data.get("show_decline_all"),
"theme": data.get("theme_overrides", "{}"),
}).fetchone()
self.db.commit()
return dict(row._mapping)
def update_variant(self, variant_id: str, data: dict) -> Optional[dict]:
sets, params = [], {"vid": variant_id}
for field in ["variant_name", "traffic_percent", "is_control", "banner_title",
"banner_description", "position", "style", "primary_color",
"show_decline_all", "is_active"]:
if field in data and data[field] is not None:
sets.append(f"{field} = :{field}")
params[field] = data[field]
if not sets:
return None
sets.append("updated_at = NOW()")
q = text(f"UPDATE compliance_banner_variants SET {', '.join(sets)} WHERE id = :vid RETURNING *")
row = self.db.execute(q, params).fetchone()
self.db.commit()
return dict(row._mapping) if row else None
def delete_variant(self, variant_id: str) -> bool:
q = text("DELETE FROM compliance_banner_variants WHERE id = :vid")
result = self.db.execute(q, {"vid": variant_id})
self.db.commit()
return result.rowcount > 0
# ------------------------------------------------------------------
# Variant Assignment (deterministic sticky bucketing)
# ------------------------------------------------------------------
def assign_variant(self, site_config_id: str, device_fingerprint: str) -> Optional[dict]:
"""Assign a variant based on device fingerprint hash. Returns variant or None."""
variants = self.db.execute(text("""
SELECT * FROM compliance_banner_variants
WHERE site_config_id = :scid AND is_active = TRUE
ORDER BY variant_key
"""), {"scid": site_config_id}).fetchall()
if not variants:
return None
# Deterministic bucket 0-99 from device fingerprint
bucket = int(hashlib.md5(f"{site_config_id}:{device_fingerprint}".encode()).hexdigest(), 16) % 100
cumulative = 0
for v in variants:
cumulative += v.traffic_percent
if bucket < cumulative:
return dict(v._mapping)
# Fallback to last variant
return dict(variants[-1]._mapping)
# ------------------------------------------------------------------
# Stats with statistical significance
# ------------------------------------------------------------------
def get_variant_stats(self, tenant_id: str, site_config_id: str) -> list[dict]:
"""Per-variant stats with chi-squared significance test."""
variants = self.list_variants(tenant_id, site_config_id)
if not variants:
return []
results = []
for v in variants:
vid = str(v["id"])
vkey = v["variant_key"]
q = text("""
SELECT
COUNT(*) AS total,
COUNT(*) FILTER (WHERE action = 'consent_given') AS accepted,
COUNT(*) FILTER (WHERE action IN ('consent_withdrawn', 'consent_revoked')) AS rejected
FROM compliance_banner_consent_audit_log
WHERE tenant_id = :tid AND variant_key = :vkey
""")
row = self.db.execute(q, {"tid": tenant_id, "vkey": vkey}).fetchone()
total = row.total if row else 0
accepted = row.accepted if row else 0
results.append({
"variant_id": vid,
"variant_key": vkey,
"variant_name": v["variant_name"],
"traffic_percent": v["traffic_percent"],
"is_control": v["is_control"],
"total": total,
"accepted": accepted,
"opt_in_rate": round(accepted / total * 100, 1) if total > 0 else 0,
})
# Chi-squared test between control and best variant
control = next((r for r in results if r["is_control"]), None)
if control and len(results) > 1:
best = max((r for r in results if not r["is_control"]), key=lambda x: x["opt_in_rate"], default=None)
if best and control["total"] > 0 and best["total"] > 0:
sig = self._chi_squared_significance(
control["accepted"], control["total"],
best["accepted"], best["total"],
)
best["is_winner"] = sig > 0.95
best["significance"] = round(sig * 100, 1)
control["is_winner"] = False
control["significance"] = round((1 - sig) * 100, 1)
return results
@staticmethod
def _chi_squared_significance(a_success: int, a_total: int, b_success: int, b_total: int) -> float:
"""Simple chi-squared test for 2x2 contingency table. Returns confidence 0-1."""
a_fail = a_total - a_success
b_fail = b_total - b_success
n = a_total + b_total
if n == 0:
return 0.0
# Expected values
exp_a_s = a_total * (a_success + b_success) / n
exp_a_f = a_total * (a_fail + b_fail) / n
exp_b_s = b_total * (a_success + b_success) / n
exp_b_f = b_total * (a_fail + b_fail) / n
chi2 = 0.0
for obs, exp in [(a_success, exp_a_s), (a_fail, exp_a_f), (b_success, exp_b_s), (b_fail, exp_b_f)]:
if exp > 0:
chi2 += (obs - exp) ** 2 / exp
# Approximate p-value for 1 df using Wilson-Hilferty
if chi2 < 0.001:
return 0.0
if chi2 > 10.83:
return 0.999
# Lookup table for common thresholds (1 df)
thresholds = [(2.706, 0.90), (3.841, 0.95), (5.024, 0.975), (6.635, 0.99), (10.83, 0.999)]
confidence = 0.0
for threshold, conf in thresholds:
if chi2 >= threshold:
confidence = conf
return confidence
@@ -0,0 +1,135 @@
"""
Banner consent analytics — time-series, device breakdown, bounce rate.
Reads from BannerConsentAuditLogDB for aggregated analytics.
"""
import re
from datetime import datetime, timedelta, timezone
from typing import Any, Optional
from sqlalchemy import text
from sqlalchemy.orm import Session
class BannerAnalyticsService:
"""Provides aggregated consent analytics for a site."""
def __init__(self, db: Session) -> None:
self.db = db
def get_time_series(
self,
tenant_id: str,
site_id: str,
period: str = "daily",
days: int = 30,
) -> list[dict[str, Any]]:
"""Opt-in rate per day/week over the last N days."""
trunc = "day" if period == "daily" else "week"
cutoff = datetime.now(timezone.utc) - timedelta(days=days)
q = text(f"""
SELECT DATE_TRUNC(:trunc, created_at) AS period,
COUNT(*) FILTER (WHERE action = 'consent_given') AS given,
COUNT(*) FILTER (WHERE action = 'consent_updated') AS updated,
COUNT(*) FILTER (WHERE action IN ('consent_withdrawn', 'consent_revoked')) AS withdrawn,
COUNT(*) AS total
FROM compliance_banner_consent_audit_log
WHERE tenant_id = :tid AND site_id = :sid AND created_at >= :cutoff
GROUP BY 1 ORDER BY 1
""")
rows = self.db.execute(q, {"tid": tenant_id, "sid": site_id, "cutoff": cutoff, "trunc": trunc}).fetchall()
return [
{
"period": r.period.isoformat() if r.period else None,
"given": r.given,
"updated": r.updated,
"withdrawn": r.withdrawn,
"total": r.total,
"opt_in_rate": round((r.given + r.updated) / r.total * 100, 1) if r.total > 0 else 0,
}
for r in rows
]
def get_category_breakdown(
self,
tenant_id: str,
site_id: str,
days: int = 30,
) -> dict[str, dict[str, int]]:
"""Acceptance count per category."""
cutoff = datetime.now(timezone.utc) - timedelta(days=days)
q = text("""
SELECT categories FROM compliance_banner_consent_audit_log
WHERE tenant_id = :tid AND site_id = :sid AND created_at >= :cutoff
AND action IN ('consent_given', 'consent_updated')
""")
rows = self.db.execute(q, {"tid": tenant_id, "sid": site_id, "cutoff": cutoff}).fetchall()
counts: dict[str, int] = {}
total = len(rows)
for r in rows:
cats = r.categories if isinstance(r.categories, list) else []
for cat in cats:
counts[cat] = counts.get(cat, 0) + 1
return {
cat: {"count": count, "total": total, "rate": round(count / total * 100, 1) if total > 0 else 0}
for cat, count in sorted(counts.items())
}
def get_device_breakdown(
self,
tenant_id: str,
site_id: str,
days: int = 30,
) -> dict[str, int]:
"""Mobile/Desktop/Tablet classification from user_agent."""
cutoff = datetime.now(timezone.utc) - timedelta(days=days)
q = text("""
SELECT user_agent FROM compliance_banner_consent_audit_log
WHERE tenant_id = :tid AND site_id = :sid AND created_at >= :cutoff
AND user_agent IS NOT NULL
""")
rows = self.db.execute(q, {"tid": tenant_id, "sid": site_id, "cutoff": cutoff}).fetchall()
result = {"desktop": 0, "mobile": 0, "tablet": 0, "unknown": 0}
mobile_re = re.compile(r"Mobile|Android|iPhone|iPod", re.IGNORECASE)
tablet_re = re.compile(r"iPad|Tablet|PlayBook|Silk", re.IGNORECASE)
for r in rows:
ua = r.user_agent or ""
if tablet_re.search(ua):
result["tablet"] += 1
elif mobile_re.search(ua):
result["mobile"] += 1
elif ua:
result["desktop"] += 1
else:
result["unknown"] += 1
return result
def get_overview_stats(
self,
tenant_id: str,
site_id: str,
days: int = 30,
) -> dict[str, Any]:
"""High-level stats: total consents, active, withdrawn, opt-in rate."""
cutoff = datetime.now(timezone.utc) - timedelta(days=days)
q = text("""
SELECT
COUNT(*) FILTER (WHERE action = 'consent_given') AS given,
COUNT(*) FILTER (WHERE action = 'consent_updated') AS updated,
COUNT(*) FILTER (WHERE action IN ('consent_withdrawn', 'consent_revoked')) AS withdrawn,
COUNT(*) AS total
FROM compliance_banner_consent_audit_log
WHERE tenant_id = :tid AND site_id = :sid AND created_at >= :cutoff
""")
r = self.db.execute(q, {"tid": tenant_id, "sid": site_id, "cutoff": cutoff}).fetchone()
total = r.total if r else 0
given = (r.given or 0) + (r.updated or 0) if r else 0
return {
"period_days": days,
"total_interactions": total,
"consents_given": r.given if r else 0,
"consents_updated": r.updated if r else 0,
"consents_withdrawn": r.withdrawn if r else 0,
"opt_in_rate": round(given / total * 100, 1) if total > 0 else 0,
}
@@ -73,9 +73,8 @@ class BannerConsentService:
ip_hash: Optional[str] = None,
banner_config_hash: Optional[str] = None,
consent_version: Optional[int] = None,
*,
consent_method: Optional[str] = None,
page_url: Optional[str] = None,
vendor_consents: Optional[dict[str, bool]] = None,
user_agent: Optional[str] = None,
) -> None:
entry = BannerConsentAuditLogDB(
tenant_id=tenant_id,
@@ -84,11 +83,11 @@ class BannerConsentService:
site_id=site_id,
device_fingerprint=device_fingerprint,
categories=categories or [],
vendor_consents=vendor_consents or {},
ip_hash=ip_hash,
user_agent=user_agent,
banner_config_hash=banner_config_hash,
consent_version=consent_version,
consent_method=consent_method,
page_url=page_url,
)
self.db.add(entry)
@@ -134,6 +133,24 @@ class BannerConsentService:
return max(v.retention_days for v in vendors if v.retention_days)
return max((CATEGORY_RETENTION_DAYS.get(c, 365) for c in categories), default=365)
def _maybe_generate_tc_string(
self, tenant_id: uuid.UUID, site_id: str, categories: list[str],
) -> Optional[str]:
"""Generate TC String if TCF is enabled for this site."""
config = (
self.db.query(BannerSiteConfigDB)
.filter(BannerSiteConfigDB.tenant_id == tenant_id, BannerSiteConfigDB.site_id == site_id)
.first()
)
if not config or not config.tcf_enabled:
return None
try:
from compliance.services.tcf_encoder_service import TCFEncoderService
encoder = TCFEncoderService()
return encoder.encode_from_categories(categories)
except Exception:
return None
# ------------------------------------------------------------------
# Consent CRUD (public SDK)
# ------------------------------------------------------------------
@@ -148,16 +165,7 @@ class BannerConsentService:
ip_address: Optional[str],
user_agent: Optional[str],
consent_string: Optional[str],
*,
consent_method: Optional[str] = None,
page_url: Optional[str] = None,
referrer: Optional[str] = None,
device_type: Optional[str] = None,
browser: Optional[str] = None,
os: Optional[str] = None,
screen_resolution: Optional[str] = None,
session_id: Optional[str] = None,
consent_scope: Optional[str] = None,
vendor_consents: Optional[dict[str, bool]] = None,
) -> dict[str, Any]:
"""Upsert a device consent row for (tenant, site, device_fingerprint).
@@ -173,20 +181,9 @@ class BannerConsentService:
expires_at = now + timedelta(days=retention)
config_hash, config_ver = self._compute_config_hash(tid, site_id)
# Vendor-agnostische Zusatzfelder
extra = {
"consent_method": consent_method,
"banner_version": config_ver,
"banner_config_hash": config_hash,
"page_url": page_url,
"referrer": referrer,
"device_type": device_type,
"browser": browser,
"os": os,
"screen_resolution": screen_resolution,
"session_id": session_id,
"consent_scope": consent_scope or "domain",
}
# Auto-generate TC String if TCF is enabled for this site
if not consent_string:
consent_string = self._maybe_generate_tc_string(tid, site_id, categories)
existing = (
self.db.query(BannerConsentDB)
@@ -201,18 +198,17 @@ class BannerConsentService:
if existing:
existing.categories = categories
existing.vendors = vendors
existing.vendor_consents = vendor_consents or {}
existing.ip_hash = ip_hash
existing.user_agent = user_agent
existing.consent_string = consent_string
existing.expires_at = expires_at
existing.updated_at = now
for key, val in extra.items():
setattr(existing, key, val)
self.db.flush()
self._log(
tid, existing.id, "consent_updated", site_id, device_fingerprint,
categories, ip_hash, config_hash, config_ver,
consent_method=consent_method, page_url=page_url,
vendor_consents=vendor_consents, user_agent=user_agent,
)
self.db.commit()
self.db.refresh(existing)
@@ -224,18 +220,18 @@ class BannerConsentService:
device_fingerprint=device_fingerprint,
categories=categories,
vendors=vendors,
vendor_consents=vendor_consents or {},
ip_hash=ip_hash,
user_agent=user_agent,
consent_string=consent_string,
expires_at=expires_at,
**extra,
)
self.db.add(consent)
self.db.flush()
self._log(
tid, consent.id, "consent_given", site_id, device_fingerprint,
categories, ip_hash, config_hash, config_ver,
consent_method=consent_method, page_url=page_url,
vendor_consents=vendor_consents, user_agent=user_agent,
)
self.db.commit()
self.db.refresh(consent)
@@ -383,14 +379,7 @@ class BannerConsentService:
total = base.count()
category_stats: dict[str, int] = {}
for c in base.all():
raw = c.categories or []
if isinstance(raw, str):
try:
import json
raw = json.loads(raw)
except (json.JSONDecodeError, TypeError):
raw = []
cats: list[str] = list(raw) if isinstance(raw, list) else []
cats: list[str] = list(c.categories or [])
for cat in cats:
category_stats[cat] = category_stats.get(cat, 0) + 1
return {
@@ -404,58 +393,3 @@ class BannerConsentService:
for cat, count in category_stats.items()
},
}
def list_consents(
self, tenant_id: str, site_id: Optional[str] = None,
limit: int = 50, offset: int = 0,
) -> dict[str, Any]:
"""List paginated banner consents with parsed categories."""
import json as _json
tid = uuid.UUID(tenant_id)
base = self.db.query(BannerConsentDB).filter(BannerConsentDB.tenant_id == tid)
if site_id:
base = base.filter(BannerConsentDB.site_id == site_id)
total = base.count()
rows = base.order_by(BannerConsentDB.created_at.desc()).offset(offset).limit(limit).all()
consents = []
for c in rows:
raw_cats = c.categories or []
if isinstance(raw_cats, str):
try:
raw_cats = _json.loads(raw_cats)
except (ValueError, TypeError):
raw_cats = []
raw_vendors = c.vendors or []
if isinstance(raw_vendors, str):
try:
raw_vendors = _json.loads(raw_vendors)
except (ValueError, TypeError):
raw_vendors = []
consents.append({
"id": str(c.id),
"site_id": c.site_id,
"device_fingerprint": c.device_fingerprint,
"categories": list(raw_cats) if isinstance(raw_cats, list) else [],
"vendors": list(raw_vendors) if isinstance(raw_vendors, list) else [],
"ip_hash": c.ip_hash,
"user_agent": c.user_agent,
"linked_email": c.linked_email,
"consent_string": c.consent_string,
"consent_method": c.consent_method,
"banner_version": c.banner_version,
"banner_config_hash": c.banner_config_hash,
"geo_country": c.geo_country,
"geo_region": c.geo_region,
"consent_scope": c.consent_scope,
"page_url": c.page_url,
"referrer": c.referrer,
"device_type": c.device_type,
"browser": c.browser,
"os": c.os,
"screen_resolution": c.screen_resolution,
"session_id": c.session_id,
"expires_at": c.expires_at.isoformat() if c.expires_at else None,
"created_at": c.created_at.isoformat() if c.created_at else None,
"updated_at": c.updated_at.isoformat() if c.updated_at else None,
})
return {"consents": consents, "total": total, "limit": limit, "offset": offset}
@@ -40,6 +40,22 @@ _CONTROL_COLUMNS = """
"""
def _ensure_list(val: Any) -> list:
"""Ensure a JSONB value is always a Python list."""
if isinstance(val, list):
return val
if val is None:
return []
if isinstance(val, str):
try:
import json
parsed = json.loads(val)
return parsed if isinstance(parsed, list) else []
except (json.JSONDecodeError, TypeError):
return []
return []
def _control_row(r: Any) -> dict[str, Any]:
"""Serialize a canonical_controls SELECT row to a response dict."""
return {
@@ -49,19 +65,19 @@ def _control_row(r: Any) -> dict[str, Any]:
"title": r.title,
"objective": r.objective,
"rationale": r.rationale,
"scope": r.scope,
"requirements": r.requirements,
"test_procedure": r.test_procedure,
"evidence": r.evidence,
"scope": r.scope if isinstance(r.scope, dict) else {},
"requirements": _ensure_list(r.requirements),
"test_procedure": _ensure_list(r.test_procedure),
"evidence": _ensure_list(r.evidence),
"severity": r.severity,
"risk_score": float(r.risk_score) if r.risk_score is not None else None,
"implementation_effort": r.implementation_effort,
"evidence_confidence": (
float(r.evidence_confidence) if r.evidence_confidence is not None else None
),
"open_anchors": r.open_anchors,
"open_anchors": _ensure_list(r.open_anchors),
"release_state": r.release_state,
"tags": r.tags or [],
"tags": _ensure_list(r.tags),
"created_at": r.created_at.isoformat() if r.created_at else None,
"updated_at": r.updated_at.isoformat() if r.updated_at else None,
}
@@ -0,0 +1,216 @@
"""
Compliance Report PDF Generator — generates a comprehensive A4 PDF
covering all compliance modules for a project.
Uses reportlab (same as audit_pdf_generator.py).
"""
import io
import logging
from datetime import datetime, timezone
from typing import Any
from reportlab.lib import colors
from reportlab.lib.pagesizes import A4
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.lib.units import mm
from reportlab.platypus import (
SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle, PageBreak,
)
from sqlalchemy import text
from sqlalchemy.orm import Session
logger = logging.getLogger(__name__)
# Colors
PURPLE = colors.HexColor("#7c3aed")
LIGHT_PURPLE = colors.HexColor("#f5f3ff")
GRAY = colors.HexColor("#6b7280")
GREEN = colors.HexColor("#16a34a")
RED = colors.HexColor("#dc2626")
YELLOW = colors.HexColor("#ca8a04")
def _styles():
ss = getSampleStyleSheet()
ss.add(ParagraphStyle("Title2", parent=ss["Title"], fontSize=24, textColor=PURPLE, spaceAfter=6))
ss.add(ParagraphStyle("Section", parent=ss["Heading2"], fontSize=14, textColor=PURPLE, spaceBefore=12, spaceAfter=6))
ss.add(ParagraphStyle("Body2", parent=ss["Normal"], fontSize=10, leading=14, spaceAfter=4))
ss.add(ParagraphStyle("Small", parent=ss["Normal"], fontSize=8, textColor=GRAY))
return ss
class CompliancePDFGenerator:
"""Generates a full compliance status report as PDF."""
def __init__(self, db: Session) -> None:
self.db = db
def generate(self, tenant_id: str, project_id: str | None = None, language: str = "de") -> tuple[bytes, str]:
buf = io.BytesIO()
doc = SimpleDocTemplate(buf, pagesize=A4, leftMargin=20 * mm, rightMargin=20 * mm, topMargin=25 * mm, bottomMargin=20 * mm)
ss = _styles()
story: list = []
now = datetime.now(timezone.utc)
story.append(Paragraph("Compliance-Report", ss["Title2"]))
story.append(Paragraph(f"Stand: {now.strftime('%d.%m.%Y %H:%M')} UTC", ss["Small"]))
story.append(Spacer(1, 10 * mm))
# Company Profile
self._add_company_section(story, ss, tenant_id, project_id)
# TOM
self._add_count_section(story, ss, "TOM (Technisch-Organisatorische Massnahmen)",
"compliance_toms", tenant_id)
# VVT
self._add_count_section(story, ss, "VVT (Verarbeitungstaetigkeiten)",
"compliance_vvt_activities", tenant_id)
# DSFA
self._add_count_section(story, ss, "Datenschutz-Folgenabschaetzungen",
"compliance_dsfa_assessments", tenant_id)
# Risks
self._add_risk_section(story, ss, tenant_id)
# Vendors
self._add_count_section(story, ss, "Auftragsverarbeiter",
"compliance_vendor_assessments", tenant_id)
# Incidents
self._add_count_section(story, ss, "Datenschutz-Vorfaelle",
"compliance_notfallplan_incidents", tenant_id)
# Document Reviews
self._add_review_section(story, ss, tenant_id)
# Banner Consents
self._add_consent_section(story, ss, tenant_id)
# Org Roles
self._add_role_section(story, ss, tenant_id, project_id)
# Footer
story.append(Spacer(1, 15 * mm))
story.append(Paragraph("Erstellt mit BreakPilot Compliance SDK", ss["Small"]))
doc.build(story)
filename = f"compliance-report-{now.strftime('%Y%m%d')}.pdf"
return buf.getvalue(), filename
def _add_company_section(self, story, ss, tid, pid):
story.append(Paragraph("Unternehmensprofil", ss["Section"]))
try:
where = "tenant_id = :tid"
params: dict[str, Any] = {"tid": tid}
if pid:
where += " AND project_id = :pid"
params["pid"] = pid
row = self.db.execute(text(f"SELECT * FROM compliance_company_profiles WHERE {where} LIMIT 1"), params).fetchone()
if row:
d = dict(row._mapping)
data = [
["Feld", "Wert"],
["Firma", d.get("company_name", "-")],
["Branche", d.get("industry", "-")],
["Rechtsform", d.get("legal_form", "-")],
["Mitarbeiter", str(d.get("employee_count", "-"))],
]
t = Table(data, colWidths=[60 * mm, 100 * mm])
t.setStyle(TableStyle([
("BACKGROUND", (0, 0), (-1, 0), LIGHT_PURPLE),
("TEXTCOLOR", (0, 0), (-1, 0), PURPLE),
("FONTSIZE", (0, 0), (-1, -1), 9),
("GRID", (0, 0), (-1, -1), 0.5, colors.lightgrey),
("VALIGN", (0, 0), (-1, -1), "TOP"),
]))
story.append(t)
else:
story.append(Paragraph("Kein Unternehmensprofil hinterlegt.", ss["Body2"]))
except Exception as e:
story.append(Paragraph(f"Fehler beim Laden: {e}", ss["Small"]))
story.append(Spacer(1, 5 * mm))
def _add_count_section(self, story, ss, title, table_name, tid):
story.append(Paragraph(title, ss["Section"]))
try:
count = self.db.execute(text(f"SELECT COUNT(*) FROM {table_name} WHERE tenant_id = :tid"), {"tid": tid}).scalar()
story.append(Paragraph(f"Eintraege: <b>{count or 0}</b>", ss["Body2"]))
except Exception:
story.append(Paragraph("Tabelle nicht vorhanden oder leer.", ss["Small"]))
story.append(Spacer(1, 3 * mm))
def _add_risk_section(self, story, ss, tid):
story.append(Paragraph("Risikobewertung", ss["Section"]))
try:
q = text("""
SELECT severity, COUNT(*) as cnt FROM compliance_risks
WHERE tenant_id = :tid GROUP BY severity ORDER BY severity
""")
rows = self.db.execute(q, {"tid": tid}).fetchall()
if rows:
data = [["Schweregrad", "Anzahl"]]
for r in rows:
data.append([r.severity or "UNKNOWN", str(r.cnt)])
t = Table(data, colWidths=[80 * mm, 40 * mm])
t.setStyle(TableStyle([
("BACKGROUND", (0, 0), (-1, 0), LIGHT_PURPLE),
("TEXTCOLOR", (0, 0), (-1, 0), PURPLE),
("FONTSIZE", (0, 0), (-1, -1), 9),
("GRID", (0, 0), (-1, -1), 0.5, colors.lightgrey),
]))
story.append(t)
else:
story.append(Paragraph("Keine Risiken erfasst.", ss["Body2"]))
except Exception:
story.append(Paragraph("Risiko-Tabelle nicht vorhanden.", ss["Small"]))
story.append(Spacer(1, 3 * mm))
def _add_review_section(self, story, ss, tid):
story.append(Paragraph("Dokumenten-Reviews", ss["Section"]))
try:
q = text("SELECT status, COUNT(*) as cnt FROM compliance_document_reviews WHERE tenant_id = :tid GROUP BY status")
rows = self.db.execute(q, {"tid": tid}).fetchall()
if rows:
data = [["Status", "Anzahl"]]
for r in rows:
data.append([r.status, str(r.cnt)])
t = Table(data, colWidths=[80 * mm, 40 * mm])
t.setStyle(TableStyle([
("BACKGROUND", (0, 0), (-1, 0), LIGHT_PURPLE),
("FONTSIZE", (0, 0), (-1, -1), 9),
("GRID", (0, 0), (-1, -1), 0.5, colors.lightgrey),
]))
story.append(t)
else:
story.append(Paragraph("Keine Reviews vorhanden.", ss["Body2"]))
except Exception:
story.append(Paragraph("Review-Tabelle nicht vorhanden.", ss["Small"]))
story.append(Spacer(1, 3 * mm))
def _add_consent_section(self, story, ss, tid):
story.append(Paragraph("Banner-Consents", ss["Section"]))
try:
count = self.db.execute(text("SELECT COUNT(*) FROM compliance_banner_consents WHERE tenant_id = :tid"), {"tid": tid}).scalar()
story.append(Paragraph(f"Gesamte Consents: <b>{count or 0}</b>", ss["Body2"]))
except Exception:
story.append(Paragraph("Banner-Tabelle nicht vorhanden.", ss["Small"]))
story.append(Spacer(1, 3 * mm))
def _add_role_section(self, story, ss, tid, pid):
story.append(Paragraph("Rollenkonzept", ss["Section"]))
try:
where = "tenant_id = :tid"
params: dict[str, Any] = {"tid": tid}
if pid:
where += " AND (project_id = :pid OR project_id IS NULL)"
params["pid"] = pid
rows = self.db.execute(text(f"SELECT role_key, role_label, person_name, person_email FROM compliance_org_roles WHERE {where} ORDER BY role_key"), params).fetchall()
if rows:
data = [["Rolle", "Name", "E-Mail"]]
for r in rows:
data.append([r.role_label or r.role_key, r.person_name or "-", r.person_email or "-"])
t = Table(data, colWidths=[60 * mm, 50 * mm, 50 * mm])
t.setStyle(TableStyle([
("BACKGROUND", (0, 0), (-1, 0), LIGHT_PURPLE),
("TEXTCOLOR", (0, 0), (-1, 0), PURPLE),
("FONTSIZE", (0, 0), (-1, -1), 9),
("GRID", (0, 0), (-1, -1), 0.5, colors.lightgrey),
]))
story.append(t)
else:
story.append(Paragraph("Keine Rollen zugewiesen.", ss["Body2"]))
except Exception:
story.append(Paragraph("Rollen-Tabelle nicht vorhanden.", ss["Small"]))
@@ -87,9 +87,10 @@ def compare_services(
for key, svc in detected_names.items():
# Skip CMP — consent managers don't need DSE mention
if svc.get("category") == "other" and svc.get("id") == "cmp":
if svc.get("category") == "cmp" or (svc.get("category") == "other" and svc.get("id") == "cmp"):
continue
matched = False
# Method 1: Match against LLM-extracted service list
for dse_key, dse_svc in dse_names.items():
if key == dse_key or _fuzzy_match(svc["name"], dse_svc["name"]):
documented.append({"detected": svc, "dse": dse_svc, "status": "ok"})
@@ -0,0 +1,100 @@
"""
DSR Art. 11 Service handles "data subject not identifiable" rejections.
Art. 11 Abs. 1 DSGVO: If the controller is unable to identify the data
subject, it is not obligated to obtain additional information solely to
comply with Art. 15-20 requests.
Common scenario: Website visitor requests access, but only anonymous
cookies/IP-hashes are stored no way to link to a person.
"""
import logging
from datetime import datetime, timezone
from typing import Any, Dict
from sqlalchemy.orm import Session
from compliance.domain import ValidationError
logger = logging.getLogger(__name__)
class DSRArt11Service:
"""Handles Art. 11 DSGVO rejections for non-identifiable data subjects."""
def __init__(self, db: Session) -> None:
self._db = db
def reject_not_identifiable(
self, dsr_id: str, tenant_id: str, notes: str = "",
) -> Dict[str, Any]:
"""Reject DSR because data subject cannot be identified."""
from compliance.db.dsr_models import DSRRequestDB
from compliance.services.dsr_workflow_service import _dsr_to_dict, _record_history
dsr = (
self._db.query(DSRRequestDB)
.filter(DSRRequestDB.id == dsr_id, DSRRequestDB.tenant_id == tenant_id)
.first()
)
if not dsr:
raise ValidationError("DSR not found")
if dsr.status in ("completed", "rejected", "cancelled"):
raise ValidationError("DSR already closed")
now = datetime.now(timezone.utc)
reason = (
"Die bei uns gespeicherten Daten (anonymisierte Cookies, IP-Hashes, "
"Device-Fingerprints) erlauben keine Identifikation der betroffenen Person. "
"Gemaess Art. 11 Abs. 1 DSGVO sind wir nicht verpflichtet, zusaetzliche "
"Informationen zu erheben, um die betroffene Person zu identifizieren."
)
if notes:
reason += f" Ergaenzung: {notes}"
_record_history(self._db, dsr, "rejected",
comment="Art. 11 DSGVO — Identifikation nicht moeglich")
dsr.status = "rejected"
dsr.rejection_reason = reason
dsr.rejection_legal_basis = "Art. 11 Abs. 1 DSGVO"
dsr.identity_verified = False
dsr.verification_method = "art11_not_identifiable"
dsr.verification_notes = "Daten erlauben keine Identifikation der betroffenen Person"
dsr.completed_at = now
dsr.updated_at = now
self._db.commit()
self._db.refresh(dsr)
# Send rejection notification
self._send_art11_notification(dsr)
return _dsr_to_dict(dsr)
def _send_art11_notification(self, dsr: Any) -> None:
if not dsr.requester_email:
return
try:
from compliance.services.email_delivery_service import EmailDeliveryService
delivery = EmailDeliveryService(self._db)
variables = {
"requester_name": dsr.requester_name or "Antragsteller/in",
"reference_number": dsr.request_number or "",
"rejection_reason": "Identifikation nicht moeglich — Art. 11 Abs. 1 DSGVO",
"legal_basis": "Art. 11 Abs. 1 DSGVO",
"sender_name": "Datenschutzbeauftragter",
}
# Use published dsr_rejection template, fallback to inline
delivery.send(
tenant_id=str(dsr.tenant_id),
template_type="dsr_rejection",
recipient=dsr.requester_email,
variables=variables,
fallback_subject=f"Zu Ihrer Anfrage {dsr.request_number} — Art. 11 DSGVO",
fallback_html=f"""<p>Sehr geehrte/r {dsr.requester_name or 'Antragsteller/in'},</p>
<p>wir koennen die bei uns gespeicherten Daten keiner identifizierbaren Person zuordnen.
Gemaess Art. 11 Abs. 1 DSGVO ist eine Auskunftserteilung nicht moeglich.</p>
<p>Mit freundlichen Gruessen<br/>Datenschutzbeauftragter</p>""",
)
except Exception as e:
logger.warning("Art. 11 notification failed: %s", e)
@@ -0,0 +1,273 @@
"""
DSR User Data Export Service aggregates all CMP data about a user.
Supports Art. 15 (access right, PDF) and Art. 20 (data portability, JSON/CSV).
Collects from: Banner Consents, Einwilligungen, Consent Audit Trail, DSR History.
"""
import csv
import io
import json
import logging
import uuid
from datetime import datetime, timezone
from typing import Any, Optional
from reportlab.lib import colors
from reportlab.lib.pagesizes import A4
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.lib.units import mm
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle
from sqlalchemy import text
from sqlalchemy.orm import Session
from compliance.services.banner_dsr_service import BannerDSRService
logger = logging.getLogger(__name__)
PURPLE = colors.HexColor("#7c3aed")
LIGHT_PURPLE = colors.HexColor("#f5f3ff")
GRAY = colors.HexColor("#6b7280")
class DSRExportService:
"""Aggregates and exports all user data stored in the CMP."""
def __init__(self, db: Session) -> None:
self.db = db
def aggregate_user_data(self, tenant_id: str, email: str) -> dict[str, Any]:
"""Collect ALL data about a user from all CMP sources."""
now = datetime.now(timezone.utc)
tid = tenant_id # Keep as string — let PostgreSQL cast
# 1. Banner consents + audit trail
banner_data: dict[str, Any] = {"banner_consents": [], "audit_trail": []}
try:
banner_svc = BannerDSRService(self.db)
banner_data = banner_svc.export_for_dsr(tenant_id, email)
except Exception as e:
logger.warning("Banner DSR export failed: %s", e)
try:
self.db.rollback()
except Exception:
pass
# 2. Einwilligungen (user-based consents)
einwilligungen: list[dict] = []
try:
q = text("""
SELECT c.id, c.data_point_id, c.granted, c.granted_at, c.revoked_at,
c.consent_version, c.source, c.ip_address, c.user_agent, c.created_at
FROM compliance_einwilligungen_consents c
WHERE c.tenant_id = CAST(:tid AS VARCHAR) AND c.user_id = :email
ORDER BY c.created_at DESC
""")
rows = self.db.execute(q, {"tid": tid, "email": email}).fetchall()
for r in rows:
entry = dict(r._mapping)
for k, v in entry.items():
if isinstance(v, datetime):
entry[k] = v.isoformat()
elif isinstance(v, uuid.UUID):
entry[k] = str(v)
# Get history
hist_q = text("""
SELECT action, consent_version, ip_address, user_agent, source, created_at
FROM compliance_einwilligungen_consent_history
WHERE consent_id = :cid ORDER BY created_at
""")
hist = self.db.execute(hist_q, {"cid": entry["id"]}).fetchall()
entry["history"] = [
{k: (v.isoformat() if isinstance(v, datetime) else str(v) if isinstance(v, uuid.UUID) else v)
for k, v in dict(h._mapping).items()}
for h in hist
]
einwilligungen.append(entry)
except Exception as e:
logger.warning("Einwilligungen export failed: %s", e)
try:
self.db.rollback()
except Exception:
pass
# 3. DSR requests by this user
dsr_requests: list[dict] = []
try:
q = text("""
SELECT id, request_number, request_type, status, received_at, deadline_at, completed_at
FROM compliance_dsr_requests
WHERE tenant_id = :tid AND requester_email = :email
ORDER BY received_at DESC
""")
rows = self.db.execute(q, {"tid": tid, "email": email}).fetchall()
for r in rows:
entry = dict(r._mapping)
for k, v in entry.items():
if isinstance(v, datetime):
entry[k] = v.isoformat()
elif isinstance(v, uuid.UUID):
entry[k] = str(v)
dsr_requests.append(entry)
except Exception as e:
logger.warning("DSR requests export failed: %s", e)
try:
self.db.rollback()
except Exception:
pass
return {
"export_date": now.isoformat(),
"data_subject": {"email": email},
"banner_consents": banner_data.get("banner_consents", []),
"consent_audit_trail": banner_data.get("audit_trail", []),
"einwilligungen": einwilligungen,
"dsr_requests": dsr_requests,
"metadata": {
"tenant_id": tenant_id,
"data_categories": ["Banner-Consents", "Einwilligungen", "Audit-Trail", "DSR-Anfragen"],
"legal_basis": "Art. 15 / Art. 20 DSGVO",
},
}
def export_json(self, tenant_id: str, email: str) -> tuple[bytes, str]:
data = self.aggregate_user_data(tenant_id, email)
data["metadata"]["export_format"] = "json"
content = json.dumps(data, indent=2, ensure_ascii=False, default=str).encode("utf-8")
return content, f"dsr-export-{email.split('@')[0]}.json"
def export_csv(self, tenant_id: str, email: str) -> tuple[bytes, str]:
data = self.aggregate_user_data(tenant_id, email)
buf = io.StringIO()
writer = csv.writer(buf)
writer.writerow(["Kategorie", "Schluessel", "Wert", "Zeitpunkt", "Quelle"])
# Banner consents
for c in data.get("banner_consents", []):
writer.writerow(["Banner-Consent", "site_id", c.get("site_id", ""), c.get("created_at", ""), "CMP"])
writer.writerow(["Banner-Consent", "categories", ", ".join(c.get("categories", [])), c.get("updated_at", ""), "CMP"])
writer.writerow(["Banner-Consent", "ip_hash", c.get("ip_hash", ""), c.get("created_at", ""), "CMP"])
# Audit trail
for a in data.get("consent_audit_trail", []):
writer.writerow(["Audit-Trail", a.get("action", ""), ", ".join(a.get("categories", [])), a.get("created_at", ""), "CMP"])
# Einwilligungen
for e in data.get("einwilligungen", []):
status = "Erteilt" if e.get("granted") else "Widerrufen"
writer.writerow(["Einwilligung", e.get("data_point_id", ""), status, e.get("granted_at", ""), e.get("source", "")])
# DSR requests
for d in data.get("dsr_requests", []):
writer.writerow(["DSR-Anfrage", d.get("request_type", ""), d.get("status", ""), d.get("received_at", ""), ""])
content = buf.getvalue().encode("utf-8-sig") # BOM for Excel
return content, f"dsr-export-{email.split('@')[0]}.csv"
def export_pdf(self, tenant_id: str, email: str) -> tuple[bytes, str]:
data = self.aggregate_user_data(tenant_id, email)
buf = io.BytesIO()
doc = SimpleDocTemplate(buf, pagesize=A4, leftMargin=20 * mm, rightMargin=20 * mm, topMargin=25 * mm, bottomMargin=20 * mm)
ss = getSampleStyleSheet()
ss.add(ParagraphStyle("Title2", parent=ss["Title"], fontSize=20, textColor=PURPLE, spaceAfter=6))
ss.add(ParagraphStyle("Section", parent=ss["Heading2"], fontSize=13, textColor=PURPLE, spaceBefore=10))
ss.add(ParagraphStyle("Body2", parent=ss["Normal"], fontSize=9, leading=13))
ss.add(ParagraphStyle("Small", parent=ss["Normal"], fontSize=8, textColor=GRAY))
story: list = []
# Cover
story.append(Paragraph("Datenauskunft gemaess Art. 15 DSGVO", ss["Title2"]))
story.append(Paragraph(f"Betroffene Person: {email}", ss["Body2"]))
story.append(Paragraph(f"Erstellt am: {data['export_date'][:10]}", ss["Small"]))
story.append(Spacer(1, 8 * mm))
tbl_style = TableStyle([
("BACKGROUND", (0, 0), (-1, 0), LIGHT_PURPLE),
("TEXTCOLOR", (0, 0), (-1, 0), PURPLE),
("FONTSIZE", (0, 0), (-1, -1), 8),
("GRID", (0, 0), (-1, -1), 0.5, colors.lightgrey),
("VALIGN", (0, 0), (-1, -1), "TOP"),
("TOPPADDING", (0, 0), (-1, -1), 3),
("BOTTOMPADDING", (0, 0), (-1, -1), 3),
])
# Section 1: Banner Consents
consents = data.get("banner_consents", [])
story.append(Paragraph(f"1. Banner-Consents ({len(consents)})", ss["Section"]))
if consents:
rows = [["Site", "Kategorien", "IP-Hash", "Erstellt", "Aktualisiert"]]
for c in consents:
rows.append([
str(c.get("site_id", "")),
", ".join(c.get("categories", [])),
str(c.get("ip_hash", ""))[:12] + "...",
str(c.get("created_at", ""))[:10],
str(c.get("updated_at", ""))[:10],
])
t = Table(rows, colWidths=[30 * mm, 40 * mm, 30 * mm, 25 * mm, 25 * mm])
t.setStyle(tbl_style)
story.append(t)
else:
story.append(Paragraph("Keine Banner-Consents gespeichert.", ss["Body2"]))
# Section 2: Einwilligungen
einw = data.get("einwilligungen", [])
story.append(Paragraph(f"2. Einwilligungen ({len(einw)})", ss["Section"]))
if einw:
rows = [["Datenpunkt", "Status", "Erteilt am", "Widerrufen am", "IP-Adresse"]]
for e in einw:
rows.append([
str(e.get("data_point_id", "")),
"Erteilt" if e.get("granted") else "Widerrufen",
str(e.get("granted_at", ""))[:10],
str(e.get("revoked_at", ""))[:10] if e.get("revoked_at") else "-",
str(e.get("ip_address", ""))[:15] if e.get("ip_address") else "-",
])
t = Table(rows, colWidths=[35 * mm, 25 * mm, 25 * mm, 25 * mm, 35 * mm])
t.setStyle(tbl_style)
story.append(t)
else:
story.append(Paragraph("Keine Einwilligungen gespeichert.", ss["Body2"]))
# Section 3: Audit Trail
trail = data.get("consent_audit_trail", [])
story.append(Paragraph(f"3. Consent-Audit-Trail ({len(trail)})", ss["Section"]))
if trail:
rows = [["Aktion", "Kategorien", "Datum"]]
for a in trail[:50]: # Limit to 50 for PDF
rows.append([
str(a.get("action", "")),
", ".join(a.get("categories", [])),
str(a.get("created_at", ""))[:19],
])
t = Table(rows, colWidths=[40 * mm, 60 * mm, 45 * mm])
t.setStyle(tbl_style)
story.append(t)
if len(trail) > 50:
story.append(Paragraph(f"... und {len(trail) - 50} weitere Eintraege (im JSON-Export enthalten)", ss["Small"]))
else:
story.append(Paragraph("Kein Audit-Trail vorhanden.", ss["Body2"]))
# Section 4: DSR Requests
dsrs = data.get("dsr_requests", [])
story.append(Paragraph(f"4. Bisherige DSR-Anfragen ({len(dsrs)})", ss["Section"]))
if dsrs:
rows = [["Typ", "Status", "Eingegangen", "Abgeschlossen"]]
for d in dsrs:
rows.append([
str(d.get("request_type", "")),
str(d.get("status", "")),
str(d.get("received_at", ""))[:10],
str(d.get("completed_at", ""))[:10] if d.get("completed_at") else "-",
])
t = Table(rows, colWidths=[35 * mm, 30 * mm, 35 * mm, 35 * mm])
t.setStyle(tbl_style)
story.append(t)
# Footer
story.append(Spacer(1, 15 * mm))
story.append(Paragraph("Erstellt mit BreakPilot Compliance SDK | Art. 15 DSGVO Datenauskunft", ss["Small"]))
doc.build(story)
return buf.getvalue(), f"dsr-export-{email.split('@')[0]}.pdf"
@@ -0,0 +1,122 @@
"""
Email Template Delivery Service the missing integration layer.
Combines: template loading published version variable rendering SMTP audit log.
Used by DSR workflow, document reviews, and other modules that need to send
templated emails.
"""
import logging
import uuid
from typing import Any, Optional
from sqlalchemy.orm import Session
from compliance.db.email_template_models import (
EmailSendLogDB,
EmailTemplateDB,
EmailTemplateVersionDB,
)
logger = logging.getLogger(__name__)
def _render(html: str, variables: dict[str, str]) -> str:
"""Replace {{variable}} placeholders with values."""
result = html
for key, value in variables.items():
result = result.replace(f"{{{{{key}}}}}", str(value))
return result
class EmailDeliveryService:
"""Load template → render → send via SMTP → log."""
def __init__(self, db: Session) -> None:
self.db = db
def get_published_version(
self, tenant_id: str, template_type: str,
) -> Optional[EmailTemplateVersionDB]:
"""Get the latest published version of a template by type."""
tid = uuid.UUID(tenant_id)
template = (
self.db.query(EmailTemplateDB)
.filter(EmailTemplateDB.tenant_id == tid, EmailTemplateDB.template_type == template_type)
.first()
)
if not template:
return None
return (
self.db.query(EmailTemplateVersionDB)
.filter(
EmailTemplateVersionDB.template_id == template.id,
EmailTemplateVersionDB.status == "published",
)
.order_by(EmailTemplateVersionDB.created_at.desc())
.first()
)
def send(
self,
tenant_id: str,
template_type: str,
recipient: str,
variables: dict[str, str],
fallback_subject: Optional[str] = None,
fallback_html: Optional[str] = None,
) -> dict[str, Any]:
"""Send a templated email. Falls back to inline HTML if no published template.
Args:
tenant_id: Tenant UUID string.
template_type: E.g. 'dsr_receipt', 'dsr_completion'.
recipient: Email address.
variables: Dict of {{key}}: value for rendering.
fallback_subject: Subject if no template found.
fallback_html: HTML body if no template found.
"""
from compliance.services.smtp_sender import send_email
tid = uuid.UUID(tenant_id)
version = self.get_published_version(tenant_id, template_type)
if version:
subject = _render(version.subject, variables)
body_html = _render(version.body_html, variables)
version_id = version.id
elif fallback_subject and fallback_html:
subject = _render(fallback_subject, variables)
body_html = _render(fallback_html, variables)
version_id = None
else:
logger.warning("No published template for '%s' and no fallback provided", template_type)
return {"success": False, "error": f"No template for {template_type}"}
result = send_email(recipient=recipient, subject=subject, body_html=body_html)
# Audit log
try:
log = EmailSendLogDB(
tenant_id=tid,
template_type=template_type,
version_id=version_id,
recipient=recipient,
subject=subject,
status=result.get("status", "unknown"),
variables=variables,
error_message=result.get("error"),
)
self.db.add(log)
self.db.commit()
except Exception as e:
logger.warning("Failed to log email send: %s", e)
return {
"success": result.get("status") == "sent",
"template_type": template_type,
"recipient": recipient,
"subject": subject,
"used_template": version is not None,
"status": result.get("status"),
}
@@ -0,0 +1,179 @@
"""
Intake Extractor derives UCCA intake flags from DETECTED SERVICES,
not from website text content.
The actual data processing happens through APIs, scripts, and cookies
NOT through visible text on the page. A news website reporting about
healthcare does NOT process health data.
Flags are derived deterministically from:
1. Which third-party services are embedded (Google Analytics tracking)
2. Which payment providers are used (Stripe payment_data)
3. Which CDN/fonts are loaded (Google Fonts cross_border_transfer)
"""
import logging
logger = logging.getLogger(__name__)
# Service category → intake flags mapping
# This is the ONLY source of truth for what a service implies
SERVICE_TO_FLAGS: dict[str, dict[str, bool]] = {
# Tracking & Analytics → personal_data + tracking
"tracking": {
"personal_data": True,
"tracking": True,
},
# Marketing → marketing + tracking + third_party_sharing
"marketing": {
"personal_data": True,
"tracking": True,
"marketing": True,
"third_party_sharing": True,
},
# Heatmap/Session Recording → tracking + profiling
"heatmap": {
"personal_data": True,
"tracking": True,
"profiling": True,
},
# Payment → payment_data
"payment": {
"personal_data": True,
"payment_data": True,
},
# Chatbot → personal_data (user sends messages)
"chatbot": {
"personal_data": True,
"customer_data": True,
},
# CRM → customer_data + profiling
"crm": {
"personal_data": True,
"customer_data": True,
"profiling": True,
},
# CDN from non-EU → cross_border_transfer (IP sent to US)
"cdn": {
"personal_data": True,
},
}
# Specific services with special flags
SPECIFIC_SERVICE_FLAGS: dict[str, dict[str, bool]] = {
"klarna": {"automated_decisions": True, "payment_data": True},
"paypal": {"cross_border_transfer": True, "payment_data": True},
"stripe": {"cross_border_transfer": True, "payment_data": True},
"google_analytics": {"cross_border_transfer": True, "tracking": True},
"facebook_pixel": {"cross_border_transfer": True, "marketing": True, "profiling": True},
"hotjar": {"profiling": True, "tracking": True},
"ms_clarity": {"cross_border_transfer": True, "profiling": True},
"tiktok_pixel": {"cross_border_transfer": True, "marketing": True},
"intercom": {"cross_border_transfer": True, "ai_usage": True},
}
def extract_intake_flags_from_services(detected_services: list[dict]) -> dict:
"""Derive intake flags from detected third-party services.
This is deterministic and 100% accurate if Google Analytics is
embedded, tracking IS happening. No guessing needed.
"""
flags = {
"personal_data": False,
"customer_data": False,
"payment_data": False,
"location_data": False,
"biometric_data": False,
"minor_data": False,
"health_data": False,
"marketing": False,
"profiling": False,
"automated_decisions": False,
"third_party_sharing": False,
"cross_border_transfer": False,
"tracking": False,
"ai_usage": False,
}
for svc in detected_services:
category = svc.get("category", "other")
service_id = svc.get("id", "")
eu_adequate = svc.get("eu_adequate", True)
# Apply category-level flags
cat_flags = SERVICE_TO_FLAGS.get(category, {})
for key, value in cat_flags.items():
if value:
flags[key] = True
# Apply service-specific flags
svc_flags = SPECIFIC_SERVICE_FLAGS.get(service_id, {})
for key, value in svc_flags.items():
if value:
flags[key] = True
# Non-EU service → cross_border_transfer
if not eu_adequate:
flags["cross_border_transfer"] = True
flags["third_party_sharing"] = True
# Any website with detected services processes personal data (IP at minimum)
if detected_services:
flags["personal_data"] = True
active = {k: v for k, v in flags.items() if v}
logger.info("Intake flags from %d services: %s", len(detected_services), active)
return flags
# Keep backward compatibility
async def extract_intake_flags(text: str) -> dict:
"""DEPRECATED — use extract_intake_flags_from_services() instead.
This function used LLM to guess flags from text content.
Text content does NOT represent actual data processing.
"""
logger.warning(
"extract_intake_flags(text) called — DEPRECATED. "
"Use extract_intake_flags_from_services(detected_services) instead."
)
# Return minimal flags — website exists = personal_data (IP)
return {"personal_data": True, "tracking": False}
def flags_to_ucca_intake(flags: dict) -> dict:
"""Convert extracted flags to UCCA intake format."""
return {
"data_types": {
"personal_data": flags.get("personal_data", False),
"customer_data": flags.get("customer_data", False),
"location_data": flags.get("location_data", False),
"biometric_data": flags.get("biometric_data", False),
"minor_data": flags.get("minor_data", False),
"images": False,
"audio": False,
"financial_data": flags.get("payment_data", False),
"employee_data": False,
"article_9_data": flags.get("health_data", False) or flags.get("biometric_data", False),
},
"purpose": {
"marketing": flags.get("marketing", False),
"analytics": flags.get("tracking", False),
"profiling": flags.get("profiling", False),
"automation": flags.get("ai_usage", False),
"customer_support": False,
"evaluation_scoring": flags.get("automated_decisions", False),
"decision_making": flags.get("automated_decisions", False),
},
"automation": "fully_automated" if flags.get("automated_decisions") else
"partially_automated" if flags.get("ai_usage") else "manual",
"outputs": {
"recommendations_to_users": flags.get("profiling", False),
"data_export": flags.get("cross_border_transfer", False),
"legal_effects": flags.get("automated_decisions", False),
},
"hosting": {
"region": "non_eu" if flags.get("cross_border_transfer") else "eu",
},
}
@@ -0,0 +1,152 @@
"""
Control Relevance Filter filters out controls that are not relevant
for the analyzed document based on keyword matching.
Prevents false positives like C_TRANSPARENCY being recommended when
no AI usage is evident.
"""
import logging
import re
logger = logging.getLogger(__name__)
# Top controls with their relevance conditions.
# A control is only relevant if ANY keyword from 'requires_any' matches the text.
# If 'requires_any' is empty, the control is always relevant.
CONTROL_RELEVANCE: dict[str, dict] = {
"C_TRANSPARENCY": {
"description": "KI-Transparenz-Hinweis (Art. 52 AI Act)",
"requires_any": [
"künstliche intelligenz", "kuenstliche intelligenz",
"artificial intelligence", "machine learning", "maschinelles lernen",
"ki-gestützt", "ki-gestuetzt", "ai-powered", "ai system",
"chatbot", "neural", "deep learning", "algorithmus", "algorithmen",
"automatisierte entscheidung", "automated decision",
],
"reason": "Nur relevant wenn KI/ML tatsaechlich eingesetzt wird",
},
"C_DSFA_REQUIRED": {
"description": "Datenschutz-Folgenabschaetzung durchfuehren",
"requires_any": [
"gesundheit", "biometrisch", "genetisch", "health", "biometric",
"scoring", "profiling", "systematisch", "umfangreich",
"videoüberwachung", "videoueberwachung", "kamera",
"minderjährig", "minderjaehrig", "kinder",
],
"reason": "Nur bei hohem Risiko (Art. 9 Daten, Profiling, Ueberwachung)",
},
"C_ART22_INFO": {
"description": "Info ueber automatisierte Einzelentscheidung (Art. 22 DSGVO)",
"requires_any": [
"automatisierte entscheidung", "automated decision", "scoring",
"bonitaet", "kredit", "rating", "algorithmische entscheidung",
"profiling", "klarna", "ratenzahlung",
],
"reason": "Nur bei automatisierten Einzelentscheidungen mit Rechtswirkung",
},
"C_DPO_REQUIRED": {
"description": "Datenschutzbeauftragten bestellen",
"requires_any": [], # Always relevant — empty means no filter
"reason": "Generell relevant fuer Unternehmen",
},
"C_EXPLICIT_CONSENT": {
"description": "Explizite Einwilligung einholen",
"requires_any": [
"cookie", "tracking", "analytics", "pixel", "marketing",
"werbung", "newsletter", "remarketing", "retargeting",
"einwilligung", "consent", "opt-in",
],
"reason": "Nur bei Tracking/Marketing das Einwilligung erfordert",
},
"C_CHILD_PROTECTION": {
"description": "Besonderer Schutz fuer Minderdjaehrige",
"requires_any": [
"kinder", "minderjährig", "minderjaehrig", "jugend",
"under 16", "unter 16", "schüler", "schueler", "child",
],
"reason": "Nur wenn Daten von Minderjaehrigen verarbeitet werden",
},
"C_THIRD_COUNTRY_SAFEGUARDS": {
"description": "Drittlandtransfer absichern (Art. 44-49 DSGVO)",
"requires_any": [
"usa", "united states", "drittland", "drittst", "third countr",
"standardvertragsklausel", "sccs", "binding corporate",
"angemessenheitsbeschluss", "adequacy",
"google", "meta", "facebook", "amazon", "microsoft", "apple",
"cloudflare", "stripe", "paypal",
],
"reason": "Nur bei Datentransfer in Drittlaender",
},
}
def filter_controls(
controls: list[str],
source_text: str,
intake_flags: dict | None = None,
) -> list[str]:
"""Filter controls based on relevance to the analyzed text.
Returns only controls that are relevant (keyword match or no filter defined).
"""
if not controls:
return controls
text_lower = source_text.lower()
filtered = []
removed = []
for control in controls:
# Extract control ID from string like "[C_TRANSPARENCY] Nutzer informieren..."
control_id = _extract_control_id(control)
if control_id and control_id in CONTROL_RELEVANCE:
rules = CONTROL_RELEVANCE[control_id]
keywords = rules["requires_any"]
if not keywords:
# No filter = always relevant
filtered.append(control)
continue
# Check if any keyword matches
if any(kw in text_lower for kw in keywords):
filtered.append(control)
else:
# Also check intake flags as fallback
if intake_flags and _check_flags(control_id, intake_flags):
filtered.append(control)
else:
removed.append((control_id, rules["reason"]))
else:
# Unknown control — keep it (don't filter what we don't understand)
filtered.append(control)
if removed:
logger.info(
"Relevance filter removed %d controls: %s",
len(removed),
", ".join(f"{cid} ({reason})" for cid, reason in removed),
)
return filtered
def _extract_control_id(control: str) -> str | None:
"""Extract control ID from '[C_XXX] description' format."""
match = re.match(r"\[([A-Z_0-9]+)\]", control)
return match.group(1) if match else None
def _check_flags(control_id: str, flags: dict) -> bool:
"""Check if intake flags make a control relevant."""
flag_map = {
"C_TRANSPARENCY": flags.get("ai_usage", False),
"C_DSFA_REQUIRED": flags.get("health_data", False) or flags.get("biometric_data", False),
"C_ART22_INFO": flags.get("automated_decisions", False),
"C_EXPLICIT_CONSENT": flags.get("tracking", False) or flags.get("marketing", False),
"C_CHILD_PROTECTION": flags.get("minor_data", False),
"C_THIRD_COUNTRY_SAFEGUARDS": flags.get("cross_border_transfer", False),
}
return flag_map.get(control_id, False)
@@ -0,0 +1,209 @@
"""
TCF 2.2 TC String Encoder generates IAB Transparency & Consent strings.
Implements the TC String v2.2 format per IAB specification.
The TC String is a base64url-encoded bitfield containing:
- CMP metadata (ID, version, screen, consent language)
- Purpose consents (12 standard IAB purposes)
- Vendor consents (per IAB vendor ID)
- Legitimate interest signals
Reference: https://github.com/InteractiveAdvertisingBureau/GDPR-Transparency-and-Consent-Framework
NOTE: This is a simplified encoder for CMP integration. For full GVL
(Global Vendor List) support, integrate with the IAB GVL API.
"""
import base64
import math
from datetime import datetime, timezone
from typing import Any
# IAB TCF 2.2 Standard Purposes
IAB_PURPOSES = {
1: {"name": "Store and/or access information on a device", "name_de": "Informationen auf Geraet speichern/abrufen"},
2: {"name": "Select basic ads", "name_de": "Einfache Anzeigen auswaehlen"},
3: {"name": "Create a personalised ads profile", "name_de": "Personalisiertes Anzeigenprofil erstellen"},
4: {"name": "Select personalised ads", "name_de": "Personalisierte Anzeigen auswaehlen"},
5: {"name": "Create a personalised content profile", "name_de": "Personalisiertes Inhaltsprofil erstellen"},
6: {"name": "Select personalised content", "name_de": "Personalisierte Inhalte auswaehlen"},
7: {"name": "Measure ad performance", "name_de": "Anzeigen-Leistung messen"},
8: {"name": "Measure content performance", "name_de": "Inhalte-Leistung messen"},
9: {"name": "Apply market research to generate audience insights", "name_de": "Marktforschung fuer Zielgruppen"},
10: {"name": "Develop and improve products", "name_de": "Produkte entwickeln und verbessern"},
11: {"name": "Use limited data to select content", "name_de": "Eingeschraenkte Daten fuer Inhalte nutzen"},
12: {"name": "Use limited data to select ads", "name_de": "Eingeschraenkte Daten fuer Anzeigen nutzen"},
}
# IAB Special Features
IAB_SPECIAL_FEATURES = {
1: {"name": "Use precise geolocation data", "name_de": "Praezise Standortdaten verwenden"},
2: {"name": "Actively scan device characteristics for identification", "name_de": "Geraetemerkmale aktiv scannen"},
}
# Category-to-Purpose mapping (how our banner categories map to IAB purposes)
CATEGORY_PURPOSE_MAP = {
"necessary": [], # No consent needed
"functional": [1, 11], # Device access + limited data for content
"statistics": [1, 7, 8, 9, 10], # Device access + measurement + research
"marketing": [1, 2, 3, 4, 5, 6, 7, 12], # Most purposes
}
def _int_to_bits(value: int, length: int) -> str:
"""Convert integer to fixed-length bit string."""
return bin(value)[2:].zfill(length)
def _datetime_to_deciseconds(dt: datetime) -> int:
"""Convert datetime to deciseconds since epoch (IAB format)."""
epoch = datetime(2000, 1, 1, tzinfo=timezone.utc)
return int((dt - epoch).total_seconds() * 10)
def _bits_to_base64url(bits: str) -> str:
"""Convert bit string to base64url encoding (TC String format)."""
# Pad to multiple of 8
padding = (8 - len(bits) % 8) % 8
bits += "0" * padding
# Convert to bytes
byte_array = bytearray()
for i in range(0, len(bits), 8):
byte_array.append(int(bits[i:i+8], 2))
# Base64url encode (no padding)
return base64.urlsafe_b64encode(bytes(byte_array)).rstrip(b"=").decode("ascii")
class TCFEncoderService:
"""Generates TC Strings per IAB TCF 2.2 specification."""
def __init__(
self,
cmp_id: int = 1,
cmp_version: int = 1,
consent_screen: int = 1,
consent_language: str = "DE",
):
self.cmp_id = cmp_id
self.cmp_version = cmp_version
self.consent_screen = consent_screen
self.consent_language = consent_language
def encode(
self,
purpose_consents: dict[int, bool],
vendor_consents: dict[int, bool],
purpose_li: dict[int, bool] | None = None,
special_features: dict[int, bool] | None = None,
) -> str:
"""Generate a TC String from consent decisions.
Args:
purpose_consents: {purpose_id: True/False} for purposes 1-12
vendor_consents: {vendor_id: True/False} for IAB vendor IDs
purpose_li: Legitimate interest signals per purpose
special_features: Special feature opt-ins
Returns:
Base64url-encoded TC String
"""
now = datetime.now(timezone.utc)
created = _datetime_to_deciseconds(now)
updated = created
bits = ""
# Core TC String v2 fields
bits += _int_to_bits(2, 6) # Version (6 bits) = 2
bits += _int_to_bits(created, 36) # Created (36 bits)
bits += _int_to_bits(updated, 36) # LastUpdated (36 bits)
bits += _int_to_bits(self.cmp_id, 12) # CmpId (12 bits)
bits += _int_to_bits(self.cmp_version, 12) # CmpVersion (12 bits)
bits += _int_to_bits(self.consent_screen, 6) # ConsentScreen (6 bits)
# ConsentLanguage (12 bits = 2 × 6-bit letters)
lang = self.consent_language.upper()[:2]
bits += _int_to_bits(ord(lang[0]) - ord("A"), 6)
bits += _int_to_bits(ord(lang[1]) - ord("A"), 6)
# VendorListVersion (12 bits) — use 0 if not fetching GVL
bits += _int_to_bits(0, 12)
# TcfPolicyVersion (6 bits) = 4 for TCF 2.2
bits += _int_to_bits(4, 6)
# IsServiceSpecific (1 bit) = 1
bits += "1"
# UseNonStandardTexts (1 bit) = 0
bits += "0"
# SpecialFeatureOptIns (12 bits)
sf = special_features or {}
for i in range(1, 13):
bits += "1" if sf.get(i, False) else "0"
# PurposesConsent (24 bits)
for i in range(1, 25):
bits += "1" if purpose_consents.get(i, False) else "0"
# PurposesLITransparency (24 bits)
li = purpose_li or {}
for i in range(1, 25):
bits += "1" if li.get(i, False) else "0"
# Purpose one treatment (1 bit) = 0, PublisherCC (12 bits) = DE
bits += "0"
bits += _int_to_bits(ord("D") - ord("A"), 6)
bits += _int_to_bits(ord("E") - ord("A"), 6)
# Vendor consents — Range encoding
max_vendor = max(vendor_consents.keys()) if vendor_consents else 0
bits += _int_to_bits(max_vendor, 16) # MaxVendorId
# Use bitfield encoding (simpler than range)
bits += "0" # IsRangeEncoding = 0 (bitfield)
for i in range(1, max_vendor + 1):
bits += "1" if vendor_consents.get(i, False) else "0"
# Vendor legitimate interests (same pattern)
bits += _int_to_bits(max_vendor, 16)
bits += "0"
for i in range(1, max_vendor + 1):
bits += "1" if vendor_consents.get(i, False) else "0" # Simplified: same as consent
return _bits_to_base64url(bits)
def encode_from_categories(
self,
categories: list[str],
vendor_consents: dict[int, bool] | None = None,
) -> str:
"""Generate TC String from banner category selections.
Maps our banner categories (necessary, statistics, marketing, functional)
to IAB purposes and generates the TC String.
"""
purpose_consents: dict[int, bool] = {}
for cat in categories:
for purpose_id in CATEGORY_PURPOSE_MAP.get(cat, []):
purpose_consents[purpose_id] = True
return self.encode(
purpose_consents=purpose_consents,
vendor_consents=vendor_consents or {},
)
@staticmethod
def get_purposes() -> list[dict[str, Any]]:
"""Return all 12 IAB purposes with translations."""
return [
{"id": pid, "name": info["name"], "name_de": info["name_de"]}
for pid, info in IAB_PURPOSES.items()
]
@staticmethod
def get_special_features() -> list[dict[str, Any]]:
return [
{"id": fid, "name": info["name"], "name_de": info["name_de"]}
for fid, info in IAB_SPECIAL_FEATURES.items()
]
@staticmethod
def get_category_purpose_map() -> dict[str, list[int]]:
return CATEGORY_PURPOSE_MAP
@@ -0,0 +1,159 @@
"""
Training Link Service bridges document review approvals with the Academy.
After a document is approved, checks which roles need training on that
document type and identifies gaps (missing/overdue assignments).
Gracefully handles missing training tables (Go service not migrated yet).
"""
import logging
from typing import Any
from sqlalchemy import text
from sqlalchemy.orm import Session
logger = logging.getLogger(__name__)
class TrainingLinkService:
"""Links document approvals to training requirements."""
def __init__(self, db: Session) -> None:
self.db = db
def _training_tables_exist(self) -> bool:
"""Check if the Go-managed training tables exist."""
try:
self.db.execute(text("SELECT 1 FROM training_modules LIMIT 0"))
return True
except Exception:
self.db.rollback()
return False
def get_role_codes_for_document(self, tenant_id: str, document_type: str) -> list[dict]:
"""Map document type → org roles → training role codes."""
try:
q = text("""
SELECT m.role_key, t.training_role_code
FROM compliance_document_role_mapping m
LEFT JOIN compliance_role_training_mapping t
ON t.org_role_key = m.role_key
AND (t.tenant_id = :tid OR t.tenant_id = '__default__')
WHERE m.tenant_id = :tid OR m.tenant_id = '__default__'
AND m.document_type = :dt
""")
rows = self.db.execute(q, {"tid": tenant_id, "dt": document_type}).fetchall()
return [{"role_key": r.role_key, "training_role_code": r.training_role_code} for r in rows]
except Exception as e:
logger.warning("Failed to get role codes: %s", e)
return []
def get_training_requirements(self, tenant_id: str, document_type: str) -> dict[str, Any]:
"""Get training modules required for roles associated with a document type."""
if not self._training_tables_exist():
return {
"academy_available": False,
"message": "Academy noch nicht eingerichtet. Training-Module werden nach Aktivierung automatisch verknuepft.",
"requirements": [],
}
role_mappings = self.get_role_codes_for_document(tenant_id, document_type)
if not role_mappings:
return {"academy_available": True, "message": "Keine Rollen-Zuordnung fuer diesen Dokumenttyp.", "requirements": []}
role_codes = [r["training_role_code"] for r in role_mappings if r.get("training_role_code")]
if not role_codes:
return {"academy_available": True, "message": "Keine Training-Codes konfiguriert.", "requirements": []}
try:
placeholders = ",".join(f":rc{i}" for i in range(len(role_codes)))
params: dict[str, Any] = {"tid": tenant_id}
for i, rc in enumerate(role_codes):
params[f"rc{i}"] = rc
q = text(f"""
SELECT tm.role_code, m.module_code, m.title, m.description,
m.frequency_type, m.duration_minutes, tm.is_mandatory
FROM training_matrix tm
JOIN training_modules m ON m.id = tm.module_id
WHERE tm.tenant_id = :tid AND tm.role_code IN ({placeholders})
AND m.is_active = TRUE
ORDER BY tm.role_code, m.sort_order
""")
rows = self.db.execute(q, params).fetchall()
reqs = [dict(r._mapping) for r in rows]
return {"academy_available": True, "requirements": reqs, "total": len(reqs)}
except Exception as e:
logger.warning("Failed to query training requirements: %s", e)
return {"academy_available": True, "requirements": [], "error": str(e)}
def check_training_gaps(
self, tenant_id: str, document_type: str, project_id: str | None = None,
) -> dict[str, Any]:
"""Check which persons assigned to roles have outstanding training."""
if not self._training_tables_exist():
return {"academy_available": False, "gaps": [], "total_gaps": 0}
role_mappings = self.get_role_codes_for_document(tenant_id, document_type)
if not role_mappings:
return {"academy_available": True, "gaps": [], "total_gaps": 0}
gaps = []
for rm in role_mappings:
role_key = rm["role_key"]
role_code = rm.get("training_role_code")
if not role_code:
continue
# Get person assigned to this role
where = "tenant_id = :tid AND role_key = :rk"
params: dict[str, Any] = {"tid": tenant_id, "rk": role_key}
if project_id:
where += " AND (project_id = :pid OR project_id IS NULL)"
params["pid"] = project_id
try:
person = self.db.execute(text(
f"SELECT person_name, person_email, role_label FROM compliance_org_roles WHERE {where} LIMIT 1"
), params).fetchone()
except Exception:
continue
if not person or not person.person_name:
continue
# Get required modules for this role code
try:
modules = self.db.execute(text("""
SELECT m.id, m.module_code, m.title FROM training_matrix tm
JOIN training_modules m ON m.id = tm.module_id
WHERE tm.tenant_id = :tid AND tm.role_code = :rc AND m.is_active = TRUE AND tm.is_mandatory = TRUE
"""), {"tid": tenant_id, "rc": role_code}).fetchall()
except Exception:
continue
for mod in modules:
# Check if assignment exists and is completed
try:
assignment = self.db.execute(text("""
SELECT status, progress_percent FROM training_assignments
WHERE tenant_id = :tid AND module_id = :mid AND user_email = :email
ORDER BY created_at DESC LIMIT 1
"""), {"tid": tenant_id, "mid": mod.id, "email": person.person_email}).fetchone()
except Exception:
assignment = None
if not assignment or assignment.status not in ("completed", "passed"):
gaps.append({
"person_name": person.person_name,
"person_email": person.person_email,
"role": person.role_label,
"role_key": role_key,
"module_code": mod.module_code,
"module_title": mod.title,
"status": assignment.status if assignment else "nicht_begonnen",
"progress": assignment.progress_percent if assignment else 0,
})
return {"academy_available": True, "gaps": gaps, "total_gaps": len(gaps)}
@@ -0,0 +1,148 @@
"""
Website Compliance Checks checks public website for consumer protection
compliance (§312k BGB, §5 TMG, Art. 13 DSGVO, Cookie-Banner).
Extracted from agent_analyze_routes.py to keep route files slim.
"""
import re
import httpx
class FollowUpQuestion:
def __init__(self, id: str, question: str, legal_basis: str, severity: str, finding_if_no: str):
self.id = id
self.question = question
self.legal_basis = legal_basis
self.severity = severity
self.finding_if_no = finding_if_no
async def check_website_compliance(
client: httpx.AsyncClient, url: str, html: str,
) -> tuple[list[str], list[FollowUpQuestion]]:
"""Scan public website for consumer protection compliance."""
findings: list[str] = []
follow_ups: list[FollowUpQuestion] = []
html_lower = html.lower()
base_domain = re.sub(r"https?://([^/]+).*", r"\1", url)
# E-Commerce detection — §312k only applies to sites with online contracts
ecommerce_indicators = [
r"warenkorb", r"cart", r"shop", r"bestell", r"order",
r"checkout", r"kasse", r"kaufen", r"add.?to.?cart",
r"stripe|paypal|klarna|mollie|adyen",
r"abo", r"mitgliedschaft", r"subscription", r"premium",
]
is_ecommerce = any(re.search(p, html_lower) for p in ecommerce_indicators)
# --- §312k BGB: Kündigungsbutton (NUR bei E-Commerce/Abo-Websites) ---
cancel_patterns = [
r'href="[^"]*(?:kuendig|kündig|cancel|vertrag.?beenden|abo.?beenden|mitgliedschaft.?beenden)[^"]*"',
r'(?:kündigen|kuendigen|vertrag beenden|abo beenden|mitgliedschaft kündigen)',
]
has_cancel_link = any(re.search(p, html_lower) for p in cancel_patterns)
cancel_urls_to_probe = [
f"https://{base_domain}/kuendigen",
f"https://{base_domain}/cancel",
f"https://{base_domain}/vertrag-kuendigen",
f"https://{base_domain}/abo-kuendigen",
f"https://{base_domain}/account/cancel",
]
if not has_cancel_link:
for probe_url in cancel_urls_to_probe:
try:
probe = await client.head(probe_url, follow_redirects=True, timeout=5.0)
if probe.status_code < 400:
has_cancel_link = True
break
except Exception:
continue
if not has_cancel_link and is_ecommerce:
findings.append(
"[§312k BGB] Kein oeffentlich sichtbarer Kuendigungsbutton gefunden. "
"Seit 01.07.2022 muessen online geschlossene Vertraege mit max. 2 Klicks kuendbar sein."
)
follow_ups.append(FollowUpQuestion(
id="cancel_button_312k",
question="Koennen Sie nach Login im Kundenbereich innerhalb von 2 Klicks Ihren Vertrag kuendigen?",
legal_basis="§ 312k BGB (Kuendigungsbutton), Omnibus-Richtlinie (EU) 2019/2161",
severity="high",
finding_if_no=(
"[§312k BGB] VERSTOSS: Kein funktionaler Kuendigungsbutton vorhanden. "
"Der Anbieter ist verpflichtet, einen leicht auffindbaren Kuendigungsbutton "
"bereitzustellen (max. 2 Klicks). Ein Zwang zur telefonischen Kuendigung "
"oder Kuendigung per Brief ist rechtswidrig."
),
))
# --- Impressumspflicht (§5 TMG / §18 MStV) ---
imprint_patterns = [
r'href="[^"]*(?:impressum|imprint|legal.?notice|about.?us/legal)[^"]*"',
r'>impressum<',
]
has_imprint = any(re.search(p, html_lower) for p in imprint_patterns)
if not has_imprint:
findings.append(
"[§5 TMG] Kein Impressum-Link auf der Seite gefunden. "
"Geschaeftsmaessige Online-Dienste muessen ein leicht erreichbares Impressum bereitstellen."
)
# --- Datenschutzerklaerung verlinkt? ---
privacy_patterns = [
r'href="[^"]*(?:datenschutz|privacy|dsgvo)[^"]*"',
r'>datenschutz<',
]
has_privacy = any(re.search(p, html_lower) for p in privacy_patterns)
if not has_privacy:
findings.append(
"[Art. 13 DSGVO] Kein Link zur Datenschutzerklaerung gefunden. "
"Nutzer muessen ueber die Verarbeitung personenbezogener Daten informiert werden."
)
# --- Cookie-Consent-Banner ---
cookie_patterns = [
r'(?:cookie.?consent|cookie.?banner|consent.?manager|didomi|cookiebot|onetrust|usercentrics)',
r'(?:gdpr|dsgvo).?(?:consent|einwilligung)',
]
has_cookie_consent = any(re.search(p, html_lower) for p in cookie_patterns)
if not has_cookie_consent:
follow_ups.append(FollowUpQuestion(
id="cookie_consent",
question="Wird beim ersten Besuch der Website ein Cookie-Consent-Banner angezeigt?",
legal_basis="§ 25 TDDDG (ehem. TTDSG), Art. 5(3) ePrivacy-Richtlinie",
severity="medium",
finding_if_no=(
"[§25 TDDDG] Kein Cookie-Consent-Banner erkannt. "
"Vor dem Setzen nicht-essentieller Cookies ist eine Einwilligung erforderlich."
),
))
return findings, follow_ups
def to_string_list(items: list) -> list[str]:
"""Convert list of dicts or strings to list of strings."""
result = []
for item in (items or []):
if isinstance(item, dict):
desc = item.get("description", item.get("name", item.get("code", str(item))))
code = item.get("code", item.get("id", ""))
result.append(f"[{code}] {desc}" if code else str(desc))
else:
result.append(str(item))
return result
def risk_to_escalation(risk_level: str) -> str:
"""Map UCCA risk level to escalation level."""
mapping = {
"MINIMAL": "E0",
"LIMITED": "E1",
"HIGH": "E2",
"UNACCEPTABLE": "E3",
}
return mapping.get(risk_level.upper() if risk_level else "", "E0")
@@ -40,107 +40,8 @@ class ScanResult:
missing_pages: dict = field(default_factory=dict) # url -> status_code
# ── Service Registry ──────────────────────────────────────────────────────────
# Each entry: regex pattern -> service metadata
SERVICE_REGISTRY: dict[str, dict] = {
# --- Tracking & Analytics ---
r"google.?analytics|gtag\(|UA-\d+|G-\w{5,}": {
"id": "google_analytics", "name": "Google Analytics", "category": "tracking",
"provider": "Google LLC", "country": "US", "eu_adequate": False,
"requires_consent": True, "legal_ref": "Art. 44-49 DSGVO, §25 TDDDG",
},
r"googletagmanager|gtm\.js": {
"id": "google_tag_manager", "name": "Google Tag Manager", "category": "tracking",
"provider": "Google LLC", "country": "US", "eu_adequate": False,
"requires_consent": True, "legal_ref": "Art. 44-49 DSGVO",
},
r"facebook\.net/.*fbevents|fbq\(": {
"id": "facebook_pixel", "name": "Meta/Facebook Pixel", "category": "marketing",
"provider": "Meta Platforms", "country": "US", "eu_adequate": False,
"requires_consent": True, "legal_ref": "Art. 44-49 DSGVO, §25 TDDDG",
},
r"hotjar\.com|_hjSettings": {
"id": "hotjar", "name": "Hotjar", "category": "tracking",
"provider": "Hotjar Ltd", "country": "MT", "eu_adequate": True,
"requires_consent": True, "legal_ref": "§25 TDDDG (Session Recording)",
},
r"clarity\.ms": {
"id": "ms_clarity", "name": "Microsoft Clarity", "category": "tracking",
"provider": "Microsoft", "country": "US", "eu_adequate": False,
"requires_consent": True, "legal_ref": "§25 TDDDG (Session Replay), Art. 44 DSGVO",
},
r"matomo|piwik": {
"id": "matomo", "name": "Matomo", "category": "tracking",
"provider": "InnoCraft/Self-hosted", "country": "EU/Self", "eu_adequate": True,
"requires_consent": False, "legal_ref": "Cookieless moeglich, §25 TDDDG",
},
r"plausible\.io": {
"id": "plausible", "name": "Plausible Analytics", "category": "tracking",
"provider": "Plausible Insights", "country": "EE", "eu_adequate": True,
"requires_consent": False, "legal_ref": "EU-Anbieter, cookieless",
},
# --- CDN & Fonts ---
r"fonts\.googleapis\.com|fonts\.gstatic\.com": {
"id": "google_fonts", "name": "Google Fonts (remote)", "category": "cdn",
"provider": "Google LLC", "country": "US", "eu_adequate": False,
"requires_consent": True, "legal_ref": "LG Muenchen I, Az. 3 O 17493/20",
},
r"cdn\.cloudflare\.com|cdnjs\.cloudflare\.com": {
"id": "cloudflare_cdn", "name": "Cloudflare CDN", "category": "cdn",
"provider": "Cloudflare Inc", "country": "US", "eu_adequate": False,
"requires_consent": False, "legal_ref": "Art. 44-49 DSGVO, berechtigtes Interesse",
},
# --- Chatbots ---
r"widget\.intercom\.io|intercomcdn": {
"id": "intercom", "name": "Intercom", "category": "chatbot",
"provider": "Intercom Inc", "country": "US", "eu_adequate": False,
"requires_consent": True, "legal_ref": "Art. 44-49 DSGVO, KI-gestuetzt",
},
r"tidio\.co|tidioChatApi": {
"id": "tidio", "name": "Tidio Chat", "category": "chatbot",
"provider": "Tidio LLC", "country": "PL", "eu_adequate": True,
"requires_consent": False, "legal_ref": "EU-Anbieter",
},
r"zendesk\.com/embeddable|zdassets": {
"id": "zendesk", "name": "Zendesk", "category": "chatbot",
"provider": "Zendesk Inc", "country": "US", "eu_adequate": False,
"requires_consent": True, "legal_ref": "Art. 44-49 DSGVO",
},
# --- Payment ---
r"js\.stripe\.com|stripe\.com/v3": {
"id": "stripe", "name": "Stripe", "category": "payment",
"provider": "Stripe Inc", "country": "US", "eu_adequate": False,
"requires_consent": False, "legal_ref": "Art. 6(1)(b) Vertragserfuellung, SCCs",
},
r"paypal\.com/sdk|paypalobjects": {
"id": "paypal", "name": "PayPal", "category": "payment",
"provider": "PayPal Holdings", "country": "US", "eu_adequate": False,
"requires_consent": False, "legal_ref": "Art. 6(1)(b) Vertragserfuellung",
},
r"klarna\.com|klarna-payments": {
"id": "klarna", "name": "Klarna", "category": "payment",
"provider": "Klarna AB", "country": "SE", "eu_adequate": True,
"requires_consent": False, "legal_ref": "EU, aber Art. 22 DSGVO bei Bonitaetspruefung!",
},
# --- Captcha ---
r"recaptcha|grecaptcha": {
"id": "recaptcha", "name": "Google reCAPTCHA", "category": "other",
"provider": "Google LLC", "country": "US", "eu_adequate": False,
"requires_consent": True, "legal_ref": "Art. 44-49 DSGVO, §25 TDDDG",
},
# --- Video ---
r"youtube\.com/embed|youtube-nocookie|ytimg": {
"id": "youtube", "name": "YouTube", "category": "other",
"provider": "Google LLC", "country": "US", "eu_adequate": False,
"requires_consent": True, "legal_ref": "Art. 44-49 DSGVO, 2-Klick empfohlen",
},
# --- Consent Management ---
r"didomi|cookiebot|onetrust|usercentrics|consentmanager|quantcast": {
"id": "cmp", "name": "Consent Management Platform", "category": "other",
"provider": "Various", "country": "EU", "eu_adequate": True,
"requires_consent": False, "legal_ref": "CMP vorhanden — gut",
},
}
# ── Service Registry (imported from master) ──────────────────────────────────
from compliance.services.service_registry import SERVICE_REGISTRY # noqa: E402
AI_TEXT_PATTERNS = [
r"k(?:ue|ü)nstliche.?intelligenz",
@@ -157,9 +58,13 @@ AI_TEXT_PATTERNS = [
FOOTER_LINK_PATTERNS = [
(r'href="([^"]*(?:impressum|imprint|legal-notice)[^"]*)"', "impressum"),
(r'href="([^"]*(?:datenschutz|privacy|dsgvo)[^"]*)"', "datenschutz"),
(r'href="([^"]*(?:datenschutz|privacy|dsgvo|hinweise.?zum.?datenschutz)[^"]*)"', "datenschutz"),
(r'href="([^"]*(?:agb|terms|nutzungsbedingungen)[^"]*)"', "agb"),
(r'href="([^"]*(?:cookie)[^"]*)"', "cookies"),
# Deep DSE links (regional pages, sub-pages, service marks)
(r'href="([^"]*(?:datenschutzinformation|datenschutzerklaerung|datenschutzerkl)[^"]*)"', "datenschutz_deep"),
# Navigation links often contain DSB/privacy sub-pages
(r'href="([^"]*(?:ueber.?uns.*datenschutz|servicemarken.*datenschutz|kontakt.*datenschutz)[^"]*)"', "datenschutz_nav"),
]
@@ -183,15 +88,46 @@ async def scan_website(base_url: str) -> ScanResult:
href = match.group(1)
if href.startswith("/"):
href = urljoin(origin, href)
if href.startswith(origin):
if href.startswith(origin) and not re.search(r"\.(css|js|png|jpg|gif|svg|pdf|zip)(\?|$)", href):
page_urls.add(href)
# 3. Scan all pages (max 10)
for url in list(page_urls)[:10]:
html = start_html if url == origin else await _fetch_page(client, url, result)
if html:
# 3. Scan all pages in PARALLEL (max 10)
import asyncio
other_urls = [u for u in list(page_urls)[:10] if u != origin]
fetch_tasks = [_fetch_page(client, u, result) for u in other_urls]
other_htmls = await asyncio.gather(*fetch_tasks, return_exceptions=True)
# Process start page
_detect_services(start_html, origin, result)
_detect_ai_mentions(start_html, origin, result)
# Process other pages + discover DSE-internal links
dse_internal_urls = set()
for url, html in zip(other_urls, other_htmls):
if isinstance(html, str) and html:
_detect_services(html, url, result)
_detect_ai_mentions(html, url, result)
# If this is a DSE page, find links within it (SAME DOMAIN only)
if re.search(r"datenschutz|privacy|dsgvo", url, re.IGNORECASE):
for pattern, _ in FOOTER_LINK_PATTERNS:
for match in re.finditer(pattern, html, re.IGNORECASE):
href = match.group(1)
if href.startswith("/"):
href = urljoin(origin, href)
# IMPORTANT: Only follow links on the SAME domain
# External links (etracker.com, google.de) must NOT be scanned
if href.startswith(origin) and href not in page_urls:
dse_internal_urls.add(href)
# 4. Follow DSE-internal links (additional pages linked from privacy policy)
if dse_internal_urls:
extra_urls = [u for u in list(dse_internal_urls)[:5] if u not in page_urls]
if extra_urls:
extra_tasks = [_fetch_page(client, u, result) for u in extra_urls]
extra_htmls = await asyncio.gather(*extra_tasks, return_exceptions=True)
for url, html in zip(extra_urls, extra_htmls):
if isinstance(html, str) and html:
_detect_services(html, url, result)
# Deduplicate services
seen = set()