Merge feat/zeroclaw-compliance-agent into main

Brings all compliance doc-check features:
- 162 regex checks + 1874 Master Controls
- LLM-agnostic agent with tool calling
- Banner check (46 checks, 30 CMPs, stealth, Shadow DOM)
- Impressum check (24 checks)
- Deep consent verification (DataLayer, GCM, TCF)
- CMP E2E tests (39 tests)
- HTML email reports, FAQ, persistent history

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-05-11 11:44:20 +02:00
175 changed files with 20063 additions and 1283 deletions
@@ -63,6 +63,13 @@ _ROUTER_MODULES = [
"tom_mapping_routes",
"llm_audit_routes",
"assertion_routes",
"org_role_routes",
"document_review_routes",
"banner_analytics_routes",
"banner_ab_routes",
"compliance_report_routes",
"whistleblower_routes",
"tcf_routes",
]
_loaded_count = 0
@@ -15,6 +15,14 @@ from fastapi import APIRouter
from pydantic import BaseModel
from compliance.services.smtp_sender import send_email
from compliance.services.intake_extractor import extract_intake_flags_from_services, flags_to_ucca_intake
from compliance.services.relevance_filter import filter_controls
from compliance.services.website_compliance_checks import (
check_website_compliance as _check_website_compliance,
FollowUpQuestion,
to_string_list as _to_string_list,
risk_to_escalation as _risk_to_escalation,
)
logger = logging.getLogger(__name__)
@@ -77,21 +85,32 @@ async def analyze_url(req: AnalyzeRequest):
# Step 2: Classify via SDK LLM
classification = await _classify(client, text)
# Step 3: Assess via UCCA
assessment = await _assess(client, text, classification)
# Step 3: Detect services from HTML (deterministic, no LLM needed)
from compliance.services.service_registry import SERVICE_REGISTRY
detected_services = []
html_lower = raw_html.lower()
for pattern, meta in SERVICE_REGISTRY.items():
if re.search(pattern, html_lower):
detected_services.append(meta)
# Step 4: Determine role
# Step 4: Derive intake flags from DETECTED SERVICES (not from text!)
intake_flags = extract_intake_flags_from_services(detected_services)
# Step 5: Assess via UCCA with service-derived flags
assessment = await _assess(client, text, classification, intake_flags)
# Step 5: Determine role
esc_level = assessment.get("escalation_level", "E0")
role = ESCALATION_ROLES.get(esc_level, ESCALATION_ROLES["E0"])
# Step 5: Website compliance checks (§312k BGB etc.)
# Step 6: Website compliance checks (§312k BGB etc.)
site_findings, follow_ups = await _check_website_compliance(client, req.url, raw_html)
# Step 6: Merge findings
# Step 7: Merge and filter findings/controls
findings = assessment.get("triggered_rules", [])
controls = assessment.get("required_controls", [])
findings_str = _to_string_list(findings) + site_findings
controls_str = _to_string_list(controls)
controls_str = filter_controls(_to_string_list(controls), text, intake_flags)
# Escalate if website checks found issues
if site_findings and esc_level == "E0":
@@ -105,7 +124,7 @@ async def analyze_url(req: AnalyzeRequest):
email_result = send_email(
recipient=req.recipient,
subject=f"[{mode_label}] Compliance-Finding: {classification}{req.url[:60]}",
body_html=f"<div>{summary}</div>",
body_html=summary,
)
return AnalyzeResponse(
@@ -179,34 +198,24 @@ async def _classify(client: httpx.AsyncClient, text: str) -> str:
return "other"
async def _assess(client: httpx.AsyncClient, text: str, classification: str) -> dict:
async def _assess(client: httpx.AsyncClient, text: str, classification: str, intake_flags: dict | None = None) -> dict:
"""Run UCCA assessment via SDK. Returns flattened result dict."""
try:
# UCCA expects boolean intake flags, not string categories
# Use LLM-extracted flags if available, otherwise minimal defaults
if intake_flags:
ucca_intake = flags_to_ucca_intake(intake_flags)
else:
ucca_intake = {
"data_types": {"personal_data": True},
"purpose": {},
"automation": "manual",
"outputs": {},
}
resp = await client.post(f"{SDK_URL}/sdk/v1/ucca/assess", headers=SDK_HEADERS, json={
"use_case_text": text[:3000],
"domain": classification,
"data_types": {
"personal_data": True,
"customer_data": True,
"location_data": "tracking" in text.lower() or "standort" in text.lower(),
"images": False,
"biometric_data": "biometrisch" in text.lower(),
"minor_data": "kinder" in text.lower() or "minderjährig" in text.lower(),
},
"purpose": {
"marketing": "werbung" in text.lower() or "marketing" in text.lower(),
"analytics": "analyse" in text.lower() or "analytics" in text.lower(),
"profiling": "profil" in text.lower() or "personalis" in text.lower(),
"automation": False,
"customer_support": False,
},
"automation": "partially_automated",
"outputs": {
"content_generation": False,
"recommendations_to_users": "empfehl" in text.lower(),
"data_export": "export" in text.lower() or "uebertrag" in text.lower(),
},
**ucca_intake,
})
data = resp.json()
# Flatten: UCCA wraps result under "assessment" and "result"
@@ -227,126 +236,27 @@ async def _assess(client: httpx.AsyncClient, text: str, classification: str) ->
return {"risk_level": "unknown", "risk_score": 0, "escalation_level": "E0"}
async def _check_website_compliance(
client: httpx.AsyncClient, url: str, html: str,
) -> tuple[list[str], list[FollowUpQuestion]]:
"""Scan public website for consumer protection compliance (§312k BGB etc.)."""
findings: list[str] = []
follow_ups: list[FollowUpQuestion] = []
html_lower = html.lower()
base_domain = re.sub(r"https?://([^/]+).*", r"\1", url)
# --- §312k BGB: Kündigungsbutton ---
cancel_patterns = [
r'href="[^"]*(?:kuendig|kündig|cancel|vertrag.?beenden|abo.?beenden|mitgliedschaft.?beenden)[^"]*"',
r'(?:kündigen|kuendigen|vertrag beenden|abo beenden|mitgliedschaft kündigen)',
]
has_cancel_link = any(re.search(p, html_lower) for p in cancel_patterns)
# Also check common cancel URLs
cancel_urls_to_probe = [
f"https://{base_domain}/kuendigen",
f"https://{base_domain}/cancel",
f"https://{base_domain}/vertrag-kuendigen",
f"https://{base_domain}/abo-kuendigen",
f"https://{base_domain}/account/cancel",
]
if not has_cancel_link:
for probe_url in cancel_urls_to_probe:
try:
probe = await client.head(probe_url, follow_redirects=True, timeout=5.0)
if probe.status_code < 400:
has_cancel_link = True
break
except Exception:
continue
if not has_cancel_link:
findings.append(
"[§312k BGB] Kein oeffentlich sichtbarer Kuendigungsbutton gefunden. "
"Seit 01.07.2022 muessen online geschlossene Vertraege mit max. 2 Klicks kuendbar sein."
)
follow_ups.append(FollowUpQuestion(
id="cancel_button_312k",
question="Koennen Sie nach Login im Kundenbereich innerhalb von 2 Klicks Ihren Vertrag kuendigen?",
legal_basis="§ 312k BGB (Kuendigungsbutton), Omnibus-Richtlinie (EU) 2019/2161",
severity="high",
finding_if_no=(
"[§312k BGB] VERSTOSS: Kein funktionaler Kuendigungsbutton vorhanden. "
"Der Anbieter ist verpflichtet, einen leicht auffindbaren Kuendigungsbutton "
"bereitzustellen (max. 2 Klicks). Ein Zwang zur telefonischen Kuendigung "
"oder Kuendigung per Brief ist rechtswidrig."
),
))
# --- Impressumspflicht (§5 TMG / §18 MStV) ---
imprint_patterns = [
r'href="[^"]*(?:impressum|imprint|legal.?notice|about.?us/legal)[^"]*"',
r'>impressum<',
]
has_imprint = any(re.search(p, html_lower) for p in imprint_patterns)
if not has_imprint:
findings.append(
"[§5 TMG] Kein Impressum-Link auf der Seite gefunden. "
"Geschaeftsmaessige Online-Dienste muessen ein leicht erreichbares Impressum bereitstellen."
)
# --- Datenschutzerklaerung verlinkt? ---
privacy_patterns = [
r'href="[^"]*(?:datenschutz|privacy|dsgvo)[^"]*"',
r'>datenschutz<',
]
has_privacy = any(re.search(p, html_lower) for p in privacy_patterns)
if not has_privacy:
findings.append(
"[Art. 13 DSGVO] Kein Link zur Datenschutzerklaerung gefunden. "
"Nutzer muessen ueber die Verarbeitung personenbezogener Daten informiert werden."
)
# --- Cookie-Consent-Banner ---
cookie_patterns = [
r'(?:cookie.?consent|cookie.?banner|consent.?manager|didomi|cookiebot|onetrust|usercentrics)',
r'(?:gdpr|dsgvo).?(?:consent|einwilligung)',
]
has_cookie_consent = any(re.search(p, html_lower) for p in cookie_patterns)
if not has_cookie_consent:
follow_ups.append(FollowUpQuestion(
id="cookie_consent",
question="Wird beim ersten Besuch der Website ein Cookie-Consent-Banner angezeigt?",
legal_basis="§ 25 TDDDG (ehem. TTDSG), Art. 5(3) ePrivacy-Richtlinie",
severity="medium",
finding_if_no=(
"[§25 TDDDG] Kein Cookie-Consent-Banner erkannt. "
"Vor dem Setzen nicht-essentieller Cookies ist eine Einwilligung erforderlich."
),
))
return findings, follow_ups
# _check_website_compliance, _to_string_list, _risk_to_escalation
# → extracted to compliance/services/website_compliance_checks.py
def _to_string_list(items: list) -> list[str]:
"""Convert list of dicts or strings to list of strings."""
result = []
for item in (items or []):
if isinstance(item, dict):
# UCCA returns {code, category, description} or {id, name, description}
desc = item.get("description", item.get("name", item.get("code", str(item))))
code = item.get("code", item.get("id", ""))
result.append(f"[{code}] {desc}" if code else str(desc))
else:
result.append(str(item))
return result
DOC_TYPE_LABELS = {
"privacy_policy": "Datenschutzerklaerung",
"cookie_banner": "Cookie-Banner",
"terms_of_service": "AGB",
"imprint": "Impressum",
"dpa": "Auftragsverarbeitung (AVV)",
"other": "Sonstiges",
}
def _risk_to_escalation(risk_level: str) -> str:
"""Map UCCA risk level to escalation level."""
mapping = {
"MINIMAL": "E0",
"LIMITED": "E1",
"HIGH": "E2",
"UNACCEPTABLE": "E3",
}
return mapping.get(risk_level.upper() if risk_level else "", "E0")
RISK_COLORS = {
"MINIMAL": ("#16a34a", "Niedrig"),
"LOW": ("#ca8a04", "Gering"),
"LIMITED": ("#ea580c", "Mittel"),
"HIGH": ("#dc2626", "Hoch"),
"UNACCEPTABLE": ("#991b1b", "Kritisch"),
}
def _build_summary(
@@ -354,48 +264,54 @@ def _build_summary(
findings_str: list[str], controls_str: list[str],
mode: str = "post_launch",
) -> str:
"""Build a German manager summary, adapted to pre/post-launch context."""
"""Build HTML summary for email and frontend."""
risk = assessment.get("risk_level", "unbekannt")
score = assessment.get("risk_score", 0)
recommendation = assessment.get("recommendation", "")
dsfa = assessment.get("dsfa_recommended", False)
is_live = mode == "post_launch"
risk_color, risk_label = RISK_COLORS.get(risk, ("#6b7280", risk))
doc_label = DOC_TYPE_LABELS.get(classification, classification)
findings_text = "\n".join(f"- {f}" for f in findings_str[:5]) if findings_str else "Keine"
controls_text = "\n".join(f"- {c}" for c in controls_str[:5]) if controls_str else "Keine"
mode_header = (
"PRUEFUNG LIVE-WEBSITE — Das Dokument ist bereits oeffentlich zugaenglich."
mode_banner = (
'<div style="background:#fef2f2;border-left:4px solid #dc2626;padding:12px 16px;margin-bottom:16px;">'
'<strong style="color:#991b1b;">LIVE-WEBSITE</strong> — Das Dokument ist bereits oeffentlich zugaenglich.</div>'
if is_live else
"INTERNE PRUEFUNG — Das Dokument ist noch nicht veroeffentlicht."
'<div style="background:#eff6ff;border-left:4px solid #3b82f6;padding:12px 16px;margin-bottom:16px;">'
'<strong style="color:#1e40af;">INTERNE PRUEFUNG</strong> — Dokument noch nicht veroeffentlicht.</div>'
)
parts = [
mode_header,
"",
f"Dokumenttyp: {classification}",
f"Quelle: {url}",
f"Risikobewertung: {risk} ({score}/100)",
f"Zustaendig: {role}",
f"DSFA empfohlen: {'Ja' if dsfa else 'Nein'}",
"",
f"Findings:\n{findings_text}",
"",
f"Erforderliche Massnahmen:\n{controls_text}",
]
findings_html = "".join(f'<li style="margin-bottom:4px;">{f}</li>' for f in findings_str[:8]) if findings_str else '<li style="color:#6b7280;">Keine</li>'
controls_html = "".join(f'<li style="margin-bottom:4px;">{c}</li>' for c in controls_str[:8]) if controls_str else '<li style="color:#6b7280;">Keine</li>'
warning = ""
if is_live and findings_str:
parts.extend([
"",
"ACHTUNG: Diese Maengel sind bereits oeffentlich sichtbar. "
"Sofortige Nachbesserung empfohlen um Abmahnrisiken zu minimieren.",
])
warning = (
'<div style="background:#fef2f2;border:1px solid #fecaca;border-radius:8px;padding:12px 16px;margin-top:16px;">'
'<strong style="color:#dc2626;">⚠ ACHTUNG:</strong> Diese Maengel sind bereits oeffentlich sichtbar. '
'Sofortige Nachbesserung empfohlen um Abmahnrisiken zu minimieren.</div>'
)
elif not is_live and controls_str:
parts.extend([
"",
"Empfehlung: Implementieren Sie die erforderlichen Kontrollen vor der Veroeffentlichung.",
])
warning = (
'<div style="background:#f0fdf4;border:1px solid #bbf7d0;border-radius:8px;padding:12px 16px;margin-top:16px;">'
'Empfehlung: Implementieren Sie die erforderlichen Kontrollen vor der Veroeffentlichung.</div>'
)
if recommendation:
parts.extend(["", f"Weitere Empfehlung: {recommendation}"])
return "\n".join(parts)
rec_html = f'<p style="color:#475569;margin-top:12px;"><em>{recommendation}</em></p>' if recommendation else ""
return f"""
{mode_banner}
<table style="width:100%;border-collapse:collapse;margin-bottom:16px;">
<tr><td style="padding:6px 0;color:#64748b;width:180px;">Dokumenttyp</td><td style="padding:6px 0;font-weight:600;">{doc_label}</td></tr>
<tr><td style="padding:6px 0;color:#64748b;">Quelle</td><td style="padding:6px 0;"><a href="{url}" style="color:#6366f1;">{url}</a></td></tr>
<tr><td style="padding:6px 0;color:#64748b;">Risikobewertung</td><td style="padding:6px 0;"><span style="background:{risk_color};color:white;padding:2px 8px;border-radius:4px;font-size:13px;">{risk_label} ({score}/100)</span></td></tr>
<tr><td style="padding:6px 0;color:#64748b;">Zustaendig</td><td style="padding:6px 0;font-weight:600;">{role}</td></tr>
<tr><td style="padding:6px 0;color:#64748b;">DSFA empfohlen</td><td style="padding:6px 0;">{'Ja' if dsfa else 'Nein'}</td></tr>
</table>
<h3 style="color:#1e293b;font-size:15px;margin:16px 0 8px;">Findings</h3>
<ul style="margin:0;padding-left:20px;color:#334155;">{findings_html}</ul>
<h3 style="color:#1e293b;font-size:15px;margin:16px 0 8px;">Erforderliche Massnahmen</h3>
<ul style="margin:0;padding-left:20px;color:#334155;">{controls_html}</ul>
{warning}
{rec_html}
"""
@@ -0,0 +1,94 @@
"""
Agent Compare Routes — scan multiple websites and compare compliance posture.
POST /api/compliance/agent/compare
"""
import asyncio
import logging
from datetime import datetime, timezone
import httpx
from fastapi import APIRouter
from pydantic import BaseModel
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/compliance/agent", tags=["agent"])
class CompareRequest(BaseModel):
urls: list[str] # 2-5 URLs to compare
mode: str = "post_launch"
class SiteResult(BaseModel):
url: str
domain: str
risk_level: str = ""
risk_score: float = 0
findings_count: int = 0
services_count: int = 0
has_impressum: bool = False
has_datenschutz: bool = False
has_cookie_banner: bool = False
has_google_fonts: bool = False
tracking_before_consent: int = 0
classification: str = ""
scan_status: str = "pending"
class CompareResponse(BaseModel):
sites: list[SiteResult]
compared_at: str
@router.post("/compare", response_model=CompareResponse)
async def compare_websites(req: CompareRequest):
"""Scan multiple websites and compare their compliance posture."""
urls = req.urls[:5] # Max 5
async def scan_one(url: str) -> SiteResult:
domain = url.split("/")[2] if len(url.split("/")) > 2 else url
try:
async with httpx.AsyncClient(timeout=120.0) as client:
resp = await client.post(
"http://localhost:8002/api/compliance/agent/scan",
json={"url": url, "mode": req.mode},
)
if resp.status_code != 200:
return SiteResult(url=url, domain=domain, scan_status="failed")
data = resp.json()
services = data.get("services", [])
findings = data.get("findings", [])
return SiteResult(
url=url,
domain=domain,
risk_level=data.get("risk_level", ""),
risk_score=data.get("risk_score", 0),
findings_count=len(findings),
services_count=len(services),
has_impressum=not any("IMPRESSUM" in f.get("code", "") for f in findings if isinstance(f, dict)),
has_datenschutz=not any("DATENSCHUTZ" in f.get("code", "") for f in findings if isinstance(f, dict)),
has_cookie_banner=data.get("chatbot_detected", False) or any(
s.get("id") == "cmp" for s in services if isinstance(s, dict)
),
has_google_fonts=any(
s.get("id") == "google_fonts" for s in services if isinstance(s, dict)
),
classification=data.get("classification", ""),
scan_status="completed",
)
except Exception as e:
logger.error("Compare scan failed for %s: %s", url, e)
return SiteResult(url=url, domain=domain, scan_status="error")
# Scan all in parallel
results = await asyncio.gather(*[scan_one(u) for u in urls])
return CompareResponse(
sites=list(results),
compared_at=datetime.now(timezone.utc).isoformat(),
)
@@ -0,0 +1,220 @@
"""
Agent History Routes — persist and retrieve scan results.
GET /api/compliance/agent/scans — list recent scans
GET /api/compliance/agent/scans/{id} — get single scan
POST /api/compliance/agent/scans — save a scan result
"""
import json
import logging
import os
import uuid
from datetime import datetime, timezone
from fastapi import APIRouter, Query
from fastapi.responses import Response
from pydantic import BaseModel
from compliance.services.agent_pdf_export import generate_scan_pdf
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/compliance/agent", tags=["agent"])
DATABASE_URL = os.environ.get(
"COMPLIANCE_DATABASE_URL",
os.environ.get("DATABASE_URL", ""),
)
class SaveScanRequest(BaseModel):
url: str
scan_type: str = "scan"
analysis_mode: str = "post_launch"
result: dict # Full scan result JSON
class ScanHistoryItem(BaseModel):
id: str
url: str
scan_type: str
analysis_mode: str
risk_level: str | None = None
risk_score: float = 0
findings_count: int = 0
pages_scanned: int = 0
email_sent: bool = False
created_at: str
class ScanDetail(BaseModel):
id: str
url: str
scan_type: str
analysis_mode: str
result: dict
created_at: str
async def _get_pool():
"""Get or create database connection pool."""
import asyncpg
if not DATABASE_URL:
return None
try:
return await asyncpg.create_pool(DATABASE_URL, min_size=1, max_size=3)
except Exception as e:
logger.warning("DB connection failed: %s", e)
return None
@router.post("/scans")
async def save_scan(req: SaveScanRequest):
"""Save a scan result to the database."""
pool = await _get_pool()
if not pool:
return {"status": "skipped", "reason": "no database"}
scan_id = str(uuid.uuid4())
result = req.result
try:
async with pool.acquire() as conn:
await conn.execute("""
INSERT INTO compliance_agent_scans
(id, url, scan_type, analysis_mode, classification, risk_level,
risk_score, escalation_level, responsible_role, services,
findings, summary_html, pages_scanned, pages_list, email_sent,
created_at)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16)
""",
uuid.UUID(scan_id),
req.url,
req.scan_type,
req.analysis_mode,
result.get("classification", ""),
result.get("risk_level", ""),
result.get("risk_score", 0),
result.get("escalation_level", ""),
result.get("responsible_role", ""),
json.dumps(result.get("services", [])),
json.dumps(result.get("findings", [])),
result.get("summary", result.get("summary_html", "")),
result.get("pages_scanned", 0),
json.dumps(result.get("pages_list", [])),
result.get("email_status") == "sent",
datetime.now(timezone.utc),
)
return {"status": "saved", "id": scan_id}
except Exception as e:
logger.error("Failed to save scan: %s", e)
return {"status": "error", "error": str(e)}
finally:
await pool.close()
@router.get("/scans", response_model=list[ScanHistoryItem])
async def list_scans(
limit: int = Query(20, le=100),
scan_type: str | None = None,
):
"""List recent scans."""
pool = await _get_pool()
if not pool:
return []
try:
async with pool.acquire() as conn:
query = """
SELECT id, url, scan_type, analysis_mode, risk_level, risk_score,
findings, pages_scanned, email_sent, created_at
FROM compliance_agent_scans
"""
params = []
if scan_type:
query += " WHERE scan_type = $1"
params.append(scan_type)
query += " ORDER BY created_at DESC LIMIT " + str(limit)
rows = await conn.fetch(query, *params)
return [
ScanHistoryItem(
id=str(r["id"]),
url=r["url"],
scan_type=r["scan_type"],
analysis_mode=r["analysis_mode"],
risk_level=r["risk_level"],
risk_score=r["risk_score"] or 0,
findings_count=len(json.loads(r["findings"] or "[]")),
pages_scanned=r["pages_scanned"] or 0,
email_sent=r["email_sent"] or False,
created_at=r["created_at"].isoformat() if r["created_at"] else "",
)
for r in rows
]
except Exception as e:
logger.error("Failed to list scans: %s", e)
return []
finally:
await pool.close()
@router.get("/scans/{scan_id}", response_model=ScanDetail)
async def get_scan(scan_id: str):
"""Get a single scan result."""
pool = await _get_pool()
if not pool:
return ScanDetail(id=scan_id, url="", scan_type="", analysis_mode="", result={}, created_at="")
try:
async with pool.acquire() as conn:
row = await conn.fetchrow("""
SELECT * FROM compliance_agent_scans WHERE id = $1
""", uuid.UUID(scan_id))
if not row:
return ScanDetail(id=scan_id, url="", scan_type="", analysis_mode="", result={}, created_at="")
return ScanDetail(
id=str(row["id"]),
url=row["url"],
scan_type=row["scan_type"],
analysis_mode=row["analysis_mode"],
result={
"classification": row["classification"],
"risk_level": row["risk_level"],
"risk_score": row["risk_score"],
"services": json.loads(row["services"] or "[]"),
"findings": json.loads(row["findings"] or "[]"),
"summary": row["summary_html"],
"pages_scanned": row["pages_scanned"],
"pages_list": json.loads(row["pages_list"] or "[]"),
},
created_at=row["created_at"].isoformat() if row["created_at"] else "",
)
except Exception as e:
logger.error("Failed to get scan: %s", e)
return ScanDetail(id=scan_id, url="", scan_type="", analysis_mode="", result={}, created_at="")
finally:
await pool.close()
@router.post("/scans/pdf")
async def export_scan_pdf(req: SaveScanRequest):
"""Generate a PDF report from scan results (no DB required)."""
try:
pdf_bytes = generate_scan_pdf({
"url": req.url,
"scan_type": req.scan_type,
"analysis_mode": req.analysis_mode,
**req.result,
})
return Response(
content=pdf_bytes,
media_type="application/pdf",
headers={"Content-Disposition": f'attachment; filename="compliance-report-{req.url.split("/")[2][:30]}.pdf"'},
)
except Exception as e:
logger.error("PDF generation failed: %s", e)
return {"error": str(e)}
@@ -0,0 +1,111 @@
"""
Agent Recurring Scan Routes — schedule and run automated periodic scans.
POST /api/compliance/agent/monitored-urls — add URL to monitoring
GET /api/compliance/agent/monitored-urls — list monitored URLs
POST /api/compliance/agent/run-scheduled — trigger all scheduled scans
"""
import json
import logging
import os
import uuid
from datetime import datetime, timezone
from fastapi import APIRouter
from pydantic import BaseModel
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/compliance/agent", tags=["agent"])
DATABASE_URL = os.environ.get(
"COMPLIANCE_DATABASE_URL",
os.environ.get("DATABASE_URL", ""),
)
# In-memory fallback when no DB available
_monitored_urls: list[dict] = []
class MonitoredURL(BaseModel):
url: str
scan_type: str = "scan" # scan, consent_test
frequency: str = "weekly" # daily, weekly, monthly
recipient: str = "dsb@breakpilot.local"
enabled: bool = True
@router.post("/monitored-urls")
async def add_monitored_url(req: MonitoredURL):
"""Add a URL to the monitoring list."""
entry = {
"id": str(uuid.uuid4()),
"url": req.url,
"scan_type": req.scan_type,
"frequency": req.frequency,
"recipient": req.recipient,
"enabled": req.enabled,
"created_at": datetime.now(timezone.utc).isoformat(),
"last_scan_at": None,
}
_monitored_urls.append(entry)
logger.info("Added monitored URL: %s (%s)", req.url, req.frequency)
return {"status": "added", **entry}
@router.get("/monitored-urls")
async def list_monitored_urls():
"""List all monitored URLs."""
return {"urls": _monitored_urls}
@router.delete("/monitored-urls/{url_id}")
async def remove_monitored_url(url_id: str):
"""Remove a URL from monitoring."""
global _monitored_urls
_monitored_urls = [u for u in _monitored_urls if u["id"] != url_id]
return {"status": "removed"}
@router.post("/run-scheduled")
async def run_scheduled_scans():
"""Trigger all enabled scheduled scans. Called by cron/ZeroClaw."""
import httpx
results = []
backend_url = "http://localhost:8002"
for entry in _monitored_urls:
if not entry["enabled"]:
continue
url = entry["url"]
scan_type = entry["scan_type"]
logger.info("Running scheduled %s for %s", scan_type, url)
try:
async with httpx.AsyncClient(timeout=300.0) as client:
if scan_type == "consent_test":
resp = await client.post(
"http://bp-compliance-consent-tester:8094/scan",
json={"url": url},
)
else:
resp = await client.post(
f"{backend_url}/api/compliance/agent/scan",
json={"url": url, "mode": "post_launch", "recipient": entry["recipient"]},
)
entry["last_scan_at"] = datetime.now(timezone.utc).isoformat()
results.append({
"url": url,
"scan_type": scan_type,
"status": "completed" if resp.status_code == 200 else "failed",
"status_code": resp.status_code,
})
except Exception as e:
logger.error("Scheduled scan failed for %s: %s", url, e)
results.append({"url": url, "scan_type": scan_type, "status": "error", "error": str(e)})
return {"scans_triggered": len(results), "results": results}
@@ -73,6 +73,7 @@ def build_scan_summary(
f"Findings: {n_findings} ({high} mit hoher Prioritaet)",
])
<<<<<<< HEAD
# DSI Documents section — grouped with their findings
if discovered_docs:
parts.extend(["", f"Rechtliche Dokumente ({len(discovered_docs)})"])
@@ -108,6 +109,27 @@ def build_scan_summary(
marker = "!!" if sev == "HIGH" else "!" if sev == "MEDIUM" else "i"
parts.append(f" [{marker}] {txt}")
elif findings:
=======
# DSI Documents section
if discovered_docs:
parts.extend([
"",
f"Rechtliche Dokumente gefunden: {len(discovered_docs)}",
])
for doc in discovered_docs:
pct = doc.completeness_pct if hasattr(doc, 'completeness_pct') else 0
fc = doc.findings_count if hasattr(doc, 'findings_count') else 0
wc = doc.word_count if hasattr(doc, 'word_count') else 0
status = "OK" if pct >= 80 else "LUECKENHAFT" if pct >= 50 else "MANGELHAFT"
dt = doc.doc_type if hasattr(doc, 'doc_type') else "unknown"
title = doc.title if hasattr(doc, 'title') else "?"
parts.append(
f" [{status}] {title} ({dt}, {wc} Woerter, "
f"{pct}% vollstaendig, {fc} Maengel)"
)
if findings:
>>>>>>> feat/zeroclaw-compliance-agent
parts.append("")
for f in findings[:20]:
sev = f.severity if hasattr(f, 'severity') else "?"
@@ -123,6 +145,7 @@ def build_scan_summary(
])
return "\n".join(parts)
<<<<<<< HEAD
async def fetch_dse_text(url: str, scanned_pages: list[str]) -> str:
@@ -161,3 +184,5 @@ async def fetch_dse_html(url: str, scanned_pages: list[str]) -> str:
return resp.text
except Exception:
return ""
=======
>>>>>>> feat/zeroclaw-compliance-agent
@@ -23,9 +23,13 @@ from compliance.services.mandatory_content_checker import (
check_mandatory_documents, check_dse_mandatory_content, MandatoryFinding,
)
from compliance.services.legal_basis_validator import validate_legal_bases
<<<<<<< HEAD
from compliance.api.agent_scan_helpers import (
add_corrections, build_scan_summary, fetch_dse_text, fetch_dse_html,
)
=======
from compliance.api.agent_scan_helpers import add_corrections, build_scan_summary
>>>>>>> feat/zeroclaw-compliance-agent
logger = logging.getLogger(__name__)
@@ -79,7 +83,10 @@ class ScanFinding(BaseModel):
severity: str
text: str
correction: str = ""
<<<<<<< HEAD
doc_title: str = ""
=======
>>>>>>> feat/zeroclaw-compliance-agent
text_reference: TextReferenceModel | None = None
@@ -219,17 +226,69 @@ async def _execute_scan(req: ScanRequest, scan_id: str = "") -> ScanResponse:
else:
scan = await scan_website(req.url)
<<<<<<< HEAD
logger.info("Scanned %d pages, found %d services", len(scan.pages_scanned), len(scan.detected_services))
_progress(f"Schritt 2/7: Rechtliche Dokumente suchen... ({len(scan.pages_scanned)} Seiten gescannt)")
=======
# Step 1: Scan website — try Playwright first (JS-rendered), fallback to httpx
playwright_htmls: dict[str, str] = {}
try:
async with httpx.AsyncClient(timeout=120.0) as pw_client:
pw_resp = await pw_client.post(
"http://bp-compliance-consent-tester:8094/website-scan",
json={"url": req.url, "max_pages": 15, "click_nav": True},
)
if pw_resp.status_code == 200:
pw_data = pw_resp.json()
playwright_htmls = pw_data.get("page_htmls", {})
logger.info("Playwright scan: %d pages, %d scripts",
pw_data.get("pages_count", 0), len(pw_data.get("external_scripts", [])))
except Exception as e:
logger.warning("Playwright scanner unavailable, falling back to httpx: %s", e)
# Use Playwright results if available, otherwise fall back to httpx scanner
if playwright_htmls:
# Build ScanResult from Playwright data
from compliance.services.website_scanner import ScanResult, DetectedService, _detect_services, _detect_ai_mentions
from compliance.services.service_registry import SERVICE_REGISTRY
scan = ScanResult()
scan.pages_scanned = list(playwright_htmls.keys())
for page_url, html in playwright_htmls.items():
_detect_services(html, page_url, scan)
_detect_ai_mentions(html, page_url, scan)
# Deduplicate
seen = set()
unique = []
for svc in scan.detected_services:
if svc.id not in seen:
seen.add(svc.id)
unique.append(svc)
scan.detected_services = unique
scan.chatbot_detected = any(s.category == "chatbot" for s in scan.detected_services)
if scan.chatbot_detected:
scan.chatbot_provider = next(s.name for s in scan.detected_services if s.category == "chatbot")
else:
scan = await scan_website(req.url)
logger.info("Scanned %d pages, found %d services", len(scan.pages_scanned), len(scan.detected_services))
>>>>>>> feat/zeroclaw-compliance-agent
# Step 1b: DSI Discovery — find all legal documents on the website
discovered_docs: list[DiscoveredDocument] = []
dsi_findings: list[ScanFinding] = []
try:
<<<<<<< HEAD
async with httpx.AsyncClient(timeout=300.0) as dsi_client:
dsi_resp = await dsi_client.post(
"http://bp-compliance-consent-tester:8094/dsi-discovery",
json={"url": req.url, "max_documents": 30},
=======
async with httpx.AsyncClient(timeout=180.0) as dsi_client:
dsi_resp = await dsi_client.post(
"http://bp-compliance-consent-tester:8094/dsi-discovery",
json={"url": req.url, "max_documents": 20},
>>>>>>> feat/zeroclaw-compliance-agent
)
if dsi_resp.status_code == 200:
dsi_data = dsi_resp.json()
@@ -241,12 +300,17 @@ async def _execute_scan(req: ScanRequest, scan_id: str = "") -> ScanResponse:
)
for doc in dsi_data.get("documents", []):
doc_type = classify_document_type(doc["title"], doc["url"])
<<<<<<< HEAD
doc_text = doc.get("full_text", "") or doc.get("text_preview", "")
logger.info("DSI check: '%s' type=%s text_len=%d full_text_len=%d preview_len=%d",
doc["title"][:50], doc_type, len(doc_text),
len(doc.get("full_text", "")), len(doc.get("text_preview", "")))
doc_findings = check_document_completeness(
doc_text, doc_type, doc["title"], doc["url"],
=======
doc_findings = check_document_completeness(
doc.get("text_preview", ""), doc_type, doc["title"], doc["url"],
>>>>>>> feat/zeroclaw-compliance-agent
)
# Count completeness
score_finding = next((f for f in doc_findings if "SCORE" in f.get("code", "")), None)
@@ -268,6 +332,7 @@ async def _execute_scan(req: ScanRequest, scan_id: str = "") -> ScanResponse:
if "SCORE" not in df.get("code", ""):
dsi_findings.append(ScanFinding(
code=df["code"], severity=df["severity"], text=df["text"],
<<<<<<< HEAD
doc_title=doc["title"],
))
except Exception as e:
@@ -296,6 +361,24 @@ async def _execute_scan(req: ScanRequest, scan_id: str = "") -> ScanResponse:
pass
if not dse_text:
dse_text = await fetch_dse_text(req.url, scan.pages_scanned)
=======
))
except Exception as e:
logger.warning("DSI discovery failed: %s", e)
# Step 2: Fetch privacy policy text (from Playwright HTMLs or httpx)
dse_text = ""
for page_url, html in playwright_htmls.items():
if re.search(r"datenschutz|privacy|dsgvo", page_url, re.IGNORECASE):
import re as _re
clean = _re.sub(r"<(script|style)[^>]*>.*?</\1>", "", html, flags=_re.DOTALL | _re.IGNORECASE)
clean = _re.sub(r"<[^>]+>", " ", clean)
clean = _re.sub(r"\s+", " ", clean).strip()
dse_text = clean[:4000]
break
if not dse_text:
dse_text = await _fetch_dse_text(req.url, scan.pages_scanned)
>>>>>>> feat/zeroclaw-compliance-agent
# Step 3: Extract services mentioned in DSE via LLM + text fallback
dse_services = await extract_dse_services(dse_text) if dse_text else []
@@ -320,11 +403,18 @@ async def _execute_scan(req: ScanRequest, scan_id: str = "") -> ScanResponse:
dse_html = html
break
if not dse_html:
<<<<<<< HEAD
dse_html = await fetch_dse_html(req.url, scan.pages_scanned)
dse_sections = parse_dse(dse_html, req.url) if dse_html else []
logger.info("Parsed %d DSE sections", len(dse_sections))
_progress("Schritt 4/7: SOLL/IST Vergleich...")
=======
dse_html = await _fetch_dse_html(req.url, scan.pages_scanned)
dse_sections = parse_dse(dse_html, req.url) if dse_html else []
logger.info("Parsed %d DSE sections", len(dse_sections))
>>>>>>> feat/zeroclaw-compliance-agent
# Step 5: SOLL/IST comparison
detected_dicts = [_service_to_dict(s) for s in scan.detected_services]
comparison = compare_services(detected_dicts, dse_services)
@@ -363,7 +453,10 @@ async def _execute_scan(req: ScanRequest, scan_id: str = "") -> ScanResponse:
# Step 8c: Add DSI document findings
findings.extend(dsi_findings)
<<<<<<< HEAD
_progress(f"Schritt 5/7: Korrekturen generieren... ({len(findings)} Findings)")
=======
>>>>>>> feat/zeroclaw-compliance-agent
# Step 9: Generate corrections for pre-launch mode
if not is_live and findings:
await add_corrections(findings, dse_text)
@@ -400,6 +493,24 @@ async def _execute_scan(req: ScanRequest, scan_id: str = "") -> ScanResponse:
async def _fetch_dse_html(url: str, scanned_pages: list[str]) -> str:
"""Fetch the raw HTML of the privacy policy page (for structured parsing)."""
import re
dse_url = None
for page in scanned_pages:
if re.search(r"datenschutz|privacy|dsgvo", page, re.IGNORECASE):
dse_url = page
break
if not dse_url:
dse_url = url
try:
async with httpx.AsyncClient(timeout=15.0, follow_redirects=True) as client:
resp = await client.get(dse_url, headers={"User-Agent": "BreakPilot-Compliance-Agent/1.0"})
return resp.text
except Exception:
return ""
def _service_to_dict(svc: DetectedService) -> dict:
return {
"id": svc.id, "name": svc.name, "category": svc.category,
@@ -0,0 +1,120 @@
"""
FastAPI routes for Banner A/B Testing.
Endpoints:
GET /banner/ab/{site_config_id}/variants list variants
POST /banner/ab/{site_config_id}/variants create variant
PUT /banner/ab/variants/{variant_id} update variant
DELETE /banner/ab/variants/{variant_id} delete variant
GET /banner/ab/{site_config_id}/stats per-variant stats
GET /banner/ab/assign assign variant for device
"""
import logging
from typing import Optional
from fastapi import APIRouter, Depends, HTTPException, Query
from pydantic import BaseModel
from sqlalchemy.orm import Session
from classroom_engine.database import get_db
from .tenant_utils import get_tenant_id as _get_tenant_id
from compliance.services.banner_ab_service import BannerABService
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/banner/ab", tags=["banner-ab-testing"])
class VariantCreate(BaseModel):
variant_name: str
variant_key: str = "A"
traffic_percent: int = 50
is_control: bool = False
banner_title: Optional[str] = None
banner_description: Optional[str] = None
position: Optional[str] = None
style: Optional[str] = None
primary_color: Optional[str] = None
show_decline_all: Optional[bool] = None
theme_overrides: Optional[dict] = None
class VariantUpdate(BaseModel):
variant_name: Optional[str] = None
traffic_percent: Optional[int] = None
is_control: Optional[bool] = None
banner_title: Optional[str] = None
banner_description: Optional[str] = None
position: Optional[str] = None
style: Optional[str] = None
primary_color: Optional[str] = None
show_decline_all: Optional[bool] = None
is_active: Optional[bool] = None
@router.get("/{site_config_id}/variants")
def list_variants(
site_config_id: str,
db: Session = Depends(get_db),
tenant_id: str = Depends(_get_tenant_id),
):
service = BannerABService(db)
return service.list_variants(tenant_id, site_config_id)
@router.post("/{site_config_id}/variants")
def create_variant(
site_config_id: str,
body: VariantCreate,
db: Session = Depends(get_db),
tenant_id: str = Depends(_get_tenant_id),
):
service = BannerABService(db)
return service.create_variant(tenant_id, site_config_id, body.model_dump())
@router.put("/variants/{variant_id}")
def update_variant(
variant_id: str,
body: VariantUpdate,
db: Session = Depends(get_db),
):
service = BannerABService(db)
result = service.update_variant(variant_id, body.model_dump(exclude_none=True))
if not result:
raise HTTPException(404, "Variant not found")
return result
@router.delete("/variants/{variant_id}")
def delete_variant(
variant_id: str,
db: Session = Depends(get_db),
):
service = BannerABService(db)
if not service.delete_variant(variant_id):
raise HTTPException(404, "Variant not found")
return {"deleted": True}
@router.get("/{site_config_id}/stats")
def variant_stats(
site_config_id: str,
db: Session = Depends(get_db),
tenant_id: str = Depends(_get_tenant_id),
):
service = BannerABService(db)
return service.get_variant_stats(tenant_id, site_config_id)
@router.get("/assign")
def assign_variant(
site_config_id: str = Query(...),
device_fingerprint: str = Query(...),
db: Session = Depends(get_db),
):
service = BannerABService(db)
variant = service.assign_variant(site_config_id, device_fingerprint)
if not variant:
return {"variant": None, "message": "No active A/B test"}
return {"variant": variant}
@@ -0,0 +1,67 @@
"""
FastAPI routes for Banner Consent Analytics.
Endpoints:
GET /banner/analytics/{site_id}/overview high-level stats
GET /banner/analytics/{site_id}/time-series opt-in rate over time
GET /banner/analytics/{site_id}/categories acceptance per category
GET /banner/analytics/{site_id}/devices mobile/desktop/tablet breakdown
"""
import logging
from typing import Optional
from fastapi import APIRouter, Depends, Query
from sqlalchemy.orm import Session
from classroom_engine.database import get_db
from .tenant_utils import get_tenant_id as _get_tenant_id
from compliance.services.banner_analytics_service import BannerAnalyticsService
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/banner/analytics", tags=["banner-analytics"])
@router.get("/{site_id}/overview")
def analytics_overview(
site_id: str,
days: int = Query(30, le=365),
db: Session = Depends(get_db),
tenant_id: str = Depends(_get_tenant_id),
):
service = BannerAnalyticsService(db)
return service.get_overview_stats(tenant_id, site_id, days)
@router.get("/{site_id}/time-series")
def analytics_time_series(
site_id: str,
period: str = Query("daily"),
days: int = Query(30, le=365),
db: Session = Depends(get_db),
tenant_id: str = Depends(_get_tenant_id),
):
service = BannerAnalyticsService(db)
return service.get_time_series(tenant_id, site_id, period, days)
@router.get("/{site_id}/categories")
def analytics_categories(
site_id: str,
days: int = Query(30, le=365),
db: Session = Depends(get_db),
tenant_id: str = Depends(_get_tenant_id),
):
service = BannerAnalyticsService(db)
return service.get_category_breakdown(tenant_id, site_id, days)
@router.get("/{site_id}/devices")
def analytics_devices(
site_id: str,
days: int = Query(30, le=365),
db: Session = Depends(get_db),
tenant_id: str = Depends(_get_tenant_id),
):
service = BannerAnalyticsService(db)
return service.get_device_breakdown(tenant_id, site_id, days)
@@ -74,6 +74,7 @@ async def record_consent(
device_fingerprint=body.device_fingerprint,
categories=body.categories,
vendors=body.vendors,
vendor_consents=body.vendor_consents,
ip_address=body.ip_address,
user_agent=body.user_agent,
consent_string=body.consent_string,
@@ -0,0 +1,38 @@
"""
FastAPI route for Compliance Report PDF generation.
Endpoint:
GET /compliance/report/pdf generate comprehensive compliance report as PDF
"""
import logging
from typing import Optional
from fastapi import APIRouter, Depends, Query
from fastapi.responses import StreamingResponse
from sqlalchemy.orm import Session
import io
from classroom_engine.database import get_db
from .tenant_utils import get_tenant_id as _get_tenant_id
from compliance.services.compliance_pdf_generator import CompliancePDFGenerator
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/compliance/report", tags=["compliance-report"])
@router.get("/pdf")
def generate_compliance_report_pdf(
project_id: Optional[str] = Query(None),
language: str = Query("de"),
db: Session = Depends(get_db),
tenant_id: str = Depends(_get_tenant_id),
):
"""Generate a comprehensive compliance PDF report for a project."""
generator = CompliancePDFGenerator(db)
pdf_bytes, filename = generator.generate(tenant_id, project_id, language)
return StreamingResponse(
io.BytesIO(pdf_bytes),
media_type="application/pdf",
headers={"Content-Disposition": f'attachment; filename="{filename}"'},
)
@@ -0,0 +1,380 @@
"""
FastAPI routes for Document Review Workflow.
Tracks which compliance documents have been sent for review, their status,
and handles email notifications to reviewers.
Endpoints:
GET /document-reviews list reviews with filters
GET /document-reviews/stats counts by status
POST /document-reviews create review (auto-assign from mapping)
GET /document-reviews/{id} single review
POST /document-reviews/{id}/send send notification email
POST /document-reviews/{id}/approve mark as approved
POST /document-reviews/{id}/reject mark as rejected
GET /document-reviews/for-document reviews for a specific doc type
"""
import hashlib
import logging
from datetime import datetime
from typing import Optional
from fastapi import APIRouter, Depends, HTTPException, Query
from pydantic import BaseModel
from sqlalchemy import text
from sqlalchemy.orm import Session
from classroom_engine.database import get_db
from .tenant_utils import get_tenant_id as _get_tenant_id
from .db_utils import row_to_dict as _row_to_dict
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/document-reviews", tags=["document-reviews"])
# =============================================================================
# Schemas
# =============================================================================
class ReviewCreate(BaseModel):
document_type: str
document_title: str
document_content: Optional[str] = None
project_id: Optional[str] = None
submitted_by: Optional[str] = None
review_link: Optional[str] = None
class ReviewReject(BaseModel):
comment: str
# =============================================================================
# Routes
# =============================================================================
@router.get("")
def list_reviews(
project_id: Optional[str] = Query(None),
status: Optional[str] = Query(None),
document_type: Optional[str] = Query(None),
reviewer_role_key: Optional[str] = Query(None),
limit: int = Query(50, le=200),
offset: int = Query(0),
db: Session = Depends(get_db),
tenant_id: str = Depends(_get_tenant_id),
):
where = ["tenant_id = :tid"]
params = {"tid": tenant_id, "lim": limit, "off": offset}
if project_id:
where.append("project_id = :pid")
params["pid"] = project_id
if status:
where.append("status = :status")
params["status"] = status
if document_type:
where.append("document_type = :dt")
params["dt"] = document_type
if reviewer_role_key:
where.append("reviewer_role_key = :rrk")
params["rrk"] = reviewer_role_key
q = text(f"""
SELECT * FROM compliance_document_reviews
WHERE {' AND '.join(where)}
ORDER BY created_at DESC LIMIT :lim OFFSET :off
""")
rows = db.execute(q, params).fetchall()
return [_row_to_dict(r) for r in rows]
@router.get("/stats")
def review_stats(
project_id: Optional[str] = Query(None),
db: Session = Depends(get_db),
tenant_id: str = Depends(_get_tenant_id),
):
where = "tenant_id = :tid"
params = {"tid": tenant_id}
if project_id:
where += " AND project_id = :pid"
params["pid"] = project_id
q = text(f"SELECT status, COUNT(*) as count FROM compliance_document_reviews WHERE {where} GROUP BY status")
rows = db.execute(q, params).fetchall()
return {r.status: r.count for r in rows}
@router.get("/for-document")
def reviews_for_document(
document_type: str = Query(...),
project_id: Optional[str] = Query(None),
db: Session = Depends(get_db),
tenant_id: str = Depends(_get_tenant_id),
):
where = "tenant_id = :tid AND document_type = :dt"
params = {"tid": tenant_id, "dt": document_type}
if project_id:
where += " AND project_id = :pid"
params["pid"] = project_id
q = text(f"SELECT * FROM compliance_document_reviews WHERE {where} ORDER BY created_at DESC LIMIT 10")
rows = db.execute(q, params).fetchall()
return [_row_to_dict(r) for r in rows]
@router.post("")
def create_review(
body: ReviewCreate,
db: Session = Depends(get_db),
tenant_id: str = Depends(_get_tenant_id),
):
# Find reviewer(s) from mapping + org_roles
q = text("""
SELECT m.role_key, m.is_primary, r.person_name, r.person_email, r.role_label
FROM compliance_document_role_mapping m
LEFT JOIN compliance_org_roles r
ON r.tenant_id = m.tenant_id AND r.role_key = m.role_key
AND (r.project_id = :pid OR r.project_id IS NULL)
WHERE m.tenant_id = :tid AND m.document_type = :dt
ORDER BY m.is_primary DESC
""")
mappings = db.execute(q, {"tid": tenant_id, "dt": body.document_type, "pid": body.project_id}).fetchall()
if not mappings:
raise HTTPException(404, f"No reviewer mapping found for document type '{body.document_type}'")
content_hash = hashlib.sha256(body.document_content.encode()).hexdigest() if body.document_content else None
created = []
for m in mappings:
m_dict = _row_to_dict(m)
ins = text("""
INSERT INTO compliance_document_reviews
(tenant_id, project_id, document_type, document_title, document_content_hash,
reviewer_role_key, reviewer_name, reviewer_email, submitted_by, review_link, submitted_at)
VALUES (:tid, :pid, :dt, :title, :hash, :rrk, :rn, :re, :sb, :rl, NOW())
RETURNING *
""")
row = db.execute(ins, {
"tid": tenant_id, "pid": body.project_id, "dt": body.document_type,
"title": body.document_title, "hash": content_hash,
"rrk": m_dict["role_key"], "rn": m_dict.get("person_name"),
"re": m_dict.get("person_email"), "sb": body.submitted_by,
"rl": body.review_link,
}).fetchone()
created.append(_row_to_dict(row))
db.commit()
return created
@router.get("/{review_id}")
def get_review(
review_id: str,
db: Session = Depends(get_db),
tenant_id: str = Depends(_get_tenant_id),
):
q = text("SELECT * FROM compliance_document_reviews WHERE id = :rid AND tenant_id = :tid")
row = db.execute(q, {"rid": review_id, "tid": tenant_id}).fetchone()
if not row:
raise HTTPException(404, "Review not found")
return _row_to_dict(row)
@router.post("/{review_id}/send")
def send_notification(
review_id: str,
db: Session = Depends(get_db),
tenant_id: str = Depends(_get_tenant_id),
):
q = text("SELECT * FROM compliance_document_reviews WHERE id = :rid AND tenant_id = :tid")
row = db.execute(q, {"rid": review_id, "tid": tenant_id}).fetchone()
if not row:
raise HTTPException(404, "Review not found")
review = _row_to_dict(row)
if not review.get("reviewer_email"):
raise HTTPException(400, "No email for reviewer — assign a person to this role first")
try:
from compliance.services.smtp_sender import send_email
result = send_email(
recipient=review["reviewer_email"],
subject=f"[BreakPilot] Dokument zur Pruefung: {review['document_title']}",
body_html=f"""
<h2>Dokument zur Pruefung</h2>
<p>Sehr geehrte/r <strong>{review.get('reviewer_name') or 'Pruefer/in'}</strong>,</p>
<p>das folgende Dokument wurde Ihnen zur inhaltlichen Pruefung zugewiesen:</p>
<table style="border-collapse:collapse;margin:16px 0;">
<tr><td style="padding:4px 12px 4px 0;font-weight:bold;">Dokument:</td>
<td>{review['document_title']}</td></tr>
<tr><td style="padding:4px 12px 4px 0;font-weight:bold;">Typ:</td>
<td>{review['document_type']}</td></tr>
<tr><td style="padding:4px 12px 4px 0;font-weight:bold;">Eingereicht von:</td>
<td>{review.get('submitted_by') or 'System'}</td></tr>
</table>
<p>Bitte pruefen Sie das Dokument auf <strong>inhaltliche Richtigkeit</strong>,
<strong>Vollstaendigkeit</strong> und <strong>Umsetzbarkeit</strong>.</p>
{f'<p><a href="{review["review_link"]}" style="background:#7c3aed;color:white;padding:10px 20px;border-radius:6px;text-decoration:none;">Dokument oeffnen</a></p>' if review.get("review_link") else ''}
<p style="color:#888;font-size:12px;">BreakPilot Compliance SDK</p>
""",
)
# Update review status
db.execute(text("""
UPDATE compliance_document_reviews
SET status = 'in_review', email_sent = TRUE, email_sent_at = NOW(), updated_at = NOW()
WHERE id = :rid
"""), {"rid": review_id})
db.commit()
return {"sent": True, "email": review["reviewer_email"], "result": result}
except Exception as e:
logger.error("Failed to send review email: %s", e)
raise HTTPException(500, f"Email sending failed: {e}")
@router.post("/{review_id}/approve")
def approve_review(
review_id: str,
db: Session = Depends(get_db),
tenant_id: str = Depends(_get_tenant_id),
):
q = text("""
UPDATE compliance_document_reviews
SET status = 'approved', reviewed_at = NOW(), updated_at = NOW()
WHERE id = :rid AND tenant_id = :tid
RETURNING *
""")
row = db.execute(q, {"rid": review_id, "tid": tenant_id}).fetchone()
if not row:
raise HTTPException(404, "Review not found")
db.commit()
review = _row_to_dict(row)
# Notify all OTHER roles mapped to this document type about the approval
_notify_approval(db, tenant_id, review)
# Check training gaps
training_info = {"training_gaps": 0, "academy_available": False}
try:
from compliance.services.training_link_service import TrainingLinkService
tls = TrainingLinkService(db)
gaps = tls.check_training_gaps(tenant_id, review["document_type"], review.get("project_id"))
training_info = {"training_gaps": gaps.get("total_gaps", 0), "academy_available": gaps.get("academy_available", False)}
# Send training notification emails for each gap
if gaps.get("gaps"):
_notify_training_gaps(gaps["gaps"], review)
except Exception as e:
logger.warning("Training gap check failed (non-blocking): %s", e)
review["training"] = training_info
return review
def _notify_approval(db: Session, tenant_id: str, review: dict):
"""Send approval notification to all other roles mapped to this document type."""
try:
from compliance.services.smtp_sender import send_email
q = text("""
SELECT DISTINCT r.person_name, r.person_email, r.role_label
FROM compliance_document_role_mapping m
JOIN compliance_org_roles r
ON r.tenant_id = m.tenant_id AND r.role_key = m.role_key
AND (r.project_id = :pid OR r.project_id IS NULL)
WHERE m.tenant_id = :tid AND m.document_type = :dt
AND m.role_key != :reviewer_key AND r.person_email IS NOT NULL
""")
others = db.execute(q, {
"tid": tenant_id, "dt": review["document_type"],
"pid": review.get("project_id"), "reviewer_key": review["reviewer_role_key"],
}).fetchall()
for other in others:
o = _row_to_dict(other)
send_email(
recipient=o["person_email"],
subject=f"[BreakPilot] Freigabe: {review['document_title']}",
body_html=f"""
<h2>Dokument freigegeben</h2>
<p>Sehr geehrte/r <strong>{o.get('person_name') or o['role_label']}</strong>,</p>
<p>das Dokument <strong>{review['document_title']}</strong> wurde von
{review.get('reviewer_name') or review['reviewer_role_key']} freigegeben.</p>
<p>Bitte pruefen Sie, ob fuer Ihren Verantwortungsbereich Handlungsbedarf besteht
(z.B. Schulungsbedarf, Prozessanpassungen).</p>
<p style="color:#888;font-size:12px;">BreakPilot Compliance SDK</p>
""",
)
logger.info("Notified %d other roles about approval of %s", len(others), review["document_title"])
except Exception as e:
logger.warning("Approval notification failed (non-blocking): %s", e)
def _notify_training_gaps(gaps: list[dict], review: dict):
"""Send training requirement emails to persons with outstanding modules."""
try:
from compliance.services.smtp_sender import send_email
for gap in gaps:
if not gap.get("person_email"):
continue
send_email(
recipient=gap["person_email"],
subject=f"[BreakPilot] Schulungsbedarf: {gap['module_title']}",
body_html=f"""
<h2>Schulungsbedarf nach Dokument-Freigabe</h2>
<p>Sehr geehrte/r <strong>{gap['person_name']}</strong>,</p>
<p>nach Freigabe des Dokuments <strong>{review['document_title']}</strong>
ist fuer Ihre Rolle (<strong>{gap['role']}</strong>) eine Schulung erforderlich:</p>
<p><strong>{gap['module_title']}</strong> ({gap['module_code']})</p>
<p>Status: {gap['status']}</p>
<p><a href="/sdk/training/learner" style="background:#7c3aed;color:white;padding:10px 20px;border-radius:6px;text-decoration:none;">Zur Academy</a></p>
<p style="color:#888;font-size:12px;">BreakPilot Compliance SDK</p>
""",
)
logger.info("Sent %d training gap notifications for %s", len(gaps), review["document_title"])
except Exception as e:
logger.warning("Training notification failed (non-blocking): %s", e)
@router.post("/{review_id}/reject")
def reject_review(
review_id: str,
body: ReviewReject,
db: Session = Depends(get_db),
tenant_id: str = Depends(_get_tenant_id),
):
q = text("""
UPDATE compliance_document_reviews
SET status = 'rejected', reviewed_at = NOW(), review_comment = :comment, updated_at = NOW()
WHERE id = :rid AND tenant_id = :tid
RETURNING *
""")
row = db.execute(q, {"rid": review_id, "tid": tenant_id, "comment": body.comment}).fetchone()
if not row:
raise HTTPException(404, "Review not found")
db.commit()
return _row_to_dict(row)
# =============================================================================
# Training Integration
# =============================================================================
@router.get("/training-requirements")
def get_training_requirements(
document_type: str = Query(...),
db: Session = Depends(get_db),
tenant_id: str = Depends(_get_tenant_id),
):
from compliance.services.training_link_service import TrainingLinkService
service = TrainingLinkService(db)
return service.get_training_requirements(tenant_id, document_type)
@router.get("/training-gaps")
def get_training_gaps(
document_type: str = Query(...),
project_id: Optional[str] = Query(None),
db: Session = Depends(get_db),
tenant_id: str = Depends(_get_tenant_id),
):
from compliance.services.training_link_service import TrainingLinkService
service = TrainingLinkService(db)
return service.check_training_gaps(tenant_id, document_type, project_id)
@@ -243,6 +243,19 @@ async def change_status(
return svc.change_status(dsr_id, body, tenant_id)
@router.post("/{dsr_id}/reject-art11")
async def reject_art11(
dsr_id: str,
notes: str = Query(""),
tenant_id: str = Depends(_get_tenant),
db: Session = Depends(get_db),
):
"""Reject DSR under Art. 11 DSGVO — data subject not identifiable."""
from compliance.services.dsr_art11_service import DSRArt11Service
with translate_domain_errors():
return DSRArt11Service(db).reject_not_identifiable(dsr_id, tenant_id, notes)
@router.post("/{dsr_id}/verify-identity")
async def verify_identity(
dsr_id: str,
@@ -367,3 +380,42 @@ async def update_exception_check(
):
with translate_domain_errors():
return svc.update_exception_check(dsr_id, check_id, body, tenant_id)
# =============================================================================
# User Data Export (Art. 15 / Art. 20)
# =============================================================================
@router.get("/{dsr_id}/export-user-data")
async def export_user_data(
dsr_id: str,
format: str = Query("json"),
tenant_id: str = Depends(_get_tenant),
svc: DSRService = Depends(_dsr_svc),
db: Session = Depends(get_db),
):
"""Export all CMP data about the data subject as JSON, CSV, or PDF."""
import io
from compliance.services.dsr_export_service import DSRExportService
with translate_domain_errors():
dsr = svc.get(dsr_id, tenant_id)
email = dsr.get("requester_email")
if not email:
from fastapi import HTTPException
raise HTTPException(400, "DSR has no requester email")
export_svc = DSRExportService(db)
if format == "pdf":
content, filename = export_svc.export_pdf(tenant_id, email)
return StreamingResponse(io.BytesIO(content), media_type="application/pdf",
headers={"Content-Disposition": f'attachment; filename="{filename}"'})
elif format == "csv":
content, filename = export_svc.export_csv(tenant_id, email)
return StreamingResponse(io.BytesIO(content), media_type="text/csv",
headers={"Content-Disposition": f'attachment; filename="{filename}"'})
else:
content, filename = export_svc.export_json(tenant_id, email)
return StreamingResponse(io.BytesIO(content), media_type="application/json",
headers={"Content-Disposition": f'attachment; filename="{filename}"'})
@@ -0,0 +1,255 @@
"""
FastAPI routes for Organizational Compliance Roles.
Manages the 7 standard compliance roles (DSB, GF, IT-Leiter, etc.)
and the document-to-role mapping that determines who reviews which documents.
Endpoints:
GET /org-roles list roles for tenant/project
POST /org-roles create/upsert a role
PUT /org-roles/{id} update role details
DELETE /org-roles/{id} remove a role
GET /org-roles/defaults 7 standard role definitions
POST /org-roles/seed seed default roles for a project
POST /org-roles/{id}/send-test send test email to role
GET /org-roles/mapping document-to-role mapping
PUT /org-roles/mapping update mapping
"""
import logging
from typing import Optional, List
from fastapi import APIRouter, Depends, HTTPException, Query
from pydantic import BaseModel
from sqlalchemy import text
from sqlalchemy.orm import Session
from classroom_engine.database import get_db
from .tenant_utils import get_tenant_id as _get_tenant_id
from .db_utils import row_to_dict as _row_to_dict
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/org-roles", tags=["org-roles"])
# =============================================================================
# Standard role definitions
# =============================================================================
DEFAULT_ROLES = [
{"role_key": "dsb", "role_label": "Datenschutzbeauftragter (DSB)"},
{"role_key": "gf", "role_label": "Geschaeftsfuehrung"},
{"role_key": "it_leiter", "role_label": "IT-Leiter / CISO"},
{"role_key": "hr_leitung", "role_label": "HR-Leitung"},
{"role_key": "marketing_leitung", "role_label": "Marketing-Leitung"},
{"role_key": "compliance_beauftragter", "role_label": "Compliance-Beauftragter"},
{"role_key": "einkauf", "role_label": "Einkauf / Vendor Management"},
]
# =============================================================================
# Schemas
# =============================================================================
class OrgRoleCreate(BaseModel):
role_key: str
role_label: str
person_name: Optional[str] = None
person_email: Optional[str] = None
department: Optional[str] = None
project_id: Optional[str] = None
class OrgRoleUpdate(BaseModel):
role_label: Optional[str] = None
person_name: Optional[str] = None
person_email: Optional[str] = None
department: Optional[str] = None
is_active: Optional[bool] = None
class MappingEntry(BaseModel):
document_type: str
role_key: str
is_primary: bool = True
class MappingUpdate(BaseModel):
entries: List[MappingEntry]
# =============================================================================
# Routes
# =============================================================================
@router.get("")
def list_roles(
project_id: Optional[str] = Query(None),
db: Session = Depends(get_db),
tenant_id: str = Depends(_get_tenant_id),
):
q = text("""
SELECT * FROM compliance_org_roles
WHERE tenant_id = :tid AND (project_id = :pid OR (:pid IS NULL AND project_id IS NULL))
ORDER BY role_key
""")
rows = db.execute(q, {"tid": tenant_id, "pid": project_id}).fetchall()
return [_row_to_dict(r) for r in rows]
@router.get("/defaults")
def get_defaults():
return DEFAULT_ROLES
@router.post("")
def create_role(
body: OrgRoleCreate,
db: Session = Depends(get_db),
tenant_id: str = Depends(_get_tenant_id),
):
q = text("""
INSERT INTO compliance_org_roles (tenant_id, project_id, role_key, role_label, person_name, person_email, department)
VALUES (:tid, :pid, :rk, :rl, :pn, :pe, :dept)
ON CONFLICT (tenant_id, project_id, role_key) DO UPDATE
SET role_label = EXCLUDED.role_label,
person_name = COALESCE(EXCLUDED.person_name, compliance_org_roles.person_name),
person_email = COALESCE(EXCLUDED.person_email, compliance_org_roles.person_email),
department = COALESCE(EXCLUDED.department, compliance_org_roles.department),
updated_at = NOW()
RETURNING *
""")
row = db.execute(q, {
"tid": tenant_id, "pid": body.project_id, "rk": body.role_key,
"rl": body.role_label, "pn": body.person_name, "pe": body.person_email,
"dept": body.department,
}).fetchone()
db.commit()
return _row_to_dict(row)
@router.put("/{role_id}")
def update_role(
role_id: str,
body: OrgRoleUpdate,
db: Session = Depends(get_db),
tenant_id: str = Depends(_get_tenant_id),
):
sets, params = [], {"rid": role_id, "tid": tenant_id}
for field in ["role_label", "person_name", "person_email", "department", "is_active"]:
val = getattr(body, field, None)
if val is not None:
sets.append(f"{field} = :{field}")
params[field] = val
if not sets:
raise HTTPException(400, "No fields to update")
sets.append("updated_at = NOW()")
q = text(f"UPDATE compliance_org_roles SET {', '.join(sets)} WHERE id = :rid AND tenant_id = :tid RETURNING *")
row = db.execute(q, params).fetchone()
if not row:
raise HTTPException(404, "Role not found")
db.commit()
return _row_to_dict(row)
@router.delete("/{role_id}")
def delete_role(
role_id: str,
db: Session = Depends(get_db),
tenant_id: str = Depends(_get_tenant_id),
):
q = text("DELETE FROM compliance_org_roles WHERE id = :rid AND tenant_id = :tid")
result = db.execute(q, {"rid": role_id, "tid": tenant_id})
db.commit()
if result.rowcount == 0:
raise HTTPException(404, "Role not found")
return {"deleted": True}
@router.post("/seed")
def seed_roles(
project_id: Optional[str] = Query(None),
db: Session = Depends(get_db),
tenant_id: str = Depends(_get_tenant_id),
):
created = 0
for role in DEFAULT_ROLES:
q = text("""
INSERT INTO compliance_org_roles (tenant_id, project_id, role_key, role_label)
VALUES (:tid, :pid, :rk, :rl)
ON CONFLICT (tenant_id, project_id, role_key) DO NOTHING
""")
result = db.execute(q, {"tid": tenant_id, "pid": project_id, "rk": role["role_key"], "rl": role["role_label"]})
created += result.rowcount
db.commit()
return {"seeded": created, "total": len(DEFAULT_ROLES)}
@router.post("/{role_id}/send-test")
def send_test_email(
role_id: str,
db: Session = Depends(get_db),
tenant_id: str = Depends(_get_tenant_id),
):
q = text("SELECT * FROM compliance_org_roles WHERE id = :rid AND tenant_id = :tid")
role = db.execute(q, {"rid": role_id, "tid": tenant_id}).fetchone()
if not role:
raise HTTPException(404, "Role not found")
role_dict = _row_to_dict(role)
if not role_dict.get("person_email"):
raise HTTPException(400, "No email configured for this role")
try:
from compliance.services.smtp_sender import send_email
result = send_email(
recipient=role_dict["person_email"],
subject=f"[BreakPilot] Test-E-Mail fuer {role_dict['role_label']}",
body_html=f"""
<h2>Test-E-Mail</h2>
<p>Diese E-Mail bestaetigt, dass die Zustellung an die Rolle
<strong>{role_dict['role_label']}</strong> funktioniert.</p>
<p>Empfaenger: {role_dict['person_name'] or 'N/A'} ({role_dict['person_email']})</p>
<p style="color:#888;font-size:12px;">Gesendet von BreakPilot Compliance SDK</p>
""",
)
return {"sent": True, "email": role_dict["person_email"], "result": result}
except Exception as e:
logger.error("Failed to send test email: %s", e)
raise HTTPException(500, f"Email sending failed: {e}")
# =============================================================================
# Document-to-Role Mapping
# =============================================================================
@router.get("/mapping")
def get_mapping(
db: Session = Depends(get_db),
tenant_id: str = Depends(_get_tenant_id),
):
q = text("""
SELECT * FROM compliance_document_role_mapping
WHERE tenant_id = :tid
ORDER BY document_type, role_key
""")
rows = db.execute(q, {"tid": tenant_id}).fetchall()
return [_row_to_dict(r) for r in rows]
@router.put("/mapping")
def update_mapping(
body: MappingUpdate,
db: Session = Depends(get_db),
tenant_id: str = Depends(_get_tenant_id),
):
for entry in body.entries:
q = text("""
INSERT INTO compliance_document_role_mapping (tenant_id, document_type, role_key, is_primary)
VALUES (:tid, :dt, :rk, :ip)
ON CONFLICT (tenant_id, document_type, role_key) DO UPDATE
SET is_primary = EXCLUDED.is_primary
""")
db.execute(q, {"tid": tenant_id, "dt": entry.document_type, "rk": entry.role_key, "ip": entry.is_primary})
db.commit()
return {"updated": len(body.entries)}
@@ -0,0 +1,95 @@
"""
FastAPI routes for IAB TCF 2.2 (Transparency & Consent Framework).
Endpoints:
GET /tcf/purposes list 12 IAB purposes with translations
GET /tcf/special-features list 2 IAB special features
GET /tcf/category-mapping banner category IAB purpose mapping
POST /tcf/encode generate TC String from consent decisions
POST /tcf/encode-categories generate TC String from banner categories
"""
import logging
from typing import Optional, List, Dict
from fastapi import APIRouter, Depends
from pydantic import BaseModel
from sqlalchemy.orm import Session
from classroom_engine.database import get_db
from .tenant_utils import get_tenant_id as _get_tenant_id
from compliance.services.tcf_encoder_service import TCFEncoderService
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/tcf", tags=["tcf"])
class TCFEncodeRequest(BaseModel):
purpose_consents: Dict[int, bool] = {}
vendor_consents: Dict[int, bool] = {}
purpose_li: Optional[Dict[int, bool]] = None
special_features: Optional[Dict[int, bool]] = None
cmp_id: int = 1
cmp_version: int = 1
consent_language: str = "DE"
class TCFCategoryEncodeRequest(BaseModel):
categories: List[str] = []
vendor_consents: Optional[Dict[int, bool]] = None
cmp_id: int = 1
consent_language: str = "DE"
@router.get("/purposes")
def list_purposes():
return TCFEncoderService.get_purposes()
@router.get("/special-features")
def list_special_features():
return TCFEncoderService.get_special_features()
@router.get("/category-mapping")
def get_category_mapping():
return TCFEncoderService.get_category_purpose_map()
@router.post("/encode")
def encode_tc_string(body: TCFEncodeRequest):
encoder = TCFEncoderService(
cmp_id=body.cmp_id,
cmp_version=body.cmp_version,
consent_language=body.consent_language,
)
tc_string = encoder.encode(
purpose_consents=body.purpose_consents,
vendor_consents=body.vendor_consents,
purpose_li=body.purpose_li,
special_features=body.special_features,
)
return {"tc_string": tc_string, "version": 2}
@router.post("/encode-categories")
def encode_from_categories(body: TCFCategoryEncodeRequest):
encoder = TCFEncoderService(
cmp_id=body.cmp_id,
consent_language=body.consent_language,
)
tc_string = encoder.encode_from_categories(
categories=body.categories,
vendor_consents=body.vendor_consents,
)
# Also return which purposes were set
from compliance.services.tcf_encoder_service import CATEGORY_PURPOSE_MAP
purpose_ids = set()
for cat in body.categories:
purpose_ids.update(CATEGORY_PURPOSE_MAP.get(cat, []))
return {
"tc_string": tc_string,
"version": 2,
"purposes_consented": sorted(purpose_ids),
"categories": body.categories,
}
@@ -0,0 +1,310 @@
"""
FastAPI routes for Whistleblower (HinSchG) Hinweisgeberschutz.
Admin endpoints for managing reports + public endpoint for anonymous submissions.
Deadlines: 7 days acknowledgment (§ 17 Abs. 1), 3 months feedback (§ 17 Abs. 2).
Endpoints:
GET /whistleblower/reports list with filters
GET /whistleblower/reports/stats counts by status/category
POST /whistleblower/reports create report (admin)
GET /whistleblower/reports/{id} single report with messages
PUT /whistleblower/reports/{id} update status/priority/assignment
POST /whistleblower/reports/{id}/acknowledge send acknowledgment
POST /whistleblower/reports/{id}/close close report
POST /whistleblower/reports/{id}/messages add message
GET /whistleblower/reports/{id}/measures list measures
POST /whistleblower/reports/{id}/measures add measure
POST /whistleblower/submit public anonymous submission
GET /whistleblower/check/{access_key} reporter checks status
"""
import logging
import secrets
import string
from datetime import datetime, timedelta, timezone
from typing import Optional, List
from fastapi import APIRouter, Depends, HTTPException, Query
from pydantic import BaseModel
from sqlalchemy import text
from sqlalchemy.orm import Session
from classroom_engine.database import get_db
from .tenant_utils import get_tenant_id as _get_tenant_id
from .db_utils import row_to_dict as _row_to_dict
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/whistleblower", tags=["whistleblower"])
VALID_CATEGORIES = {"corruption", "fraud", "data_protection", "discrimination",
"environment", "competition", "product_safety", "tax_evasion", "other"}
VALID_STATUSES = {"new", "acknowledged", "under_review", "investigation",
"measures_taken", "closed", "rejected"}
def _gen_ref(tenant_id: str, db: Session) -> str:
year = datetime.now().year
q = text("SELECT COUNT(*) FROM compliance_whistleblower_reports WHERE tenant_id = :tid")
count = db.execute(q, {"tid": tenant_id}).scalar() or 0
return f"WB-{year}-{count + 1:06d}"
def _gen_access_key() -> str:
chars = string.ascii_uppercase + string.digits
parts = [''.join(secrets.choice(chars) for _ in range(4)) for _ in range(3)]
return '-'.join(parts)
# =============================================================================
# Schemas
# =============================================================================
class ReportCreate(BaseModel):
category: str = "other"
title: str
description: str
is_anonymous: bool = True
reporter_name: Optional[str] = None
reporter_email: Optional[str] = None
reporter_phone: Optional[str] = None
priority: str = "normal"
class ReportUpdate(BaseModel):
status: Optional[str] = None
priority: Optional[str] = None
assigned_to: Optional[str] = None
category: Optional[str] = None
class MessageCreate(BaseModel):
message: str
sender_type: str = "admin"
is_internal: bool = False
class MeasureCreate(BaseModel):
title: str
description: Optional[str] = None
responsible: Optional[str] = None
due_date: Optional[str] = None
# =============================================================================
# Admin Routes
# =============================================================================
@router.get("/reports")
def list_reports(
status: Optional[str] = Query(None),
category: Optional[str] = Query(None),
limit: int = Query(50, le=200),
db: Session = Depends(get_db),
tenant_id: str = Depends(_get_tenant_id),
):
where = ["tenant_id = :tid"]
params = {"tid": tenant_id, "lim": limit}
if status:
where.append("status = :st")
params["st"] = status
if category:
where.append("category = :cat")
params["cat"] = category
q = text(f"SELECT * FROM compliance_whistleblower_reports WHERE {' AND '.join(where)} ORDER BY received_at DESC LIMIT :lim")
return [_row_to_dict(r) for r in db.execute(q, params).fetchall()]
@router.get("/reports/stats")
def report_stats(
db: Session = Depends(get_db),
tenant_id: str = Depends(_get_tenant_id),
):
now = datetime.now(timezone.utc)
q = text("SELECT status, COUNT(*) as cnt FROM compliance_whistleblower_reports WHERE tenant_id = :tid GROUP BY status")
by_status = {r.status: r.cnt for r in db.execute(q, {"tid": tenant_id}).fetchall()}
q2 = text("SELECT category, COUNT(*) as cnt FROM compliance_whistleblower_reports WHERE tenant_id = :tid GROUP BY category")
by_category = {r.category: r.cnt for r in db.execute(q2, {"tid": tenant_id}).fetchall()}
q3 = text("SELECT COUNT(*) FROM compliance_whistleblower_reports WHERE tenant_id = :tid AND deadline_acknowledgment < :now AND acknowledged_at IS NULL AND status = 'new'")
overdue_ack = db.execute(q3, {"tid": tenant_id, "now": now}).scalar() or 0
q4 = text("SELECT COUNT(*) FROM compliance_whistleblower_reports WHERE tenant_id = :tid AND deadline_feedback < :now AND status NOT IN ('closed', 'rejected')")
overdue_fb = db.execute(q4, {"tid": tenant_id, "now": now}).scalar() or 0
total = sum(by_status.values())
return {"total": total, "by_status": by_status, "by_category": by_category, "overdue_acknowledgment": overdue_ack, "overdue_feedback": overdue_fb}
@router.post("/reports")
def create_report(
body: ReportCreate,
db: Session = Depends(get_db),
tenant_id: str = Depends(_get_tenant_id),
):
now = datetime.now(timezone.utc)
ref = _gen_ref(tenant_id, db)
ak = _gen_access_key()
q = text("""
INSERT INTO compliance_whistleblower_reports
(tenant_id, reference_number, access_key, category, title, description,
is_anonymous, reporter_name, reporter_email, reporter_phone, priority,
received_at, deadline_acknowledgment, deadline_feedback)
VALUES (:tid, :ref, :ak, :cat, :title, :desc,
:anon, :rn, :re, :rp, :pri,
:now, :dl_ack, :dl_fb)
RETURNING *
""")
row = db.execute(q, {
"tid": tenant_id, "ref": ref, "ak": ak,
"cat": body.category, "title": body.title, "desc": body.description,
"anon": body.is_anonymous, "rn": body.reporter_name,
"re": body.reporter_email, "rp": body.reporter_phone,
"pri": body.priority, "now": now,
"dl_ack": now + timedelta(days=7),
"dl_fb": now + timedelta(days=90),
}).fetchone()
db.commit()
return _row_to_dict(row)
@router.get("/reports/{report_id}")
def get_report(
report_id: str,
db: Session = Depends(get_db),
tenant_id: str = Depends(_get_tenant_id),
):
row = db.execute(text("SELECT * FROM compliance_whistleblower_reports WHERE id = :rid AND tenant_id = :tid"),
{"rid": report_id, "tid": tenant_id}).fetchone()
if not row:
raise HTTPException(404, "Report not found")
result = _row_to_dict(row)
msgs = db.execute(text("SELECT * FROM compliance_whistleblower_messages WHERE report_id = :rid ORDER BY created_at"),
{"rid": report_id}).fetchall()
result["messages"] = [_row_to_dict(m) for m in msgs]
measures = db.execute(text("SELECT * FROM compliance_whistleblower_measures WHERE report_id = :rid ORDER BY created_at"),
{"rid": report_id}).fetchall()
result["measures"] = [_row_to_dict(m) for m in measures]
return result
@router.put("/reports/{report_id}")
def update_report(
report_id: str,
body: ReportUpdate,
db: Session = Depends(get_db),
tenant_id: str = Depends(_get_tenant_id),
):
sets, params = [], {"rid": report_id, "tid": tenant_id}
for field in ["status", "priority", "assigned_to", "category"]:
val = getattr(body, field, None)
if val is not None:
sets.append(f"{field} = :{field}")
params[field] = val
if not sets:
raise HTTPException(400, "No fields to update")
sets.append("updated_at = NOW()")
q = text(f"UPDATE compliance_whistleblower_reports SET {', '.join(sets)} WHERE id = :rid AND tenant_id = :tid RETURNING *")
row = db.execute(q, params).fetchone()
if not row:
raise HTTPException(404, "Report not found")
db.commit()
return _row_to_dict(row)
@router.post("/reports/{report_id}/acknowledge")
def acknowledge_report(
report_id: str,
db: Session = Depends(get_db),
tenant_id: str = Depends(_get_tenant_id),
):
q = text("""
UPDATE compliance_whistleblower_reports
SET status = 'acknowledged', acknowledged_at = NOW(), updated_at = NOW()
WHERE id = :rid AND tenant_id = :tid RETURNING *
""")
row = db.execute(q, {"rid": report_id, "tid": tenant_id}).fetchone()
if not row:
raise HTTPException(404, "Report not found")
db.commit()
return _row_to_dict(row)
@router.post("/reports/{report_id}/close")
def close_report(
report_id: str,
reason: str = Query(""),
db: Session = Depends(get_db),
tenant_id: str = Depends(_get_tenant_id),
):
q = text("""
UPDATE compliance_whistleblower_reports
SET status = 'closed', closed_at = NOW(), closure_reason = :reason, updated_at = NOW()
WHERE id = :rid AND tenant_id = :tid RETURNING *
""")
row = db.execute(q, {"rid": report_id, "tid": tenant_id, "reason": reason}).fetchone()
if not row:
raise HTTPException(404, "Report not found")
db.commit()
return _row_to_dict(row)
@router.post("/reports/{report_id}/messages")
def add_message(
report_id: str,
body: MessageCreate,
db: Session = Depends(get_db),
tenant_id: str = Depends(_get_tenant_id),
):
q = text("""
INSERT INTO compliance_whistleblower_messages (report_id, sender_type, message, is_internal)
VALUES (:rid, :st, :msg, :internal) RETURNING *
""")
row = db.execute(q, {"rid": report_id, "st": body.sender_type, "msg": body.message, "internal": body.is_internal}).fetchone()
db.commit()
return _row_to_dict(row)
@router.get("/reports/{report_id}/measures")
def list_measures(report_id: str, db: Session = Depends(get_db)):
return [_row_to_dict(r) for r in db.execute(text(
"SELECT * FROM compliance_whistleblower_measures WHERE report_id = :rid ORDER BY created_at"
), {"rid": report_id}).fetchall()]
@router.post("/reports/{report_id}/measures")
def add_measure(
report_id: str, body: MeasureCreate,
db: Session = Depends(get_db),
):
q = text("""
INSERT INTO compliance_whistleblower_measures (report_id, title, description, responsible, due_date)
VALUES (:rid, :title, :desc, :resp, :due) RETURNING *
""")
row = db.execute(q, {"rid": report_id, "title": body.title, "desc": body.description,
"resp": body.responsible, "due": body.due_date}).fetchone()
db.commit()
return _row_to_dict(row)
# =============================================================================
# Public Routes (Anonymous)
# =============================================================================
@router.post("/submit")
def submit_report(body: ReportCreate, db: Session = Depends(get_db), tenant_id: str = Depends(_get_tenant_id)):
"""Public anonymous submission — same as create but returns only access_key."""
body.is_anonymous = True
result = create_report(body, db, tenant_id)
return {"access_key": result["access_key"], "reference_number": result["reference_number"],
"message": "Ihre Meldung wurde erfolgreich eingereicht. Nutzen Sie den Zugangscode um den Status zu pruefen."}
@router.get("/check/{access_key}")
def check_status(access_key: str, db: Session = Depends(get_db), tenant_id: str = Depends(_get_tenant_id)):
"""Reporter checks status anonymously via access key."""
row = db.execute(text(
"SELECT id, reference_number, status, category, received_at, acknowledged_at FROM compliance_whistleblower_reports WHERE access_key = :ak AND tenant_id = :tid"
), {"ak": access_key, "tid": tenant_id}).fetchone()
if not row:
raise HTTPException(404, "Meldung nicht gefunden")
result = _row_to_dict(row)
msgs = db.execute(text(
"SELECT message, sender_type, created_at FROM compliance_whistleblower_messages WHERE report_id = :rid AND is_internal = FALSE ORDER BY created_at"
), {"rid": result["id"]}).fetchall()
result["messages"] = [_row_to_dict(m) for m in msgs]
return result
@@ -31,24 +31,11 @@ class BannerConsentDB(Base):
device_fingerprint = Column(Text, nullable=False)
categories = Column(JSON, default=list)
vendors = Column(JSON, default=list)
vendor_consents = Column(JSON, default=dict) # {"vendor_id": true/false}
ip_hash = Column(Text)
user_agent = Column(Text)
consent_string = Column(Text)
linked_email = Column(Text)
# Vendor-agnostische Felder (Migration 107)
consent_method = Column(Text) # accept_all / reject_all / custom_selection
banner_version = Column(Integer)
banner_config_hash = Column(Text)
geo_country = Column(Text)
geo_region = Column(Text)
consent_scope = Column(Text, default='domain')
page_url = Column(Text)
referrer = Column(Text)
device_type = Column(Text) # mobile / desktop / tablet
browser = Column(Text)
os = Column(Text)
screen_resolution = Column(Text)
session_id = Column(Text)
expires_at = Column(DateTime)
created_at = Column(DateTime, nullable=False, default=datetime.utcnow)
updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
@@ -74,11 +61,11 @@ class BannerConsentAuditLogDB(Base):
site_id = Column(Text, nullable=False)
device_fingerprint = Column(Text)
categories = Column(JSON, default=list)
vendor_consents = Column(JSON, default=dict)
ip_hash = Column(Text)
user_agent = Column(Text)
banner_config_hash = Column(Text)
consent_version = Column(Integer)
consent_method = Column(Text)
page_url = Column(Text)
created_at = Column(DateTime, nullable=False, default=datetime.utcnow)
__table_args__ = (
@@ -16,6 +16,7 @@ class ConsentCreate(BaseModel):
device_fingerprint: str
categories: List[str] = []
vendors: List[str] = []
vendor_consents: dict[str, bool] = {}
ip_address: Optional[str] = None
user_agent: Optional[str] = None
consent_string: Optional[str] = None
@@ -23,23 +23,10 @@ def consent_to_dict(c: BannerConsentDB) -> dict[str, Any]:
"device_fingerprint": c.device_fingerprint,
"categories": c.categories or [],
"vendors": c.vendors or [],
"vendor_consents": c.vendor_consents or {},
"ip_hash": c.ip_hash,
"user_agent": c.user_agent,
"consent_string": c.consent_string,
"linked_email": c.linked_email,
"consent_method": c.consent_method,
"banner_version": c.banner_version,
"banner_config_hash": c.banner_config_hash,
"geo_country": c.geo_country,
"geo_region": c.geo_region,
"consent_scope": c.consent_scope,
"page_url": c.page_url,
"referrer": c.referrer,
"device_type": c.device_type,
"browser": c.browser,
"os": c.os,
"screen_resolution": c.screen_resolution,
"session_id": c.session_id,
"expires_at": c.expires_at.isoformat() if c.expires_at else None,
"created_at": c.created_at.isoformat() if c.created_at else None,
"updated_at": c.updated_at.isoformat() if c.updated_at else None,
@@ -0,0 +1,95 @@
"""
Agent PDF Export generates printable compliance scan reports.
Uses WeasyPrint to convert HTML report to PDF.
"""
import logging
from datetime import datetime, timezone
from io import BytesIO
logger = logging.getLogger(__name__)
def generate_scan_pdf(scan_data: dict) -> bytes:
"""Generate a PDF report from scan results."""
from weasyprint import HTML
html = _build_report_html(scan_data)
pdf_buffer = BytesIO()
HTML(string=html).write_pdf(pdf_buffer)
return pdf_buffer.getvalue()
def _severity_color(sev: str) -> str:
return {"HIGH": "#dc2626", "CRITICAL": "#991b1b", "MEDIUM": "#ea580c", "LOW": "#2563eb"}.get(sev, "#6b7280")
def _build_report_html(data: dict) -> str:
"""Build HTML for the PDF report."""
url = data.get("url", "")
scan_type = data.get("scan_type", "scan")
mode = data.get("analysis_mode", "post_launch")
findings = data.get("findings", [])
services = data.get("services", [])
risk = data.get("risk_level", "")
score = data.get("risk_score", 0)
pages = data.get("pages_scanned", 0)
now = datetime.now(timezone.utc).strftime("%d.%m.%Y %H:%M UTC")
mode_label = "Live-Website Pruefung" if mode == "post_launch" else "Interne Pruefung"
type_label = {"quick": "Schnellanalyse", "scan": "Website-Scan", "consent_test": "Cookie-Test"}.get(scan_type, scan_type)
findings_rows = ""
for f in findings:
sev = f.get("severity", "MEDIUM") if isinstance(f, dict) else "MEDIUM"
text = f.get("text", str(f)) if isinstance(f, dict) else str(f)
color = _severity_color(sev)
findings_rows += f'<tr><td style="color:{color};font-weight:bold;padding:6px 8px;border-bottom:1px solid #e5e7eb;">{sev}</td><td style="padding:6px 8px;border-bottom:1px solid #e5e7eb;">{text}</td></tr>'
services_rows = ""
for s in services:
if isinstance(s, dict):
status_icon = "" if s.get("in_dse") or s.get("status") == "ok" else ""
status_color = "#16a34a" if status_icon == "" else "#dc2626"
services_rows += f'<tr><td style="color:{status_color};font-weight:bold;padding:4px 8px;border-bottom:1px solid #f3f4f6;">{status_icon}</td><td style="padding:4px 8px;border-bottom:1px solid #f3f4f6;">{s.get("name","")}</td><td style="padding:4px 8px;border-bottom:1px solid #f3f4f6;">{s.get("country","")}</td><td style="padding:4px 8px;border-bottom:1px solid #f3f4f6;">{s.get("category","")}</td></tr>'
return f"""<!DOCTYPE html>
<html><head><meta charset="utf-8">
<style>
body {{ font-family: -apple-system, Arial, sans-serif; font-size: 11px; color: #1e293b; margin: 40px; }}
h1 {{ font-size: 20px; color: #1e1b4b; margin-bottom: 4px; }}
h2 {{ font-size: 14px; color: #334155; border-bottom: 2px solid #e2e8f0; padding-bottom: 4px; margin-top: 24px; }}
.meta {{ color: #64748b; font-size: 10px; margin-bottom: 20px; }}
.badge {{ display: inline-block; padding: 2px 8px; border-radius: 4px; color: white; font-size: 10px; font-weight: bold; }}
table {{ width: 100%; border-collapse: collapse; }}
th {{ text-align: left; padding: 6px 8px; background: #f8fafc; border-bottom: 2px solid #e2e8f0; font-size: 10px; color: #64748b; }}
.warning {{ background: #fef2f2; border-left: 4px solid #dc2626; padding: 10px 14px; margin: 16px 0; }}
.footer {{ margin-top: 30px; padding-top: 10px; border-top: 1px solid #e2e8f0; color: #94a3b8; font-size: 9px; }}
</style></head><body>
<h1>Compliance Agent Report</h1>
<p class="meta">{type_label} | {mode_label} | {now}</p>
<table style="margin-bottom:20px;">
<tr><td style="padding:4px 0;color:#64748b;width:150px;">URL</td><td style="padding:4px 0;"><strong>{url}</strong></td></tr>
<tr><td style="padding:4px 0;color:#64748b;">Risikobewertung</td><td style="padding:4px 0;"><span class="badge" style="background:{_severity_color(risk) if risk else '#6b7280'}">{risk} ({score}/100)</span></td></tr>
<tr><td style="padding:4px 0;color:#64748b;">Seiten gescannt</td><td style="padding:4px 0;">{pages}</td></tr>
<tr><td style="padding:4px 0;color:#64748b;">Findings</td><td style="padding:4px 0;"><strong>{len(findings)}</strong></td></tr>
</table>
{'<div class="warning"><strong>ACHTUNG:</strong> Maengel auf einer bereits veroeffentlichten Website. Sofortige Korrektur empfohlen.</div>' if mode == "post_launch" and findings else ''}
<h2>Findings ({len(findings)})</h2>
<table>
<tr><th>Schwere</th><th>Beschreibung</th></tr>
{findings_rows if findings_rows else '<tr><td colspan="2" style="padding:8px;color:#16a34a;">Keine Findings — alles OK</td></tr>'}
</table>
{'<h2>Dienstleister-Abgleich</h2><table><tr><th>Status</th><th>Dienst</th><th>Land</th><th>Kategorie</th></tr>' + services_rows + '</table>' if services_rows else ''}
<div class="footer">
Automatisch erstellt vom BreakPilot Compliance Agent | {now}<br>
Dieses Dokument ersetzt keine Rechtsberatung.
</div>
</body></html>"""
@@ -0,0 +1,193 @@
"""
Banner A/B Testing Service variant assignment, stats, significance.
Deterministic variant assignment via device fingerprint hash ensures
the same device always sees the same variant (sticky bucketing).
"""
import hashlib
import math
import uuid
from datetime import datetime, timezone
from typing import Any, Optional
from sqlalchemy import text
from sqlalchemy.orm import Session
class BannerABService:
"""A/B testing for consent banner variants."""
def __init__(self, db: Session) -> None:
self.db = db
# ------------------------------------------------------------------
# Variant CRUD
# ------------------------------------------------------------------
def list_variants(self, tenant_id: str, site_config_id: str) -> list[dict]:
q = text("""
SELECT * FROM compliance_banner_variants
WHERE tenant_id = :tid AND site_config_id = :scid
ORDER BY variant_key
""")
rows = self.db.execute(q, {"tid": tenant_id, "scid": site_config_id}).fetchall()
return [dict(r._mapping) for r in rows]
def create_variant(self, tenant_id: str, site_config_id: str, data: dict) -> dict:
q = text("""
INSERT INTO compliance_banner_variants
(tenant_id, site_config_id, variant_name, variant_key, traffic_percent, is_control,
banner_title, banner_description, position, style, primary_color, show_decline_all, theme_overrides)
VALUES (:tid, :scid, :name, :key, :pct, :ctrl,
:title, :desc, :pos, :style, :color, :decline, :theme)
RETURNING *
""")
row = self.db.execute(q, {
"tid": tenant_id, "scid": site_config_id,
"name": data.get("variant_name", ""),
"key": data.get("variant_key", "A"),
"pct": data.get("traffic_percent", 50),
"ctrl": data.get("is_control", False),
"title": data.get("banner_title"),
"desc": data.get("banner_description"),
"pos": data.get("position"),
"style": data.get("style"),
"color": data.get("primary_color"),
"decline": data.get("show_decline_all"),
"theme": data.get("theme_overrides", "{}"),
}).fetchone()
self.db.commit()
return dict(row._mapping)
def update_variant(self, variant_id: str, data: dict) -> Optional[dict]:
sets, params = [], {"vid": variant_id}
for field in ["variant_name", "traffic_percent", "is_control", "banner_title",
"banner_description", "position", "style", "primary_color",
"show_decline_all", "is_active"]:
if field in data and data[field] is not None:
sets.append(f"{field} = :{field}")
params[field] = data[field]
if not sets:
return None
sets.append("updated_at = NOW()")
q = text(f"UPDATE compliance_banner_variants SET {', '.join(sets)} WHERE id = :vid RETURNING *")
row = self.db.execute(q, params).fetchone()
self.db.commit()
return dict(row._mapping) if row else None
def delete_variant(self, variant_id: str) -> bool:
q = text("DELETE FROM compliance_banner_variants WHERE id = :vid")
result = self.db.execute(q, {"vid": variant_id})
self.db.commit()
return result.rowcount > 0
# ------------------------------------------------------------------
# Variant Assignment (deterministic sticky bucketing)
# ------------------------------------------------------------------
def assign_variant(self, site_config_id: str, device_fingerprint: str) -> Optional[dict]:
"""Assign a variant based on device fingerprint hash. Returns variant or None."""
variants = self.db.execute(text("""
SELECT * FROM compliance_banner_variants
WHERE site_config_id = :scid AND is_active = TRUE
ORDER BY variant_key
"""), {"scid": site_config_id}).fetchall()
if not variants:
return None
# Deterministic bucket 0-99 from device fingerprint
bucket = int(hashlib.md5(f"{site_config_id}:{device_fingerprint}".encode()).hexdigest(), 16) % 100
cumulative = 0
for v in variants:
cumulative += v.traffic_percent
if bucket < cumulative:
return dict(v._mapping)
# Fallback to last variant
return dict(variants[-1]._mapping)
# ------------------------------------------------------------------
# Stats with statistical significance
# ------------------------------------------------------------------
def get_variant_stats(self, tenant_id: str, site_config_id: str) -> list[dict]:
"""Per-variant stats with chi-squared significance test."""
variants = self.list_variants(tenant_id, site_config_id)
if not variants:
return []
results = []
for v in variants:
vid = str(v["id"])
vkey = v["variant_key"]
q = text("""
SELECT
COUNT(*) AS total,
COUNT(*) FILTER (WHERE action = 'consent_given') AS accepted,
COUNT(*) FILTER (WHERE action IN ('consent_withdrawn', 'consent_revoked')) AS rejected
FROM compliance_banner_consent_audit_log
WHERE tenant_id = :tid AND variant_key = :vkey
""")
row = self.db.execute(q, {"tid": tenant_id, "vkey": vkey}).fetchone()
total = row.total if row else 0
accepted = row.accepted if row else 0
results.append({
"variant_id": vid,
"variant_key": vkey,
"variant_name": v["variant_name"],
"traffic_percent": v["traffic_percent"],
"is_control": v["is_control"],
"total": total,
"accepted": accepted,
"opt_in_rate": round(accepted / total * 100, 1) if total > 0 else 0,
})
# Chi-squared test between control and best variant
control = next((r for r in results if r["is_control"]), None)
if control and len(results) > 1:
best = max((r for r in results if not r["is_control"]), key=lambda x: x["opt_in_rate"], default=None)
if best and control["total"] > 0 and best["total"] > 0:
sig = self._chi_squared_significance(
control["accepted"], control["total"],
best["accepted"], best["total"],
)
best["is_winner"] = sig > 0.95
best["significance"] = round(sig * 100, 1)
control["is_winner"] = False
control["significance"] = round((1 - sig) * 100, 1)
return results
@staticmethod
def _chi_squared_significance(a_success: int, a_total: int, b_success: int, b_total: int) -> float:
"""Simple chi-squared test for 2x2 contingency table. Returns confidence 0-1."""
a_fail = a_total - a_success
b_fail = b_total - b_success
n = a_total + b_total
if n == 0:
return 0.0
# Expected values
exp_a_s = a_total * (a_success + b_success) / n
exp_a_f = a_total * (a_fail + b_fail) / n
exp_b_s = b_total * (a_success + b_success) / n
exp_b_f = b_total * (a_fail + b_fail) / n
chi2 = 0.0
for obs, exp in [(a_success, exp_a_s), (a_fail, exp_a_f), (b_success, exp_b_s), (b_fail, exp_b_f)]:
if exp > 0:
chi2 += (obs - exp) ** 2 / exp
# Approximate p-value for 1 df using Wilson-Hilferty
if chi2 < 0.001:
return 0.0
if chi2 > 10.83:
return 0.999
# Lookup table for common thresholds (1 df)
thresholds = [(2.706, 0.90), (3.841, 0.95), (5.024, 0.975), (6.635, 0.99), (10.83, 0.999)]
confidence = 0.0
for threshold, conf in thresholds:
if chi2 >= threshold:
confidence = conf
return confidence
@@ -0,0 +1,135 @@
"""
Banner consent analytics time-series, device breakdown, bounce rate.
Reads from BannerConsentAuditLogDB for aggregated analytics.
"""
import re
from datetime import datetime, timedelta, timezone
from typing import Any, Optional
from sqlalchemy import text
from sqlalchemy.orm import Session
class BannerAnalyticsService:
"""Provides aggregated consent analytics for a site."""
def __init__(self, db: Session) -> None:
self.db = db
def get_time_series(
self,
tenant_id: str,
site_id: str,
period: str = "daily",
days: int = 30,
) -> list[dict[str, Any]]:
"""Opt-in rate per day/week over the last N days."""
trunc = "day" if period == "daily" else "week"
cutoff = datetime.now(timezone.utc) - timedelta(days=days)
q = text(f"""
SELECT DATE_TRUNC(:trunc, created_at) AS period,
COUNT(*) FILTER (WHERE action = 'consent_given') AS given,
COUNT(*) FILTER (WHERE action = 'consent_updated') AS updated,
COUNT(*) FILTER (WHERE action IN ('consent_withdrawn', 'consent_revoked')) AS withdrawn,
COUNT(*) AS total
FROM compliance_banner_consent_audit_log
WHERE tenant_id = :tid AND site_id = :sid AND created_at >= :cutoff
GROUP BY 1 ORDER BY 1
""")
rows = self.db.execute(q, {"tid": tenant_id, "sid": site_id, "cutoff": cutoff, "trunc": trunc}).fetchall()
return [
{
"period": r.period.isoformat() if r.period else None,
"given": r.given,
"updated": r.updated,
"withdrawn": r.withdrawn,
"total": r.total,
"opt_in_rate": round((r.given + r.updated) / r.total * 100, 1) if r.total > 0 else 0,
}
for r in rows
]
def get_category_breakdown(
self,
tenant_id: str,
site_id: str,
days: int = 30,
) -> dict[str, dict[str, int]]:
"""Acceptance count per category."""
cutoff = datetime.now(timezone.utc) - timedelta(days=days)
q = text("""
SELECT categories FROM compliance_banner_consent_audit_log
WHERE tenant_id = :tid AND site_id = :sid AND created_at >= :cutoff
AND action IN ('consent_given', 'consent_updated')
""")
rows = self.db.execute(q, {"tid": tenant_id, "sid": site_id, "cutoff": cutoff}).fetchall()
counts: dict[str, int] = {}
total = len(rows)
for r in rows:
cats = r.categories if isinstance(r.categories, list) else []
for cat in cats:
counts[cat] = counts.get(cat, 0) + 1
return {
cat: {"count": count, "total": total, "rate": round(count / total * 100, 1) if total > 0 else 0}
for cat, count in sorted(counts.items())
}
def get_device_breakdown(
self,
tenant_id: str,
site_id: str,
days: int = 30,
) -> dict[str, int]:
"""Mobile/Desktop/Tablet classification from user_agent."""
cutoff = datetime.now(timezone.utc) - timedelta(days=days)
q = text("""
SELECT user_agent FROM compliance_banner_consent_audit_log
WHERE tenant_id = :tid AND site_id = :sid AND created_at >= :cutoff
AND user_agent IS NOT NULL
""")
rows = self.db.execute(q, {"tid": tenant_id, "sid": site_id, "cutoff": cutoff}).fetchall()
result = {"desktop": 0, "mobile": 0, "tablet": 0, "unknown": 0}
mobile_re = re.compile(r"Mobile|Android|iPhone|iPod", re.IGNORECASE)
tablet_re = re.compile(r"iPad|Tablet|PlayBook|Silk", re.IGNORECASE)
for r in rows:
ua = r.user_agent or ""
if tablet_re.search(ua):
result["tablet"] += 1
elif mobile_re.search(ua):
result["mobile"] += 1
elif ua:
result["desktop"] += 1
else:
result["unknown"] += 1
return result
def get_overview_stats(
self,
tenant_id: str,
site_id: str,
days: int = 30,
) -> dict[str, Any]:
"""High-level stats: total consents, active, withdrawn, opt-in rate."""
cutoff = datetime.now(timezone.utc) - timedelta(days=days)
q = text("""
SELECT
COUNT(*) FILTER (WHERE action = 'consent_given') AS given,
COUNT(*) FILTER (WHERE action = 'consent_updated') AS updated,
COUNT(*) FILTER (WHERE action IN ('consent_withdrawn', 'consent_revoked')) AS withdrawn,
COUNT(*) AS total
FROM compliance_banner_consent_audit_log
WHERE tenant_id = :tid AND site_id = :sid AND created_at >= :cutoff
""")
r = self.db.execute(q, {"tid": tenant_id, "sid": site_id, "cutoff": cutoff}).fetchone()
total = r.total if r else 0
given = (r.given or 0) + (r.updated or 0) if r else 0
return {
"period_days": days,
"total_interactions": total,
"consents_given": r.given if r else 0,
"consents_updated": r.updated if r else 0,
"consents_withdrawn": r.withdrawn if r else 0,
"opt_in_rate": round(given / total * 100, 1) if total > 0 else 0,
}
@@ -73,9 +73,8 @@ class BannerConsentService:
ip_hash: Optional[str] = None,
banner_config_hash: Optional[str] = None,
consent_version: Optional[int] = None,
*,
consent_method: Optional[str] = None,
page_url: Optional[str] = None,
vendor_consents: Optional[dict[str, bool]] = None,
user_agent: Optional[str] = None,
) -> None:
entry = BannerConsentAuditLogDB(
tenant_id=tenant_id,
@@ -84,11 +83,11 @@ class BannerConsentService:
site_id=site_id,
device_fingerprint=device_fingerprint,
categories=categories or [],
vendor_consents=vendor_consents or {},
ip_hash=ip_hash,
user_agent=user_agent,
banner_config_hash=banner_config_hash,
consent_version=consent_version,
consent_method=consent_method,
page_url=page_url,
)
self.db.add(entry)
@@ -134,6 +133,24 @@ class BannerConsentService:
return max(v.retention_days for v in vendors if v.retention_days)
return max((CATEGORY_RETENTION_DAYS.get(c, 365) for c in categories), default=365)
def _maybe_generate_tc_string(
self, tenant_id: uuid.UUID, site_id: str, categories: list[str],
) -> Optional[str]:
"""Generate TC String if TCF is enabled for this site."""
config = (
self.db.query(BannerSiteConfigDB)
.filter(BannerSiteConfigDB.tenant_id == tenant_id, BannerSiteConfigDB.site_id == site_id)
.first()
)
if not config or not config.tcf_enabled:
return None
try:
from compliance.services.tcf_encoder_service import TCFEncoderService
encoder = TCFEncoderService()
return encoder.encode_from_categories(categories)
except Exception:
return None
# ------------------------------------------------------------------
# Consent CRUD (public SDK)
# ------------------------------------------------------------------
@@ -148,16 +165,7 @@ class BannerConsentService:
ip_address: Optional[str],
user_agent: Optional[str],
consent_string: Optional[str],
*,
consent_method: Optional[str] = None,
page_url: Optional[str] = None,
referrer: Optional[str] = None,
device_type: Optional[str] = None,
browser: Optional[str] = None,
os: Optional[str] = None,
screen_resolution: Optional[str] = None,
session_id: Optional[str] = None,
consent_scope: Optional[str] = None,
vendor_consents: Optional[dict[str, bool]] = None,
) -> dict[str, Any]:
"""Upsert a device consent row for (tenant, site, device_fingerprint).
@@ -173,20 +181,9 @@ class BannerConsentService:
expires_at = now + timedelta(days=retention)
config_hash, config_ver = self._compute_config_hash(tid, site_id)
# Vendor-agnostische Zusatzfelder
extra = {
"consent_method": consent_method,
"banner_version": config_ver,
"banner_config_hash": config_hash,
"page_url": page_url,
"referrer": referrer,
"device_type": device_type,
"browser": browser,
"os": os,
"screen_resolution": screen_resolution,
"session_id": session_id,
"consent_scope": consent_scope or "domain",
}
# Auto-generate TC String if TCF is enabled for this site
if not consent_string:
consent_string = self._maybe_generate_tc_string(tid, site_id, categories)
existing = (
self.db.query(BannerConsentDB)
@@ -201,18 +198,17 @@ class BannerConsentService:
if existing:
existing.categories = categories
existing.vendors = vendors
existing.vendor_consents = vendor_consents or {}
existing.ip_hash = ip_hash
existing.user_agent = user_agent
existing.consent_string = consent_string
existing.expires_at = expires_at
existing.updated_at = now
for key, val in extra.items():
setattr(existing, key, val)
self.db.flush()
self._log(
tid, existing.id, "consent_updated", site_id, device_fingerprint,
categories, ip_hash, config_hash, config_ver,
consent_method=consent_method, page_url=page_url,
vendor_consents=vendor_consents, user_agent=user_agent,
)
self.db.commit()
self.db.refresh(existing)
@@ -224,18 +220,18 @@ class BannerConsentService:
device_fingerprint=device_fingerprint,
categories=categories,
vendors=vendors,
vendor_consents=vendor_consents or {},
ip_hash=ip_hash,
user_agent=user_agent,
consent_string=consent_string,
expires_at=expires_at,
**extra,
)
self.db.add(consent)
self.db.flush()
self._log(
tid, consent.id, "consent_given", site_id, device_fingerprint,
categories, ip_hash, config_hash, config_ver,
consent_method=consent_method, page_url=page_url,
vendor_consents=vendor_consents, user_agent=user_agent,
)
self.db.commit()
self.db.refresh(consent)
@@ -383,14 +379,7 @@ class BannerConsentService:
total = base.count()
category_stats: dict[str, int] = {}
for c in base.all():
raw = c.categories or []
if isinstance(raw, str):
try:
import json
raw = json.loads(raw)
except (json.JSONDecodeError, TypeError):
raw = []
cats: list[str] = list(raw) if isinstance(raw, list) else []
cats: list[str] = list(c.categories or [])
for cat in cats:
category_stats[cat] = category_stats.get(cat, 0) + 1
return {
@@ -404,58 +393,3 @@ class BannerConsentService:
for cat, count in category_stats.items()
},
}
def list_consents(
self, tenant_id: str, site_id: Optional[str] = None,
limit: int = 50, offset: int = 0,
) -> dict[str, Any]:
"""List paginated banner consents with parsed categories."""
import json as _json
tid = uuid.UUID(tenant_id)
base = self.db.query(BannerConsentDB).filter(BannerConsentDB.tenant_id == tid)
if site_id:
base = base.filter(BannerConsentDB.site_id == site_id)
total = base.count()
rows = base.order_by(BannerConsentDB.created_at.desc()).offset(offset).limit(limit).all()
consents = []
for c in rows:
raw_cats = c.categories or []
if isinstance(raw_cats, str):
try:
raw_cats = _json.loads(raw_cats)
except (ValueError, TypeError):
raw_cats = []
raw_vendors = c.vendors or []
if isinstance(raw_vendors, str):
try:
raw_vendors = _json.loads(raw_vendors)
except (ValueError, TypeError):
raw_vendors = []
consents.append({
"id": str(c.id),
"site_id": c.site_id,
"device_fingerprint": c.device_fingerprint,
"categories": list(raw_cats) if isinstance(raw_cats, list) else [],
"vendors": list(raw_vendors) if isinstance(raw_vendors, list) else [],
"ip_hash": c.ip_hash,
"user_agent": c.user_agent,
"linked_email": c.linked_email,
"consent_string": c.consent_string,
"consent_method": c.consent_method,
"banner_version": c.banner_version,
"banner_config_hash": c.banner_config_hash,
"geo_country": c.geo_country,
"geo_region": c.geo_region,
"consent_scope": c.consent_scope,
"page_url": c.page_url,
"referrer": c.referrer,
"device_type": c.device_type,
"browser": c.browser,
"os": c.os,
"screen_resolution": c.screen_resolution,
"session_id": c.session_id,
"expires_at": c.expires_at.isoformat() if c.expires_at else None,
"created_at": c.created_at.isoformat() if c.created_at else None,
"updated_at": c.updated_at.isoformat() if c.updated_at else None,
})
return {"consents": consents, "total": total, "limit": limit, "offset": offset}
@@ -40,6 +40,22 @@ _CONTROL_COLUMNS = """
"""
def _ensure_list(val: Any) -> list:
"""Ensure a JSONB value is always a Python list."""
if isinstance(val, list):
return val
if val is None:
return []
if isinstance(val, str):
try:
import json
parsed = json.loads(val)
return parsed if isinstance(parsed, list) else []
except (json.JSONDecodeError, TypeError):
return []
return []
def _control_row(r: Any) -> dict[str, Any]:
"""Serialize a canonical_controls SELECT row to a response dict."""
return {
@@ -49,19 +65,19 @@ def _control_row(r: Any) -> dict[str, Any]:
"title": r.title,
"objective": r.objective,
"rationale": r.rationale,
"scope": r.scope,
"requirements": r.requirements,
"test_procedure": r.test_procedure,
"evidence": r.evidence,
"scope": r.scope if isinstance(r.scope, dict) else {},
"requirements": _ensure_list(r.requirements),
"test_procedure": _ensure_list(r.test_procedure),
"evidence": _ensure_list(r.evidence),
"severity": r.severity,
"risk_score": float(r.risk_score) if r.risk_score is not None else None,
"implementation_effort": r.implementation_effort,
"evidence_confidence": (
float(r.evidence_confidence) if r.evidence_confidence is not None else None
),
"open_anchors": r.open_anchors,
"open_anchors": _ensure_list(r.open_anchors),
"release_state": r.release_state,
"tags": r.tags or [],
"tags": _ensure_list(r.tags),
"created_at": r.created_at.isoformat() if r.created_at else None,
"updated_at": r.updated_at.isoformat() if r.updated_at else None,
}
@@ -0,0 +1,216 @@
"""
Compliance Report PDF Generator generates a comprehensive A4 PDF
covering all compliance modules for a project.
Uses reportlab (same as audit_pdf_generator.py).
"""
import io
import logging
from datetime import datetime, timezone
from typing import Any
from reportlab.lib import colors
from reportlab.lib.pagesizes import A4
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.lib.units import mm
from reportlab.platypus import (
SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle, PageBreak,
)
from sqlalchemy import text
from sqlalchemy.orm import Session
logger = logging.getLogger(__name__)
# Colors
PURPLE = colors.HexColor("#7c3aed")
LIGHT_PURPLE = colors.HexColor("#f5f3ff")
GRAY = colors.HexColor("#6b7280")
GREEN = colors.HexColor("#16a34a")
RED = colors.HexColor("#dc2626")
YELLOW = colors.HexColor("#ca8a04")
def _styles():
ss = getSampleStyleSheet()
ss.add(ParagraphStyle("Title2", parent=ss["Title"], fontSize=24, textColor=PURPLE, spaceAfter=6))
ss.add(ParagraphStyle("Section", parent=ss["Heading2"], fontSize=14, textColor=PURPLE, spaceBefore=12, spaceAfter=6))
ss.add(ParagraphStyle("Body2", parent=ss["Normal"], fontSize=10, leading=14, spaceAfter=4))
ss.add(ParagraphStyle("Small", parent=ss["Normal"], fontSize=8, textColor=GRAY))
return ss
class CompliancePDFGenerator:
"""Generates a full compliance status report as PDF."""
def __init__(self, db: Session) -> None:
self.db = db
def generate(self, tenant_id: str, project_id: str | None = None, language: str = "de") -> tuple[bytes, str]:
buf = io.BytesIO()
doc = SimpleDocTemplate(buf, pagesize=A4, leftMargin=20 * mm, rightMargin=20 * mm, topMargin=25 * mm, bottomMargin=20 * mm)
ss = _styles()
story: list = []
now = datetime.now(timezone.utc)
story.append(Paragraph("Compliance-Report", ss["Title2"]))
story.append(Paragraph(f"Stand: {now.strftime('%d.%m.%Y %H:%M')} UTC", ss["Small"]))
story.append(Spacer(1, 10 * mm))
# Company Profile
self._add_company_section(story, ss, tenant_id, project_id)
# TOM
self._add_count_section(story, ss, "TOM (Technisch-Organisatorische Massnahmen)",
"compliance_toms", tenant_id)
# VVT
self._add_count_section(story, ss, "VVT (Verarbeitungstaetigkeiten)",
"compliance_vvt_activities", tenant_id)
# DSFA
self._add_count_section(story, ss, "Datenschutz-Folgenabschaetzungen",
"compliance_dsfa_assessments", tenant_id)
# Risks
self._add_risk_section(story, ss, tenant_id)
# Vendors
self._add_count_section(story, ss, "Auftragsverarbeiter",
"compliance_vendor_assessments", tenant_id)
# Incidents
self._add_count_section(story, ss, "Datenschutz-Vorfaelle",
"compliance_notfallplan_incidents", tenant_id)
# Document Reviews
self._add_review_section(story, ss, tenant_id)
# Banner Consents
self._add_consent_section(story, ss, tenant_id)
# Org Roles
self._add_role_section(story, ss, tenant_id, project_id)
# Footer
story.append(Spacer(1, 15 * mm))
story.append(Paragraph("Erstellt mit BreakPilot Compliance SDK", ss["Small"]))
doc.build(story)
filename = f"compliance-report-{now.strftime('%Y%m%d')}.pdf"
return buf.getvalue(), filename
def _add_company_section(self, story, ss, tid, pid):
story.append(Paragraph("Unternehmensprofil", ss["Section"]))
try:
where = "tenant_id = :tid"
params: dict[str, Any] = {"tid": tid}
if pid:
where += " AND project_id = :pid"
params["pid"] = pid
row = self.db.execute(text(f"SELECT * FROM compliance_company_profiles WHERE {where} LIMIT 1"), params).fetchone()
if row:
d = dict(row._mapping)
data = [
["Feld", "Wert"],
["Firma", d.get("company_name", "-")],
["Branche", d.get("industry", "-")],
["Rechtsform", d.get("legal_form", "-")],
["Mitarbeiter", str(d.get("employee_count", "-"))],
]
t = Table(data, colWidths=[60 * mm, 100 * mm])
t.setStyle(TableStyle([
("BACKGROUND", (0, 0), (-1, 0), LIGHT_PURPLE),
("TEXTCOLOR", (0, 0), (-1, 0), PURPLE),
("FONTSIZE", (0, 0), (-1, -1), 9),
("GRID", (0, 0), (-1, -1), 0.5, colors.lightgrey),
("VALIGN", (0, 0), (-1, -1), "TOP"),
]))
story.append(t)
else:
story.append(Paragraph("Kein Unternehmensprofil hinterlegt.", ss["Body2"]))
except Exception as e:
story.append(Paragraph(f"Fehler beim Laden: {e}", ss["Small"]))
story.append(Spacer(1, 5 * mm))
def _add_count_section(self, story, ss, title, table_name, tid):
story.append(Paragraph(title, ss["Section"]))
try:
count = self.db.execute(text(f"SELECT COUNT(*) FROM {table_name} WHERE tenant_id = :tid"), {"tid": tid}).scalar()
story.append(Paragraph(f"Eintraege: <b>{count or 0}</b>", ss["Body2"]))
except Exception:
story.append(Paragraph("Tabelle nicht vorhanden oder leer.", ss["Small"]))
story.append(Spacer(1, 3 * mm))
def _add_risk_section(self, story, ss, tid):
story.append(Paragraph("Risikobewertung", ss["Section"]))
try:
q = text("""
SELECT severity, COUNT(*) as cnt FROM compliance_risks
WHERE tenant_id = :tid GROUP BY severity ORDER BY severity
""")
rows = self.db.execute(q, {"tid": tid}).fetchall()
if rows:
data = [["Schweregrad", "Anzahl"]]
for r in rows:
data.append([r.severity or "UNKNOWN", str(r.cnt)])
t = Table(data, colWidths=[80 * mm, 40 * mm])
t.setStyle(TableStyle([
("BACKGROUND", (0, 0), (-1, 0), LIGHT_PURPLE),
("TEXTCOLOR", (0, 0), (-1, 0), PURPLE),
("FONTSIZE", (0, 0), (-1, -1), 9),
("GRID", (0, 0), (-1, -1), 0.5, colors.lightgrey),
]))
story.append(t)
else:
story.append(Paragraph("Keine Risiken erfasst.", ss["Body2"]))
except Exception:
story.append(Paragraph("Risiko-Tabelle nicht vorhanden.", ss["Small"]))
story.append(Spacer(1, 3 * mm))
def _add_review_section(self, story, ss, tid):
story.append(Paragraph("Dokumenten-Reviews", ss["Section"]))
try:
q = text("SELECT status, COUNT(*) as cnt FROM compliance_document_reviews WHERE tenant_id = :tid GROUP BY status")
rows = self.db.execute(q, {"tid": tid}).fetchall()
if rows:
data = [["Status", "Anzahl"]]
for r in rows:
data.append([r.status, str(r.cnt)])
t = Table(data, colWidths=[80 * mm, 40 * mm])
t.setStyle(TableStyle([
("BACKGROUND", (0, 0), (-1, 0), LIGHT_PURPLE),
("FONTSIZE", (0, 0), (-1, -1), 9),
("GRID", (0, 0), (-1, -1), 0.5, colors.lightgrey),
]))
story.append(t)
else:
story.append(Paragraph("Keine Reviews vorhanden.", ss["Body2"]))
except Exception:
story.append(Paragraph("Review-Tabelle nicht vorhanden.", ss["Small"]))
story.append(Spacer(1, 3 * mm))
def _add_consent_section(self, story, ss, tid):
story.append(Paragraph("Banner-Consents", ss["Section"]))
try:
count = self.db.execute(text("SELECT COUNT(*) FROM compliance_banner_consents WHERE tenant_id = :tid"), {"tid": tid}).scalar()
story.append(Paragraph(f"Gesamte Consents: <b>{count or 0}</b>", ss["Body2"]))
except Exception:
story.append(Paragraph("Banner-Tabelle nicht vorhanden.", ss["Small"]))
story.append(Spacer(1, 3 * mm))
def _add_role_section(self, story, ss, tid, pid):
story.append(Paragraph("Rollenkonzept", ss["Section"]))
try:
where = "tenant_id = :tid"
params: dict[str, Any] = {"tid": tid}
if pid:
where += " AND (project_id = :pid OR project_id IS NULL)"
params["pid"] = pid
rows = self.db.execute(text(f"SELECT role_key, role_label, person_name, person_email FROM compliance_org_roles WHERE {where} ORDER BY role_key"), params).fetchall()
if rows:
data = [["Rolle", "Name", "E-Mail"]]
for r in rows:
data.append([r.role_label or r.role_key, r.person_name or "-", r.person_email or "-"])
t = Table(data, colWidths=[60 * mm, 50 * mm, 50 * mm])
t.setStyle(TableStyle([
("BACKGROUND", (0, 0), (-1, 0), LIGHT_PURPLE),
("TEXTCOLOR", (0, 0), (-1, 0), PURPLE),
("FONTSIZE", (0, 0), (-1, -1), 9),
("GRID", (0, 0), (-1, -1), 0.5, colors.lightgrey),
]))
story.append(t)
else:
story.append(Paragraph("Keine Rollen zugewiesen.", ss["Body2"]))
except Exception:
story.append(Paragraph("Rollen-Tabelle nicht vorhanden.", ss["Small"]))
@@ -87,9 +87,10 @@ def compare_services(
for key, svc in detected_names.items():
# Skip CMP — consent managers don't need DSE mention
if svc.get("category") == "other" and svc.get("id") == "cmp":
if svc.get("category") == "cmp" or (svc.get("category") == "other" and svc.get("id") == "cmp"):
continue
matched = False
# Method 1: Match against LLM-extracted service list
for dse_key, dse_svc in dse_names.items():
if key == dse_key or _fuzzy_match(svc["name"], dse_svc["name"]):
documented.append({"detected": svc, "dse": dse_svc, "status": "ok"})
@@ -0,0 +1,100 @@
"""
DSR Art. 11 Service handles "data subject not identifiable" rejections.
Art. 11 Abs. 1 DSGVO: If the controller is unable to identify the data
subject, it is not obligated to obtain additional information solely to
comply with Art. 15-20 requests.
Common scenario: Website visitor requests access, but only anonymous
cookies/IP-hashes are stored no way to link to a person.
"""
import logging
from datetime import datetime, timezone
from typing import Any, Dict
from sqlalchemy.orm import Session
from compliance.domain import ValidationError
logger = logging.getLogger(__name__)
class DSRArt11Service:
"""Handles Art. 11 DSGVO rejections for non-identifiable data subjects."""
def __init__(self, db: Session) -> None:
self._db = db
def reject_not_identifiable(
self, dsr_id: str, tenant_id: str, notes: str = "",
) -> Dict[str, Any]:
"""Reject DSR because data subject cannot be identified."""
from compliance.db.dsr_models import DSRRequestDB
from compliance.services.dsr_workflow_service import _dsr_to_dict, _record_history
dsr = (
self._db.query(DSRRequestDB)
.filter(DSRRequestDB.id == dsr_id, DSRRequestDB.tenant_id == tenant_id)
.first()
)
if not dsr:
raise ValidationError("DSR not found")
if dsr.status in ("completed", "rejected", "cancelled"):
raise ValidationError("DSR already closed")
now = datetime.now(timezone.utc)
reason = (
"Die bei uns gespeicherten Daten (anonymisierte Cookies, IP-Hashes, "
"Device-Fingerprints) erlauben keine Identifikation der betroffenen Person. "
"Gemaess Art. 11 Abs. 1 DSGVO sind wir nicht verpflichtet, zusaetzliche "
"Informationen zu erheben, um die betroffene Person zu identifizieren."
)
if notes:
reason += f" Ergaenzung: {notes}"
_record_history(self._db, dsr, "rejected",
comment="Art. 11 DSGVO — Identifikation nicht moeglich")
dsr.status = "rejected"
dsr.rejection_reason = reason
dsr.rejection_legal_basis = "Art. 11 Abs. 1 DSGVO"
dsr.identity_verified = False
dsr.verification_method = "art11_not_identifiable"
dsr.verification_notes = "Daten erlauben keine Identifikation der betroffenen Person"
dsr.completed_at = now
dsr.updated_at = now
self._db.commit()
self._db.refresh(dsr)
# Send rejection notification
self._send_art11_notification(dsr)
return _dsr_to_dict(dsr)
def _send_art11_notification(self, dsr: Any) -> None:
if not dsr.requester_email:
return
try:
from compliance.services.email_delivery_service import EmailDeliveryService
delivery = EmailDeliveryService(self._db)
variables = {
"requester_name": dsr.requester_name or "Antragsteller/in",
"reference_number": dsr.request_number or "",
"rejection_reason": "Identifikation nicht moeglich — Art. 11 Abs. 1 DSGVO",
"legal_basis": "Art. 11 Abs. 1 DSGVO",
"sender_name": "Datenschutzbeauftragter",
}
# Use published dsr_rejection template, fallback to inline
delivery.send(
tenant_id=str(dsr.tenant_id),
template_type="dsr_rejection",
recipient=dsr.requester_email,
variables=variables,
fallback_subject=f"Zu Ihrer Anfrage {dsr.request_number} — Art. 11 DSGVO",
fallback_html=f"""<p>Sehr geehrte/r {dsr.requester_name or 'Antragsteller/in'},</p>
<p>wir koennen die bei uns gespeicherten Daten keiner identifizierbaren Person zuordnen.
Gemaess Art. 11 Abs. 1 DSGVO ist eine Auskunftserteilung nicht moeglich.</p>
<p>Mit freundlichen Gruessen<br/>Datenschutzbeauftragter</p>""",
)
except Exception as e:
logger.warning("Art. 11 notification failed: %s", e)
@@ -0,0 +1,273 @@
"""
DSR User Data Export Service aggregates all CMP data about a user.
Supports Art. 15 (access right, PDF) and Art. 20 (data portability, JSON/CSV).
Collects from: Banner Consents, Einwilligungen, Consent Audit Trail, DSR History.
"""
import csv
import io
import json
import logging
import uuid
from datetime import datetime, timezone
from typing import Any, Optional
from reportlab.lib import colors
from reportlab.lib.pagesizes import A4
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.lib.units import mm
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle
from sqlalchemy import text
from sqlalchemy.orm import Session
from compliance.services.banner_dsr_service import BannerDSRService
logger = logging.getLogger(__name__)
PURPLE = colors.HexColor("#7c3aed")
LIGHT_PURPLE = colors.HexColor("#f5f3ff")
GRAY = colors.HexColor("#6b7280")
class DSRExportService:
"""Aggregates and exports all user data stored in the CMP."""
def __init__(self, db: Session) -> None:
self.db = db
def aggregate_user_data(self, tenant_id: str, email: str) -> dict[str, Any]:
"""Collect ALL data about a user from all CMP sources."""
now = datetime.now(timezone.utc)
tid = tenant_id # Keep as string — let PostgreSQL cast
# 1. Banner consents + audit trail
banner_data: dict[str, Any] = {"banner_consents": [], "audit_trail": []}
try:
banner_svc = BannerDSRService(self.db)
banner_data = banner_svc.export_for_dsr(tenant_id, email)
except Exception as e:
logger.warning("Banner DSR export failed: %s", e)
try:
self.db.rollback()
except Exception:
pass
# 2. Einwilligungen (user-based consents)
einwilligungen: list[dict] = []
try:
q = text("""
SELECT c.id, c.data_point_id, c.granted, c.granted_at, c.revoked_at,
c.consent_version, c.source, c.ip_address, c.user_agent, c.created_at
FROM compliance_einwilligungen_consents c
WHERE c.tenant_id = CAST(:tid AS VARCHAR) AND c.user_id = :email
ORDER BY c.created_at DESC
""")
rows = self.db.execute(q, {"tid": tid, "email": email}).fetchall()
for r in rows:
entry = dict(r._mapping)
for k, v in entry.items():
if isinstance(v, datetime):
entry[k] = v.isoformat()
elif isinstance(v, uuid.UUID):
entry[k] = str(v)
# Get history
hist_q = text("""
SELECT action, consent_version, ip_address, user_agent, source, created_at
FROM compliance_einwilligungen_consent_history
WHERE consent_id = :cid ORDER BY created_at
""")
hist = self.db.execute(hist_q, {"cid": entry["id"]}).fetchall()
entry["history"] = [
{k: (v.isoformat() if isinstance(v, datetime) else str(v) if isinstance(v, uuid.UUID) else v)
for k, v in dict(h._mapping).items()}
for h in hist
]
einwilligungen.append(entry)
except Exception as e:
logger.warning("Einwilligungen export failed: %s", e)
try:
self.db.rollback()
except Exception:
pass
# 3. DSR requests by this user
dsr_requests: list[dict] = []
try:
q = text("""
SELECT id, request_number, request_type, status, received_at, deadline_at, completed_at
FROM compliance_dsr_requests
WHERE tenant_id = :tid AND requester_email = :email
ORDER BY received_at DESC
""")
rows = self.db.execute(q, {"tid": tid, "email": email}).fetchall()
for r in rows:
entry = dict(r._mapping)
for k, v in entry.items():
if isinstance(v, datetime):
entry[k] = v.isoformat()
elif isinstance(v, uuid.UUID):
entry[k] = str(v)
dsr_requests.append(entry)
except Exception as e:
logger.warning("DSR requests export failed: %s", e)
try:
self.db.rollback()
except Exception:
pass
return {
"export_date": now.isoformat(),
"data_subject": {"email": email},
"banner_consents": banner_data.get("banner_consents", []),
"consent_audit_trail": banner_data.get("audit_trail", []),
"einwilligungen": einwilligungen,
"dsr_requests": dsr_requests,
"metadata": {
"tenant_id": tenant_id,
"data_categories": ["Banner-Consents", "Einwilligungen", "Audit-Trail", "DSR-Anfragen"],
"legal_basis": "Art. 15 / Art. 20 DSGVO",
},
}
def export_json(self, tenant_id: str, email: str) -> tuple[bytes, str]:
data = self.aggregate_user_data(tenant_id, email)
data["metadata"]["export_format"] = "json"
content = json.dumps(data, indent=2, ensure_ascii=False, default=str).encode("utf-8")
return content, f"dsr-export-{email.split('@')[0]}.json"
def export_csv(self, tenant_id: str, email: str) -> tuple[bytes, str]:
data = self.aggregate_user_data(tenant_id, email)
buf = io.StringIO()
writer = csv.writer(buf)
writer.writerow(["Kategorie", "Schluessel", "Wert", "Zeitpunkt", "Quelle"])
# Banner consents
for c in data.get("banner_consents", []):
writer.writerow(["Banner-Consent", "site_id", c.get("site_id", ""), c.get("created_at", ""), "CMP"])
writer.writerow(["Banner-Consent", "categories", ", ".join(c.get("categories", [])), c.get("updated_at", ""), "CMP"])
writer.writerow(["Banner-Consent", "ip_hash", c.get("ip_hash", ""), c.get("created_at", ""), "CMP"])
# Audit trail
for a in data.get("consent_audit_trail", []):
writer.writerow(["Audit-Trail", a.get("action", ""), ", ".join(a.get("categories", [])), a.get("created_at", ""), "CMP"])
# Einwilligungen
for e in data.get("einwilligungen", []):
status = "Erteilt" if e.get("granted") else "Widerrufen"
writer.writerow(["Einwilligung", e.get("data_point_id", ""), status, e.get("granted_at", ""), e.get("source", "")])
# DSR requests
for d in data.get("dsr_requests", []):
writer.writerow(["DSR-Anfrage", d.get("request_type", ""), d.get("status", ""), d.get("received_at", ""), ""])
content = buf.getvalue().encode("utf-8-sig") # BOM for Excel
return content, f"dsr-export-{email.split('@')[0]}.csv"
def export_pdf(self, tenant_id: str, email: str) -> tuple[bytes, str]:
data = self.aggregate_user_data(tenant_id, email)
buf = io.BytesIO()
doc = SimpleDocTemplate(buf, pagesize=A4, leftMargin=20 * mm, rightMargin=20 * mm, topMargin=25 * mm, bottomMargin=20 * mm)
ss = getSampleStyleSheet()
ss.add(ParagraphStyle("Title2", parent=ss["Title"], fontSize=20, textColor=PURPLE, spaceAfter=6))
ss.add(ParagraphStyle("Section", parent=ss["Heading2"], fontSize=13, textColor=PURPLE, spaceBefore=10))
ss.add(ParagraphStyle("Body2", parent=ss["Normal"], fontSize=9, leading=13))
ss.add(ParagraphStyle("Small", parent=ss["Normal"], fontSize=8, textColor=GRAY))
story: list = []
# Cover
story.append(Paragraph("Datenauskunft gemaess Art. 15 DSGVO", ss["Title2"]))
story.append(Paragraph(f"Betroffene Person: {email}", ss["Body2"]))
story.append(Paragraph(f"Erstellt am: {data['export_date'][:10]}", ss["Small"]))
story.append(Spacer(1, 8 * mm))
tbl_style = TableStyle([
("BACKGROUND", (0, 0), (-1, 0), LIGHT_PURPLE),
("TEXTCOLOR", (0, 0), (-1, 0), PURPLE),
("FONTSIZE", (0, 0), (-1, -1), 8),
("GRID", (0, 0), (-1, -1), 0.5, colors.lightgrey),
("VALIGN", (0, 0), (-1, -1), "TOP"),
("TOPPADDING", (0, 0), (-1, -1), 3),
("BOTTOMPADDING", (0, 0), (-1, -1), 3),
])
# Section 1: Banner Consents
consents = data.get("banner_consents", [])
story.append(Paragraph(f"1. Banner-Consents ({len(consents)})", ss["Section"]))
if consents:
rows = [["Site", "Kategorien", "IP-Hash", "Erstellt", "Aktualisiert"]]
for c in consents:
rows.append([
str(c.get("site_id", "")),
", ".join(c.get("categories", [])),
str(c.get("ip_hash", ""))[:12] + "...",
str(c.get("created_at", ""))[:10],
str(c.get("updated_at", ""))[:10],
])
t = Table(rows, colWidths=[30 * mm, 40 * mm, 30 * mm, 25 * mm, 25 * mm])
t.setStyle(tbl_style)
story.append(t)
else:
story.append(Paragraph("Keine Banner-Consents gespeichert.", ss["Body2"]))
# Section 2: Einwilligungen
einw = data.get("einwilligungen", [])
story.append(Paragraph(f"2. Einwilligungen ({len(einw)})", ss["Section"]))
if einw:
rows = [["Datenpunkt", "Status", "Erteilt am", "Widerrufen am", "IP-Adresse"]]
for e in einw:
rows.append([
str(e.get("data_point_id", "")),
"Erteilt" if e.get("granted") else "Widerrufen",
str(e.get("granted_at", ""))[:10],
str(e.get("revoked_at", ""))[:10] if e.get("revoked_at") else "-",
str(e.get("ip_address", ""))[:15] if e.get("ip_address") else "-",
])
t = Table(rows, colWidths=[35 * mm, 25 * mm, 25 * mm, 25 * mm, 35 * mm])
t.setStyle(tbl_style)
story.append(t)
else:
story.append(Paragraph("Keine Einwilligungen gespeichert.", ss["Body2"]))
# Section 3: Audit Trail
trail = data.get("consent_audit_trail", [])
story.append(Paragraph(f"3. Consent-Audit-Trail ({len(trail)})", ss["Section"]))
if trail:
rows = [["Aktion", "Kategorien", "Datum"]]
for a in trail[:50]: # Limit to 50 for PDF
rows.append([
str(a.get("action", "")),
", ".join(a.get("categories", [])),
str(a.get("created_at", ""))[:19],
])
t = Table(rows, colWidths=[40 * mm, 60 * mm, 45 * mm])
t.setStyle(tbl_style)
story.append(t)
if len(trail) > 50:
story.append(Paragraph(f"... und {len(trail) - 50} weitere Eintraege (im JSON-Export enthalten)", ss["Small"]))
else:
story.append(Paragraph("Kein Audit-Trail vorhanden.", ss["Body2"]))
# Section 4: DSR Requests
dsrs = data.get("dsr_requests", [])
story.append(Paragraph(f"4. Bisherige DSR-Anfragen ({len(dsrs)})", ss["Section"]))
if dsrs:
rows = [["Typ", "Status", "Eingegangen", "Abgeschlossen"]]
for d in dsrs:
rows.append([
str(d.get("request_type", "")),
str(d.get("status", "")),
str(d.get("received_at", ""))[:10],
str(d.get("completed_at", ""))[:10] if d.get("completed_at") else "-",
])
t = Table(rows, colWidths=[35 * mm, 30 * mm, 35 * mm, 35 * mm])
t.setStyle(tbl_style)
story.append(t)
# Footer
story.append(Spacer(1, 15 * mm))
story.append(Paragraph("Erstellt mit BreakPilot Compliance SDK | Art. 15 DSGVO Datenauskunft", ss["Small"]))
doc.build(story)
return buf.getvalue(), f"dsr-export-{email.split('@')[0]}.pdf"
@@ -0,0 +1,122 @@
"""
Email Template Delivery Service the missing integration layer.
Combines: template loading published version variable rendering SMTP audit log.
Used by DSR workflow, document reviews, and other modules that need to send
templated emails.
"""
import logging
import uuid
from typing import Any, Optional
from sqlalchemy.orm import Session
from compliance.db.email_template_models import (
EmailSendLogDB,
EmailTemplateDB,
EmailTemplateVersionDB,
)
logger = logging.getLogger(__name__)
def _render(html: str, variables: dict[str, str]) -> str:
"""Replace {{variable}} placeholders with values."""
result = html
for key, value in variables.items():
result = result.replace(f"{{{{{key}}}}}", str(value))
return result
class EmailDeliveryService:
"""Load template → render → send via SMTP → log."""
def __init__(self, db: Session) -> None:
self.db = db
def get_published_version(
self, tenant_id: str, template_type: str,
) -> Optional[EmailTemplateVersionDB]:
"""Get the latest published version of a template by type."""
tid = uuid.UUID(tenant_id)
template = (
self.db.query(EmailTemplateDB)
.filter(EmailTemplateDB.tenant_id == tid, EmailTemplateDB.template_type == template_type)
.first()
)
if not template:
return None
return (
self.db.query(EmailTemplateVersionDB)
.filter(
EmailTemplateVersionDB.template_id == template.id,
EmailTemplateVersionDB.status == "published",
)
.order_by(EmailTemplateVersionDB.created_at.desc())
.first()
)
def send(
self,
tenant_id: str,
template_type: str,
recipient: str,
variables: dict[str, str],
fallback_subject: Optional[str] = None,
fallback_html: Optional[str] = None,
) -> dict[str, Any]:
"""Send a templated email. Falls back to inline HTML if no published template.
Args:
tenant_id: Tenant UUID string.
template_type: E.g. 'dsr_receipt', 'dsr_completion'.
recipient: Email address.
variables: Dict of {{key}}: value for rendering.
fallback_subject: Subject if no template found.
fallback_html: HTML body if no template found.
"""
from compliance.services.smtp_sender import send_email
tid = uuid.UUID(tenant_id)
version = self.get_published_version(tenant_id, template_type)
if version:
subject = _render(version.subject, variables)
body_html = _render(version.body_html, variables)
version_id = version.id
elif fallback_subject and fallback_html:
subject = _render(fallback_subject, variables)
body_html = _render(fallback_html, variables)
version_id = None
else:
logger.warning("No published template for '%s' and no fallback provided", template_type)
return {"success": False, "error": f"No template for {template_type}"}
result = send_email(recipient=recipient, subject=subject, body_html=body_html)
# Audit log
try:
log = EmailSendLogDB(
tenant_id=tid,
template_type=template_type,
version_id=version_id,
recipient=recipient,
subject=subject,
status=result.get("status", "unknown"),
variables=variables,
error_message=result.get("error"),
)
self.db.add(log)
self.db.commit()
except Exception as e:
logger.warning("Failed to log email send: %s", e)
return {
"success": result.get("status") == "sent",
"template_type": template_type,
"recipient": recipient,
"subject": subject,
"used_template": version is not None,
"status": result.get("status"),
}
@@ -0,0 +1,179 @@
"""
Intake Extractor derives UCCA intake flags from DETECTED SERVICES,
not from website text content.
The actual data processing happens through APIs, scripts, and cookies
NOT through visible text on the page. A news website reporting about
healthcare does NOT process health data.
Flags are derived deterministically from:
1. Which third-party services are embedded (Google Analytics tracking)
2. Which payment providers are used (Stripe payment_data)
3. Which CDN/fonts are loaded (Google Fonts cross_border_transfer)
"""
import logging
logger = logging.getLogger(__name__)
# Service category → intake flags mapping
# This is the ONLY source of truth for what a service implies
SERVICE_TO_FLAGS: dict[str, dict[str, bool]] = {
# Tracking & Analytics → personal_data + tracking
"tracking": {
"personal_data": True,
"tracking": True,
},
# Marketing → marketing + tracking + third_party_sharing
"marketing": {
"personal_data": True,
"tracking": True,
"marketing": True,
"third_party_sharing": True,
},
# Heatmap/Session Recording → tracking + profiling
"heatmap": {
"personal_data": True,
"tracking": True,
"profiling": True,
},
# Payment → payment_data
"payment": {
"personal_data": True,
"payment_data": True,
},
# Chatbot → personal_data (user sends messages)
"chatbot": {
"personal_data": True,
"customer_data": True,
},
# CRM → customer_data + profiling
"crm": {
"personal_data": True,
"customer_data": True,
"profiling": True,
},
# CDN from non-EU → cross_border_transfer (IP sent to US)
"cdn": {
"personal_data": True,
},
}
# Specific services with special flags
SPECIFIC_SERVICE_FLAGS: dict[str, dict[str, bool]] = {
"klarna": {"automated_decisions": True, "payment_data": True},
"paypal": {"cross_border_transfer": True, "payment_data": True},
"stripe": {"cross_border_transfer": True, "payment_data": True},
"google_analytics": {"cross_border_transfer": True, "tracking": True},
"facebook_pixel": {"cross_border_transfer": True, "marketing": True, "profiling": True},
"hotjar": {"profiling": True, "tracking": True},
"ms_clarity": {"cross_border_transfer": True, "profiling": True},
"tiktok_pixel": {"cross_border_transfer": True, "marketing": True},
"intercom": {"cross_border_transfer": True, "ai_usage": True},
}
def extract_intake_flags_from_services(detected_services: list[dict]) -> dict:
"""Derive intake flags from detected third-party services.
This is deterministic and 100% accurate if Google Analytics is
embedded, tracking IS happening. No guessing needed.
"""
flags = {
"personal_data": False,
"customer_data": False,
"payment_data": False,
"location_data": False,
"biometric_data": False,
"minor_data": False,
"health_data": False,
"marketing": False,
"profiling": False,
"automated_decisions": False,
"third_party_sharing": False,
"cross_border_transfer": False,
"tracking": False,
"ai_usage": False,
}
for svc in detected_services:
category = svc.get("category", "other")
service_id = svc.get("id", "")
eu_adequate = svc.get("eu_adequate", True)
# Apply category-level flags
cat_flags = SERVICE_TO_FLAGS.get(category, {})
for key, value in cat_flags.items():
if value:
flags[key] = True
# Apply service-specific flags
svc_flags = SPECIFIC_SERVICE_FLAGS.get(service_id, {})
for key, value in svc_flags.items():
if value:
flags[key] = True
# Non-EU service → cross_border_transfer
if not eu_adequate:
flags["cross_border_transfer"] = True
flags["third_party_sharing"] = True
# Any website with detected services processes personal data (IP at minimum)
if detected_services:
flags["personal_data"] = True
active = {k: v for k, v in flags.items() if v}
logger.info("Intake flags from %d services: %s", len(detected_services), active)
return flags
# Keep backward compatibility
async def extract_intake_flags(text: str) -> dict:
"""DEPRECATED — use extract_intake_flags_from_services() instead.
This function used LLM to guess flags from text content.
Text content does NOT represent actual data processing.
"""
logger.warning(
"extract_intake_flags(text) called — DEPRECATED. "
"Use extract_intake_flags_from_services(detected_services) instead."
)
# Return minimal flags — website exists = personal_data (IP)
return {"personal_data": True, "tracking": False}
def flags_to_ucca_intake(flags: dict) -> dict:
"""Convert extracted flags to UCCA intake format."""
return {
"data_types": {
"personal_data": flags.get("personal_data", False),
"customer_data": flags.get("customer_data", False),
"location_data": flags.get("location_data", False),
"biometric_data": flags.get("biometric_data", False),
"minor_data": flags.get("minor_data", False),
"images": False,
"audio": False,
"financial_data": flags.get("payment_data", False),
"employee_data": False,
"article_9_data": flags.get("health_data", False) or flags.get("biometric_data", False),
},
"purpose": {
"marketing": flags.get("marketing", False),
"analytics": flags.get("tracking", False),
"profiling": flags.get("profiling", False),
"automation": flags.get("ai_usage", False),
"customer_support": False,
"evaluation_scoring": flags.get("automated_decisions", False),
"decision_making": flags.get("automated_decisions", False),
},
"automation": "fully_automated" if flags.get("automated_decisions") else
"partially_automated" if flags.get("ai_usage") else "manual",
"outputs": {
"recommendations_to_users": flags.get("profiling", False),
"data_export": flags.get("cross_border_transfer", False),
"legal_effects": flags.get("automated_decisions", False),
},
"hosting": {
"region": "non_eu" if flags.get("cross_border_transfer") else "eu",
},
}
@@ -0,0 +1,152 @@
"""
Control Relevance Filter filters out controls that are not relevant
for the analyzed document based on keyword matching.
Prevents false positives like C_TRANSPARENCY being recommended when
no AI usage is evident.
"""
import logging
import re
logger = logging.getLogger(__name__)
# Top controls with their relevance conditions.
# A control is only relevant if ANY keyword from 'requires_any' matches the text.
# If 'requires_any' is empty, the control is always relevant.
CONTROL_RELEVANCE: dict[str, dict] = {
"C_TRANSPARENCY": {
"description": "KI-Transparenz-Hinweis (Art. 52 AI Act)",
"requires_any": [
"künstliche intelligenz", "kuenstliche intelligenz",
"artificial intelligence", "machine learning", "maschinelles lernen",
"ki-gestützt", "ki-gestuetzt", "ai-powered", "ai system",
"chatbot", "neural", "deep learning", "algorithmus", "algorithmen",
"automatisierte entscheidung", "automated decision",
],
"reason": "Nur relevant wenn KI/ML tatsaechlich eingesetzt wird",
},
"C_DSFA_REQUIRED": {
"description": "Datenschutz-Folgenabschaetzung durchfuehren",
"requires_any": [
"gesundheit", "biometrisch", "genetisch", "health", "biometric",
"scoring", "profiling", "systematisch", "umfangreich",
"videoüberwachung", "videoueberwachung", "kamera",
"minderjährig", "minderjaehrig", "kinder",
],
"reason": "Nur bei hohem Risiko (Art. 9 Daten, Profiling, Ueberwachung)",
},
"C_ART22_INFO": {
"description": "Info ueber automatisierte Einzelentscheidung (Art. 22 DSGVO)",
"requires_any": [
"automatisierte entscheidung", "automated decision", "scoring",
"bonitaet", "kredit", "rating", "algorithmische entscheidung",
"profiling", "klarna", "ratenzahlung",
],
"reason": "Nur bei automatisierten Einzelentscheidungen mit Rechtswirkung",
},
"C_DPO_REQUIRED": {
"description": "Datenschutzbeauftragten bestellen",
"requires_any": [], # Always relevant — empty means no filter
"reason": "Generell relevant fuer Unternehmen",
},
"C_EXPLICIT_CONSENT": {
"description": "Explizite Einwilligung einholen",
"requires_any": [
"cookie", "tracking", "analytics", "pixel", "marketing",
"werbung", "newsletter", "remarketing", "retargeting",
"einwilligung", "consent", "opt-in",
],
"reason": "Nur bei Tracking/Marketing das Einwilligung erfordert",
},
"C_CHILD_PROTECTION": {
"description": "Besonderer Schutz fuer Minderdjaehrige",
"requires_any": [
"kinder", "minderjährig", "minderjaehrig", "jugend",
"under 16", "unter 16", "schüler", "schueler", "child",
],
"reason": "Nur wenn Daten von Minderjaehrigen verarbeitet werden",
},
"C_THIRD_COUNTRY_SAFEGUARDS": {
"description": "Drittlandtransfer absichern (Art. 44-49 DSGVO)",
"requires_any": [
"usa", "united states", "drittland", "drittst", "third countr",
"standardvertragsklausel", "sccs", "binding corporate",
"angemessenheitsbeschluss", "adequacy",
"google", "meta", "facebook", "amazon", "microsoft", "apple",
"cloudflare", "stripe", "paypal",
],
"reason": "Nur bei Datentransfer in Drittlaender",
},
}
def filter_controls(
controls: list[str],
source_text: str,
intake_flags: dict | None = None,
) -> list[str]:
"""Filter controls based on relevance to the analyzed text.
Returns only controls that are relevant (keyword match or no filter defined).
"""
if not controls:
return controls
text_lower = source_text.lower()
filtered = []
removed = []
for control in controls:
# Extract control ID from string like "[C_TRANSPARENCY] Nutzer informieren..."
control_id = _extract_control_id(control)
if control_id and control_id in CONTROL_RELEVANCE:
rules = CONTROL_RELEVANCE[control_id]
keywords = rules["requires_any"]
if not keywords:
# No filter = always relevant
filtered.append(control)
continue
# Check if any keyword matches
if any(kw in text_lower for kw in keywords):
filtered.append(control)
else:
# Also check intake flags as fallback
if intake_flags and _check_flags(control_id, intake_flags):
filtered.append(control)
else:
removed.append((control_id, rules["reason"]))
else:
# Unknown control — keep it (don't filter what we don't understand)
filtered.append(control)
if removed:
logger.info(
"Relevance filter removed %d controls: %s",
len(removed),
", ".join(f"{cid} ({reason})" for cid, reason in removed),
)
return filtered
def _extract_control_id(control: str) -> str | None:
"""Extract control ID from '[C_XXX] description' format."""
match = re.match(r"\[([A-Z_0-9]+)\]", control)
return match.group(1) if match else None
def _check_flags(control_id: str, flags: dict) -> bool:
"""Check if intake flags make a control relevant."""
flag_map = {
"C_TRANSPARENCY": flags.get("ai_usage", False),
"C_DSFA_REQUIRED": flags.get("health_data", False) or flags.get("biometric_data", False),
"C_ART22_INFO": flags.get("automated_decisions", False),
"C_EXPLICIT_CONSENT": flags.get("tracking", False) or flags.get("marketing", False),
"C_CHILD_PROTECTION": flags.get("minor_data", False),
"C_THIRD_COUNTRY_SAFEGUARDS": flags.get("cross_border_transfer", False),
}
return flag_map.get(control_id, False)
@@ -0,0 +1,209 @@
"""
TCF 2.2 TC String Encoder generates IAB Transparency & Consent strings.
Implements the TC String v2.2 format per IAB specification.
The TC String is a base64url-encoded bitfield containing:
- CMP metadata (ID, version, screen, consent language)
- Purpose consents (12 standard IAB purposes)
- Vendor consents (per IAB vendor ID)
- Legitimate interest signals
Reference: https://github.com/InteractiveAdvertisingBureau/GDPR-Transparency-and-Consent-Framework
NOTE: This is a simplified encoder for CMP integration. For full GVL
(Global Vendor List) support, integrate with the IAB GVL API.
"""
import base64
import math
from datetime import datetime, timezone
from typing import Any
# IAB TCF 2.2 Standard Purposes
IAB_PURPOSES = {
1: {"name": "Store and/or access information on a device", "name_de": "Informationen auf Geraet speichern/abrufen"},
2: {"name": "Select basic ads", "name_de": "Einfache Anzeigen auswaehlen"},
3: {"name": "Create a personalised ads profile", "name_de": "Personalisiertes Anzeigenprofil erstellen"},
4: {"name": "Select personalised ads", "name_de": "Personalisierte Anzeigen auswaehlen"},
5: {"name": "Create a personalised content profile", "name_de": "Personalisiertes Inhaltsprofil erstellen"},
6: {"name": "Select personalised content", "name_de": "Personalisierte Inhalte auswaehlen"},
7: {"name": "Measure ad performance", "name_de": "Anzeigen-Leistung messen"},
8: {"name": "Measure content performance", "name_de": "Inhalte-Leistung messen"},
9: {"name": "Apply market research to generate audience insights", "name_de": "Marktforschung fuer Zielgruppen"},
10: {"name": "Develop and improve products", "name_de": "Produkte entwickeln und verbessern"},
11: {"name": "Use limited data to select content", "name_de": "Eingeschraenkte Daten fuer Inhalte nutzen"},
12: {"name": "Use limited data to select ads", "name_de": "Eingeschraenkte Daten fuer Anzeigen nutzen"},
}
# IAB Special Features
IAB_SPECIAL_FEATURES = {
1: {"name": "Use precise geolocation data", "name_de": "Praezise Standortdaten verwenden"},
2: {"name": "Actively scan device characteristics for identification", "name_de": "Geraetemerkmale aktiv scannen"},
}
# Category-to-Purpose mapping (how our banner categories map to IAB purposes)
CATEGORY_PURPOSE_MAP = {
"necessary": [], # No consent needed
"functional": [1, 11], # Device access + limited data for content
"statistics": [1, 7, 8, 9, 10], # Device access + measurement + research
"marketing": [1, 2, 3, 4, 5, 6, 7, 12], # Most purposes
}
def _int_to_bits(value: int, length: int) -> str:
"""Convert integer to fixed-length bit string."""
return bin(value)[2:].zfill(length)
def _datetime_to_deciseconds(dt: datetime) -> int:
"""Convert datetime to deciseconds since epoch (IAB format)."""
epoch = datetime(2000, 1, 1, tzinfo=timezone.utc)
return int((dt - epoch).total_seconds() * 10)
def _bits_to_base64url(bits: str) -> str:
"""Convert bit string to base64url encoding (TC String format)."""
# Pad to multiple of 8
padding = (8 - len(bits) % 8) % 8
bits += "0" * padding
# Convert to bytes
byte_array = bytearray()
for i in range(0, len(bits), 8):
byte_array.append(int(bits[i:i+8], 2))
# Base64url encode (no padding)
return base64.urlsafe_b64encode(bytes(byte_array)).rstrip(b"=").decode("ascii")
class TCFEncoderService:
"""Generates TC Strings per IAB TCF 2.2 specification."""
def __init__(
self,
cmp_id: int = 1,
cmp_version: int = 1,
consent_screen: int = 1,
consent_language: str = "DE",
):
self.cmp_id = cmp_id
self.cmp_version = cmp_version
self.consent_screen = consent_screen
self.consent_language = consent_language
def encode(
self,
purpose_consents: dict[int, bool],
vendor_consents: dict[int, bool],
purpose_li: dict[int, bool] | None = None,
special_features: dict[int, bool] | None = None,
) -> str:
"""Generate a TC String from consent decisions.
Args:
purpose_consents: {purpose_id: True/False} for purposes 1-12
vendor_consents: {vendor_id: True/False} for IAB vendor IDs
purpose_li: Legitimate interest signals per purpose
special_features: Special feature opt-ins
Returns:
Base64url-encoded TC String
"""
now = datetime.now(timezone.utc)
created = _datetime_to_deciseconds(now)
updated = created
bits = ""
# Core TC String v2 fields
bits += _int_to_bits(2, 6) # Version (6 bits) = 2
bits += _int_to_bits(created, 36) # Created (36 bits)
bits += _int_to_bits(updated, 36) # LastUpdated (36 bits)
bits += _int_to_bits(self.cmp_id, 12) # CmpId (12 bits)
bits += _int_to_bits(self.cmp_version, 12) # CmpVersion (12 bits)
bits += _int_to_bits(self.consent_screen, 6) # ConsentScreen (6 bits)
# ConsentLanguage (12 bits = 2 × 6-bit letters)
lang = self.consent_language.upper()[:2]
bits += _int_to_bits(ord(lang[0]) - ord("A"), 6)
bits += _int_to_bits(ord(lang[1]) - ord("A"), 6)
# VendorListVersion (12 bits) — use 0 if not fetching GVL
bits += _int_to_bits(0, 12)
# TcfPolicyVersion (6 bits) = 4 for TCF 2.2
bits += _int_to_bits(4, 6)
# IsServiceSpecific (1 bit) = 1
bits += "1"
# UseNonStandardTexts (1 bit) = 0
bits += "0"
# SpecialFeatureOptIns (12 bits)
sf = special_features or {}
for i in range(1, 13):
bits += "1" if sf.get(i, False) else "0"
# PurposesConsent (24 bits)
for i in range(1, 25):
bits += "1" if purpose_consents.get(i, False) else "0"
# PurposesLITransparency (24 bits)
li = purpose_li or {}
for i in range(1, 25):
bits += "1" if li.get(i, False) else "0"
# Purpose one treatment (1 bit) = 0, PublisherCC (12 bits) = DE
bits += "0"
bits += _int_to_bits(ord("D") - ord("A"), 6)
bits += _int_to_bits(ord("E") - ord("A"), 6)
# Vendor consents — Range encoding
max_vendor = max(vendor_consents.keys()) if vendor_consents else 0
bits += _int_to_bits(max_vendor, 16) # MaxVendorId
# Use bitfield encoding (simpler than range)
bits += "0" # IsRangeEncoding = 0 (bitfield)
for i in range(1, max_vendor + 1):
bits += "1" if vendor_consents.get(i, False) else "0"
# Vendor legitimate interests (same pattern)
bits += _int_to_bits(max_vendor, 16)
bits += "0"
for i in range(1, max_vendor + 1):
bits += "1" if vendor_consents.get(i, False) else "0" # Simplified: same as consent
return _bits_to_base64url(bits)
def encode_from_categories(
self,
categories: list[str],
vendor_consents: dict[int, bool] | None = None,
) -> str:
"""Generate TC String from banner category selections.
Maps our banner categories (necessary, statistics, marketing, functional)
to IAB purposes and generates the TC String.
"""
purpose_consents: dict[int, bool] = {}
for cat in categories:
for purpose_id in CATEGORY_PURPOSE_MAP.get(cat, []):
purpose_consents[purpose_id] = True
return self.encode(
purpose_consents=purpose_consents,
vendor_consents=vendor_consents or {},
)
@staticmethod
def get_purposes() -> list[dict[str, Any]]:
"""Return all 12 IAB purposes with translations."""
return [
{"id": pid, "name": info["name"], "name_de": info["name_de"]}
for pid, info in IAB_PURPOSES.items()
]
@staticmethod
def get_special_features() -> list[dict[str, Any]]:
return [
{"id": fid, "name": info["name"], "name_de": info["name_de"]}
for fid, info in IAB_SPECIAL_FEATURES.items()
]
@staticmethod
def get_category_purpose_map() -> dict[str, list[int]]:
return CATEGORY_PURPOSE_MAP
@@ -0,0 +1,159 @@
"""
Training Link Service bridges document review approvals with the Academy.
After a document is approved, checks which roles need training on that
document type and identifies gaps (missing/overdue assignments).
Gracefully handles missing training tables (Go service not migrated yet).
"""
import logging
from typing import Any
from sqlalchemy import text
from sqlalchemy.orm import Session
logger = logging.getLogger(__name__)
class TrainingLinkService:
"""Links document approvals to training requirements."""
def __init__(self, db: Session) -> None:
self.db = db
def _training_tables_exist(self) -> bool:
"""Check if the Go-managed training tables exist."""
try:
self.db.execute(text("SELECT 1 FROM training_modules LIMIT 0"))
return True
except Exception:
self.db.rollback()
return False
def get_role_codes_for_document(self, tenant_id: str, document_type: str) -> list[dict]:
"""Map document type → org roles → training role codes."""
try:
q = text("""
SELECT m.role_key, t.training_role_code
FROM compliance_document_role_mapping m
LEFT JOIN compliance_role_training_mapping t
ON t.org_role_key = m.role_key
AND (t.tenant_id = :tid OR t.tenant_id = '__default__')
WHERE m.tenant_id = :tid OR m.tenant_id = '__default__'
AND m.document_type = :dt
""")
rows = self.db.execute(q, {"tid": tenant_id, "dt": document_type}).fetchall()
return [{"role_key": r.role_key, "training_role_code": r.training_role_code} for r in rows]
except Exception as e:
logger.warning("Failed to get role codes: %s", e)
return []
def get_training_requirements(self, tenant_id: str, document_type: str) -> dict[str, Any]:
"""Get training modules required for roles associated with a document type."""
if not self._training_tables_exist():
return {
"academy_available": False,
"message": "Academy noch nicht eingerichtet. Training-Module werden nach Aktivierung automatisch verknuepft.",
"requirements": [],
}
role_mappings = self.get_role_codes_for_document(tenant_id, document_type)
if not role_mappings:
return {"academy_available": True, "message": "Keine Rollen-Zuordnung fuer diesen Dokumenttyp.", "requirements": []}
role_codes = [r["training_role_code"] for r in role_mappings if r.get("training_role_code")]
if not role_codes:
return {"academy_available": True, "message": "Keine Training-Codes konfiguriert.", "requirements": []}
try:
placeholders = ",".join(f":rc{i}" for i in range(len(role_codes)))
params: dict[str, Any] = {"tid": tenant_id}
for i, rc in enumerate(role_codes):
params[f"rc{i}"] = rc
q = text(f"""
SELECT tm.role_code, m.module_code, m.title, m.description,
m.frequency_type, m.duration_minutes, tm.is_mandatory
FROM training_matrix tm
JOIN training_modules m ON m.id = tm.module_id
WHERE tm.tenant_id = :tid AND tm.role_code IN ({placeholders})
AND m.is_active = TRUE
ORDER BY tm.role_code, m.sort_order
""")
rows = self.db.execute(q, params).fetchall()
reqs = [dict(r._mapping) for r in rows]
return {"academy_available": True, "requirements": reqs, "total": len(reqs)}
except Exception as e:
logger.warning("Failed to query training requirements: %s", e)
return {"academy_available": True, "requirements": [], "error": str(e)}
def check_training_gaps(
self, tenant_id: str, document_type: str, project_id: str | None = None,
) -> dict[str, Any]:
"""Check which persons assigned to roles have outstanding training."""
if not self._training_tables_exist():
return {"academy_available": False, "gaps": [], "total_gaps": 0}
role_mappings = self.get_role_codes_for_document(tenant_id, document_type)
if not role_mappings:
return {"academy_available": True, "gaps": [], "total_gaps": 0}
gaps = []
for rm in role_mappings:
role_key = rm["role_key"]
role_code = rm.get("training_role_code")
if not role_code:
continue
# Get person assigned to this role
where = "tenant_id = :tid AND role_key = :rk"
params: dict[str, Any] = {"tid": tenant_id, "rk": role_key}
if project_id:
where += " AND (project_id = :pid OR project_id IS NULL)"
params["pid"] = project_id
try:
person = self.db.execute(text(
f"SELECT person_name, person_email, role_label FROM compliance_org_roles WHERE {where} LIMIT 1"
), params).fetchone()
except Exception:
continue
if not person or not person.person_name:
continue
# Get required modules for this role code
try:
modules = self.db.execute(text("""
SELECT m.id, m.module_code, m.title FROM training_matrix tm
JOIN training_modules m ON m.id = tm.module_id
WHERE tm.tenant_id = :tid AND tm.role_code = :rc AND m.is_active = TRUE AND tm.is_mandatory = TRUE
"""), {"tid": tenant_id, "rc": role_code}).fetchall()
except Exception:
continue
for mod in modules:
# Check if assignment exists and is completed
try:
assignment = self.db.execute(text("""
SELECT status, progress_percent FROM training_assignments
WHERE tenant_id = :tid AND module_id = :mid AND user_email = :email
ORDER BY created_at DESC LIMIT 1
"""), {"tid": tenant_id, "mid": mod.id, "email": person.person_email}).fetchone()
except Exception:
assignment = None
if not assignment or assignment.status not in ("completed", "passed"):
gaps.append({
"person_name": person.person_name,
"person_email": person.person_email,
"role": person.role_label,
"role_key": role_key,
"module_code": mod.module_code,
"module_title": mod.title,
"status": assignment.status if assignment else "nicht_begonnen",
"progress": assignment.progress_percent if assignment else 0,
})
return {"academy_available": True, "gaps": gaps, "total_gaps": len(gaps)}
@@ -0,0 +1,148 @@
"""
Website Compliance Checks checks public website for consumer protection
compliance (§312k BGB, §5 TMG, Art. 13 DSGVO, Cookie-Banner).
Extracted from agent_analyze_routes.py to keep route files slim.
"""
import re
import httpx
class FollowUpQuestion:
def __init__(self, id: str, question: str, legal_basis: str, severity: str, finding_if_no: str):
self.id = id
self.question = question
self.legal_basis = legal_basis
self.severity = severity
self.finding_if_no = finding_if_no
async def check_website_compliance(
client: httpx.AsyncClient, url: str, html: str,
) -> tuple[list[str], list[FollowUpQuestion]]:
"""Scan public website for consumer protection compliance."""
findings: list[str] = []
follow_ups: list[FollowUpQuestion] = []
html_lower = html.lower()
base_domain = re.sub(r"https?://([^/]+).*", r"\1", url)
# E-Commerce detection — §312k only applies to sites with online contracts
ecommerce_indicators = [
r"warenkorb", r"cart", r"shop", r"bestell", r"order",
r"checkout", r"kasse", r"kaufen", r"add.?to.?cart",
r"stripe|paypal|klarna|mollie|adyen",
r"abo", r"mitgliedschaft", r"subscription", r"premium",
]
is_ecommerce = any(re.search(p, html_lower) for p in ecommerce_indicators)
# --- §312k BGB: Kündigungsbutton (NUR bei E-Commerce/Abo-Websites) ---
cancel_patterns = [
r'href="[^"]*(?:kuendig|kündig|cancel|vertrag.?beenden|abo.?beenden|mitgliedschaft.?beenden)[^"]*"',
r'(?:kündigen|kuendigen|vertrag beenden|abo beenden|mitgliedschaft kündigen)',
]
has_cancel_link = any(re.search(p, html_lower) for p in cancel_patterns)
cancel_urls_to_probe = [
f"https://{base_domain}/kuendigen",
f"https://{base_domain}/cancel",
f"https://{base_domain}/vertrag-kuendigen",
f"https://{base_domain}/abo-kuendigen",
f"https://{base_domain}/account/cancel",
]
if not has_cancel_link:
for probe_url in cancel_urls_to_probe:
try:
probe = await client.head(probe_url, follow_redirects=True, timeout=5.0)
if probe.status_code < 400:
has_cancel_link = True
break
except Exception:
continue
if not has_cancel_link and is_ecommerce:
findings.append(
"[§312k BGB] Kein oeffentlich sichtbarer Kuendigungsbutton gefunden. "
"Seit 01.07.2022 muessen online geschlossene Vertraege mit max. 2 Klicks kuendbar sein."
)
follow_ups.append(FollowUpQuestion(
id="cancel_button_312k",
question="Koennen Sie nach Login im Kundenbereich innerhalb von 2 Klicks Ihren Vertrag kuendigen?",
legal_basis="§ 312k BGB (Kuendigungsbutton), Omnibus-Richtlinie (EU) 2019/2161",
severity="high",
finding_if_no=(
"[§312k BGB] VERSTOSS: Kein funktionaler Kuendigungsbutton vorhanden. "
"Der Anbieter ist verpflichtet, einen leicht auffindbaren Kuendigungsbutton "
"bereitzustellen (max. 2 Klicks). Ein Zwang zur telefonischen Kuendigung "
"oder Kuendigung per Brief ist rechtswidrig."
),
))
# --- Impressumspflicht (§5 TMG / §18 MStV) ---
imprint_patterns = [
r'href="[^"]*(?:impressum|imprint|legal.?notice|about.?us/legal)[^"]*"',
r'>impressum<',
]
has_imprint = any(re.search(p, html_lower) for p in imprint_patterns)
if not has_imprint:
findings.append(
"[§5 TMG] Kein Impressum-Link auf der Seite gefunden. "
"Geschaeftsmaessige Online-Dienste muessen ein leicht erreichbares Impressum bereitstellen."
)
# --- Datenschutzerklaerung verlinkt? ---
privacy_patterns = [
r'href="[^"]*(?:datenschutz|privacy|dsgvo)[^"]*"',
r'>datenschutz<',
]
has_privacy = any(re.search(p, html_lower) for p in privacy_patterns)
if not has_privacy:
findings.append(
"[Art. 13 DSGVO] Kein Link zur Datenschutzerklaerung gefunden. "
"Nutzer muessen ueber die Verarbeitung personenbezogener Daten informiert werden."
)
# --- Cookie-Consent-Banner ---
cookie_patterns = [
r'(?:cookie.?consent|cookie.?banner|consent.?manager|didomi|cookiebot|onetrust|usercentrics)',
r'(?:gdpr|dsgvo).?(?:consent|einwilligung)',
]
has_cookie_consent = any(re.search(p, html_lower) for p in cookie_patterns)
if not has_cookie_consent:
follow_ups.append(FollowUpQuestion(
id="cookie_consent",
question="Wird beim ersten Besuch der Website ein Cookie-Consent-Banner angezeigt?",
legal_basis="§ 25 TDDDG (ehem. TTDSG), Art. 5(3) ePrivacy-Richtlinie",
severity="medium",
finding_if_no=(
"[§25 TDDDG] Kein Cookie-Consent-Banner erkannt. "
"Vor dem Setzen nicht-essentieller Cookies ist eine Einwilligung erforderlich."
),
))
return findings, follow_ups
def to_string_list(items: list) -> list[str]:
"""Convert list of dicts or strings to list of strings."""
result = []
for item in (items or []):
if isinstance(item, dict):
desc = item.get("description", item.get("name", item.get("code", str(item))))
code = item.get("code", item.get("id", ""))
result.append(f"[{code}] {desc}" if code else str(desc))
else:
result.append(str(item))
return result
def risk_to_escalation(risk_level: str) -> str:
"""Map UCCA risk level to escalation level."""
mapping = {
"MINIMAL": "E0",
"LIMITED": "E1",
"HIGH": "E2",
"UNACCEPTABLE": "E3",
}
return mapping.get(risk_level.upper() if risk_level else "", "E0")
@@ -40,107 +40,8 @@ class ScanResult:
missing_pages: dict = field(default_factory=dict) # url -> status_code
# ── Service Registry ──────────────────────────────────────────────────────────
# Each entry: regex pattern -> service metadata
SERVICE_REGISTRY: dict[str, dict] = {
# --- Tracking & Analytics ---
r"google.?analytics|gtag\(|UA-\d+|G-\w{5,}": {
"id": "google_analytics", "name": "Google Analytics", "category": "tracking",
"provider": "Google LLC", "country": "US", "eu_adequate": False,
"requires_consent": True, "legal_ref": "Art. 44-49 DSGVO, §25 TDDDG",
},
r"googletagmanager|gtm\.js": {
"id": "google_tag_manager", "name": "Google Tag Manager", "category": "tracking",
"provider": "Google LLC", "country": "US", "eu_adequate": False,
"requires_consent": True, "legal_ref": "Art. 44-49 DSGVO",
},
r"facebook\.net/.*fbevents|fbq\(": {
"id": "facebook_pixel", "name": "Meta/Facebook Pixel", "category": "marketing",
"provider": "Meta Platforms", "country": "US", "eu_adequate": False,
"requires_consent": True, "legal_ref": "Art. 44-49 DSGVO, §25 TDDDG",
},
r"hotjar\.com|_hjSettings": {
"id": "hotjar", "name": "Hotjar", "category": "tracking",
"provider": "Hotjar Ltd", "country": "MT", "eu_adequate": True,
"requires_consent": True, "legal_ref": "§25 TDDDG (Session Recording)",
},
r"clarity\.ms": {
"id": "ms_clarity", "name": "Microsoft Clarity", "category": "tracking",
"provider": "Microsoft", "country": "US", "eu_adequate": False,
"requires_consent": True, "legal_ref": "§25 TDDDG (Session Replay), Art. 44 DSGVO",
},
r"matomo|piwik": {
"id": "matomo", "name": "Matomo", "category": "tracking",
"provider": "InnoCraft/Self-hosted", "country": "EU/Self", "eu_adequate": True,
"requires_consent": False, "legal_ref": "Cookieless moeglich, §25 TDDDG",
},
r"plausible\.io": {
"id": "plausible", "name": "Plausible Analytics", "category": "tracking",
"provider": "Plausible Insights", "country": "EE", "eu_adequate": True,
"requires_consent": False, "legal_ref": "EU-Anbieter, cookieless",
},
# --- CDN & Fonts ---
r"fonts\.googleapis\.com|fonts\.gstatic\.com": {
"id": "google_fonts", "name": "Google Fonts (remote)", "category": "cdn",
"provider": "Google LLC", "country": "US", "eu_adequate": False,
"requires_consent": True, "legal_ref": "LG Muenchen I, Az. 3 O 17493/20",
},
r"cdn\.cloudflare\.com|cdnjs\.cloudflare\.com": {
"id": "cloudflare_cdn", "name": "Cloudflare CDN", "category": "cdn",
"provider": "Cloudflare Inc", "country": "US", "eu_adequate": False,
"requires_consent": False, "legal_ref": "Art. 44-49 DSGVO, berechtigtes Interesse",
},
# --- Chatbots ---
r"widget\.intercom\.io|intercomcdn": {
"id": "intercom", "name": "Intercom", "category": "chatbot",
"provider": "Intercom Inc", "country": "US", "eu_adequate": False,
"requires_consent": True, "legal_ref": "Art. 44-49 DSGVO, KI-gestuetzt",
},
r"tidio\.co|tidioChatApi": {
"id": "tidio", "name": "Tidio Chat", "category": "chatbot",
"provider": "Tidio LLC", "country": "PL", "eu_adequate": True,
"requires_consent": False, "legal_ref": "EU-Anbieter",
},
r"zendesk\.com/embeddable|zdassets": {
"id": "zendesk", "name": "Zendesk", "category": "chatbot",
"provider": "Zendesk Inc", "country": "US", "eu_adequate": False,
"requires_consent": True, "legal_ref": "Art. 44-49 DSGVO",
},
# --- Payment ---
r"js\.stripe\.com|stripe\.com/v3": {
"id": "stripe", "name": "Stripe", "category": "payment",
"provider": "Stripe Inc", "country": "US", "eu_adequate": False,
"requires_consent": False, "legal_ref": "Art. 6(1)(b) Vertragserfuellung, SCCs",
},
r"paypal\.com/sdk|paypalobjects": {
"id": "paypal", "name": "PayPal", "category": "payment",
"provider": "PayPal Holdings", "country": "US", "eu_adequate": False,
"requires_consent": False, "legal_ref": "Art. 6(1)(b) Vertragserfuellung",
},
r"klarna\.com|klarna-payments": {
"id": "klarna", "name": "Klarna", "category": "payment",
"provider": "Klarna AB", "country": "SE", "eu_adequate": True,
"requires_consent": False, "legal_ref": "EU, aber Art. 22 DSGVO bei Bonitaetspruefung!",
},
# --- Captcha ---
r"recaptcha|grecaptcha": {
"id": "recaptcha", "name": "Google reCAPTCHA", "category": "other",
"provider": "Google LLC", "country": "US", "eu_adequate": False,
"requires_consent": True, "legal_ref": "Art. 44-49 DSGVO, §25 TDDDG",
},
# --- Video ---
r"youtube\.com/embed|youtube-nocookie|ytimg": {
"id": "youtube", "name": "YouTube", "category": "other",
"provider": "Google LLC", "country": "US", "eu_adequate": False,
"requires_consent": True, "legal_ref": "Art. 44-49 DSGVO, 2-Klick empfohlen",
},
# --- Consent Management ---
r"didomi|cookiebot|onetrust|usercentrics|consentmanager|quantcast": {
"id": "cmp", "name": "Consent Management Platform", "category": "other",
"provider": "Various", "country": "EU", "eu_adequate": True,
"requires_consent": False, "legal_ref": "CMP vorhanden — gut",
},
}
# ── Service Registry (imported from master) ──────────────────────────────────
from compliance.services.service_registry import SERVICE_REGISTRY # noqa: E402
AI_TEXT_PATTERNS = [
r"k(?:ue|ü)nstliche.?intelligenz",
@@ -157,9 +58,13 @@ AI_TEXT_PATTERNS = [
FOOTER_LINK_PATTERNS = [
(r'href="([^"]*(?:impressum|imprint|legal-notice)[^"]*)"', "impressum"),
(r'href="([^"]*(?:datenschutz|privacy|dsgvo)[^"]*)"', "datenschutz"),
(r'href="([^"]*(?:datenschutz|privacy|dsgvo|hinweise.?zum.?datenschutz)[^"]*)"', "datenschutz"),
(r'href="([^"]*(?:agb|terms|nutzungsbedingungen)[^"]*)"', "agb"),
(r'href="([^"]*(?:cookie)[^"]*)"', "cookies"),
# Deep DSE links (regional pages, sub-pages, service marks)
(r'href="([^"]*(?:datenschutzinformation|datenschutzerklaerung|datenschutzerkl)[^"]*)"', "datenschutz_deep"),
# Navigation links often contain DSB/privacy sub-pages
(r'href="([^"]*(?:ueber.?uns.*datenschutz|servicemarken.*datenschutz|kontakt.*datenschutz)[^"]*)"', "datenschutz_nav"),
]
@@ -183,15 +88,46 @@ async def scan_website(base_url: str) -> ScanResult:
href = match.group(1)
if href.startswith("/"):
href = urljoin(origin, href)
if href.startswith(origin):
if href.startswith(origin) and not re.search(r"\.(css|js|png|jpg|gif|svg|pdf|zip)(\?|$)", href):
page_urls.add(href)
# 3. Scan all pages (max 10)
for url in list(page_urls)[:10]:
html = start_html if url == origin else await _fetch_page(client, url, result)
if html:
# 3. Scan all pages in PARALLEL (max 10)
import asyncio
other_urls = [u for u in list(page_urls)[:10] if u != origin]
fetch_tasks = [_fetch_page(client, u, result) for u in other_urls]
other_htmls = await asyncio.gather(*fetch_tasks, return_exceptions=True)
# Process start page
_detect_services(start_html, origin, result)
_detect_ai_mentions(start_html, origin, result)
# Process other pages + discover DSE-internal links
dse_internal_urls = set()
for url, html in zip(other_urls, other_htmls):
if isinstance(html, str) and html:
_detect_services(html, url, result)
_detect_ai_mentions(html, url, result)
# If this is a DSE page, find links within it (SAME DOMAIN only)
if re.search(r"datenschutz|privacy|dsgvo", url, re.IGNORECASE):
for pattern, _ in FOOTER_LINK_PATTERNS:
for match in re.finditer(pattern, html, re.IGNORECASE):
href = match.group(1)
if href.startswith("/"):
href = urljoin(origin, href)
# IMPORTANT: Only follow links on the SAME domain
# External links (etracker.com, google.de) must NOT be scanned
if href.startswith(origin) and href not in page_urls:
dse_internal_urls.add(href)
# 4. Follow DSE-internal links (additional pages linked from privacy policy)
if dse_internal_urls:
extra_urls = [u for u in list(dse_internal_urls)[:5] if u not in page_urls]
if extra_urls:
extra_tasks = [_fetch_page(client, u, result) for u in extra_urls]
extra_htmls = await asyncio.gather(*extra_tasks, return_exceptions=True)
for url, html in zip(extra_urls, extra_htmls):
if isinstance(html, str) and html:
_detect_services(html, url, result)
# Deduplicate services
seen = set()