Files
breakpilot-compliance/backend-compliance/compliance/api/saving_scan_routes.py
T
Benjamin Admin 6c223c7c9b
CI / detect-changes (push) Successful in 10s
CI / branch-name (push) Has been skipped
CI / guardrail-integrity (push) Has been skipped
CI / secret-scan (push) Has been skipped
CI / dep-audit (push) Has been skipped
CI / sbom-scan (push) Has been skipped
CI / validate-canonical-controls (push) Successful in 14s
CI / loc-budget (push) Failing after 15s
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / nodejs-build (push) Successful in 2m43s
CI / test-go (push) Has been skipped
CI / iace-gt-coverage (push) Has been skipped
CI / test-python-backend (push) Successful in 37s
CI / test-python-document-crawler (push) Has been skipped
CI / test-python-dsms-gateway (push) Has been skipped
feat(compliance-check): exec-summary + voll-audit + TDM-respect + cookie-KB-extended + saving-scan-funnel
P1 — Exec-Summary oben im Email-Report (4 KPIs + 2 CTAs, dunkler Gradient)
P3 — no_direct_sales-Flag fuer OEM-Konfigurator-Sites; AGB/Widerruf/AGB als
     "NICHT ANWENDBAR" (grau) statt "NICHT GEFUNDEN" (rot)
P5 — Voll-Audit Unification: alle Findings (MC + Pflichtangaben + Vendor +
     Redundanz) in /data/compliance_audits.db.unified_findings; neuer
     /api/compliance/agent/findings/<id> Endpoint + FindingsTab im Audit-UI
     mit Filter + CSV-Export
P7 — Crawl-Hardening: TDM-Reservation-Check (robots.txt / ai.txt / Header /
     Meta) vor jedem Run mit 24h-Cache; HeadlessChrome-UA (Firma noch nicht
     gegruendet — Switch via BREAKPILOT_BRANDED_UA env); per-Domain
     Rate-Limit 1 req/s + max 2 concurrent
P2 — Cookie-Knowledge-DB additiv erweitert (35 -> 74 Cookies): Adobe, Meta,
     Microsoft, LinkedIn, TikTok, HubSpot, Marketo, Salesforce, Hotjar,
     FullStory, Mouseflow, Intercom, Drift, Zendesk, Cloudflare, Stripe,
     OneTrust/Cookiebot/Usercentrics, Matomo, Pinterest, Snapchat, X/Twitter,
     YouTube, Vimeo, Klaviyo, Mailchimp, Mixpanel, Segment, Amplitude,
     Optimizely, Datadog; Wire-in in cookie_function_classifier liefert
     compliance_risk-Label (kritisch/hoch/mittel/gering) pro Vendor
A  — k-Anonymitaets-Helper (benchmark_k_anonymity) fuer P6-Vorbereitung
B  — Cross-Tenant-Domain-Assertion im /findings-Endpoint (expected_domain
     Query-Param -> 403 bei Mismatch)
C  — Saving-Scan-Funnel: /api/compliance/agent/saving-scan/start mit
     Validierung + 24h-Rate-Limit pro Domain + Lead-Persistenz in
     saving_scan_leads + Auto-Discovery via _run_compliance_check; 6 Tests
D  — Risk-Badge im Email-Vendor-Row

Rechtliche Leitplanken (Memory feedback_oem_data_legal.md): nur eigene
Knapp-Bewertungen + Source-Pointer, keine 1:1-Kopien fremder CMP-Texte.
TDM-Opt-Out-Respect nach § 44b UrhG. KEINE Schema-Aenderungen — alles in
Sidecar-SQLite.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-18 23:48:34 +02:00

197 lines
6.7 KiB
Python

"""
Saving-Scan-Funnel Endpoint — Marketing-Lead → Compliance-Check.
Externes Form (https://breakpilot.ai/savings-scan) postet hier:
POST /api/compliance/agent/saving-scan/start
Body: {"url": "...", "email": "..."}
Server-side:
1. Validierung URL + Email (E-Mail-Regex, URL-Schema).
2. Rate-Limit: max 1 vollstaendiger Scan / Domain / 24h
(saving_scan_allowed aus compliance_user_agent).
3. Lead persistieren (saving_scan_leads in Sidecar-SQLite) — fuer
spaeteren Report-Versand + Sales-Follow-Up.
4. Compliance-Check starten mit Auto-Discovery (DocumentInput leer
ausser Homepage). Der bestehende Worker laeuft TDM-Check, dann
Discovery, dann Pruefung.
5. check_id zurueck — Frontend pollt /compliance-check/<check_id>.
"""
from __future__ import annotations
import logging
import os
import re
import sqlite3
import uuid as _uuid
from datetime import datetime, timezone
from pathlib import Path
import asyncio
from fastapi import APIRouter, HTTPException
from pydantic import BaseModel, Field
from compliance.services.compliance_user_agent import (
base_domain_of, saving_scan_allowed,
)
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/compliance/agent", tags=["agent"])
DB_PATH = os.getenv("COMPLIANCE_AUDIT_DB", "/data/compliance_audits.db")
_EMAIL_RE = re.compile(r"^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$")
_URL_RE = re.compile(r"^https?://[A-Za-z0-9.-]+(/.*)?$")
class SavingScanRequest(BaseModel):
url: str = Field(..., min_length=4, max_length=400)
email: str = Field(..., min_length=5, max_length=200)
consent: bool = Field(
True, description="Marketing-Consent fuer Sales-Follow-Up — "
"muss True sein laut Form-Checkbox.",
)
class SavingScanResponse(BaseModel):
check_id: str
status: str
message: str = ""
def _ensure_leads_table() -> None:
Path(DB_PATH).parent.mkdir(parents=True, exist_ok=True)
with sqlite3.connect(DB_PATH) as conn:
conn.executescript("""
CREATE TABLE IF NOT EXISTS saving_scan_leads (
id INTEGER PRIMARY KEY AUTOINCREMENT,
ts TEXT NOT NULL,
email TEXT NOT NULL,
url TEXT NOT NULL,
base_domain TEXT NOT NULL,
check_id TEXT,
consent INTEGER NOT NULL,
source TEXT
);
CREATE INDEX IF NOT EXISTS idx_leads_domain ON saving_scan_leads(base_domain, ts);
CREATE INDEX IF NOT EXISTS idx_leads_email ON saving_scan_leads(email, ts);
""")
def _persist_lead(email: str, url: str, check_id: str, consent: bool) -> None:
try:
_ensure_leads_table()
with sqlite3.connect(DB_PATH) as conn:
conn.execute(
"INSERT INTO saving_scan_leads "
"(ts, email, url, base_domain, check_id, consent, source) "
"VALUES (?, ?, ?, ?, ?, ?, ?)",
(
datetime.now(timezone.utc).isoformat(),
email.lower().strip(),
url,
base_domain_of(url),
check_id,
1 if consent else 0,
"saving_scan_form",
),
)
conn.commit()
except Exception as e:
logger.warning("persist lead failed: %s", e)
def _normalize_url(url: str) -> str:
"""Strip path → behaupt nur Homepage, der Discover findet den Rest."""
if "://" not in url:
url = "https://" + url
from urllib.parse import urlparse
p = urlparse(url)
return f"{p.scheme}://{p.netloc}/"
@router.post("/saving-scan/start", response_model=SavingScanResponse)
async def start_saving_scan(req: SavingScanRequest) -> SavingScanResponse:
"""Trigger compliance check from the marketing-funnel form."""
if not _EMAIL_RE.match(req.email):
raise HTTPException(400, "Ungueltige E-Mail-Adresse.")
if not _URL_RE.match(req.url):
raise HTTPException(400, "URL muss mit http:// oder https:// beginnen.")
if not req.consent:
raise HTTPException(400, "Marketing-Consent erforderlich.")
domain = base_domain_of(req.url)
if not domain:
raise HTTPException(400, "Konnte Domain nicht ermitteln.")
allowed, wait_s = saving_scan_allowed(req.url)
if not allowed:
raise HTTPException(
429,
f"Fuer '{domain}' wurde in den letzten 24h bereits ein Scan "
f"durchgefuehrt. Bitte in {wait_s // 3600}h {wait_s % 3600 // 60}min "
f"erneut versuchen.",
)
# Lazy import to avoid circular dependency at module load.
from compliance.api.agent_compliance_check_routes import (
DocumentInput,
ComplianceCheckRequest,
_run_compliance_check,
_compliance_check_jobs,
)
homepage = _normalize_url(req.url)
check_id = str(_uuid.uuid4())[:8]
_compliance_check_jobs[check_id] = {
"status": "running",
"progress": "Saving-Scan gestartet — Auto-Discovery laeuft...",
"progress_pct": 0,
"result": None,
"error": "",
}
# Single "other" entry forces auto-discovery to fill in the rest.
docs = [DocumentInput(doc_type="other", url=homepage)]
check_req = ComplianceCheckRequest(
documents=docs, recipient=req.email.lower().strip(),
)
_persist_lead(req.email, req.url, check_id, req.consent)
asyncio.create_task(_run_compliance_check(check_id, check_req))
logger.info("saving-scan start: check_id=%s domain=%s email=%s",
check_id, domain, req.email[:3] + "***")
return SavingScanResponse(
check_id=check_id,
status="running",
message=f"Scan gestartet fuer {domain}. Bericht in ~3-5 Minuten.",
)
@router.get("/saving-scan/lead-count")
def saving_scan_lead_count() -> dict:
"""Diagnostik fuer das Sales-Dashboard."""
try:
_ensure_leads_table()
with sqlite3.connect(DB_PATH) as conn:
total = conn.execute(
"SELECT COUNT(*) FROM saving_scan_leads",
).fetchone()[0]
last_24h = conn.execute(
"SELECT COUNT(*) FROM saving_scan_leads "
"WHERE ts > datetime('now', '-1 day')",
).fetchone()[0]
top_domains = conn.execute(
"SELECT base_domain, COUNT(*) AS n FROM saving_scan_leads "
"GROUP BY base_domain ORDER BY n DESC LIMIT 10",
).fetchall()
return {
"total_leads": total,
"last_24h": last_24h,
"top_domains": [{"domain": d, "scans": n} for d, n in top_domains],
}
except Exception as e:
return {"error": str(e)[:200]}