""" Saving-Scan-Funnel Endpoint — Marketing-Lead → Compliance-Check. Externes Form (https://breakpilot.ai/savings-scan) postet hier: POST /api/compliance/agent/saving-scan/start Body: {"url": "...", "email": "..."} Server-side: 1. Validierung URL + Email (E-Mail-Regex, URL-Schema). 2. Rate-Limit: max 1 vollstaendiger Scan / Domain / 24h (saving_scan_allowed aus compliance_user_agent). 3. Lead persistieren (saving_scan_leads in Sidecar-SQLite) — fuer spaeteren Report-Versand + Sales-Follow-Up. 4. Compliance-Check starten mit Auto-Discovery (DocumentInput leer ausser Homepage). Der bestehende Worker laeuft TDM-Check, dann Discovery, dann Pruefung. 5. check_id zurueck — Frontend pollt /compliance-check/. """ from __future__ import annotations import logging import os import re import sqlite3 import uuid as _uuid from datetime import datetime, timezone from pathlib import Path import asyncio from fastapi import APIRouter, HTTPException from pydantic import BaseModel, Field from compliance.services.compliance_user_agent import ( base_domain_of, saving_scan_allowed, ) logger = logging.getLogger(__name__) router = APIRouter(prefix="/compliance/agent", tags=["agent"]) DB_PATH = os.getenv("COMPLIANCE_AUDIT_DB", "/data/compliance_audits.db") _EMAIL_RE = re.compile(r"^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$") _URL_RE = re.compile(r"^https?://[A-Za-z0-9.-]+(/.*)?$") class SavingScanRequest(BaseModel): url: str = Field(..., min_length=4, max_length=400) email: str = Field(..., min_length=5, max_length=200) consent: bool = Field( True, description="Marketing-Consent fuer Sales-Follow-Up — " "muss True sein laut Form-Checkbox.", ) class SavingScanResponse(BaseModel): check_id: str status: str message: str = "" def _ensure_leads_table() -> None: Path(DB_PATH).parent.mkdir(parents=True, exist_ok=True) with sqlite3.connect(DB_PATH) as conn: conn.executescript(""" CREATE TABLE IF NOT EXISTS saving_scan_leads ( id INTEGER PRIMARY KEY AUTOINCREMENT, ts TEXT NOT NULL, email TEXT NOT NULL, url TEXT NOT NULL, base_domain TEXT NOT NULL, check_id TEXT, consent INTEGER NOT NULL, source TEXT ); CREATE INDEX IF NOT EXISTS idx_leads_domain ON saving_scan_leads(base_domain, ts); CREATE INDEX IF NOT EXISTS idx_leads_email ON saving_scan_leads(email, ts); """) def _persist_lead(email: str, url: str, check_id: str, consent: bool) -> None: try: _ensure_leads_table() with sqlite3.connect(DB_PATH) as conn: conn.execute( "INSERT INTO saving_scan_leads " "(ts, email, url, base_domain, check_id, consent, source) " "VALUES (?, ?, ?, ?, ?, ?, ?)", ( datetime.now(timezone.utc).isoformat(), email.lower().strip(), url, base_domain_of(url), check_id, 1 if consent else 0, "saving_scan_form", ), ) conn.commit() except Exception as e: logger.warning("persist lead failed: %s", e) def _normalize_url(url: str) -> str: """Strip path → behaupt nur Homepage, der Discover findet den Rest.""" if "://" not in url: url = "https://" + url from urllib.parse import urlparse p = urlparse(url) return f"{p.scheme}://{p.netloc}/" @router.post("/saving-scan/start", response_model=SavingScanResponse) async def start_saving_scan(req: SavingScanRequest) -> SavingScanResponse: """Trigger compliance check from the marketing-funnel form.""" if not _EMAIL_RE.match(req.email): raise HTTPException(400, "Ungueltige E-Mail-Adresse.") if not _URL_RE.match(req.url): raise HTTPException(400, "URL muss mit http:// oder https:// beginnen.") if not req.consent: raise HTTPException(400, "Marketing-Consent erforderlich.") domain = base_domain_of(req.url) if not domain: raise HTTPException(400, "Konnte Domain nicht ermitteln.") allowed, wait_s = saving_scan_allowed(req.url) if not allowed: raise HTTPException( 429, f"Fuer '{domain}' wurde in den letzten 24h bereits ein Scan " f"durchgefuehrt. Bitte in {wait_s // 3600}h {wait_s % 3600 // 60}min " f"erneut versuchen.", ) # Lazy import to avoid circular dependency at module load. from compliance.api.agent_compliance_check_routes import ( DocumentInput, ComplianceCheckRequest, _run_compliance_check, _compliance_check_jobs, ) homepage = _normalize_url(req.url) check_id = str(_uuid.uuid4())[:8] _compliance_check_jobs[check_id] = { "status": "running", "progress": "Saving-Scan gestartet — Auto-Discovery laeuft...", "progress_pct": 0, "result": None, "error": "", } # Single "other" entry forces auto-discovery to fill in the rest. docs = [DocumentInput(doc_type="other", url=homepage)] check_req = ComplianceCheckRequest( documents=docs, recipient=req.email.lower().strip(), ) _persist_lead(req.email, req.url, check_id, req.consent) asyncio.create_task(_run_compliance_check(check_id, check_req)) logger.info("saving-scan start: check_id=%s domain=%s email=%s", check_id, domain, req.email[:3] + "***") return SavingScanResponse( check_id=check_id, status="running", message=f"Scan gestartet fuer {domain}. Bericht in ~3-5 Minuten.", ) @router.get("/saving-scan/lead-count") def saving_scan_lead_count() -> dict: """Diagnostik fuer das Sales-Dashboard.""" try: _ensure_leads_table() with sqlite3.connect(DB_PATH) as conn: total = conn.execute( "SELECT COUNT(*) FROM saving_scan_leads", ).fetchone()[0] last_24h = conn.execute( "SELECT COUNT(*) FROM saving_scan_leads " "WHERE ts > datetime('now', '-1 day')", ).fetchone()[0] top_domains = conn.execute( "SELECT base_domain, COUNT(*) AS n FROM saving_scan_leads " "GROUP BY base_domain ORDER BY n DESC LIMIT 10", ).fetchall() return { "total_leads": total, "last_24h": last_24h, "top_domains": [{"domain": d, "scans": n} for d, n in top_domains], } except Exception as e: return {"error": str(e)[:200]}