diff --git a/admin-compliance/app/api/sdk/v1/agent/scan/route.ts b/admin-compliance/app/api/sdk/v1/agent/scan/route.ts index 5f7d6aa..ff3e2de 100644 --- a/admin-compliance/app/api/sdk/v1/agent/scan/route.ts +++ b/admin-compliance/app/api/sdk/v1/agent/scan/route.ts @@ -1,6 +1,8 @@ /** - * Agent Scan API Proxy - * POST /api/sdk/v1/agent/scan → backend-compliance /api/compliance/agent/scan + * Agent Scan API Proxy — async scan with polling + * + * POST /api/sdk/v1/agent/scan → starts scan, returns scan_id + * GET /api/sdk/v1/agent/scan?scan_id=xxx → poll status/results */ import { NextRequest, NextResponse } from 'next/server' @@ -11,11 +13,12 @@ export async function POST(request: NextRequest) { try { const body = await request.text() + // Start async scan — returns immediately with scan_id const response = await fetch(`${BACKEND_URL}/api/compliance/agent/scan`, { method: 'POST', headers: { 'Content-Type': 'application/json' }, body, - signal: AbortSignal.timeout(600000), // 10 min — 50 pages + 20 DSI docs + LLM corrections + signal: AbortSignal.timeout(30000), // 30s — just needs to start the job }) if (!response.ok) { @@ -31,7 +34,36 @@ export async function POST(request: NextRequest) { } catch (error) { console.error('Agent scan proxy error:', error) return NextResponse.json( - { error: 'Scan fehlgeschlagen oder Timeout' }, + { error: 'Scan konnte nicht gestartet werden' }, + { status: 503 } + ) + } +} + +export async function GET(request: NextRequest) { + const scanId = request.nextUrl.searchParams.get('scan_id') + if (!scanId) { + return NextResponse.json({ error: 'scan_id parameter required' }, { status: 400 }) + } + + try { + const response = await fetch( + `${BACKEND_URL}/api/compliance/agent/scan/${scanId}`, + { signal: AbortSignal.timeout(10000) } + ) + + if (!response.ok) { + return NextResponse.json( + { error: `Backend: ${response.status}` }, + { status: response.status } + ) + } + + const data = await response.json() + return NextResponse.json(data) + } catch (error) { + return NextResponse.json( + { error: 'Status-Abfrage fehlgeschlagen' }, { status: 503 } ) } diff --git a/admin-compliance/app/sdk/agent/page.tsx b/admin-compliance/app/sdk/agent/page.tsx index cb36083..58a8c8e 100644 --- a/admin-compliance/app/sdk/agent/page.tsx +++ b/admin-compliance/app/sdk/agent/page.tsx @@ -40,13 +40,42 @@ export default function AgentPage() { setScanError(null) setScanData(null) try { - const res = await fetch('/api/sdk/v1/agent/scan', { + // Step 1: Start async scan + const startRes = await fetch('/api/sdk/v1/agent/scan', { method: 'POST', headers: { 'Content-Type': 'application/json' }, body: JSON.stringify({ url: url.trim(), mode }), }) - if (!res.ok) throw new Error(`Scan fehlgeschlagen: ${res.status}`) - setScanData(await res.json()) + if (!startRes.ok) throw new Error(`Scan konnte nicht gestartet werden: ${startRes.status}`) + const { scan_id } = await startRes.json() + if (!scan_id) throw new Error('Keine Scan-ID erhalten') + + // Step 2: Poll for results + let attempts = 0 + const maxAttempts = 120 // 10 min at 5s intervals + while (attempts < maxAttempts) { + await new Promise(r => setTimeout(r, 5000)) + const pollRes = await fetch(`/api/sdk/v1/agent/scan?scan_id=${scan_id}`) + if (!pollRes.ok) { attempts++; continue } + const status = await pollRes.json() + + // Update progress message + if (status.progress) { + setScanError(null) + // Show progress as temporary "error" (will be cleared when done) + setScanData({ _progress: status.progress } as any) + } + + if (status.status === 'completed' && status.result) { + setScanData(status.result) + break + } + if (status.status === 'failed') { + throw new Error(status.error || 'Scan fehlgeschlagen') + } + attempts++ + } + if (attempts >= maxAttempts) throw new Error('Scan-Timeout (10 Minuten)') } catch (e) { setScanError(e instanceof Error ? e.message : 'Unbekannter Fehler') } finally { diff --git a/backend-compliance/compliance/api/agent_scan_helpers.py b/backend-compliance/compliance/api/agent_scan_helpers.py index 5adcb43..98d534f 100644 --- a/backend-compliance/compliance/api/agent_scan_helpers.py +++ b/backend-compliance/compliance/api/agent_scan_helpers.py @@ -107,3 +107,41 @@ def build_scan_summary( ]) return "\n".join(parts) + + +async def fetch_dse_text(url: str, scanned_pages: list[str]) -> str: + """Find and fetch the privacy policy page text.""" + dse_url = None + for page in scanned_pages: + if re.search(r"datenschutz|privacy|dsgvo", page, re.IGNORECASE): + dse_url = page + break + if not dse_url: + dse_url = url + try: + async with httpx.AsyncClient(timeout=15.0, follow_redirects=True) as client: + resp = await client.get(dse_url, headers={"User-Agent": "BreakPilot-Compliance-Agent/1.0"}) + html = resp.text + clean = re.sub(r"<(script|style)[^>]*>.*?", "", html, flags=re.DOTALL | re.IGNORECASE) + clean = re.sub(r"<[^>]+>", " ", clean) + clean = re.sub(r"\s+", " ", clean).strip() + return clean[:8000] + except Exception: + return "" + + +async def fetch_dse_html(url: str, scanned_pages: list[str]) -> str: + """Fetch the raw HTML of the privacy policy page.""" + dse_url = None + for page in scanned_pages: + if re.search(r"datenschutz|privacy|dsgvo", page, re.IGNORECASE): + dse_url = page + break + if not dse_url: + dse_url = url + try: + async with httpx.AsyncClient(timeout=15.0, follow_redirects=True) as client: + resp = await client.get(dse_url, headers={"User-Agent": "BreakPilot-Compliance-Agent/1.0"}) + return resp.text + except Exception: + return "" diff --git a/backend-compliance/compliance/api/agent_scan_routes.py b/backend-compliance/compliance/api/agent_scan_routes.py index 2e877ce..23952fa 100644 --- a/backend-compliance/compliance/api/agent_scan_routes.py +++ b/backend-compliance/compliance/api/agent_scan_routes.py @@ -23,7 +23,9 @@ from compliance.services.mandatory_content_checker import ( check_mandatory_documents, check_dse_mandatory_content, MandatoryFinding, ) from compliance.services.legal_basis_validator import validate_legal_bases -from compliance.api.agent_scan_helpers import add_corrections, build_scan_summary +from compliance.api.agent_scan_helpers import ( + add_corrections, build_scan_summary, fetch_dse_text, fetch_dse_html, +) logger = logging.getLogger(__name__) @@ -106,11 +108,76 @@ class ScanResponse(BaseModel): scanned_at: str -@router.post("/scan", response_model=ScanResponse) -async def scan_website_endpoint(req: ScanRequest): - """Deep website scan: multi-page crawl + SOLL/IST service comparison.""" - is_live = req.mode == "post_launch" +import asyncio +import uuid as _uuid +# In-memory scan job store (survives until container restart) +_scan_jobs: dict[str, dict] = {} + + +class ScanStartResponse(BaseModel): + scan_id: str + status: str = "running" + message: str = "" + + +class ScanStatusResponse(BaseModel): + scan_id: str + status: str # "running", "completed", "failed" + progress: str = "" + result: ScanResponse | None = None + error: str = "" + + +@router.post("/scan") +async def scan_website_endpoint(req: ScanRequest): + """Start async website scan. Returns scan_id immediately. + Poll GET /scan/{scan_id} for status and results.""" + scan_id = str(_uuid.uuid4())[:8] + _scan_jobs[scan_id] = {"status": "running", "progress": "Scan gestartet...", "result": None, "error": ""} + + # Launch scan in background + asyncio.create_task(_run_scan(scan_id, req)) + + return ScanStartResponse(scan_id=scan_id, status="running", message="Scan gestartet. Ergebnisse unter GET /scan/{scan_id}") + + +@router.get("/scan/{scan_id}") +async def get_scan_status(scan_id: str): + """Poll scan status. Returns result when completed.""" + job = _scan_jobs.get(scan_id) + if not job: + return {"scan_id": scan_id, "status": "not_found", "error": "Scan-ID nicht gefunden"} + return ScanStatusResponse( + scan_id=scan_id, + status=job["status"], + progress=job.get("progress", ""), + result=job.get("result"), + error=job.get("error", ""), + ) + + +async def _run_scan(scan_id: str, req: ScanRequest): + """Background scan task — updates _scan_jobs with progress.""" + try: + result = await _execute_scan(req, scan_id) + _scan_jobs[scan_id]["status"] = "completed" + _scan_jobs[scan_id]["result"] = result + _scan_jobs[scan_id]["progress"] = "Fertig" + except Exception as e: + logger.error("Scan %s failed: %s", scan_id, e) + _scan_jobs[scan_id]["status"] = "failed" + _scan_jobs[scan_id]["error"] = str(e)[:500] + + +async def _execute_scan(req: ScanRequest, scan_id: str = "") -> ScanResponse: + """Execute the full scan pipeline (called as background task).""" + is_live = req.mode == "post_launch" + def _progress(msg: str): + if scan_id and scan_id in _scan_jobs: + _scan_jobs[scan_id]["progress"] = msg + + _progress("Schritt 1/7: Website wird gescannt...") # Step 1: Scan website — try Playwright first (JS-rendered), fallback to httpx playwright_htmls: dict[str, str] = {} try: @@ -153,14 +220,15 @@ async def scan_website_endpoint(req: ScanRequest): logger.info("Scanned %d pages, found %d services", len(scan.pages_scanned), len(scan.detected_services)) + _progress(f"Schritt 2/7: Rechtliche Dokumente suchen... ({len(scan.pages_scanned)} Seiten gescannt)") # Step 1b: DSI Discovery — find all legal documents on the website discovered_docs: list[DiscoveredDocument] = [] dsi_findings: list[ScanFinding] = [] try: - async with httpx.AsyncClient(timeout=180.0) as dsi_client: + async with httpx.AsyncClient(timeout=300.0) as dsi_client: dsi_resp = await dsi_client.post( "http://bp-compliance-consent-tester:8094/dsi-discovery", - json={"url": req.url, "max_documents": 20}, + json={"url": req.url, "max_documents": 30}, ) if dsi_resp.status_code == 200: dsi_data = dsi_resp.json() @@ -198,8 +266,9 @@ async def scan_website_endpoint(req: ScanRequest): code=df["code"], severity=df["severity"], text=df["text"], )) except Exception as e: - logger.warning("DSI discovery failed: %s", e) + logger.warning("DSI discovery failed: %s %s", type(e).__name__, e) + _progress(f"Schritt 3/7: Datenschutzerklaerung analysieren... ({len(discovered_docs)} Dokumente gefunden)") # Step 2: Fetch privacy policy text # Priority: 1) Playwright HTMLs, 2) DSI Discovery full_text, 3) httpx fallback dse_text = "" @@ -223,7 +292,7 @@ async def scan_website_endpoint(req: ScanRequest): except Exception: pass if not dse_text: - dse_text = await _fetch_dse_text(req.url, scan.pages_scanned) + dse_text = await fetch_dse_text(req.url, scan.pages_scanned) # Step 3: Extract services mentioned in DSE via LLM + text fallback dse_services = await extract_dse_services(dse_text) if dse_text else [] @@ -248,10 +317,11 @@ async def scan_website_endpoint(req: ScanRequest): dse_html = html break if not dse_html: - dse_html = await _fetch_dse_html(req.url, scan.pages_scanned) + dse_html = await fetch_dse_html(req.url, scan.pages_scanned) dse_sections = parse_dse(dse_html, req.url) if dse_html else [] logger.info("Parsed %d DSE sections", len(dse_sections)) + _progress("Schritt 4/7: SOLL/IST Vergleich...") # Step 5: SOLL/IST comparison detected_dicts = [_service_to_dict(s) for s in scan.detected_services] comparison = compare_services(detected_dicts, dse_services) @@ -290,13 +360,16 @@ async def scan_website_endpoint(req: ScanRequest): # Step 8c: Add DSI document findings findings.extend(dsi_findings) + _progress(f"Schritt 5/7: Korrekturen generieren... ({len(findings)} Findings)") # Step 9: Generate corrections for pre-launch mode if not is_live and findings: await add_corrections(findings, dse_text) + _progress("Schritt 6/7: Report erstellen...") # Step 7: Build summary summary = build_scan_summary(req.url, scan, comparison, findings, is_live, discovered_docs) + _progress("Schritt 7/7: E-Mail senden...") # Step 8: Send notification mode_label = "INTERNE PRUEFUNG" if not is_live else "LIVE-WEBSITE" email_result = send_email( @@ -322,46 +395,6 @@ async def scan_website_endpoint(req: ScanRequest): ) -async def _fetch_dse_text(url: str, scanned_pages: list[str]) -> str: - """Find and fetch the privacy policy page text.""" - import re - # Find DSE URL from scanned pages - dse_url = None - for page in scanned_pages: - if re.search(r"datenschutz|privacy|dsgvo", page, re.IGNORECASE): - dse_url = page - break - if not dse_url: - dse_url = url # Fallback to provided URL - - try: - async with httpx.AsyncClient(timeout=15.0, follow_redirects=True) as client: - resp = await client.get(dse_url, headers={"User-Agent": "BreakPilot-Compliance-Agent/1.0"}) - html = resp.text - clean = re.sub(r"<(script|style)[^>]*>.*?", "", html, flags=re.DOTALL | re.IGNORECASE) - clean = re.sub(r"<[^>]+>", " ", clean) - clean = re.sub(r"\s+", " ", clean).strip() - return clean[:4000] - except Exception: - return "" - - -async def _fetch_dse_html(url: str, scanned_pages: list[str]) -> str: - """Fetch the raw HTML of the privacy policy page (for structured parsing).""" - import re - dse_url = None - for page in scanned_pages: - if re.search(r"datenschutz|privacy|dsgvo", page, re.IGNORECASE): - dse_url = page - break - if not dse_url: - dse_url = url - try: - async with httpx.AsyncClient(timeout=15.0, follow_redirects=True) as client: - resp = await client.get(dse_url, headers={"User-Agent": "BreakPilot-Compliance-Agent/1.0"}) - return resp.text - except Exception: - return "" def _service_to_dict(svc: DetectedService) -> dict: