feat: Async scan with polling — no more timeout issues
Fundamental fix: scans now run asynchronously with progress polling.
Backend:
- POST /scan starts background task, returns scan_id immediately
- GET /scan/{scan_id} returns status + progress + result when done
- 7 progress steps shown: Website scan, DSI discovery, DSE analysis,
SOLL/IST comparison, corrections, report, email
- In-memory job store (dict with scan_id → status/result)
- No timeout limits on scan duration
Frontend:
- POST starts scan, receives scan_id
- Polls GET every 5 seconds (max 120 attempts = 10 min)
- Shows live progress message during scan
- Displays result when completed, error when failed
Proxy:
- POST timeout reduced to 30s (just starts the job)
- GET timeout 10s (just status check)
- No more 504/connection-dropped errors
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -1,6 +1,8 @@
|
||||
/**
|
||||
* Agent Scan API Proxy
|
||||
* POST /api/sdk/v1/agent/scan → backend-compliance /api/compliance/agent/scan
|
||||
* Agent Scan API Proxy — async scan with polling
|
||||
*
|
||||
* POST /api/sdk/v1/agent/scan → starts scan, returns scan_id
|
||||
* GET /api/sdk/v1/agent/scan?scan_id=xxx → poll status/results
|
||||
*/
|
||||
|
||||
import { NextRequest, NextResponse } from 'next/server'
|
||||
@@ -11,11 +13,12 @@ export async function POST(request: NextRequest) {
|
||||
try {
|
||||
const body = await request.text()
|
||||
|
||||
// Start async scan — returns immediately with scan_id
|
||||
const response = await fetch(`${BACKEND_URL}/api/compliance/agent/scan`, {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body,
|
||||
signal: AbortSignal.timeout(600000), // 10 min — 50 pages + 20 DSI docs + LLM corrections
|
||||
signal: AbortSignal.timeout(30000), // 30s — just needs to start the job
|
||||
})
|
||||
|
||||
if (!response.ok) {
|
||||
@@ -31,7 +34,36 @@ export async function POST(request: NextRequest) {
|
||||
} catch (error) {
|
||||
console.error('Agent scan proxy error:', error)
|
||||
return NextResponse.json(
|
||||
{ error: 'Scan fehlgeschlagen oder Timeout' },
|
||||
{ error: 'Scan konnte nicht gestartet werden' },
|
||||
{ status: 503 }
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
export async function GET(request: NextRequest) {
|
||||
const scanId = request.nextUrl.searchParams.get('scan_id')
|
||||
if (!scanId) {
|
||||
return NextResponse.json({ error: 'scan_id parameter required' }, { status: 400 })
|
||||
}
|
||||
|
||||
try {
|
||||
const response = await fetch(
|
||||
`${BACKEND_URL}/api/compliance/agent/scan/${scanId}`,
|
||||
{ signal: AbortSignal.timeout(10000) }
|
||||
)
|
||||
|
||||
if (!response.ok) {
|
||||
return NextResponse.json(
|
||||
{ error: `Backend: ${response.status}` },
|
||||
{ status: response.status }
|
||||
)
|
||||
}
|
||||
|
||||
const data = await response.json()
|
||||
return NextResponse.json(data)
|
||||
} catch (error) {
|
||||
return NextResponse.json(
|
||||
{ error: 'Status-Abfrage fehlgeschlagen' },
|
||||
{ status: 503 }
|
||||
)
|
||||
}
|
||||
|
||||
@@ -40,13 +40,42 @@ export default function AgentPage() {
|
||||
setScanError(null)
|
||||
setScanData(null)
|
||||
try {
|
||||
const res = await fetch('/api/sdk/v1/agent/scan', {
|
||||
// Step 1: Start async scan
|
||||
const startRes = await fetch('/api/sdk/v1/agent/scan', {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({ url: url.trim(), mode }),
|
||||
})
|
||||
if (!res.ok) throw new Error(`Scan fehlgeschlagen: ${res.status}`)
|
||||
setScanData(await res.json())
|
||||
if (!startRes.ok) throw new Error(`Scan konnte nicht gestartet werden: ${startRes.status}`)
|
||||
const { scan_id } = await startRes.json()
|
||||
if (!scan_id) throw new Error('Keine Scan-ID erhalten')
|
||||
|
||||
// Step 2: Poll for results
|
||||
let attempts = 0
|
||||
const maxAttempts = 120 // 10 min at 5s intervals
|
||||
while (attempts < maxAttempts) {
|
||||
await new Promise(r => setTimeout(r, 5000))
|
||||
const pollRes = await fetch(`/api/sdk/v1/agent/scan?scan_id=${scan_id}`)
|
||||
if (!pollRes.ok) { attempts++; continue }
|
||||
const status = await pollRes.json()
|
||||
|
||||
// Update progress message
|
||||
if (status.progress) {
|
||||
setScanError(null)
|
||||
// Show progress as temporary "error" (will be cleared when done)
|
||||
setScanData({ _progress: status.progress } as any)
|
||||
}
|
||||
|
||||
if (status.status === 'completed' && status.result) {
|
||||
setScanData(status.result)
|
||||
break
|
||||
}
|
||||
if (status.status === 'failed') {
|
||||
throw new Error(status.error || 'Scan fehlgeschlagen')
|
||||
}
|
||||
attempts++
|
||||
}
|
||||
if (attempts >= maxAttempts) throw new Error('Scan-Timeout (10 Minuten)')
|
||||
} catch (e) {
|
||||
setScanError(e instanceof Error ? e.message : 'Unbekannter Fehler')
|
||||
} finally {
|
||||
|
||||
@@ -107,3 +107,41 @@ def build_scan_summary(
|
||||
])
|
||||
|
||||
return "\n".join(parts)
|
||||
|
||||
|
||||
async def fetch_dse_text(url: str, scanned_pages: list[str]) -> str:
|
||||
"""Find and fetch the privacy policy page text."""
|
||||
dse_url = None
|
||||
for page in scanned_pages:
|
||||
if re.search(r"datenschutz|privacy|dsgvo", page, re.IGNORECASE):
|
||||
dse_url = page
|
||||
break
|
||||
if not dse_url:
|
||||
dse_url = url
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=15.0, follow_redirects=True) as client:
|
||||
resp = await client.get(dse_url, headers={"User-Agent": "BreakPilot-Compliance-Agent/1.0"})
|
||||
html = resp.text
|
||||
clean = re.sub(r"<(script|style)[^>]*>.*?</\1>", "", html, flags=re.DOTALL | re.IGNORECASE)
|
||||
clean = re.sub(r"<[^>]+>", " ", clean)
|
||||
clean = re.sub(r"\s+", " ", clean).strip()
|
||||
return clean[:8000]
|
||||
except Exception:
|
||||
return ""
|
||||
|
||||
|
||||
async def fetch_dse_html(url: str, scanned_pages: list[str]) -> str:
|
||||
"""Fetch the raw HTML of the privacy policy page."""
|
||||
dse_url = None
|
||||
for page in scanned_pages:
|
||||
if re.search(r"datenschutz|privacy|dsgvo", page, re.IGNORECASE):
|
||||
dse_url = page
|
||||
break
|
||||
if not dse_url:
|
||||
dse_url = url
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=15.0, follow_redirects=True) as client:
|
||||
resp = await client.get(dse_url, headers={"User-Agent": "BreakPilot-Compliance-Agent/1.0"})
|
||||
return resp.text
|
||||
except Exception:
|
||||
return ""
|
||||
|
||||
@@ -23,7 +23,9 @@ from compliance.services.mandatory_content_checker import (
|
||||
check_mandatory_documents, check_dse_mandatory_content, MandatoryFinding,
|
||||
)
|
||||
from compliance.services.legal_basis_validator import validate_legal_bases
|
||||
from compliance.api.agent_scan_helpers import add_corrections, build_scan_summary
|
||||
from compliance.api.agent_scan_helpers import (
|
||||
add_corrections, build_scan_summary, fetch_dse_text, fetch_dse_html,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -106,11 +108,76 @@ class ScanResponse(BaseModel):
|
||||
scanned_at: str
|
||||
|
||||
|
||||
@router.post("/scan", response_model=ScanResponse)
|
||||
async def scan_website_endpoint(req: ScanRequest):
|
||||
"""Deep website scan: multi-page crawl + SOLL/IST service comparison."""
|
||||
is_live = req.mode == "post_launch"
|
||||
import asyncio
|
||||
import uuid as _uuid
|
||||
|
||||
# In-memory scan job store (survives until container restart)
|
||||
_scan_jobs: dict[str, dict] = {}
|
||||
|
||||
|
||||
class ScanStartResponse(BaseModel):
|
||||
scan_id: str
|
||||
status: str = "running"
|
||||
message: str = ""
|
||||
|
||||
|
||||
class ScanStatusResponse(BaseModel):
|
||||
scan_id: str
|
||||
status: str # "running", "completed", "failed"
|
||||
progress: str = ""
|
||||
result: ScanResponse | None = None
|
||||
error: str = ""
|
||||
|
||||
|
||||
@router.post("/scan")
|
||||
async def scan_website_endpoint(req: ScanRequest):
|
||||
"""Start async website scan. Returns scan_id immediately.
|
||||
Poll GET /scan/{scan_id} for status and results."""
|
||||
scan_id = str(_uuid.uuid4())[:8]
|
||||
_scan_jobs[scan_id] = {"status": "running", "progress": "Scan gestartet...", "result": None, "error": ""}
|
||||
|
||||
# Launch scan in background
|
||||
asyncio.create_task(_run_scan(scan_id, req))
|
||||
|
||||
return ScanStartResponse(scan_id=scan_id, status="running", message="Scan gestartet. Ergebnisse unter GET /scan/{scan_id}")
|
||||
|
||||
|
||||
@router.get("/scan/{scan_id}")
|
||||
async def get_scan_status(scan_id: str):
|
||||
"""Poll scan status. Returns result when completed."""
|
||||
job = _scan_jobs.get(scan_id)
|
||||
if not job:
|
||||
return {"scan_id": scan_id, "status": "not_found", "error": "Scan-ID nicht gefunden"}
|
||||
return ScanStatusResponse(
|
||||
scan_id=scan_id,
|
||||
status=job["status"],
|
||||
progress=job.get("progress", ""),
|
||||
result=job.get("result"),
|
||||
error=job.get("error", ""),
|
||||
)
|
||||
|
||||
|
||||
async def _run_scan(scan_id: str, req: ScanRequest):
|
||||
"""Background scan task — updates _scan_jobs with progress."""
|
||||
try:
|
||||
result = await _execute_scan(req, scan_id)
|
||||
_scan_jobs[scan_id]["status"] = "completed"
|
||||
_scan_jobs[scan_id]["result"] = result
|
||||
_scan_jobs[scan_id]["progress"] = "Fertig"
|
||||
except Exception as e:
|
||||
logger.error("Scan %s failed: %s", scan_id, e)
|
||||
_scan_jobs[scan_id]["status"] = "failed"
|
||||
_scan_jobs[scan_id]["error"] = str(e)[:500]
|
||||
|
||||
|
||||
async def _execute_scan(req: ScanRequest, scan_id: str = "") -> ScanResponse:
|
||||
"""Execute the full scan pipeline (called as background task)."""
|
||||
is_live = req.mode == "post_launch"
|
||||
def _progress(msg: str):
|
||||
if scan_id and scan_id in _scan_jobs:
|
||||
_scan_jobs[scan_id]["progress"] = msg
|
||||
|
||||
_progress("Schritt 1/7: Website wird gescannt...")
|
||||
# Step 1: Scan website — try Playwright first (JS-rendered), fallback to httpx
|
||||
playwright_htmls: dict[str, str] = {}
|
||||
try:
|
||||
@@ -153,14 +220,15 @@ async def scan_website_endpoint(req: ScanRequest):
|
||||
|
||||
logger.info("Scanned %d pages, found %d services", len(scan.pages_scanned), len(scan.detected_services))
|
||||
|
||||
_progress(f"Schritt 2/7: Rechtliche Dokumente suchen... ({len(scan.pages_scanned)} Seiten gescannt)")
|
||||
# Step 1b: DSI Discovery — find all legal documents on the website
|
||||
discovered_docs: list[DiscoveredDocument] = []
|
||||
dsi_findings: list[ScanFinding] = []
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=180.0) as dsi_client:
|
||||
async with httpx.AsyncClient(timeout=300.0) as dsi_client:
|
||||
dsi_resp = await dsi_client.post(
|
||||
"http://bp-compliance-consent-tester:8094/dsi-discovery",
|
||||
json={"url": req.url, "max_documents": 20},
|
||||
json={"url": req.url, "max_documents": 30},
|
||||
)
|
||||
if dsi_resp.status_code == 200:
|
||||
dsi_data = dsi_resp.json()
|
||||
@@ -198,8 +266,9 @@ async def scan_website_endpoint(req: ScanRequest):
|
||||
code=df["code"], severity=df["severity"], text=df["text"],
|
||||
))
|
||||
except Exception as e:
|
||||
logger.warning("DSI discovery failed: %s", e)
|
||||
logger.warning("DSI discovery failed: %s %s", type(e).__name__, e)
|
||||
|
||||
_progress(f"Schritt 3/7: Datenschutzerklaerung analysieren... ({len(discovered_docs)} Dokumente gefunden)")
|
||||
# Step 2: Fetch privacy policy text
|
||||
# Priority: 1) Playwright HTMLs, 2) DSI Discovery full_text, 3) httpx fallback
|
||||
dse_text = ""
|
||||
@@ -223,7 +292,7 @@ async def scan_website_endpoint(req: ScanRequest):
|
||||
except Exception:
|
||||
pass
|
||||
if not dse_text:
|
||||
dse_text = await _fetch_dse_text(req.url, scan.pages_scanned)
|
||||
dse_text = await fetch_dse_text(req.url, scan.pages_scanned)
|
||||
|
||||
# Step 3: Extract services mentioned in DSE via LLM + text fallback
|
||||
dse_services = await extract_dse_services(dse_text) if dse_text else []
|
||||
@@ -248,10 +317,11 @@ async def scan_website_endpoint(req: ScanRequest):
|
||||
dse_html = html
|
||||
break
|
||||
if not dse_html:
|
||||
dse_html = await _fetch_dse_html(req.url, scan.pages_scanned)
|
||||
dse_html = await fetch_dse_html(req.url, scan.pages_scanned)
|
||||
dse_sections = parse_dse(dse_html, req.url) if dse_html else []
|
||||
logger.info("Parsed %d DSE sections", len(dse_sections))
|
||||
|
||||
_progress("Schritt 4/7: SOLL/IST Vergleich...")
|
||||
# Step 5: SOLL/IST comparison
|
||||
detected_dicts = [_service_to_dict(s) for s in scan.detected_services]
|
||||
comparison = compare_services(detected_dicts, dse_services)
|
||||
@@ -290,13 +360,16 @@ async def scan_website_endpoint(req: ScanRequest):
|
||||
# Step 8c: Add DSI document findings
|
||||
findings.extend(dsi_findings)
|
||||
|
||||
_progress(f"Schritt 5/7: Korrekturen generieren... ({len(findings)} Findings)")
|
||||
# Step 9: Generate corrections for pre-launch mode
|
||||
if not is_live and findings:
|
||||
await add_corrections(findings, dse_text)
|
||||
|
||||
_progress("Schritt 6/7: Report erstellen...")
|
||||
# Step 7: Build summary
|
||||
summary = build_scan_summary(req.url, scan, comparison, findings, is_live, discovered_docs)
|
||||
|
||||
_progress("Schritt 7/7: E-Mail senden...")
|
||||
# Step 8: Send notification
|
||||
mode_label = "INTERNE PRUEFUNG" if not is_live else "LIVE-WEBSITE"
|
||||
email_result = send_email(
|
||||
@@ -322,46 +395,6 @@ async def scan_website_endpoint(req: ScanRequest):
|
||||
)
|
||||
|
||||
|
||||
async def _fetch_dse_text(url: str, scanned_pages: list[str]) -> str:
|
||||
"""Find and fetch the privacy policy page text."""
|
||||
import re
|
||||
# Find DSE URL from scanned pages
|
||||
dse_url = None
|
||||
for page in scanned_pages:
|
||||
if re.search(r"datenschutz|privacy|dsgvo", page, re.IGNORECASE):
|
||||
dse_url = page
|
||||
break
|
||||
if not dse_url:
|
||||
dse_url = url # Fallback to provided URL
|
||||
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=15.0, follow_redirects=True) as client:
|
||||
resp = await client.get(dse_url, headers={"User-Agent": "BreakPilot-Compliance-Agent/1.0"})
|
||||
html = resp.text
|
||||
clean = re.sub(r"<(script|style)[^>]*>.*?</\1>", "", html, flags=re.DOTALL | re.IGNORECASE)
|
||||
clean = re.sub(r"<[^>]+>", " ", clean)
|
||||
clean = re.sub(r"\s+", " ", clean).strip()
|
||||
return clean[:4000]
|
||||
except Exception:
|
||||
return ""
|
||||
|
||||
|
||||
async def _fetch_dse_html(url: str, scanned_pages: list[str]) -> str:
|
||||
"""Fetch the raw HTML of the privacy policy page (for structured parsing)."""
|
||||
import re
|
||||
dse_url = None
|
||||
for page in scanned_pages:
|
||||
if re.search(r"datenschutz|privacy|dsgvo", page, re.IGNORECASE):
|
||||
dse_url = page
|
||||
break
|
||||
if not dse_url:
|
||||
dse_url = url
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=15.0, follow_redirects=True) as client:
|
||||
resp = await client.get(dse_url, headers={"User-Agent": "BreakPilot-Compliance-Agent/1.0"})
|
||||
return resp.text
|
||||
except Exception:
|
||||
return ""
|
||||
|
||||
|
||||
def _service_to_dict(svc: DetectedService) -> dict:
|
||||
|
||||
Reference in New Issue
Block a user