feat: Async scan with polling — no more timeout issues
Fundamental fix: scans now run asynchronously with progress polling.
Backend:
- POST /scan starts background task, returns scan_id immediately
- GET /scan/{scan_id} returns status + progress + result when done
- 7 progress steps shown: Website scan, DSI discovery, DSE analysis,
SOLL/IST comparison, corrections, report, email
- In-memory job store (dict with scan_id → status/result)
- No timeout limits on scan duration
Frontend:
- POST starts scan, receives scan_id
- Polls GET every 5 seconds (max 120 attempts = 10 min)
- Shows live progress message during scan
- Displays result when completed, error when failed
Proxy:
- POST timeout reduced to 30s (just starts the job)
- GET timeout 10s (just status check)
- No more 504/connection-dropped errors
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -107,3 +107,41 @@ def build_scan_summary(
|
||||
])
|
||||
|
||||
return "\n".join(parts)
|
||||
|
||||
|
||||
async def fetch_dse_text(url: str, scanned_pages: list[str]) -> str:
|
||||
"""Find and fetch the privacy policy page text."""
|
||||
dse_url = None
|
||||
for page in scanned_pages:
|
||||
if re.search(r"datenschutz|privacy|dsgvo", page, re.IGNORECASE):
|
||||
dse_url = page
|
||||
break
|
||||
if not dse_url:
|
||||
dse_url = url
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=15.0, follow_redirects=True) as client:
|
||||
resp = await client.get(dse_url, headers={"User-Agent": "BreakPilot-Compliance-Agent/1.0"})
|
||||
html = resp.text
|
||||
clean = re.sub(r"<(script|style)[^>]*>.*?</\1>", "", html, flags=re.DOTALL | re.IGNORECASE)
|
||||
clean = re.sub(r"<[^>]+>", " ", clean)
|
||||
clean = re.sub(r"\s+", " ", clean).strip()
|
||||
return clean[:8000]
|
||||
except Exception:
|
||||
return ""
|
||||
|
||||
|
||||
async def fetch_dse_html(url: str, scanned_pages: list[str]) -> str:
|
||||
"""Fetch the raw HTML of the privacy policy page."""
|
||||
dse_url = None
|
||||
for page in scanned_pages:
|
||||
if re.search(r"datenschutz|privacy|dsgvo", page, re.IGNORECASE):
|
||||
dse_url = page
|
||||
break
|
||||
if not dse_url:
|
||||
dse_url = url
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=15.0, follow_redirects=True) as client:
|
||||
resp = await client.get(dse_url, headers={"User-Agent": "BreakPilot-Compliance-Agent/1.0"})
|
||||
return resp.text
|
||||
except Exception:
|
||||
return ""
|
||||
|
||||
Reference in New Issue
Block a user