feat: Async scan with polling — no more timeout issues

Fundamental fix: scans now run asynchronously with progress polling. Backend: - POST /scan starts background task, returns scan_id immediately - GET /scan/{scan_id} returns status + progress + result when done - 7 progress steps shown: Website scan, DSI discovery, DSE analysis, SOLL/IST comparison, corrections, report, email - In-memory job store (dict with scan_id → status/result) - No timeout limits on scan duration Frontend: - POST starts scan, receives scan_id - Polls GET every 5 seconds (max 120 attempts = 10 min) - Shows live progress message during scan - Displays result when completed, error when failed Proxy: - POST timeout reduced to 30s (just starts the job) - GET timeout 10s (just status check) - No more 504/connection-dropped errors Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-05 07:30:09 +02:00
parent d7b287889e
commit cb607bf228
4 changed files with 189 additions and 57 deletions
@@ -107,3 +107,41 @@ def build_scan_summary(
        ])

    return "\n".join(parts)
+
+
+async def fetch_dse_text(url: str, scanned_pages: list[str]) -> str:
+    """Find and fetch the privacy policy page text."""
+    dse_url = None
+    for page in scanned_pages:
+        if re.search(r"datenschutz|privacy|dsgvo", page, re.IGNORECASE):
+            dse_url = page
+            break
+    if not dse_url:
+        dse_url = url
+    try:
+        async with httpx.AsyncClient(timeout=15.0, follow_redirects=True) as client:
+            resp = await client.get(dse_url, headers={"User-Agent": "BreakPilot-Compliance-Agent/1.0"})
+            html = resp.text
+            clean = re.sub(r"<(script|style)[^>]*>.*?</\1>", "", html, flags=re.DOTALL | re.IGNORECASE)
+            clean = re.sub(r"<[^>]+>", " ", clean)
+            clean = re.sub(r"\s+", " ", clean).strip()
+            return clean[:8000]
+    except Exception:
+        return ""
+
+
+async def fetch_dse_html(url: str, scanned_pages: list[str]) -> str:
+    """Fetch the raw HTML of the privacy policy page."""
+    dse_url = None
+    for page in scanned_pages:
+        if re.search(r"datenschutz|privacy|dsgvo", page, re.IGNORECASE):
+            dse_url = page
+            break
+    if not dse_url:
+        dse_url = url
+    try:
+        async with httpx.AsyncClient(timeout=15.0, follow_redirects=True) as client:
+            resp = await client.get(dse_url, headers={"User-Agent": "BreakPilot-Compliance-Agent/1.0"})
+            return resp.text
+    except Exception:
+        return ""