fix: Scan quality — raise page limit, use full DSI text for checks

Bug 1: max_pages was hardcoded to 15 in backend call — raised to 50 Bug 2: DSI documents checked against text_preview (500 chars) — now uses full_text (10,000 chars) for Art. 13 mandatory field checks Bug 3: DSE text not found when Playwright misses DSE page — now falls back to DSI Discovery full_text as second source Bug 4: Backend timeout 120s too short for 50 pages — raised to 300s Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-04 23:51:03 +02:00
parent f967480cd9
commit a3f7fb93f4
2 changed files with 24 additions and 9 deletions
@@ -114,10 +114,10 @@ async def scan_website_endpoint(req: ScanRequest):
    # Step 1: Scan website — try Playwright first (JS-rendered), fallback to httpx
    playwright_htmls: dict[str, str] = {}
    try:
-        async with httpx.AsyncClient(timeout=120.0) as pw_client:
+        async with httpx.AsyncClient(timeout=300.0) as pw_client:
            pw_resp = await pw_client.post(
                "http://bp-compliance-consent-tester:8094/website-scan",
-                json={"url": req.url, "max_pages": 15, "click_nav": True},
+                json={"url": req.url, "max_pages": 50, "click_nav": True},
            )
            if pw_resp.status_code == 200:
                pw_data = pw_resp.json()
@@ -172,8 +172,9 @@ async def scan_website_endpoint(req: ScanRequest):
                )
                for doc in dsi_data.get("documents", []):
                    doc_type = classify_document_type(doc["title"], doc["url"])
+                    doc_text = doc.get("full_text", "") or doc.get("text_preview", "")
                    doc_findings = check_document_completeness(
-                        doc.get("text_preview", ""), doc_type, doc["title"], doc["url"],
+                        doc_text, doc_type, doc["title"], doc["url"],
                    )
                    # Count completeness
                    score_finding = next((f for f in doc_findings if "SCORE" in f.get("code", "")), None)
@@ -199,16 +200,28 @@ async def scan_website_endpoint(req: ScanRequest):
    except Exception as e:
        logger.warning("DSI discovery failed: %s", e)

-    # Step 2: Fetch privacy policy text (from Playwright HTMLs or httpx)
+    # Step 2: Fetch privacy policy text
+    # Priority: 1) Playwright HTMLs, 2) DSI Discovery full_text, 3) httpx fallback
    dse_text = ""
    for page_url, html in playwright_htmls.items():
        if re.search(r"datenschutz|privacy|dsgvo", page_url, re.IGNORECASE):
-            import re as _re
-            clean = _re.sub(r"<(script|style)[^>]*>.*?</\1>", "", html, flags=_re.DOTALL | _re.IGNORECASE)
-            clean = _re.sub(r"<[^>]+>", " ", clean)
-            clean = _re.sub(r"\s+", " ", clean).strip()
-            dse_text = clean[:4000]
+            clean = re.sub(r"<(script|style)[^>]*>.*?</\1>", "", html, flags=re.DOTALL | re.IGNORECASE)
+            clean = re.sub(r"<[^>]+>", " ", clean)
+            clean = re.sub(r"\s+", " ", clean).strip()
+            dse_text = clean[:8000]
            break
+    # Fallback: use DSI discovery texts (combined from all DSE documents found)
+    if not dse_text and discovered_docs:
+        try:
+            dsi_data_local = dsi_resp.json() if 'dsi_resp' in dir() else {}
+            for doc in dsi_data_local.get("documents", []):
+                if doc.get("doc_type", "") in ("dse", "privacy", "datenschutz") or \
+                   "datenschutz" in doc.get("title", "").lower():
+                    ft = doc.get("full_text", "")
+                    if ft and len(ft) > len(dse_text):
+                        dse_text = ft[:8000]
+        except Exception:
+            pass
    if not dse_text:
        dse_text = await _fetch_dse_text(req.url, scan.pages_scanned)