From a3f7fb93f434b67ceb9a0599ee3669816c211d81 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Mon, 4 May 2026 23:51:03 +0200 Subject: [PATCH] =?UTF-8?q?fix:=20Scan=20quality=20=E2=80=94=20raise=20pag?= =?UTF-8?q?e=20limit,=20use=20full=20DSI=20text=20for=20checks?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bug 1: max_pages was hardcoded to 15 in backend call — raised to 50 Bug 2: DSI documents checked against text_preview (500 chars) — now uses full_text (10,000 chars) for Art. 13 mandatory field checks Bug 3: DSE text not found when Playwright misses DSE page — now falls back to DSI Discovery full_text as second source Bug 4: Backend timeout 120s too short for 50 pages — raised to 300s Co-Authored-By: Claude Opus 4.6 (1M context) --- .../compliance/api/agent_scan_routes.py | 31 +++++++++++++------ consent-tester/main.py | 2 ++ 2 files changed, 24 insertions(+), 9 deletions(-) diff --git a/backend-compliance/compliance/api/agent_scan_routes.py b/backend-compliance/compliance/api/agent_scan_routes.py index 1a541ed..2e877ce 100644 --- a/backend-compliance/compliance/api/agent_scan_routes.py +++ b/backend-compliance/compliance/api/agent_scan_routes.py @@ -114,10 +114,10 @@ async def scan_website_endpoint(req: ScanRequest): # Step 1: Scan website — try Playwright first (JS-rendered), fallback to httpx playwright_htmls: dict[str, str] = {} try: - async with httpx.AsyncClient(timeout=120.0) as pw_client: + async with httpx.AsyncClient(timeout=300.0) as pw_client: pw_resp = await pw_client.post( "http://bp-compliance-consent-tester:8094/website-scan", - json={"url": req.url, "max_pages": 15, "click_nav": True}, + json={"url": req.url, "max_pages": 50, "click_nav": True}, ) if pw_resp.status_code == 200: pw_data = pw_resp.json() @@ -172,8 +172,9 @@ async def scan_website_endpoint(req: ScanRequest): ) for doc in dsi_data.get("documents", []): doc_type = classify_document_type(doc["title"], doc["url"]) + doc_text = doc.get("full_text", "") or doc.get("text_preview", "") doc_findings = check_document_completeness( - doc.get("text_preview", ""), doc_type, doc["title"], doc["url"], + doc_text, doc_type, doc["title"], doc["url"], ) # Count completeness score_finding = next((f for f in doc_findings if "SCORE" in f.get("code", "")), None) @@ -199,16 +200,28 @@ async def scan_website_endpoint(req: ScanRequest): except Exception as e: logger.warning("DSI discovery failed: %s", e) - # Step 2: Fetch privacy policy text (from Playwright HTMLs or httpx) + # Step 2: Fetch privacy policy text + # Priority: 1) Playwright HTMLs, 2) DSI Discovery full_text, 3) httpx fallback dse_text = "" for page_url, html in playwright_htmls.items(): if re.search(r"datenschutz|privacy|dsgvo", page_url, re.IGNORECASE): - import re as _re - clean = _re.sub(r"<(script|style)[^>]*>.*?", "", html, flags=_re.DOTALL | _re.IGNORECASE) - clean = _re.sub(r"<[^>]+>", " ", clean) - clean = _re.sub(r"\s+", " ", clean).strip() - dse_text = clean[:4000] + clean = re.sub(r"<(script|style)[^>]*>.*?", "", html, flags=re.DOTALL | re.IGNORECASE) + clean = re.sub(r"<[^>]+>", " ", clean) + clean = re.sub(r"\s+", " ", clean).strip() + dse_text = clean[:8000] break + # Fallback: use DSI discovery texts (combined from all DSE documents found) + if not dse_text and discovered_docs: + try: + dsi_data_local = dsi_resp.json() if 'dsi_resp' in dir() else {} + for doc in dsi_data_local.get("documents", []): + if doc.get("doc_type", "") in ("dse", "privacy", "datenschutz") or \ + "datenschutz" in doc.get("title", "").lower(): + ft = doc.get("full_text", "") + if ft and len(ft) > len(dse_text): + dse_text = ft[:8000] + except Exception: + pass if not dse_text: dse_text = await _fetch_dse_text(req.url, scan.pages_scanned) diff --git a/consent-tester/main.py b/consent-tester/main.py index 6a79ac5..f06cd2f 100644 --- a/consent-tester/main.py +++ b/consent-tester/main.py @@ -260,6 +260,7 @@ class DSIDocumentInfo(BaseModel): doc_type: str = "" word_count: int = 0 text_preview: str = "" + full_text: str = "" class DSIDiscoveryResponse(BaseModel): @@ -311,6 +312,7 @@ async def dsi_discovery(req: DSIDiscoveryRequest): doc_type=d.doc_type, word_count=d.word_count, text_preview=d.text[:500] if d.text else "", + full_text=d.text[:10000] if d.text else "", ) for d in result.documents ],