From e494cf62bb43388d30fd28f1031cc726b7cb654f Mon Sep 17 00:00:00 2001
From: Benjamin Admin <benjaminadmin@MacBook-Pro.local>
Date: Tue, 5 May 2026 13:10:59 +0200
Subject: [PATCH] =?UTF-8?q?fix:=20Increase=20page=20load=20timeouts=20?=
 =?UTF-8?q?=E2=80=94=20IHK=20site=20needs=20>30s=20for=20networkidle?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Initial page.goto timeout: 30s → 60s (IHK loads many JS resources)
- Per-page navigation timeout: 20s → 45s (heavy JS sites)
- Reduced extra wait from 3s+1s back to 2s+0.5s (goto timeout handles slow loads)
- Playwright scanner page timeout: 20s → 45s

Root cause: IHK website has heavy JavaScript that takes >30s to reach
'networkidle' state, causing DSI discovery to fail immediately.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 consent-tester/services/dsi_discovery.py      | 12 ++++++------
 consent-tester/services/playwright_scanner.py |  2 +-
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/consent-tester/services/dsi_discovery.py b/consent-tester/services/dsi_discovery.py
index 9f71b62..f1d34f3 100644
--- a/consent-tester/services/dsi_discovery.py
+++ b/consent-tester/services/dsi_discovery.py
@@ -217,7 +217,7 @@ async def discover_dsi_documents(
 
     try:
         # Step 1: Load the page
-        await page.goto(url, wait_until="networkidle", timeout=30000)
+        await page.goto(url, wait_until="networkidle", timeout=60000)
         await page.wait_for_timeout(2000)
 
         # Step 2: Find DSI links in current page
@@ -290,11 +290,11 @@ async def discover_dsi_documents(
                     continue
 
                 # Navigate to page — wait for JS to load content
-                resp = await page.goto(href, wait_until="networkidle", timeout=20000)
+                resp = await page.goto(href, wait_until="networkidle", timeout=45000)
                 if resp and resp.status < 400:
-                    await page.wait_for_timeout(3000)  # Extra wait for JS content loading
+                    await page.wait_for_timeout(2000)
                     await _expand_all_interactive(page)
-                    await page.wait_for_timeout(1000)
+                    await page.wait_for_timeout(500)
 
                     # Extract text — try specific content areas, fall back to full body
                     text = await page.evaluate("""
@@ -333,14 +333,14 @@ async def discover_dsi_documents(
                             pending_links.append(nl)
 
                 # Navigate back for next link
-                await page.goto(url, wait_until="networkidle", timeout=20000)
+                await page.goto(url, wait_until="networkidle", timeout=45000)
                 await page.wait_for_timeout(500)
                 await _expand_all_interactive(page)
 
             except Exception as e:
                 result.errors.append(f"Failed to load {href}: {str(e)[:80]}")
                 try:
-                    await page.goto(url, wait_until="networkidle", timeout=20000)
+                    await page.goto(url, wait_until="networkidle", timeout=45000)
                 except Exception:
                     pass
 
diff --git a/consent-tester/services/playwright_scanner.py b/consent-tester/services/playwright_scanner.py
index 49635d2..2aef6ef 100644
--- a/consent-tester/services/playwright_scanner.py
+++ b/consent-tester/services/playwright_scanner.py
@@ -157,7 +157,7 @@ async def _visit_page(page: Page, url: str, result: PlaywrightScanResult) -> Sca
     """Visit a page and capture its rendered HTML."""
     sp = ScannedPage(url=url, status=0)
     try:
-        response = await page.goto(url, wait_until="networkidle", timeout=20000)
+        response = await page.goto(url, wait_until="networkidle", timeout=45000)
         sp.status = response.status if response else 0
         await page.wait_for_timeout(2000)