fix: Increase page load timeouts — IHK site needs >30s for networkidle

- Initial page.goto timeout: 30s → 60s (IHK loads many JS resources)
- Per-page navigation timeout: 20s → 45s (heavy JS sites)
- Reduced extra wait from 3s+1s back to 2s+0.5s (goto timeout handles slow loads)
- Playwright scanner page timeout: 20s → 45s

Root cause: IHK website has heavy JavaScript that takes >30s to reach
'networkidle' state, causing DSI discovery to fail immediately.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-05-05 13:10:59 +02:00
parent d547e63663
commit e494cf62bb
2 changed files with 7 additions and 7 deletions
+6 -6
View File
@@ -217,7 +217,7 @@ async def discover_dsi_documents(
try:
# Step 1: Load the page
await page.goto(url, wait_until="networkidle", timeout=30000)
await page.goto(url, wait_until="networkidle", timeout=60000)
await page.wait_for_timeout(2000)
# Step 2: Find DSI links in current page
@@ -290,11 +290,11 @@ async def discover_dsi_documents(
continue
# Navigate to page — wait for JS to load content
resp = await page.goto(href, wait_until="networkidle", timeout=20000)
resp = await page.goto(href, wait_until="networkidle", timeout=45000)
if resp and resp.status < 400:
await page.wait_for_timeout(3000) # Extra wait for JS content loading
await page.wait_for_timeout(2000)
await _expand_all_interactive(page)
await page.wait_for_timeout(1000)
await page.wait_for_timeout(500)
# Extract text — try specific content areas, fall back to full body
text = await page.evaluate("""
@@ -333,14 +333,14 @@ async def discover_dsi_documents(
pending_links.append(nl)
# Navigate back for next link
await page.goto(url, wait_until="networkidle", timeout=20000)
await page.goto(url, wait_until="networkidle", timeout=45000)
await page.wait_for_timeout(500)
await _expand_all_interactive(page)
except Exception as e:
result.errors.append(f"Failed to load {href}: {str(e)[:80]}")
try:
await page.goto(url, wait_until="networkidle", timeout=20000)
await page.goto(url, wait_until="networkidle", timeout=45000)
except Exception:
pass
@@ -157,7 +157,7 @@ async def _visit_page(page: Page, url: str, result: PlaywrightScanResult) -> Sca
"""Visit a page and capture its rendered HTML."""
sp = ScannedPage(url=url, status=0)
try:
response = await page.goto(url, wait_until="networkidle", timeout=20000)
response = await page.goto(url, wait_until="networkidle", timeout=45000)
sp.status = response.status if response else 0
await page.wait_for_timeout(2000)