fix: Increase page load timeouts — IHK site needs >30s for networkidle

- Initial page.goto timeout: 30s → 60s (IHK loads many JS resources) - Per-page navigation timeout: 20s → 45s (heavy JS sites) - Reduced extra wait from 3s+1s back to 2s+0.5s (goto timeout handles slow loads) - Playwright scanner page timeout: 20s → 45s Root cause: IHK website has heavy JavaScript that takes >30s to reach 'networkidle' state, causing DSI discovery to fail immediately. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-05 13:10:59 +02:00
parent d547e63663
commit e494cf62bb
2 changed files with 7 additions and 7 deletions
@@ -217,7 +217,7 @@ async def discover_dsi_documents(

    try:
        # Step 1: Load the page
-        await page.goto(url, wait_until="networkidle", timeout=30000)
+        await page.goto(url, wait_until="networkidle", timeout=60000)
        await page.wait_for_timeout(2000)

        # Step 2: Find DSI links in current page
@@ -290,11 +290,11 @@ async def discover_dsi_documents(
                    continue

                # Navigate to page — wait for JS to load content
-                resp = await page.goto(href, wait_until="networkidle", timeout=20000)
+                resp = await page.goto(href, wait_until="networkidle", timeout=45000)
                if resp and resp.status < 400:
-                    await page.wait_for_timeout(3000)  # Extra wait for JS content loading
+                    await page.wait_for_timeout(2000)
                    await _expand_all_interactive(page)
-                    await page.wait_for_timeout(1000)
+                    await page.wait_for_timeout(500)

                    # Extract text — try specific content areas, fall back to full body
                    text = await page.evaluate("""
@@ -333,14 +333,14 @@ async def discover_dsi_documents(
                            pending_links.append(nl)

                # Navigate back for next link
-                await page.goto(url, wait_until="networkidle", timeout=20000)
+                await page.goto(url, wait_until="networkidle", timeout=45000)
                await page.wait_for_timeout(500)
                await _expand_all_interactive(page)

            except Exception as e:
                result.errors.append(f"Failed to load {href}: {str(e)[:80]}")
                try:
-                    await page.goto(url, wait_until="networkidle", timeout=20000)
+                    await page.goto(url, wait_until="networkidle", timeout=45000)
                except Exception:
                    pass

@@ -157,7 +157,7 @@ async def _visit_page(page: Page, url: str, result: PlaywrightScanResult) -> Sca
    """Visit a page and capture its rendered HTML."""
    sp = ScannedPage(url=url, status=0)
    try:
-        response = await page.goto(url, wait_until="networkidle", timeout=20000)
+        response = await page.goto(url, wait_until="networkidle", timeout=45000)
        sp.status = response.status if response else 0
        await page.wait_for_timeout(2000)