fix: Increase page load timeouts — IHK site needs >30s for networkidle
- Initial page.goto timeout: 30s → 60s (IHK loads many JS resources) - Per-page navigation timeout: 20s → 45s (heavy JS sites) - Reduced extra wait from 3s+1s back to 2s+0.5s (goto timeout handles slow loads) - Playwright scanner page timeout: 20s → 45s Root cause: IHK website has heavy JavaScript that takes >30s to reach 'networkidle' state, causing DSI discovery to fail immediately. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -217,7 +217,7 @@ async def discover_dsi_documents(
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
# Step 1: Load the page
|
# Step 1: Load the page
|
||||||
await page.goto(url, wait_until="networkidle", timeout=30000)
|
await page.goto(url, wait_until="networkidle", timeout=60000)
|
||||||
await page.wait_for_timeout(2000)
|
await page.wait_for_timeout(2000)
|
||||||
|
|
||||||
# Step 2: Find DSI links in current page
|
# Step 2: Find DSI links in current page
|
||||||
@@ -290,11 +290,11 @@ async def discover_dsi_documents(
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
# Navigate to page — wait for JS to load content
|
# Navigate to page — wait for JS to load content
|
||||||
resp = await page.goto(href, wait_until="networkidle", timeout=20000)
|
resp = await page.goto(href, wait_until="networkidle", timeout=45000)
|
||||||
if resp and resp.status < 400:
|
if resp and resp.status < 400:
|
||||||
await page.wait_for_timeout(3000) # Extra wait for JS content loading
|
await page.wait_for_timeout(2000)
|
||||||
await _expand_all_interactive(page)
|
await _expand_all_interactive(page)
|
||||||
await page.wait_for_timeout(1000)
|
await page.wait_for_timeout(500)
|
||||||
|
|
||||||
# Extract text — try specific content areas, fall back to full body
|
# Extract text — try specific content areas, fall back to full body
|
||||||
text = await page.evaluate("""
|
text = await page.evaluate("""
|
||||||
@@ -333,14 +333,14 @@ async def discover_dsi_documents(
|
|||||||
pending_links.append(nl)
|
pending_links.append(nl)
|
||||||
|
|
||||||
# Navigate back for next link
|
# Navigate back for next link
|
||||||
await page.goto(url, wait_until="networkidle", timeout=20000)
|
await page.goto(url, wait_until="networkidle", timeout=45000)
|
||||||
await page.wait_for_timeout(500)
|
await page.wait_for_timeout(500)
|
||||||
await _expand_all_interactive(page)
|
await _expand_all_interactive(page)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
result.errors.append(f"Failed to load {href}: {str(e)[:80]}")
|
result.errors.append(f"Failed to load {href}: {str(e)[:80]}")
|
||||||
try:
|
try:
|
||||||
await page.goto(url, wait_until="networkidle", timeout=20000)
|
await page.goto(url, wait_until="networkidle", timeout=45000)
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|||||||
@@ -157,7 +157,7 @@ async def _visit_page(page: Page, url: str, result: PlaywrightScanResult) -> Sca
|
|||||||
"""Visit a page and capture its rendered HTML."""
|
"""Visit a page and capture its rendered HTML."""
|
||||||
sp = ScannedPage(url=url, status=0)
|
sp = ScannedPage(url=url, status=0)
|
||||||
try:
|
try:
|
||||||
response = await page.goto(url, wait_until="networkidle", timeout=20000)
|
response = await page.goto(url, wait_until="networkidle", timeout=45000)
|
||||||
sp.status = response.status if response else 0
|
sp.status = response.status if response else 0
|
||||||
await page.wait_for_timeout(2000)
|
await page.wait_for_timeout(2000)
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user