fix: Exhaustive crawl — no arbitrary page/document limits

Both scanners now search until done, not until a counter runs out:

playwright_scanner.py:
- Default max_pages raised from 15 to 50
- Added 3-minute timeout as safety net
- Recursive link discovery on EVERY visited page (not just DSE pages)
- Stops when: all links visited OR max_pages OR timeout

dsi_discovery.py:
- Default max_documents raised from 30 to 100
- Added 5-minute timeout as safety net
- Recursive: on each visited page, searches for MORE DSI links
- Processes ALL discovered links exhaustively
- Stops when: no more pending links OR max_documents OR timeout

The scanners now behave like a real user: they follow every relevant
link they find, and on each new page they look for more links.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-05-04 22:21:16 +02:00
parent c284cefada
commit 6da9972ef4
2 changed files with 48 additions and 18 deletions
+21 -10
View File
@@ -61,10 +61,18 @@ class PlaywrightScanResult:
async def scan_website_playwright(
base_url: str,
max_pages: int = 15,
max_pages: int = 50,
click_nav: bool = True,
timeout_seconds: int = 180,
) -> PlaywrightScanResult:
"""Scan website using Playwright — discovers pages via JS navigation."""
"""Scan website using Playwright — discovers pages via JS navigation.
Exhaustively crawls until no new relevant links found, up to max_pages
(default 50) or timeout (default 3 min) as safety limits.
"""
import time as _time
deadline = _time.time() + timeout_seconds
result = PlaywrightScanResult()
parsed = urlparse(base_url)
origin = f"{parsed.scheme}://{parsed.netloc}"
@@ -105,8 +113,12 @@ async def scan_website_playwright(
if link not in visited and link not in to_visit:
to_visit.append(link)
# Phase 2: Visit discovered pages (up to max_pages)
for url in to_visit[:max_pages]:
# Phase 2: Visit discovered pages exhaustively (until done or timeout)
visit_idx = 0
while visit_idx < len(to_visit) and len(visited) < max_pages and _time.time() < deadline:
url = to_visit[visit_idx]
visit_idx += 1
if url in visited:
continue
if SKIP_PATTERNS.search(url):
@@ -115,13 +127,12 @@ async def scan_website_playwright(
continue
visited.add(url)
await _visit_page(page, url, result)
sp = await _visit_page(page, url, result)
# On DSE pages, discover additional links
current_url = page.url
if re.search(r"datenschutz|privacy|dsgvo", current_url, re.IGNORECASE):
dse_links = await _discover_nav_links(page, origin)
for link in dse_links:
# On every visited page, discover more links (recursive crawl)
if sp and sp.html:
new_links = await _discover_nav_links(page, origin)
for link in new_links:
if link not in visited and link not in to_visit and link.startswith(origin):
to_visit.append(link)