fix: Exhaustive crawl — no arbitrary page/document limits
Both scanners now search until done, not until a counter runs out: playwright_scanner.py: - Default max_pages raised from 15 to 50 - Added 3-minute timeout as safety net - Recursive link discovery on EVERY visited page (not just DSE pages) - Stops when: all links visited OR max_pages OR timeout dsi_discovery.py: - Default max_documents raised from 30 to 100 - Added 5-minute timeout as safety net - Recursive: on each visited page, searches for MORE DSI links - Processes ALL discovered links exhaustively - Stops when: no more pending links OR max_documents OR timeout The scanners now behave like a real user: they follow every relevant link they find, and on each new page they look for more links. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -203,12 +203,18 @@ def _is_allowed_domain(href: str, base_domain: str) -> bool:
|
||||
async def discover_dsi_documents(
|
||||
page: Page,
|
||||
url: str,
|
||||
max_documents: int = 30,
|
||||
max_documents: int = 100,
|
||||
timeout_seconds: int = 300,
|
||||
) -> DSIDiscoveryResult:
|
||||
"""Discover all privacy/data protection documents on a website.
|
||||
|
||||
Works generically regardless of website technology, structure, or language.
|
||||
Searches exhaustively until no new documents are found — no arbitrary page limit.
|
||||
Stops when: all discovered links have been visited OR timeout reached.
|
||||
"""
|
||||
import time
|
||||
deadline = time.time() + timeout_seconds
|
||||
|
||||
result = DSIDiscoveryResult(base_url=url)
|
||||
base_domain = urlparse(url).netloc
|
||||
seen_urls: set[str] = set()
|
||||
@@ -251,8 +257,15 @@ async def discover_dsi_documents(
|
||||
)
|
||||
result.documents.append(doc)
|
||||
|
||||
# Step 5: Follow each DSI link and extract content
|
||||
for link_info in links[:max_documents]:
|
||||
# Step 5: Follow each DSI link and extract content.
|
||||
# Exhaustive: processes ALL found links. On each visited page,
|
||||
# searches for MORE links (recursive discovery). Stops only when
|
||||
# all links visited or timeout reached.
|
||||
pending_links = list(links)
|
||||
pages_to_revisit: list[str] = [] # Pages where we found docs — may have more links
|
||||
|
||||
while pending_links and time.time() < deadline and len(result.documents) < max_documents:
|
||||
link_info = pending_links.pop(0)
|
||||
href = link_info["href"]
|
||||
if href in seen_urls:
|
||||
continue
|
||||
@@ -275,7 +288,6 @@ async def discover_dsi_documents(
|
||||
))
|
||||
continue
|
||||
|
||||
# Navigate to the link and extract text
|
||||
try:
|
||||
is_anchor = "#" in href and href.split("#")[0] == url.split("#")[0]
|
||||
if is_anchor:
|
||||
@@ -295,13 +307,14 @@ async def discover_dsi_documents(
|
||||
))
|
||||
continue
|
||||
|
||||
# External or same-domain page
|
||||
# Navigate to page
|
||||
resp = await page.goto(href, wait_until="networkidle", timeout=20000)
|
||||
if resp and resp.status < 400:
|
||||
await page.wait_for_timeout(2000)
|
||||
await _expand_all_interactive(page) # Expand accordions on target page too
|
||||
await _expand_all_interactive(page)
|
||||
await page.wait_for_timeout(500)
|
||||
|
||||
# Extract text
|
||||
text = await page.evaluate("""
|
||||
() => {
|
||||
const main = document.querySelector('main, article, [role="main"], .content, #content');
|
||||
@@ -316,9 +329,15 @@ async def discover_dsi_documents(
|
||||
text=text[:50000], word_count=len(text.split()),
|
||||
))
|
||||
|
||||
# Navigate back to source page for next link
|
||||
# Recursive: search THIS page for more DSI links
|
||||
new_links = await _find_dsi_links(page, base_domain)
|
||||
for nl in new_links:
|
||||
if nl["href"] not in seen_urls and nl["href"] not in [p["href"] for p in pending_links]:
|
||||
pending_links.append(nl)
|
||||
|
||||
# Navigate back for next link
|
||||
await page.goto(url, wait_until="networkidle", timeout=20000)
|
||||
await page.wait_for_timeout(1000)
|
||||
await page.wait_for_timeout(500)
|
||||
await _expand_all_interactive(page)
|
||||
|
||||
except Exception as e:
|
||||
|
||||
@@ -61,10 +61,18 @@ class PlaywrightScanResult:
|
||||
|
||||
async def scan_website_playwright(
|
||||
base_url: str,
|
||||
max_pages: int = 15,
|
||||
max_pages: int = 50,
|
||||
click_nav: bool = True,
|
||||
timeout_seconds: int = 180,
|
||||
) -> PlaywrightScanResult:
|
||||
"""Scan website using Playwright — discovers pages via JS navigation."""
|
||||
"""Scan website using Playwright — discovers pages via JS navigation.
|
||||
|
||||
Exhaustively crawls until no new relevant links found, up to max_pages
|
||||
(default 50) or timeout (default 3 min) as safety limits.
|
||||
"""
|
||||
import time as _time
|
||||
deadline = _time.time() + timeout_seconds
|
||||
|
||||
result = PlaywrightScanResult()
|
||||
parsed = urlparse(base_url)
|
||||
origin = f"{parsed.scheme}://{parsed.netloc}"
|
||||
@@ -105,8 +113,12 @@ async def scan_website_playwright(
|
||||
if link not in visited and link not in to_visit:
|
||||
to_visit.append(link)
|
||||
|
||||
# Phase 2: Visit discovered pages (up to max_pages)
|
||||
for url in to_visit[:max_pages]:
|
||||
# Phase 2: Visit discovered pages exhaustively (until done or timeout)
|
||||
visit_idx = 0
|
||||
while visit_idx < len(to_visit) and len(visited) < max_pages and _time.time() < deadline:
|
||||
url = to_visit[visit_idx]
|
||||
visit_idx += 1
|
||||
|
||||
if url in visited:
|
||||
continue
|
||||
if SKIP_PATTERNS.search(url):
|
||||
@@ -115,13 +127,12 @@ async def scan_website_playwright(
|
||||
continue
|
||||
|
||||
visited.add(url)
|
||||
await _visit_page(page, url, result)
|
||||
sp = await _visit_page(page, url, result)
|
||||
|
||||
# On DSE pages, discover additional links
|
||||
current_url = page.url
|
||||
if re.search(r"datenschutz|privacy|dsgvo", current_url, re.IGNORECASE):
|
||||
dse_links = await _discover_nav_links(page, origin)
|
||||
for link in dse_links:
|
||||
# On every visited page, discover more links (recursive crawl)
|
||||
if sp and sp.html:
|
||||
new_links = await _discover_nav_links(page, origin)
|
||||
for link in new_links:
|
||||
if link not in visited and link not in to_visit and link.startswith(origin):
|
||||
to_visit.append(link)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user