From 6da9972ef47a49b6e209b41ba231688d93fc5aec Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Mon, 4 May 2026 22:21:16 +0200 Subject: [PATCH] =?UTF-8?q?fix:=20Exhaustive=20crawl=20=E2=80=94=20no=20ar?= =?UTF-8?q?bitrary=20page/document=20limits?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Both scanners now search until done, not until a counter runs out: playwright_scanner.py: - Default max_pages raised from 15 to 50 - Added 3-minute timeout as safety net - Recursive link discovery on EVERY visited page (not just DSE pages) - Stops when: all links visited OR max_pages OR timeout dsi_discovery.py: - Default max_documents raised from 30 to 100 - Added 5-minute timeout as safety net - Recursive: on each visited page, searches for MORE DSI links - Processes ALL discovered links exhaustively - Stops when: no more pending links OR max_documents OR timeout The scanners now behave like a real user: they follow every relevant link they find, and on each new page they look for more links. Co-Authored-By: Claude Opus 4.6 (1M context) --- consent-tester/services/dsi_discovery.py | 35 ++++++++++++++----- consent-tester/services/playwright_scanner.py | 31 ++++++++++------ 2 files changed, 48 insertions(+), 18 deletions(-) diff --git a/consent-tester/services/dsi_discovery.py b/consent-tester/services/dsi_discovery.py index 8f686db..d86e3d4 100644 --- a/consent-tester/services/dsi_discovery.py +++ b/consent-tester/services/dsi_discovery.py @@ -203,12 +203,18 @@ def _is_allowed_domain(href: str, base_domain: str) -> bool: async def discover_dsi_documents( page: Page, url: str, - max_documents: int = 30, + max_documents: int = 100, + timeout_seconds: int = 300, ) -> DSIDiscoveryResult: """Discover all privacy/data protection documents on a website. Works generically regardless of website technology, structure, or language. + Searches exhaustively until no new documents are found — no arbitrary page limit. + Stops when: all discovered links have been visited OR timeout reached. """ + import time + deadline = time.time() + timeout_seconds + result = DSIDiscoveryResult(base_url=url) base_domain = urlparse(url).netloc seen_urls: set[str] = set() @@ -251,8 +257,15 @@ async def discover_dsi_documents( ) result.documents.append(doc) - # Step 5: Follow each DSI link and extract content - for link_info in links[:max_documents]: + # Step 5: Follow each DSI link and extract content. + # Exhaustive: processes ALL found links. On each visited page, + # searches for MORE links (recursive discovery). Stops only when + # all links visited or timeout reached. + pending_links = list(links) + pages_to_revisit: list[str] = [] # Pages where we found docs — may have more links + + while pending_links and time.time() < deadline and len(result.documents) < max_documents: + link_info = pending_links.pop(0) href = link_info["href"] if href in seen_urls: continue @@ -275,7 +288,6 @@ async def discover_dsi_documents( )) continue - # Navigate to the link and extract text try: is_anchor = "#" in href and href.split("#")[0] == url.split("#")[0] if is_anchor: @@ -295,13 +307,14 @@ async def discover_dsi_documents( )) continue - # External or same-domain page + # Navigate to page resp = await page.goto(href, wait_until="networkidle", timeout=20000) if resp and resp.status < 400: await page.wait_for_timeout(2000) - await _expand_all_interactive(page) # Expand accordions on target page too + await _expand_all_interactive(page) await page.wait_for_timeout(500) + # Extract text text = await page.evaluate(""" () => { const main = document.querySelector('main, article, [role="main"], .content, #content'); @@ -316,9 +329,15 @@ async def discover_dsi_documents( text=text[:50000], word_count=len(text.split()), )) - # Navigate back to source page for next link + # Recursive: search THIS page for more DSI links + new_links = await _find_dsi_links(page, base_domain) + for nl in new_links: + if nl["href"] not in seen_urls and nl["href"] not in [p["href"] for p in pending_links]: + pending_links.append(nl) + + # Navigate back for next link await page.goto(url, wait_until="networkidle", timeout=20000) - await page.wait_for_timeout(1000) + await page.wait_for_timeout(500) await _expand_all_interactive(page) except Exception as e: diff --git a/consent-tester/services/playwright_scanner.py b/consent-tester/services/playwright_scanner.py index 5fdbd5c..49635d2 100644 --- a/consent-tester/services/playwright_scanner.py +++ b/consent-tester/services/playwright_scanner.py @@ -61,10 +61,18 @@ class PlaywrightScanResult: async def scan_website_playwright( base_url: str, - max_pages: int = 15, + max_pages: int = 50, click_nav: bool = True, + timeout_seconds: int = 180, ) -> PlaywrightScanResult: - """Scan website using Playwright — discovers pages via JS navigation.""" + """Scan website using Playwright — discovers pages via JS navigation. + + Exhaustively crawls until no new relevant links found, up to max_pages + (default 50) or timeout (default 3 min) as safety limits. + """ + import time as _time + deadline = _time.time() + timeout_seconds + result = PlaywrightScanResult() parsed = urlparse(base_url) origin = f"{parsed.scheme}://{parsed.netloc}" @@ -105,8 +113,12 @@ async def scan_website_playwright( if link not in visited and link not in to_visit: to_visit.append(link) - # Phase 2: Visit discovered pages (up to max_pages) - for url in to_visit[:max_pages]: + # Phase 2: Visit discovered pages exhaustively (until done or timeout) + visit_idx = 0 + while visit_idx < len(to_visit) and len(visited) < max_pages and _time.time() < deadline: + url = to_visit[visit_idx] + visit_idx += 1 + if url in visited: continue if SKIP_PATTERNS.search(url): @@ -115,13 +127,12 @@ async def scan_website_playwright( continue visited.add(url) - await _visit_page(page, url, result) + sp = await _visit_page(page, url, result) - # On DSE pages, discover additional links - current_url = page.url - if re.search(r"datenschutz|privacy|dsgvo", current_url, re.IGNORECASE): - dse_links = await _discover_nav_links(page, origin) - for link in dse_links: + # On every visited page, discover more links (recursive crawl) + if sp and sp.html: + new_links = await _discover_nav_links(page, origin) + for link in new_links: if link not in visited and link not in to_visit and link.startswith(origin): to_visit.append(link)