diff --git a/consent-tester/services/dsi_discovery.py b/consent-tester/services/dsi_discovery.py index 8f686db..d86e3d4 100644 --- a/consent-tester/services/dsi_discovery.py +++ b/consent-tester/services/dsi_discovery.py @@ -203,12 +203,18 @@ def _is_allowed_domain(href: str, base_domain: str) -> bool: async def discover_dsi_documents( page: Page, url: str, - max_documents: int = 30, + max_documents: int = 100, + timeout_seconds: int = 300, ) -> DSIDiscoveryResult: """Discover all privacy/data protection documents on a website. Works generically regardless of website technology, structure, or language. + Searches exhaustively until no new documents are found — no arbitrary page limit. + Stops when: all discovered links have been visited OR timeout reached. """ + import time + deadline = time.time() + timeout_seconds + result = DSIDiscoveryResult(base_url=url) base_domain = urlparse(url).netloc seen_urls: set[str] = set() @@ -251,8 +257,15 @@ async def discover_dsi_documents( ) result.documents.append(doc) - # Step 5: Follow each DSI link and extract content - for link_info in links[:max_documents]: + # Step 5: Follow each DSI link and extract content. + # Exhaustive: processes ALL found links. On each visited page, + # searches for MORE links (recursive discovery). Stops only when + # all links visited or timeout reached. + pending_links = list(links) + pages_to_revisit: list[str] = [] # Pages where we found docs — may have more links + + while pending_links and time.time() < deadline and len(result.documents) < max_documents: + link_info = pending_links.pop(0) href = link_info["href"] if href in seen_urls: continue @@ -275,7 +288,6 @@ async def discover_dsi_documents( )) continue - # Navigate to the link and extract text try: is_anchor = "#" in href and href.split("#")[0] == url.split("#")[0] if is_anchor: @@ -295,13 +307,14 @@ async def discover_dsi_documents( )) continue - # External or same-domain page + # Navigate to page resp = await page.goto(href, wait_until="networkidle", timeout=20000) if resp and resp.status < 400: await page.wait_for_timeout(2000) - await _expand_all_interactive(page) # Expand accordions on target page too + await _expand_all_interactive(page) await page.wait_for_timeout(500) + # Extract text text = await page.evaluate(""" () => { const main = document.querySelector('main, article, [role="main"], .content, #content'); @@ -316,9 +329,15 @@ async def discover_dsi_documents( text=text[:50000], word_count=len(text.split()), )) - # Navigate back to source page for next link + # Recursive: search THIS page for more DSI links + new_links = await _find_dsi_links(page, base_domain) + for nl in new_links: + if nl["href"] not in seen_urls and nl["href"] not in [p["href"] for p in pending_links]: + pending_links.append(nl) + + # Navigate back for next link await page.goto(url, wait_until="networkidle", timeout=20000) - await page.wait_for_timeout(1000) + await page.wait_for_timeout(500) await _expand_all_interactive(page) except Exception as e: diff --git a/consent-tester/services/playwright_scanner.py b/consent-tester/services/playwright_scanner.py index 5fdbd5c..49635d2 100644 --- a/consent-tester/services/playwright_scanner.py +++ b/consent-tester/services/playwright_scanner.py @@ -61,10 +61,18 @@ class PlaywrightScanResult: async def scan_website_playwright( base_url: str, - max_pages: int = 15, + max_pages: int = 50, click_nav: bool = True, + timeout_seconds: int = 180, ) -> PlaywrightScanResult: - """Scan website using Playwright — discovers pages via JS navigation.""" + """Scan website using Playwright — discovers pages via JS navigation. + + Exhaustively crawls until no new relevant links found, up to max_pages + (default 50) or timeout (default 3 min) as safety limits. + """ + import time as _time + deadline = _time.time() + timeout_seconds + result = PlaywrightScanResult() parsed = urlparse(base_url) origin = f"{parsed.scheme}://{parsed.netloc}" @@ -105,8 +113,12 @@ async def scan_website_playwright( if link not in visited and link not in to_visit: to_visit.append(link) - # Phase 2: Visit discovered pages (up to max_pages) - for url in to_visit[:max_pages]: + # Phase 2: Visit discovered pages exhaustively (until done or timeout) + visit_idx = 0 + while visit_idx < len(to_visit) and len(visited) < max_pages and _time.time() < deadline: + url = to_visit[visit_idx] + visit_idx += 1 + if url in visited: continue if SKIP_PATTERNS.search(url): @@ -115,13 +127,12 @@ async def scan_website_playwright( continue visited.add(url) - await _visit_page(page, url, result) + sp = await _visit_page(page, url, result) - # On DSE pages, discover additional links - current_url = page.url - if re.search(r"datenschutz|privacy|dsgvo", current_url, re.IGNORECASE): - dse_links = await _discover_nav_links(page, origin) - for link in dse_links: + # On every visited page, discover more links (recursive crawl) + if sp and sp.html: + new_links = await _discover_nav_links(page, origin) + for link in new_links: if link not in visited and link not in to_visit and link.startswith(origin): to_visit.append(link)