From 6da9972ef47a49b6e209b41ba231688d93fc5aec Mon Sep 17 00:00:00 2001
From: Benjamin Admin <benjaminadmin@MacBook-Pro.local>
Date: Mon, 4 May 2026 22:21:16 +0200
Subject: [PATCH] =?UTF-8?q?fix:=20Exhaustive=20crawl=20=E2=80=94=20no=20ar?=
 =?UTF-8?q?bitrary=20page/document=20limits?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Both scanners now search until done, not until a counter runs out:

playwright_scanner.py:
- Default max_pages raised from 15 to 50
- Added 3-minute timeout as safety net
- Recursive link discovery on EVERY visited page (not just DSE pages)
- Stops when: all links visited OR max_pages OR timeout

dsi_discovery.py:
- Default max_documents raised from 30 to 100
- Added 5-minute timeout as safety net
- Recursive: on each visited page, searches for MORE DSI links
- Processes ALL discovered links exhaustively
- Stops when: no more pending links OR max_documents OR timeout

The scanners now behave like a real user: they follow every relevant
link they find, and on each new page they look for more links.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 consent-tester/services/dsi_discovery.py      | 35 ++++++++++++++-----
 consent-tester/services/playwright_scanner.py | 31 ++++++++++------
 2 files changed, 48 insertions(+), 18 deletions(-)

diff --git a/consent-tester/services/dsi_discovery.py b/consent-tester/services/dsi_discovery.py
index 8f686db..d86e3d4 100644
--- a/consent-tester/services/dsi_discovery.py
+++ b/consent-tester/services/dsi_discovery.py
@@ -203,12 +203,18 @@ def _is_allowed_domain(href: str, base_domain: str) -> bool:
 async def discover_dsi_documents(
     page: Page,
     url: str,
-    max_documents: int = 30,
+    max_documents: int = 100,
+    timeout_seconds: int = 300,
 ) -> DSIDiscoveryResult:
     """Discover all privacy/data protection documents on a website.
 
     Works generically regardless of website technology, structure, or language.
+    Searches exhaustively until no new documents are found — no arbitrary page limit.
+    Stops when: all discovered links have been visited OR timeout reached.
     """
+    import time
+    deadline = time.time() + timeout_seconds
+
     result = DSIDiscoveryResult(base_url=url)
     base_domain = urlparse(url).netloc
     seen_urls: set[str] = set()
@@ -251,8 +257,15 @@ async def discover_dsi_documents(
                 )
                 result.documents.append(doc)
 
-        # Step 5: Follow each DSI link and extract content
-        for link_info in links[:max_documents]:
+        # Step 5: Follow each DSI link and extract content.
+        # Exhaustive: processes ALL found links. On each visited page,
+        # searches for MORE links (recursive discovery). Stops only when
+        # all links visited or timeout reached.
+        pending_links = list(links)
+        pages_to_revisit: list[str] = []  # Pages where we found docs — may have more links
+
+        while pending_links and time.time() < deadline and len(result.documents) < max_documents:
+            link_info = pending_links.pop(0)
             href = link_info["href"]
             if href in seen_urls:
                 continue
@@ -275,7 +288,6 @@ async def discover_dsi_documents(
                 ))
                 continue
 
-            # Navigate to the link and extract text
             try:
                 is_anchor = "#" in href and href.split("#")[0] == url.split("#")[0]
                 if is_anchor:
@@ -295,13 +307,14 @@ async def discover_dsi_documents(
                         ))
                     continue
 
-                # External or same-domain page
+                # Navigate to page
                 resp = await page.goto(href, wait_until="networkidle", timeout=20000)
                 if resp and resp.status < 400:
                     await page.wait_for_timeout(2000)
-                    await _expand_all_interactive(page)  # Expand accordions on target page too
+                    await _expand_all_interactive(page)
                     await page.wait_for_timeout(500)
 
+                    # Extract text
                     text = await page.evaluate("""
                         () => {
                             const main = document.querySelector('main, article, [role="main"], .content, #content');
@@ -316,9 +329,15 @@ async def discover_dsi_documents(
                             text=text[:50000], word_count=len(text.split()),
                         ))
 
-                # Navigate back to source page for next link
+                    # Recursive: search THIS page for more DSI links
+                    new_links = await _find_dsi_links(page, base_domain)
+                    for nl in new_links:
+                        if nl["href"] not in seen_urls and nl["href"] not in [p["href"] for p in pending_links]:
+                            pending_links.append(nl)
+
+                # Navigate back for next link
                 await page.goto(url, wait_until="networkidle", timeout=20000)
-                await page.wait_for_timeout(1000)
+                await page.wait_for_timeout(500)
                 await _expand_all_interactive(page)
 
             except Exception as e:
diff --git a/consent-tester/services/playwright_scanner.py b/consent-tester/services/playwright_scanner.py
index 5fdbd5c..49635d2 100644
--- a/consent-tester/services/playwright_scanner.py
+++ b/consent-tester/services/playwright_scanner.py
@@ -61,10 +61,18 @@ class PlaywrightScanResult:
 
 async def scan_website_playwright(
     base_url: str,
-    max_pages: int = 15,
+    max_pages: int = 50,
     click_nav: bool = True,
+    timeout_seconds: int = 180,
 ) -> PlaywrightScanResult:
-    """Scan website using Playwright — discovers pages via JS navigation."""
+    """Scan website using Playwright — discovers pages via JS navigation.
+
+    Exhaustively crawls until no new relevant links found, up to max_pages
+    (default 50) or timeout (default 3 min) as safety limits.
+    """
+    import time as _time
+    deadline = _time.time() + timeout_seconds
+
     result = PlaywrightScanResult()
     parsed = urlparse(base_url)
     origin = f"{parsed.scheme}://{parsed.netloc}"
@@ -105,8 +113,12 @@ async def scan_website_playwright(
                         if link not in visited and link not in to_visit:
                             to_visit.append(link)
 
-            # Phase 2: Visit discovered pages (up to max_pages)
-            for url in to_visit[:max_pages]:
+            # Phase 2: Visit discovered pages exhaustively (until done or timeout)
+            visit_idx = 0
+            while visit_idx < len(to_visit) and len(visited) < max_pages and _time.time() < deadline:
+                url = to_visit[visit_idx]
+                visit_idx += 1
+
                 if url in visited:
                     continue
                 if SKIP_PATTERNS.search(url):
@@ -115,13 +127,12 @@ async def scan_website_playwright(
                     continue
 
                 visited.add(url)
-                await _visit_page(page, url, result)
+                sp = await _visit_page(page, url, result)
 
-                # On DSE pages, discover additional links
-                current_url = page.url
-                if re.search(r"datenschutz|privacy|dsgvo", current_url, re.IGNORECASE):
-                    dse_links = await _discover_nav_links(page, origin)
-                    for link in dse_links:
+                # On every visited page, discover more links (recursive crawl)
+                if sp and sp.html:
+                    new_links = await _discover_nav_links(page, origin)
+                    for link in new_links:
                         if link not in visited and link not in to_visit and link.startswith(origin):
                             to_visit.append(link)