fix: Exhaustive crawl — no arbitrary page/document limits

Both scanners now search until done, not until a counter runs out: playwright_scanner.py: - Default max_pages raised from 15 to 50 - Added 3-minute timeout as safety net - Recursive link discovery on EVERY visited page (not just DSE pages) - Stops when: all links visited OR max_pages OR timeout dsi_discovery.py: - Default max_documents raised from 30 to 100 - Added 5-minute timeout as safety net - Recursive: on each visited page, searches for MORE DSI links - Processes ALL discovered links exhaustively - Stops when: no more pending links OR max_documents OR timeout The scanners now behave like a real user: they follow every relevant link they find, and on each new page they look for more links. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-04 22:21:16 +02:00
parent a970c28168
commit b22351fc6e
2 changed files with 293 additions and 8 deletions
@@ -203,12 +203,18 @@ def _is_allowed_domain(href: str, base_domain: str) -> bool:
 async def discover_dsi_documents(
    page: Page,
    url: str,
-    max_documents: int = 30,
+    max_documents: int = 100,
+    timeout_seconds: int = 300,
 ) -> DSIDiscoveryResult:
    """Discover all privacy/data protection documents on a website.

    Works generically regardless of website technology, structure, or language.
+    Searches exhaustively until no new documents are found — no arbitrary page limit.
+    Stops when: all discovered links have been visited OR timeout reached.
    """
+    import time
+    deadline = time.time() + timeout_seconds
+
    result = DSIDiscoveryResult(base_url=url)
    base_domain = urlparse(url).netloc
    seen_urls: set[str] = set()
@@ -251,8 +257,15 @@ async def discover_dsi_documents(
                )
                result.documents.append(doc)

-        # Step 5: Follow each DSI link and extract content
-        for link_info in links[:max_documents]:
+        # Step 5: Follow each DSI link and extract content.
+        # Exhaustive: processes ALL found links. On each visited page,
+        # searches for MORE links (recursive discovery). Stops only when
+        # all links visited or timeout reached.
+        pending_links = list(links)
+        pages_to_revisit: list[str] = []  # Pages where we found docs — may have more links
+
+        while pending_links and time.time() < deadline and len(result.documents) < max_documents:
+            link_info = pending_links.pop(0)
            href = link_info["href"]
            if href in seen_urls:
                continue
@@ -275,7 +288,6 @@ async def discover_dsi_documents(
                ))
                continue

-            # Navigate to the link and extract text
            try:
                is_anchor = "#" in href and href.split("#")[0] == url.split("#")[0]
                if is_anchor:
@@ -295,13 +307,14 @@ async def discover_dsi_documents(
                        ))
                    continue

-                # External or same-domain page
+                # Navigate to page
                resp = await page.goto(href, wait_until="networkidle", timeout=20000)
                if resp and resp.status < 400:
                    await page.wait_for_timeout(2000)
-                    await _expand_all_interactive(page)  # Expand accordions on target page too
+                    await _expand_all_interactive(page)
                    await page.wait_for_timeout(500)

+                    # Extract text
                    text = await page.evaluate("""
                        () => {
                            const main = document.querySelector('main, article, [role="main"], .content, #content');
@@ -316,9 +329,15 @@ async def discover_dsi_documents(
                            text=text[:50000], word_count=len(text.split()),
                        ))

-                # Navigate back to source page for next link
+                    # Recursive: search THIS page for more DSI links
+                    new_links = await _find_dsi_links(page, base_domain)
+                    for nl in new_links:
+                        if nl["href"] not in seen_urls and nl["href"] not in [p["href"] for p in pending_links]:
+                            pending_links.append(nl)
+
+                # Navigate back for next link
                await page.goto(url, wait_until="networkidle", timeout=20000)
-                await page.wait_for_timeout(1000)
+                await page.wait_for_timeout(500)
                await _expand_all_interactive(page)

            except Exception as e: