""" Playwright Website Scanner — browser-based page discovery and scanning. Unlike httpx (curl-like), this uses a real browser that: - Executes JavaScript (finds dynamically loaded content) - Clicks navigation menus (discovers hidden sub-pages) - Renders SPAs (React, Angular, Vue) - Sees what the user sees Replaces the httpx-based scanner for comprehensive website analysis. """ import logging import re from dataclasses import dataclass, field from urllib.parse import urljoin, urlparse from playwright.async_api import async_playwright, Page, BrowserContext logger = logging.getLogger(__name__) USER_AGENT = ( "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" ) # Patterns for discovering important pages NAV_LINK_KEYWORDS = [ "datenschutz", "privacy", "dsgvo", "impressum", "imprint", "legal", "agb", "terms", "nutzungsbedingung", "cookie", "kontakt", "contact", "ueber-uns", "about", "service", ] # Skip these URL patterns (not HTML pages) SKIP_PATTERNS = re.compile( r"\.(css|js|png|jpg|jpeg|gif|svg|pdf|zip|xml|json|woff|woff2|ttf|eot|ico)(\?|#|$)", re.IGNORECASE, ) @dataclass class ScannedPage: url: str status: int html: str = "" title: str = "" error: str = "" @dataclass class PlaywrightScanResult: pages: list[ScannedPage] = field(default_factory=list) discovered_urls: list[str] = field(default_factory=list) external_scripts: list[str] = field(default_factory=list) all_cookies: list[str] = field(default_factory=list) async def scan_website_playwright( base_url: str, max_pages: int = 50, click_nav: bool = True, timeout_seconds: int = 180, ) -> PlaywrightScanResult: """Scan website using Playwright — discovers pages via JS navigation. Exhaustively crawls until no new relevant links found, up to max_pages (default 50) or timeout (default 3 min) as safety limits. """ import time as _time deadline = _time.time() + timeout_seconds result = PlaywrightScanResult() parsed = urlparse(base_url) origin = f"{parsed.scheme}://{parsed.netloc}" visited: set[str] = set() to_visit: list[str] = [base_url] # Also add common paths to probe if base_url != origin: to_visit.append(origin) async with async_playwright() as p: browser = await p.chromium.launch( headless=True, args=["--no-sandbox", "--disable-dev-shm-usage"], ) context = await browser.new_context(user_agent=USER_AGENT) try: # Phase 1: Load start page and discover navigation links page = await context.new_page() scripts_collected: list[str] = [] page.on("request", lambda req: _collect_external(req, scripts_collected, origin)) start_page = await _visit_page(page, base_url, result) visited.add(base_url) if start_page and start_page.html: # Extract links from rendered HTML (after JS execution) nav_links = await _discover_nav_links(page, origin) for link in nav_links: if link not in visited and link not in to_visit: to_visit.append(link) # Click navigation menus to find hidden links if click_nav: menu_links = await _click_navigation_menus(page, origin) for link in menu_links: if link not in visited and link not in to_visit: to_visit.append(link) # Phase 2: Visit discovered pages exhaustively (until done or timeout) visit_idx = 0 while visit_idx < len(to_visit) and len(visited) < max_pages and _time.time() < deadline: url = to_visit[visit_idx] visit_idx += 1 if url in visited: continue if SKIP_PATTERNS.search(url): continue if not url.startswith(origin): continue visited.add(url) sp = await _visit_page(page, url, result) # On every visited page, discover more links (recursive crawl) if sp and sp.html: new_links = await _discover_nav_links(page, origin) for link in new_links: if link not in visited and link not in to_visit and link.startswith(origin): to_visit.append(link) # Collect cookies cookies = await context.cookies() result.all_cookies = sorted(set(c.get("name", "") for c in cookies)) result.external_scripts = list(set(scripts_collected)) result.discovered_urls = [p.url for p in result.pages] except Exception as e: logger.error("Playwright scan failed: %s", e) finally: await context.close() await browser.close() logger.info("Playwright scan: %d pages visited, %d scripts found", len(result.pages), len(result.external_scripts)) return result async def _visit_page(page: Page, url: str, result: PlaywrightScanResult) -> ScannedPage | None: """Visit a page and capture its rendered HTML.""" sp = ScannedPage(url=url, status=0) try: response = await page.goto(url, wait_until="networkidle", timeout=20000) sp.status = response.status if response else 0 await page.wait_for_timeout(2000) if sp.status < 400: sp.html = await page.content() sp.title = await page.title() else: sp.error = f"HTTP {sp.status}" except Exception as e: sp.status = 0 sp.error = str(e)[:100] logger.warning("Failed to visit %s: %s", url, sp.error) result.pages.append(sp) return sp if sp.status < 400 and sp.html else None async def _discover_nav_links(page: Page, origin: str) -> list[str]: """Extract all navigation links from the rendered page.""" links = set() try: # Get all hrefs from the rendered DOM all_hrefs = await page.evaluate(""" () => [...document.querySelectorAll('a[href]')] .map(a => a.href) .filter(h => h.startsWith('http')) """) for href in (all_hrefs or []): href_clean = href.split("#")[0].split("?")[0] # Strip anchors and params if not href_clean.startswith(origin): continue if SKIP_PATTERNS.search(href_clean): continue # Prioritize pages with relevant keywords href_lower = href_clean.lower() if any(kw in href_lower for kw in NAV_LINK_KEYWORDS): links.add(href_clean) except Exception as e: logger.warning("Link discovery failed: %s", e) return sorted(links)[:20] # Cap at 20 async def _click_navigation_menus(page: Page, origin: str) -> list[str]: """Click expandable navigation menus to discover hidden links.""" links = set() try: # Find and click common menu toggles menu_selectors = [ 'button[aria-expanded="false"]', '[class*="dropdown"] > a', '[class*="menu-toggle"]', '[class*="nav-toggle"]', 'details:not([open]) > summary', '[class*="accordion"] > button', 'nav button', ] for selector in menu_selectors: try: elements = page.locator(selector) count = await elements.count() for i in range(min(count, 10)): # Max 10 menus try: await elements.nth(i).click(timeout=2000) await page.wait_for_timeout(500) except Exception: continue except Exception: continue # After clicking, collect newly visible links new_hrefs = await page.evaluate(""" () => [...document.querySelectorAll('a[href]')] .filter(a => { const rect = a.getBoundingClientRect(); return rect.width > 0 && rect.height > 0; }) .map(a => a.href) .filter(h => h.startsWith('http')) """) for href in (new_hrefs or []): href_clean = href.split("#")[0].split("?")[0] if href_clean.startswith(origin) and not SKIP_PATTERNS.search(href_clean): href_lower = href_clean.lower() if any(kw in href_lower for kw in NAV_LINK_KEYWORDS): links.add(href_clean) except Exception as e: logger.warning("Menu click failed: %s", e) return sorted(links)[:10] def _collect_external(request, scripts: list[str], origin: str): """Collect external script/resource URLs.""" url = request.url if request.resource_type in ("script", "image") and not url.startswith(origin): domain = url.split("/")[2] if len(url.split("/")) > 2 else url if domain not in [s.split("/")[2] if len(s.split("/")) > 2 else s for s in scripts]: scripts.append(url)