"""Full-page screenshot mit Timestamp-Overlay. Macht ein vollständiges Screenshot einer URL (z.B. Cookie-Richtlinie), mit eingebrannter Timestamp + URL fuer juristische Beweiskraft. Akzeptiert das Banner zuvor (sonst wuerde Banner-Overlay die Tabelle verdecken) und klappt Accordions auf. Returnt PNG bytes + Metadaten. """ from __future__ import annotations import logging from datetime import datetime, timezone from playwright.async_api import async_playwright logger = logging.getLogger(__name__) _USER_AGENT = ( "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" ) _TIMESTAMP_BANNER_JS = r"""(meta) => { // Einbrenn-Banner ans Seitenkopf: ohne in den Original-Inhalt einzugreifen, // damit die Beweiskraft erhalten bleibt (nur Overlay-Header). const bar = document.createElement('div'); bar.setAttribute('id', '__bp_evidence_bar__'); bar.style.cssText = ( 'position:relative;background:#0f172a;color:#fff;' 'padding:10px 18px;font:600 13px/1.4 -apple-system,' 'BlinkMacSystemFont,sans-serif;border-bottom:3px solid #0ea5e9;' 'z-index:2147483647;box-sizing:border-box;width:100%' ); bar.innerHTML = ( '
BreakPilot Compliance-Audit · ' + meta.url + '
' + '
' + 'Erfasst: ' + meta.ts + ' UTC · Scan-ID ' + meta.check_id + '
' ); document.body.insertBefore(bar, document.body.firstChild); }""" _EXPAND_ALL_JS = r"""() => { // Click everything that looks expandable so cookie-table-rows nested // in accordions become visible in the full-page screenshot. let n = 0; const triggers = document.querySelectorAll( '[aria-expanded="false"], summary, ' + 'details:not([open]), ' + 'button[class*="expand" i], button[class*="accordion" i], ' + 'button[class*="toggle" i], [role="button"][class*="expand" i]' ); for (const t of triggers) { try { t.click(); if (t.open !== undefined) t.open = true; n++; } catch(e){} } return n; }""" _DISMISS_BANNER_JS = r"""() => { // Click any "Accept all" / "Alle akzeptieren" / "Akzeptieren" button so // the consent overlay disappears and we can capture the page content. // We accept rather than reject because rejecting often LEAVES the banner // in place ("you must consent to continue"), blocking the screenshot. function walk(root) { if (!root || !root.querySelectorAll) return false; const buttons = root.querySelectorAll( 'button, [role="button"], a, [class*="accept" i]' ); for (const b of buttons) { const t = (b.textContent || '').trim().toLowerCase(); if (!t || t.length > 40) continue; if (t === 'alle akzeptieren' || t === 'akzeptieren' || t === 'accept all' || t === 'agree' || t === 'einverstanden' || t === 'i agree' || t === 'zustimmen' || t === 'ok' || t === 'alle cookies akzeptieren' || t === 'alle annehmen') { try { b.click(); return true; } catch(e){} } } const all = root.querySelectorAll('*'); for (const el of all) if (el.shadowRoot && walk(el.shadowRoot)) return true; return false; } return walk(document); }""" async def capture_page_overlapping_slices( url: str, check_id: str = "", viewport_h: int = 1024, overlap_px: int = 200, timeout_ms: int = 30000, max_slices: int = 40, ) -> dict: """Lückenlose Beweiskette: scrollt die Seite in viewport-grossen Schritten und macht pro Schritt ein eigenes Screenshot. Jeder Schritt ueberlappt mit dem vorherigen um `overlap_px` Pixel — so erscheint jeder Cookie in mind. einem Bild, an Slice-Grenzen sogar in zweien. Tesseract-Dedup nach Cookie-Name eliminiert Doppel. Vorteil ggue. full_page=True: - Beweiskette VERIFIZIERBAR (Overlap dokumentiert Lueckenfreiheit) - Tesseract pro Slice schneller + parallel ausfuehrbar - Pro Slice eigener Timestamp + Sequenz-Nummer in der Mail-ZIP Returns dict: slices: [{idx, ts, png_b64, top_y, bot_y, sha256}, ...] total_height_px width_px url (final after redirect) accepted_banner, expanded """ import base64 as _b64 import hashlib out: dict = { "slices": [], "total_height_px": 0, "width_px": 0, "url": url, "accepted_banner": False, "expanded": 0, } ts_base = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S") async with async_playwright() as p: browser = await p.chromium.launch( headless=True, args=["--no-sandbox", "--disable-dev-shm-usage"], ) ctx = await browser.new_context( user_agent=_USER_AGENT, viewport={"width": 1280, "height": viewport_h}, locale="de-DE", timezone_id="Europe/Berlin", ) page = await ctx.new_page() try: await page.goto(url, wait_until="domcontentloaded", timeout=timeout_ms) await page.wait_for_timeout(3500) try: out["accepted_banner"] = bool(await page.evaluate(_DISMISS_BANNER_JS)) if out["accepted_banner"]: await page.wait_for_timeout(1500) except Exception: pass try: out["expanded"] = int(await page.evaluate(_EXPAND_ALL_JS) or 0) if out["expanded"]: await page.wait_for_timeout(1500) except Exception: pass out["url"] = page.url # Inject timestamp banner so the FIRST slice carries it. try: await page.evaluate(_TIMESTAMP_BANNER_JS, { "url": out["url"], "ts": ts_base, "check_id": check_id or "—", }) except Exception: pass await page.wait_for_timeout(500) # Measure total scroll height + width dims = await page.evaluate( "() => ({w: document.documentElement.scrollWidth, " "h: document.documentElement.scrollHeight})" ) total_h = int(dims.get("h") or 0) out["total_height_px"] = total_h out["width_px"] = int(dims.get("w") or 0) # Calculate scroll-step: viewport_h minus overlap. Each slice # contains overlap_px pixels of the PREVIOUS slice's bottom. step = max(1, viewport_h - overlap_px) scroll_y = 0 idx = 0 while scroll_y < total_h and idx < max_slices: # Scroll to position. Wait for any lazy content to render. await page.evaluate(f"window.scrollTo(0, {scroll_y})") await page.wait_for_timeout(400) png = await page.screenshot( full_page=False, type="png", timeout=timeout_ms, ) ts = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S") top_y = scroll_y bot_y = min(scroll_y + viewport_h, total_h) sha = hashlib.sha256(png).hexdigest()[:16] out["slices"].append({ "idx": idx, "ts": ts, "top_y": top_y, "bot_y": bot_y, "sha256": sha, "png_b64": _b64.b64encode(png).decode("ascii"), "png_size": len(png), }) scroll_y += step idx += 1 logger.info( "Overlapping screenshots: %d slices for %s (total_h=%d, " "viewport=%d, overlap=%d)", len(out["slices"]), out["url"], total_h, viewport_h, overlap_px, ) finally: await ctx.close() await browser.close() return out async def capture_page_evidence( url: str, check_id: str = "", timeout_ms: int = 30000, max_height_px: int = 30000, ) -> dict: """Capture a full-page screenshot of `url` with embedded timestamp. Returns dict: png_bytes: bytes captured_at: ISO timestamp url: final URL after redirects accepted_banner: bool expanded: int — accordion-clicks performed height_px, width_px """ out: dict = { "png_bytes": b"", "captured_at": "", "url": url, "accepted_banner": False, "expanded": 0, "height_px": 0, "width_px": 0, } ts = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S") async with async_playwright() as p: browser = await p.chromium.launch( headless=True, args=["--no-sandbox", "--disable-dev-shm-usage"], ) ctx = await browser.new_context( user_agent=_USER_AGENT, viewport={"width": 1280, "height": 1024}, locale="de-DE", timezone_id="Europe/Berlin", ) page = await ctx.new_page() try: await page.goto(url, wait_until="domcontentloaded", timeout=timeout_ms) await page.wait_for_timeout(3500) # Step 1: dismiss banner (accept) so we see the policy content try: out["accepted_banner"] = bool(await page.evaluate(_DISMISS_BANNER_JS)) if out["accepted_banner"]: await page.wait_for_timeout(1500) except Exception as e: logger.debug("dismiss-banner failed: %s", e) # Step 2: expand accordions / details try: out["expanded"] = int(await page.evaluate(_EXPAND_ALL_JS) or 0) if out["expanded"]: await page.wait_for_timeout(1500) except Exception as e: logger.debug("expand-all failed: %s", e) out["url"] = page.url # Step 3: inject timestamp banner for evidence try: await page.evaluate(_TIMESTAMP_BANNER_JS, { "url": out["url"], "ts": ts, "check_id": check_id or "—", }) except Exception as e: logger.debug("timestamp-inject failed: %s", e) # Step 4: capture full-page screenshot. Cap height for sanity. dims = await page.evaluate( "() => ({w: document.documentElement.scrollWidth, " "h: document.documentElement.scrollHeight})" ) out["width_px"] = int(dims.get("w") or 0) out["height_px"] = min(int(dims.get("h") or 0), max_height_px) # If page is too tall, scroll-into-view to anchor a screenshot region png = await page.screenshot( full_page=True, type="png", timeout=timeout_ms, ) out["png_bytes"] = png out["captured_at"] = ts logger.info( "Evidence screenshot captured: %s (%dx%d, %d bytes, accepted=%s, expanded=%d)", out["url"], out["width_px"], out["height_px"], len(png), out["accepted_banner"], out["expanded"], ) finally: await ctx.close() await browser.close() return out