breakpilot-compliance/consent-tester/services/page_screenshot.py

"""Full-page screenshot mit Timestamp-Overlay.

Macht ein vollständiges Screenshot einer URL (z.B. Cookie-Richtlinie),
mit eingebrannter Timestamp + URL fuer juristische Beweiskraft. Akzeptiert
das Banner zuvor (sonst wuerde Banner-Overlay die Tabelle verdecken) und
klappt Accordions auf.

Returnt PNG bytes + Metadaten.
"""

from __future__ import annotations

import logging
from datetime import datetime, timezone

from playwright.async_api import async_playwright

logger = logging.getLogger(__name__)


_USER_AGENT = (
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
    "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
)

_TIMESTAMP_BANNER_JS = r"""(meta) => {
    // Einbrenn-Banner ans Seitenkopf: ohne in den Original-Inhalt einzugreifen,
    // damit die Beweiskraft erhalten bleibt (nur Overlay-Header).
    const bar = document.createElement('div');
    bar.setAttribute('id', '__bp_evidence_bar__');
    bar.style.cssText = (
        'position:relative;background:#0f172a;color:#fff;'
        'padding:10px 18px;font:600 13px/1.4 -apple-system,'
        'BlinkMacSystemFont,sans-serif;border-bottom:3px solid #0ea5e9;'
        'z-index:2147483647;box-sizing:border-box;width:100%'
    );
    bar.innerHTML = (
        '<div>BreakPilot Compliance-Audit · ' + meta.url + '</div>' +
        '<div style="font-weight:400;opacity:0.8;font-size:11px;margin-top:2px">' +
        'Erfasst: ' + meta.ts + ' UTC · Scan-ID ' + meta.check_id +
        '</div>'
    );
    document.body.insertBefore(bar, document.body.firstChild);
}"""


_EXPAND_ALL_JS = r"""() => {
    // Click everything that looks expandable so cookie-table-rows nested
    // in accordions become visible in the full-page screenshot.
    let n = 0;
    const triggers = document.querySelectorAll(
        '[aria-expanded="false"], summary, ' +
        'details:not([open]), ' +
        'button[class*="expand" i], button[class*="accordion" i], ' +
        'button[class*="toggle" i], [role="button"][class*="expand" i]'
    );
    for (const t of triggers) {
        try { t.click(); if (t.open !== undefined) t.open = true; n++; } catch(e){}
    }
    return n;
}"""


_DISMISS_BANNER_JS = r"""() => {
    // Click any "Accept all" / "Alle akzeptieren" / "Akzeptieren" button so
    // the consent overlay disappears and we can capture the page content.
    // We accept rather than reject because rejecting often LEAVES the banner
    // in place ("you must consent to continue"), blocking the screenshot.
    function walk(root) {
        if (!root || !root.querySelectorAll) return false;
        const buttons = root.querySelectorAll(
            'button, [role="button"], a, [class*="accept" i]'
        );
        for (const b of buttons) {
            const t = (b.textContent || '').trim().toLowerCase();
            if (!t || t.length > 40) continue;
            if (t === 'alle akzeptieren' || t === 'akzeptieren' ||
                t === 'accept all' || t === 'agree' || t === 'einverstanden' ||
                t === 'i agree' || t === 'zustimmen' || t === 'ok' ||
                t === 'alle cookies akzeptieren' || t === 'alle annehmen') {
                try { b.click(); return true; } catch(e){}
            }
        }
        const all = root.querySelectorAll('*');
        for (const el of all) if (el.shadowRoot && walk(el.shadowRoot)) return true;
        return false;
    }
    return walk(document);
}"""


async def capture_page_evidence(
    url: str,
    check_id: str = "",
    timeout_ms: int = 30000,
    max_height_px: int = 30000,
) -> dict:
    """Capture a full-page screenshot of `url` with embedded timestamp.

    Returns dict:
        png_bytes: bytes
        captured_at: ISO timestamp
        url: final URL after redirects
        accepted_banner: bool
        expanded: int — accordion-clicks performed
        height_px, width_px
    """
    out: dict = {
        "png_bytes": b"",
        "captured_at": "",
        "url": url,
        "accepted_banner": False,
        "expanded": 0,
        "height_px": 0,
        "width_px": 0,
    }
    ts = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S")
    async with async_playwright() as p:
        browser = await p.chromium.launch(
            headless=True,
            args=["--no-sandbox", "--disable-dev-shm-usage"],
        )
        ctx = await browser.new_context(
            user_agent=_USER_AGENT,
            viewport={"width": 1280, "height": 1024},
            locale="de-DE",
            timezone_id="Europe/Berlin",
        )
        page = await ctx.new_page()
        try:
            await page.goto(url, wait_until="domcontentloaded", timeout=timeout_ms)
            await page.wait_for_timeout(3500)
            # Step 1: dismiss banner (accept) so we see the policy content
            try:
                out["accepted_banner"] = bool(await page.evaluate(_DISMISS_BANNER_JS))
                if out["accepted_banner"]:
                    await page.wait_for_timeout(1500)
            except Exception as e:
                logger.debug("dismiss-banner failed: %s", e)
            # Step 2: expand accordions / details
            try:
                out["expanded"] = int(await page.evaluate(_EXPAND_ALL_JS) or 0)
                if out["expanded"]:
                    await page.wait_for_timeout(1500)
            except Exception as e:
                logger.debug("expand-all failed: %s", e)
            out["url"] = page.url
            # Step 3: inject timestamp banner for evidence
            try:
                await page.evaluate(_TIMESTAMP_BANNER_JS, {
                    "url": out["url"], "ts": ts, "check_id": check_id or "—",
                })
            except Exception as e:
                logger.debug("timestamp-inject failed: %s", e)
            # Step 4: capture full-page screenshot. Cap height for sanity.
            dims = await page.evaluate(
                "() => ({w: document.documentElement.scrollWidth, "
                "h: document.documentElement.scrollHeight})"
            )
            out["width_px"] = int(dims.get("w") or 0)
            out["height_px"] = min(int(dims.get("h") or 0), max_height_px)
            # If page is too tall, scroll-into-view to anchor a screenshot region
            png = await page.screenshot(
                full_page=True, type="png", timeout=timeout_ms,
            )
            out["png_bytes"] = png
            out["captured_at"] = ts
            logger.info(
                "Evidence screenshot captured: %s (%dx%d, %d bytes, accepted=%s, expanded=%d)",
                out["url"], out["width_px"], out["height_px"],
                len(png), out["accepted_banner"], out["expanded"],
            )
        finally:
            await ctx.close()
            await browser.close()
    return out