efeef73f90
Statt EIN full-page screenshot: full-page wird per PIL in viewport-grosse Slices geschnitten, jede ueberlappt die vorherige um overlap_px Pixel. Jeder Cookie erscheint in mind. einer Slice, an Slice-Grenzen sogar in zwei → Dedup nach Name eliminiert die Doppel. Warum nicht direkt scroll-based slicing in Playwright? VW's Cookie-Page nutzt scroll-snap / fixed-position — alle viewport-shots kamen identisch zurueck (Header-Overlay). PIL-cut auf dem full-page PNG bypasst das Problem voellig. VW smoke-test (32 slices): per-slice: [0, 0, 2, 5, 5, 3, 4, 7, 4, 3, 4, 5, ...] 103 raw cookies → 79 unique nach dedup 14 vendor records (Google 9, Adobe-Familie 17, etc.) Jeder Slice hat eigenen Timestamp + SHA256 → ZIP-Anhang fuer juristische Beweiskette. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
295 lines
11 KiB
Python
295 lines
11 KiB
Python
"""Full-page screenshot mit Timestamp-Overlay.
|
|
|
|
Macht ein vollständiges Screenshot einer URL (z.B. Cookie-Richtlinie),
|
|
mit eingebrannter Timestamp + URL fuer juristische Beweiskraft. Akzeptiert
|
|
das Banner zuvor (sonst wuerde Banner-Overlay die Tabelle verdecken) und
|
|
klappt Accordions auf.
|
|
|
|
Returnt PNG bytes + Metadaten.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
from datetime import datetime, timezone
|
|
|
|
from playwright.async_api import async_playwright
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
_USER_AGENT = (
|
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
|
|
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
|
)
|
|
|
|
_TIMESTAMP_BANNER_JS = r"""(meta) => {
|
|
// Einbrenn-Banner ans Seitenkopf: ohne in den Original-Inhalt einzugreifen,
|
|
// damit die Beweiskraft erhalten bleibt (nur Overlay-Header).
|
|
const bar = document.createElement('div');
|
|
bar.setAttribute('id', '__bp_evidence_bar__');
|
|
bar.style.cssText = (
|
|
'position:relative;background:#0f172a;color:#fff;'
|
|
'padding:10px 18px;font:600 13px/1.4 -apple-system,'
|
|
'BlinkMacSystemFont,sans-serif;border-bottom:3px solid #0ea5e9;'
|
|
'z-index:2147483647;box-sizing:border-box;width:100%'
|
|
);
|
|
bar.innerHTML = (
|
|
'<div>BreakPilot Compliance-Audit · ' + meta.url + '</div>' +
|
|
'<div style="font-weight:400;opacity:0.8;font-size:11px;margin-top:2px">' +
|
|
'Erfasst: ' + meta.ts + ' UTC · Scan-ID ' + meta.check_id +
|
|
'</div>'
|
|
);
|
|
document.body.insertBefore(bar, document.body.firstChild);
|
|
}"""
|
|
|
|
|
|
_EXPAND_ALL_JS = r"""() => {
|
|
// Click everything that looks expandable so cookie-table-rows nested
|
|
// in accordions become visible in the full-page screenshot.
|
|
let n = 0;
|
|
const triggers = document.querySelectorAll(
|
|
'[aria-expanded="false"], summary, ' +
|
|
'details:not([open]), ' +
|
|
'button[class*="expand" i], button[class*="accordion" i], ' +
|
|
'button[class*="toggle" i], [role="button"][class*="expand" i]'
|
|
);
|
|
for (const t of triggers) {
|
|
try { t.click(); if (t.open !== undefined) t.open = true; n++; } catch(e){}
|
|
}
|
|
return n;
|
|
}"""
|
|
|
|
|
|
_DISMISS_BANNER_JS = r"""() => {
|
|
// Click any "Accept all" / "Alle akzeptieren" / "Akzeptieren" button so
|
|
// the consent overlay disappears and we can capture the page content.
|
|
// We accept rather than reject because rejecting often LEAVES the banner
|
|
// in place ("you must consent to continue"), blocking the screenshot.
|
|
function walk(root) {
|
|
if (!root || !root.querySelectorAll) return false;
|
|
const buttons = root.querySelectorAll(
|
|
'button, [role="button"], a, [class*="accept" i]'
|
|
);
|
|
for (const b of buttons) {
|
|
const t = (b.textContent || '').trim().toLowerCase();
|
|
if (!t || t.length > 40) continue;
|
|
if (t === 'alle akzeptieren' || t === 'akzeptieren' ||
|
|
t === 'accept all' || t === 'agree' || t === 'einverstanden' ||
|
|
t === 'i agree' || t === 'zustimmen' || t === 'ok' ||
|
|
t === 'alle cookies akzeptieren' || t === 'alle annehmen') {
|
|
try { b.click(); return true; } catch(e){}
|
|
}
|
|
}
|
|
const all = root.querySelectorAll('*');
|
|
for (const el of all) if (el.shadowRoot && walk(el.shadowRoot)) return true;
|
|
return false;
|
|
}
|
|
return walk(document);
|
|
}"""
|
|
|
|
|
|
async def capture_page_overlapping_slices(
|
|
url: str,
|
|
check_id: str = "",
|
|
viewport_h: int = 1024,
|
|
overlap_px: int = 200,
|
|
timeout_ms: int = 30000,
|
|
max_slices: int = 40,
|
|
) -> dict:
|
|
"""Lückenlose Beweiskette: scrollt die Seite in viewport-grossen
|
|
Schritten und macht pro Schritt ein eigenes Screenshot. Jeder
|
|
Schritt ueberlappt mit dem vorherigen um `overlap_px` Pixel — so
|
|
erscheint jeder Cookie in mind. einem Bild, an Slice-Grenzen sogar
|
|
in zweien. Tesseract-Dedup nach Cookie-Name eliminiert Doppel.
|
|
|
|
Vorteil ggue. full_page=True:
|
|
- Beweiskette VERIFIZIERBAR (Overlap dokumentiert Lueckenfreiheit)
|
|
- Tesseract pro Slice schneller + parallel ausfuehrbar
|
|
- Pro Slice eigener Timestamp + Sequenz-Nummer in der Mail-ZIP
|
|
|
|
Returns dict:
|
|
slices: [{idx, ts, png_b64, top_y, bot_y, sha256}, ...]
|
|
total_height_px
|
|
width_px
|
|
url (final after redirect)
|
|
accepted_banner, expanded
|
|
"""
|
|
import base64 as _b64
|
|
import hashlib
|
|
|
|
out: dict = {
|
|
"slices": [],
|
|
"total_height_px": 0,
|
|
"width_px": 0,
|
|
"url": url,
|
|
"accepted_banner": False,
|
|
"expanded": 0,
|
|
}
|
|
ts_base = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S")
|
|
async with async_playwright() as p:
|
|
browser = await p.chromium.launch(
|
|
headless=True, args=["--no-sandbox", "--disable-dev-shm-usage"],
|
|
)
|
|
ctx = await browser.new_context(
|
|
user_agent=_USER_AGENT,
|
|
viewport={"width": 1280, "height": viewport_h},
|
|
locale="de-DE", timezone_id="Europe/Berlin",
|
|
)
|
|
page = await ctx.new_page()
|
|
try:
|
|
await page.goto(url, wait_until="domcontentloaded", timeout=timeout_ms)
|
|
await page.wait_for_timeout(3500)
|
|
try:
|
|
out["accepted_banner"] = bool(await page.evaluate(_DISMISS_BANNER_JS))
|
|
if out["accepted_banner"]:
|
|
await page.wait_for_timeout(1500)
|
|
except Exception:
|
|
pass
|
|
try:
|
|
out["expanded"] = int(await page.evaluate(_EXPAND_ALL_JS) or 0)
|
|
if out["expanded"]:
|
|
await page.wait_for_timeout(1500)
|
|
except Exception:
|
|
pass
|
|
out["url"] = page.url
|
|
# Inject timestamp banner so the FIRST slice carries it.
|
|
try:
|
|
await page.evaluate(_TIMESTAMP_BANNER_JS, {
|
|
"url": out["url"], "ts": ts_base, "check_id": check_id or "—",
|
|
})
|
|
except Exception:
|
|
pass
|
|
await page.wait_for_timeout(500)
|
|
# Measure total scroll height + width
|
|
dims = await page.evaluate(
|
|
"() => ({w: document.documentElement.scrollWidth, "
|
|
"h: document.documentElement.scrollHeight})"
|
|
)
|
|
total_h = int(dims.get("h") or 0)
|
|
out["total_height_px"] = total_h
|
|
out["width_px"] = int(dims.get("w") or 0)
|
|
# Calculate scroll-step: viewport_h minus overlap. Each slice
|
|
# contains overlap_px pixels of the PREVIOUS slice's bottom.
|
|
step = max(1, viewport_h - overlap_px)
|
|
scroll_y = 0
|
|
idx = 0
|
|
while scroll_y < total_h and idx < max_slices:
|
|
# Scroll to position. Wait for any lazy content to render.
|
|
await page.evaluate(f"window.scrollTo(0, {scroll_y})")
|
|
await page.wait_for_timeout(400)
|
|
png = await page.screenshot(
|
|
full_page=False, type="png", timeout=timeout_ms,
|
|
)
|
|
ts = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S")
|
|
top_y = scroll_y
|
|
bot_y = min(scroll_y + viewport_h, total_h)
|
|
sha = hashlib.sha256(png).hexdigest()[:16]
|
|
out["slices"].append({
|
|
"idx": idx,
|
|
"ts": ts,
|
|
"top_y": top_y,
|
|
"bot_y": bot_y,
|
|
"sha256": sha,
|
|
"png_b64": _b64.b64encode(png).decode("ascii"),
|
|
"png_size": len(png),
|
|
})
|
|
scroll_y += step
|
|
idx += 1
|
|
logger.info(
|
|
"Overlapping screenshots: %d slices for %s (total_h=%d, "
|
|
"viewport=%d, overlap=%d)",
|
|
len(out["slices"]), out["url"], total_h, viewport_h, overlap_px,
|
|
)
|
|
finally:
|
|
await ctx.close()
|
|
await browser.close()
|
|
return out
|
|
|
|
|
|
async def capture_page_evidence(
|
|
url: str,
|
|
check_id: str = "",
|
|
timeout_ms: int = 30000,
|
|
max_height_px: int = 30000,
|
|
) -> dict:
|
|
"""Capture a full-page screenshot of `url` with embedded timestamp.
|
|
|
|
Returns dict:
|
|
png_bytes: bytes
|
|
captured_at: ISO timestamp
|
|
url: final URL after redirects
|
|
accepted_banner: bool
|
|
expanded: int — accordion-clicks performed
|
|
height_px, width_px
|
|
"""
|
|
out: dict = {
|
|
"png_bytes": b"",
|
|
"captured_at": "",
|
|
"url": url,
|
|
"accepted_banner": False,
|
|
"expanded": 0,
|
|
"height_px": 0,
|
|
"width_px": 0,
|
|
}
|
|
ts = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S")
|
|
async with async_playwright() as p:
|
|
browser = await p.chromium.launch(
|
|
headless=True,
|
|
args=["--no-sandbox", "--disable-dev-shm-usage"],
|
|
)
|
|
ctx = await browser.new_context(
|
|
user_agent=_USER_AGENT,
|
|
viewport={"width": 1280, "height": 1024},
|
|
locale="de-DE",
|
|
timezone_id="Europe/Berlin",
|
|
)
|
|
page = await ctx.new_page()
|
|
try:
|
|
await page.goto(url, wait_until="domcontentloaded", timeout=timeout_ms)
|
|
await page.wait_for_timeout(3500)
|
|
# Step 1: dismiss banner (accept) so we see the policy content
|
|
try:
|
|
out["accepted_banner"] = bool(await page.evaluate(_DISMISS_BANNER_JS))
|
|
if out["accepted_banner"]:
|
|
await page.wait_for_timeout(1500)
|
|
except Exception as e:
|
|
logger.debug("dismiss-banner failed: %s", e)
|
|
# Step 2: expand accordions / details
|
|
try:
|
|
out["expanded"] = int(await page.evaluate(_EXPAND_ALL_JS) or 0)
|
|
if out["expanded"]:
|
|
await page.wait_for_timeout(1500)
|
|
except Exception as e:
|
|
logger.debug("expand-all failed: %s", e)
|
|
out["url"] = page.url
|
|
# Step 3: inject timestamp banner for evidence
|
|
try:
|
|
await page.evaluate(_TIMESTAMP_BANNER_JS, {
|
|
"url": out["url"], "ts": ts, "check_id": check_id or "—",
|
|
})
|
|
except Exception as e:
|
|
logger.debug("timestamp-inject failed: %s", e)
|
|
# Step 4: capture full-page screenshot. Cap height for sanity.
|
|
dims = await page.evaluate(
|
|
"() => ({w: document.documentElement.scrollWidth, "
|
|
"h: document.documentElement.scrollHeight})"
|
|
)
|
|
out["width_px"] = int(dims.get("w") or 0)
|
|
out["height_px"] = min(int(dims.get("h") or 0), max_height_px)
|
|
# If page is too tall, scroll-into-view to anchor a screenshot region
|
|
png = await page.screenshot(
|
|
full_page=True, type="png", timeout=timeout_ms,
|
|
)
|
|
out["png_bytes"] = png
|
|
out["captured_at"] = ts
|
|
logger.info(
|
|
"Evidence screenshot captured: %s (%dx%d, %d bytes, accepted=%s, expanded=%d)",
|
|
out["url"], out["width_px"], out["height_px"],
|
|
len(png), out["accepted_banner"], out["expanded"],
|
|
)
|
|
finally:
|
|
await ctx.close()
|
|
await browser.close()
|
|
return out
|