feat(audit): overlapping evidence-slices fuer lueckenlose Beweiskette

Statt EIN full-page screenshot: full-page wird per PIL in viewport-grosse
Slices geschnitten, jede ueberlappt die vorherige um overlap_px Pixel.
Jeder Cookie erscheint in mind. einer Slice, an Slice-Grenzen sogar in
zwei → Dedup nach Name eliminiert die Doppel.

Warum nicht direkt scroll-based slicing in Playwright? VW's
Cookie-Page nutzt scroll-snap / fixed-position — alle viewport-shots
kamen identisch zurueck (Header-Overlay). PIL-cut auf dem full-page
PNG bypasst das Problem voellig.

VW smoke-test (32 slices):
  per-slice: [0, 0, 2, 5, 5, 3, 4, 7, 4, 3, 4, 5, ...]
  103 raw cookies → 79 unique nach dedup
  14 vendor records (Google 9, Adobe-Familie 17, etc.)

Jeder Slice hat eigenen Timestamp + SHA256 → ZIP-Anhang fuer
juristische Beweiskette.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-05-22 23:38:13 +02:00
parent 1784b43d72
commit efeef73f90
3 changed files with 300 additions and 1 deletions
+118
View File
@@ -89,6 +89,124 @@ _DISMISS_BANNER_JS = r"""() => {
}"""
async def capture_page_overlapping_slices(
url: str,
check_id: str = "",
viewport_h: int = 1024,
overlap_px: int = 200,
timeout_ms: int = 30000,
max_slices: int = 40,
) -> dict:
"""Lückenlose Beweiskette: scrollt die Seite in viewport-grossen
Schritten und macht pro Schritt ein eigenes Screenshot. Jeder
Schritt ueberlappt mit dem vorherigen um `overlap_px` Pixel — so
erscheint jeder Cookie in mind. einem Bild, an Slice-Grenzen sogar
in zweien. Tesseract-Dedup nach Cookie-Name eliminiert Doppel.
Vorteil ggue. full_page=True:
- Beweiskette VERIFIZIERBAR (Overlap dokumentiert Lueckenfreiheit)
- Tesseract pro Slice schneller + parallel ausfuehrbar
- Pro Slice eigener Timestamp + Sequenz-Nummer in der Mail-ZIP
Returns dict:
slices: [{idx, ts, png_b64, top_y, bot_y, sha256}, ...]
total_height_px
width_px
url (final after redirect)
accepted_banner, expanded
"""
import base64 as _b64
import hashlib
out: dict = {
"slices": [],
"total_height_px": 0,
"width_px": 0,
"url": url,
"accepted_banner": False,
"expanded": 0,
}
ts_base = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S")
async with async_playwright() as p:
browser = await p.chromium.launch(
headless=True, args=["--no-sandbox", "--disable-dev-shm-usage"],
)
ctx = await browser.new_context(
user_agent=_USER_AGENT,
viewport={"width": 1280, "height": viewport_h},
locale="de-DE", timezone_id="Europe/Berlin",
)
page = await ctx.new_page()
try:
await page.goto(url, wait_until="domcontentloaded", timeout=timeout_ms)
await page.wait_for_timeout(3500)
try:
out["accepted_banner"] = bool(await page.evaluate(_DISMISS_BANNER_JS))
if out["accepted_banner"]:
await page.wait_for_timeout(1500)
except Exception:
pass
try:
out["expanded"] = int(await page.evaluate(_EXPAND_ALL_JS) or 0)
if out["expanded"]:
await page.wait_for_timeout(1500)
except Exception:
pass
out["url"] = page.url
# Inject timestamp banner so the FIRST slice carries it.
try:
await page.evaluate(_TIMESTAMP_BANNER_JS, {
"url": out["url"], "ts": ts_base, "check_id": check_id or "",
})
except Exception:
pass
await page.wait_for_timeout(500)
# Measure total scroll height + width
dims = await page.evaluate(
"() => ({w: document.documentElement.scrollWidth, "
"h: document.documentElement.scrollHeight})"
)
total_h = int(dims.get("h") or 0)
out["total_height_px"] = total_h
out["width_px"] = int(dims.get("w") or 0)
# Calculate scroll-step: viewport_h minus overlap. Each slice
# contains overlap_px pixels of the PREVIOUS slice's bottom.
step = max(1, viewport_h - overlap_px)
scroll_y = 0
idx = 0
while scroll_y < total_h and idx < max_slices:
# Scroll to position. Wait for any lazy content to render.
await page.evaluate(f"window.scrollTo(0, {scroll_y})")
await page.wait_for_timeout(400)
png = await page.screenshot(
full_page=False, type="png", timeout=timeout_ms,
)
ts = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S")
top_y = scroll_y
bot_y = min(scroll_y + viewport_h, total_h)
sha = hashlib.sha256(png).hexdigest()[:16]
out["slices"].append({
"idx": idx,
"ts": ts,
"top_y": top_y,
"bot_y": bot_y,
"sha256": sha,
"png_b64": _b64.b64encode(png).decode("ascii"),
"png_size": len(png),
})
scroll_y += step
idx += 1
logger.info(
"Overlapping screenshots: %d slices for %s (total_h=%d, "
"viewport=%d, overlap=%d)",
len(out["slices"]), out["url"], total_h, viewport_h, overlap_px,
)
finally:
await ctx.close()
await browser.close()
return out
async def capture_page_evidence(
url: str,
check_id: str = "",