feat(crawl): Vollstaendigkeit — Shadow-DOM/versteckte Links + Interaktions-Fixpunkt + Wayback-CDX-Orphans
CI / test-python-backend (push) Successful in 30s
CI / detect-changes (push) Successful in 9s
CI / branch-name (push) Has been skipped
CI / guardrail-integrity (push) Has been skipped
CI / secret-scan (push) Has been skipped
CI / dep-audit (push) Has been skipped
CI / sbom-scan (push) Has been skipped
CI / build-sha-integrity (push) Failing after 4s
CI / validate-canonical-controls (push) Successful in 12s
CI / loc-budget (push) Successful in 15s
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / nodejs-build (push) Has been skipped
CI / test-go (push) Has been skipped
CI / iace-gt-coverage (push) Has been skipped
CI / test-python-document-crawler (push) Has been skipped
CI / test-python-dsms-gateway (push) Has been skipped

Damit die Specialist-Agents auf vollstaendigem Website-Content arbeiten:

A — _find_dsi_links pierct jetzt Shadow-DOM (Web-Components wie Usercentrics/
    Mercedes) rekursiv; versteckte (display:none) Links werden erfasst + als
    Coverage-Metadatum geflaggt.
B — _expand_to_fixpoint klappt Akkordeons/Tabs/Hover-Menues in einer Schleife
    auf, bis das DOM stabil ist (statt 1 Pass); erweiterte Selektoren;
    Coverage-Telemetrie (Runden, expandierte Elemente, DOM-Wachstum, Shadow-/
    versteckte Links) → Response + Backend-Log.
C — legacy_url_cdx.cdx_enumerate listet via Wayback-CDX-API ALLE je
    archivierten URLs der Domain → findet Orphan-/Legacy-Seiten, die nie im
    Slug-Raster standen (z.B. nicht mehr verlinktes /datenschutz, per Direkt-
    URL noch erreichbar). Fliesst durch das bestehende Legacy-URL-Inventar.

Tests: test_legacy_url_cdx.py (6) + consent-tester/tests/test_dsi_discovery.py
(Pure-Helper + Real-Browser-Integration). Alle gruen, LOC-Gate gruen.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-06-09 12:33:34 +02:00
parent b1357915ae
commit 08c08fcba2
7 changed files with 487 additions and 41 deletions
@@ -54,6 +54,17 @@ async def _fetch_text(url: str, doc_type: str = "") -> tuple[str, list[dict]]:
docs = payload.get("documents", [])
cmp_payloads = payload.get("cmp_payloads") or []
cmp_cookie_text = payload.get("cmp_cookie_text") or ""
coverage = payload.get("coverage") or {}
if coverage:
logger.info(
"Crawl-Coverage %s: %d Interaktions-Runden, "
"%d Elemente expandiert, %d Shadow-Links, "
"%d versteckte Links",
url, coverage.get("interaction_rounds", 0),
coverage.get("elements_expanded", 0),
coverage.get("shadow_links_found", 0),
coverage.get("hidden_links_found", 0),
)
# D — wenn der consent-tester HTML-Tabellen aus dem DOM
# extrahiert hat, in die cmp_payloads als "generic_table"
# einschleusen damit das Backend sie via cookies_table_parser
@@ -0,0 +1,89 @@
"""Wayback-CDX-Enumeration — listet ALLE je archivierten URLs einer Domain.
Anders als die per-Slug-Wayback-Pruefung (legacy_url_discovery._wayback_check)
holen wir hier die KOMPLETTE History-Liste der Domain ueber die CDX-API. So
finden wir Orphan-/Legacy-Seiten, die nie im Slug-Raster standen und heute
nicht mehr verlinkt sind, aber per Direkt-URL noch erreichbar — genau der Fall
"www.xyz.com/datenschutz existierte mal, wurde nie entfernt".
Best-effort: jede Exception → leere Liste, blockiert die uebrige Discovery nie.
"""
from __future__ import annotations
import logging
from urllib.parse import urlparse
import httpx
logger = logging.getLogger(__name__)
_CDX_API = "http://web.archive.org/cdx/search/cdx"
# Nicht-HTML-Assets, die uns fuer Rechts-Content nicht interessieren.
_ASSET_SUFFIXES = (
".js", ".css", ".png", ".jpg", ".jpeg", ".gif", ".svg", ".ico",
".woff", ".woff2", ".ttf", ".eot", ".webp", ".mp4", ".webm",
".zip", ".map", ".json", ".xml", ".rss", ".txt", ".csv",
)
def _parse_cdx_rows(rows: list) -> list[tuple[str, str]]:
"""Parst CDX-JSON zu (url, timestamp)-Paaren.
CDX-JSON ist ein Array von Arrays; Zeile 0 ist der Header
["original","timestamp","statuscode"]. Assets werden gedroppt,
Duplikate (per URL ohne Fragment) entfernt.
"""
if not isinstance(rows, list) or len(rows) < 2:
return []
seen: set[str] = set()
out: list[tuple[str, str]] = []
for row in rows[1:]: # Zeile 0 = Header
if not isinstance(row, (list, tuple)) or not row:
continue
url = str(row[0]).strip()
if not url:
continue
path = url.lower().split("?", 1)[0].split("#", 1)[0]
if path.endswith(_ASSET_SUFFIXES):
continue
key = url.split("#", 1)[0]
if key in seen:
continue
seen.add(key)
ts = str(row[1]).strip() if len(row) > 1 else ""
out.append((url, ts))
return out
async def cdx_enumerate(origin: str, limit: int = 2000) -> list[tuple[str, str]]:
"""Liefert (url, wayback_timestamp) fuer alle je archivierten HTML-URLs.
`collapse=urlkey` → eine Zeile pro URL; `filter=statuscode:200` → nur
erfolgreich archivierte. Der timestamp wird spaeter als Legacy-Alter
wiederverwendet (spart einen zweiten Wayback-Call pro URL).
"""
netloc = urlparse(origin).netloc or origin.replace("https://", "").replace(
"http://", "",
)
if not netloc:
return []
params = {
"url": f"{netloc}*",
"output": "json",
"collapse": "urlkey",
"fl": "original,timestamp,statuscode",
"filter": "statuscode:200",
"limit": str(limit),
}
try:
async with httpx.AsyncClient(timeout=15.0) as c:
r = await c.get(_CDX_API, params=params)
if r.status_code != 200:
return []
rows = r.json() or []
except Exception as e:
logger.info("CDX enumerate failed for %s: %s", netloc, e)
return []
return _parse_cdx_rows(rows)
@@ -29,6 +29,8 @@ from urllib.parse import urljoin, urlparse
import httpx
from compliance.services.legacy_url_cdx import cdx_enumerate
logger = logging.getLogger(__name__)
@@ -239,13 +241,24 @@ async def discover_legacy_urls(state: dict) -> dict:
return {"candidates": [], "skipped": "no_origin"}
candidates: set[str] = set()
# A.1 Sitemap
# A.1 Sitemap + A.3 Slug-Permutations
for o in list(origins)[:2]:
sitemap_urls = await _fetch_sitemap_urls(o)
candidates.update(_filter_legal_urls(sitemap_urls))
# A.3 Slug-Permutations
candidates.update(_build_slug_candidates(o))
# A.5 Wayback-CDX: alle je archivierten URLs der Domain → faengt
# Orphans, die nie im Slug-Raster standen. (url, cdx_timestamp); der
# timestamp dient als Legacy-Alter (kein zweiter Wayback-Call noetig).
cdx_pairs: list[tuple[str, str]] = []
for o in list(origins)[:2]:
cdx_pairs.extend(await cdx_enumerate(o))
cdx_legal_urls = set(_filter_legal_urls([u for u, _ in cdx_pairs]))
cdx_legal = [
(u, ts) for (u, ts) in cdx_pairs
if u in cdx_legal_urls and u not in candidates
][:100]
# Cap to avoid explosion
cands = list(candidates)[:60]
@@ -264,12 +277,32 @@ async def discover_legacy_urls(state: dict) -> dict:
"age_months": age,
"in_footer": in_footer,
"recommendation": _recommend(status, age, False, in_footer),
"via": "sitemap/slug",
}
results = await asyncio.gather(
*[_check(u) for u in cands], return_exceptions=True,
# CDX-Kandidaten: nur Liveness pruefen (Archiv-Stand kennen wir schon).
async def _check_cdx(url: str, ts: str) -> dict:
status, lm = await _probe_alive(url)
age = _months_since(ts)
in_footer = url.split("#")[0].split("?")[0] in footer_urls
return {
"url": url,
"status": status,
"last_modified": lm,
"wayback_snapshot": "",
"wayback_timestamp": ts,
"age_months": age,
"in_footer": in_footer,
"recommendation": _recommend(status, age, False, in_footer),
"via": "wayback-cdx",
}
gathered = await asyncio.gather(
*[_check(u) for u in cands],
*[_check_cdx(u, ts) for u, ts in cdx_legal],
return_exceptions=True,
)
results = [r for r in results if isinstance(r, dict)]
results = [r for r in gathered if isinstance(r, dict)]
# Filter: only show interesting ones (≥200 reachable + legacy-relevant)
interesting: list[dict] = []
@@ -297,5 +330,6 @@ async def discover_legacy_urls(state: dict) -> dict:
"candidates": interesting,
"probed": len(results),
"filtered_kept": len(interesting),
"cdx_candidates": len(cdx_legal),
"origins": list(origins),
}
@@ -0,0 +1,110 @@
"""Tests für die Wayback-CDX-Orphan-Enumeration (Feature C)."""
from __future__ import annotations
import asyncio
from compliance.services.legacy_url_cdx import _parse_cdx_rows, cdx_enumerate
def _run(coro):
return asyncio.get_event_loop().run_until_complete(coro)
# ── Pure: _parse_cdx_rows ───────────────────────────────────────────
def test_parse_cdx_rows_drops_assets_and_dedups():
rows = [
["original", "timestamp", "statuscode"], # Header
["http://x.com/datenschutz", "20190101", "200"],
["http://x.com/datenschutz", "20200101", "200"], # Duplikat
["http://x.com/style.css", "20200101", "200"], # Asset
["http://x.com/app.js", "20200101", "200"], # Asset
["http://x.com/impressum", "20180101", "200"],
]
out = _parse_cdx_rows(rows)
urls = [u for u, _ in out]
assert urls == ["http://x.com/datenschutz", "http://x.com/impressum"]
# timestamp des ERSTEN (ältesten) Snapshots bleibt erhalten
assert out[0] == ("http://x.com/datenschutz", "20190101")
def test_parse_cdx_rows_empty_or_header_only():
assert _parse_cdx_rows([]) == []
assert _parse_cdx_rows([["original", "timestamp"]]) == []
assert _parse_cdx_rows("garbage") == [] # type: ignore[arg-type]
# ── cdx_enumerate mit gemocktem httpx ───────────────────────────────
class _FakeResp:
def __init__(self, status_code, json_data):
self.status_code = status_code
self._json = json_data
def json(self):
return self._json
class _FakeClient:
def __init__(self, resp):
self._resp = resp
async def __aenter__(self):
return self
async def __aexit__(self, *a):
return False
async def get(self, *a, **kw):
return self._resp
def _patch_httpx(monkeypatch, resp):
monkeypatch.setattr(
"compliance.services.legacy_url_cdx.httpx.AsyncClient",
lambda *a, **kw: _FakeClient(resp),
)
def test_cdx_enumerate_returns_parsed_pairs(monkeypatch):
rows = [
["original", "timestamp", "statuscode"],
["http://x.com/datenschutz", "20190101120000", "200"],
["http://x.com/logo.png", "20200101", "200"],
]
_patch_httpx(monkeypatch, _FakeResp(200, rows))
out = _run(cdx_enumerate("https://x.com"))
urls = [u for u, _ in out]
assert "http://x.com/datenschutz" in urls
assert "http://x.com/logo.png" not in urls # Asset gedroppt
def test_cdx_enumerate_non_200_returns_empty(monkeypatch):
_patch_httpx(monkeypatch, _FakeResp(503, []))
assert _run(cdx_enumerate("https://x.com")) == []
def test_cdx_enumerate_no_netloc_returns_empty(monkeypatch):
_patch_httpx(monkeypatch, _FakeResp(200, []))
assert _run(cdx_enumerate("")) == []
# ── Orphan-Pfad: CDX-Fund → Legal-Filter behält Rechts-Seite ────────
def test_cdx_orphan_survives_legal_filter():
"""Der eigentliche Orphan-Fall: CDX findet /datenschutz (nicht mehr
verlinkt), der Legal-Filter behält sie, Produktseiten fallen raus."""
from compliance.services.legacy_url_discovery import _filter_legal_urls
rows = [
["original", "timestamp", "statuscode"],
["http://x.com/datenschutz", "20190101", "200"],
["http://x.com/products/widget", "20200101", "200"],
]
pairs = _parse_cdx_rows(rows)
legal = _filter_legal_urls([u for u, _ in pairs])
assert "http://x.com/datenschutz" in legal
assert "http://x.com/products/widget" not in legal