feat(crawl): Vollstaendigkeit — Shadow-DOM/versteckte Links + Interaktions-Fixpunkt + Wayback-CDX-Orphans

Damit die Specialist-Agents auf vollstaendigem Website-Content arbeiten: A — _find_dsi_links pierct jetzt Shadow-DOM (Web-Components wie Usercentrics/ Mercedes) rekursiv; versteckte (display:none) Links werden erfasst + als Coverage-Metadatum geflaggt. B — _expand_to_fixpoint klappt Akkordeons/Tabs/Hover-Menues in einer Schleife auf, bis das DOM stabil ist (statt 1 Pass); erweiterte Selektoren; Coverage-Telemetrie (Runden, expandierte Elemente, DOM-Wachstum, Shadow-/ versteckte Links) → Response + Backend-Log. C — legacy_url_cdx.cdx_enumerate listet via Wayback-CDX-API ALLE je archivierten URLs der Domain → findet Orphan-/Legacy-Seiten, die nie im Slug-Raster standen (z.B. nicht mehr verlinktes /datenschutz, per Direkt- URL noch erreichbar). Fliesst durch das bestehende Legacy-URL-Inventar. Tests: test_legacy_url_cdx.py (6) + consent-tester/tests/test_dsi_discovery.py (Pure-Helper + Real-Browser-Integration). Alle gruen, LOC-Gate gruen. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-09 12:33:34 +02:00
parent b1357915ae
commit 08c08fcba2
7 changed files with 487 additions and 41 deletions
@@ -0,0 +1,89 @@
+"""Wayback-CDX-Enumeration — listet ALLE je archivierten URLs einer Domain.
+
+Anders als die per-Slug-Wayback-Pruefung (legacy_url_discovery._wayback_check)
+holen wir hier die KOMPLETTE History-Liste der Domain ueber die CDX-API. So
+finden wir Orphan-/Legacy-Seiten, die nie im Slug-Raster standen und heute
+nicht mehr verlinkt sind, aber per Direkt-URL noch erreichbar — genau der Fall
+"www.xyz.com/datenschutz existierte mal, wurde nie entfernt".
+
+Best-effort: jede Exception → leere Liste, blockiert die uebrige Discovery nie.
+"""
+
+from __future__ import annotations
+
+import logging
+from urllib.parse import urlparse
+
+import httpx
+
+logger = logging.getLogger(__name__)
+
+_CDX_API = "http://web.archive.org/cdx/search/cdx"
+
+# Nicht-HTML-Assets, die uns fuer Rechts-Content nicht interessieren.
+_ASSET_SUFFIXES = (
+    ".js", ".css", ".png", ".jpg", ".jpeg", ".gif", ".svg", ".ico",
+    ".woff", ".woff2", ".ttf", ".eot", ".webp", ".mp4", ".webm",
+    ".zip", ".map", ".json", ".xml", ".rss", ".txt", ".csv",
+)
+
+
+def _parse_cdx_rows(rows: list) -> list[tuple[str, str]]:
+    """Parst CDX-JSON zu (url, timestamp)-Paaren.
+
+    CDX-JSON ist ein Array von Arrays; Zeile 0 ist der Header
+    ["original","timestamp","statuscode"]. Assets werden gedroppt,
+    Duplikate (per URL ohne Fragment) entfernt.
+    """
+    if not isinstance(rows, list) or len(rows) < 2:
+        return []
+    seen: set[str] = set()
+    out: list[tuple[str, str]] = []
+    for row in rows[1:]:  # Zeile 0 = Header
+        if not isinstance(row, (list, tuple)) or not row:
+            continue
+        url = str(row[0]).strip()
+        if not url:
+            continue
+        path = url.lower().split("?", 1)[0].split("#", 1)[0]
+        if path.endswith(_ASSET_SUFFIXES):
+            continue
+        key = url.split("#", 1)[0]
+        if key in seen:
+            continue
+        seen.add(key)
+        ts = str(row[1]).strip() if len(row) > 1 else ""
+        out.append((url, ts))
+    return out
+
+
+async def cdx_enumerate(origin: str, limit: int = 2000) -> list[tuple[str, str]]:
+    """Liefert (url, wayback_timestamp) fuer alle je archivierten HTML-URLs.
+
+    `collapse=urlkey` → eine Zeile pro URL; `filter=statuscode:200` → nur
+    erfolgreich archivierte. Der timestamp wird spaeter als Legacy-Alter
+    wiederverwendet (spart einen zweiten Wayback-Call pro URL).
+    """
+    netloc = urlparse(origin).netloc or origin.replace("https://", "").replace(
+        "http://", "",
+    )
+    if not netloc:
+        return []
+    params = {
+        "url": f"{netloc}*",
+        "output": "json",
+        "collapse": "urlkey",
+        "fl": "original,timestamp,statuscode",
+        "filter": "statuscode:200",
+        "limit": str(limit),
+    }
+    try:
+        async with httpx.AsyncClient(timeout=15.0) as c:
+            r = await c.get(_CDX_API, params=params)
+            if r.status_code != 200:
+                return []
+            rows = r.json() or []
+    except Exception as e:
+        logger.info("CDX enumerate failed for %s: %s", netloc, e)
+        return []
+    return _parse_cdx_rows(rows)
@@ -29,6 +29,8 @@ from urllib.parse import urljoin, urlparse

 import httpx

+from compliance.services.legacy_url_cdx import cdx_enumerate
+
 logger = logging.getLogger(__name__)


@@ -239,13 +241,24 @@ async def discover_legacy_urls(state: dict) -> dict:
        return {"candidates": [], "skipped": "no_origin"}

    candidates: set[str] = set()
-    # A.1 Sitemap
+    # A.1 Sitemap + A.3 Slug-Permutations
    for o in list(origins)[:2]:
        sitemap_urls = await _fetch_sitemap_urls(o)
        candidates.update(_filter_legal_urls(sitemap_urls))
-        # A.3 Slug-Permutations
        candidates.update(_build_slug_candidates(o))

+    # A.5 Wayback-CDX: alle je archivierten URLs der Domain → faengt
+    # Orphans, die nie im Slug-Raster standen. (url, cdx_timestamp); der
+    # timestamp dient als Legacy-Alter (kein zweiter Wayback-Call noetig).
+    cdx_pairs: list[tuple[str, str]] = []
+    for o in list(origins)[:2]:
+        cdx_pairs.extend(await cdx_enumerate(o))
+    cdx_legal_urls = set(_filter_legal_urls([u for u, _ in cdx_pairs]))
+    cdx_legal = [
+        (u, ts) for (u, ts) in cdx_pairs
+        if u in cdx_legal_urls and u not in candidates
+    ][:100]
+
    # Cap to avoid explosion
    cands = list(candidates)[:60]

@@ -264,12 +277,32 @@ async def discover_legacy_urls(state: dict) -> dict:
            "age_months": age,
            "in_footer": in_footer,
            "recommendation": _recommend(status, age, False, in_footer),
+            "via": "sitemap/slug",
        }

-    results = await asyncio.gather(
-        *[_check(u) for u in cands], return_exceptions=True,
+    # CDX-Kandidaten: nur Liveness pruefen (Archiv-Stand kennen wir schon).
+    async def _check_cdx(url: str, ts: str) -> dict:
+        status, lm = await _probe_alive(url)
+        age = _months_since(ts)
+        in_footer = url.split("#")[0].split("?")[0] in footer_urls
+        return {
+            "url": url,
+            "status": status,
+            "last_modified": lm,
+            "wayback_snapshot": "",
+            "wayback_timestamp": ts,
+            "age_months": age,
+            "in_footer": in_footer,
+            "recommendation": _recommend(status, age, False, in_footer),
+            "via": "wayback-cdx",
+        }
+
+    gathered = await asyncio.gather(
+        *[_check(u) for u in cands],
+        *[_check_cdx(u, ts) for u, ts in cdx_legal],
+        return_exceptions=True,
    )
-    results = [r for r in results if isinstance(r, dict)]
+    results = [r for r in gathered if isinstance(r, dict)]

    # Filter: only show interesting ones (≥200 reachable + legacy-relevant)
    interesting: list[dict] = []
@@ -297,5 +330,6 @@ async def discover_legacy_urls(state: dict) -> dict:
        "candidates": interesting,
        "probed": len(results),
        "filtered_kept": len(interesting),
+        "cdx_candidates": len(cdx_legal),
        "origins": list(origins),
    }