feat(crawl): Vollstaendigkeit — Shadow-DOM/versteckte Links + Interaktions-Fixpunkt + Wayback-CDX-Orphans

Damit die Specialist-Agents auf vollstaendigem Website-Content arbeiten: A — _find_dsi_links pierct jetzt Shadow-DOM (Web-Components wie Usercentrics/ Mercedes) rekursiv; versteckte (display:none) Links werden erfasst + als Coverage-Metadatum geflaggt. B — _expand_to_fixpoint klappt Akkordeons/Tabs/Hover-Menues in einer Schleife auf, bis das DOM stabil ist (statt 1 Pass); erweiterte Selektoren; Coverage-Telemetrie (Runden, expandierte Elemente, DOM-Wachstum, Shadow-/ versteckte Links) → Response + Backend-Log. C — legacy_url_cdx.cdx_enumerate listet via Wayback-CDX-API ALLE je archivierten URLs der Domain → findet Orphan-/Legacy-Seiten, die nie im Slug-Raster standen (z.B. nicht mehr verlinktes /datenschutz, per Direkt- URL noch erreichbar). Fliesst durch das bestehende Legacy-URL-Inventar. Tests: test_legacy_url_cdx.py (6) + consent-tester/tests/test_dsi_discovery.py (Pure-Helper + Real-Browser-Integration). Alle gruen, LOC-Gate gruen. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-09 12:33:34 +02:00
parent b1357915ae
commit 08c08fcba2
7 changed files with 487 additions and 41 deletions
@@ -0,0 +1,110 @@
+"""Tests für die Wayback-CDX-Orphan-Enumeration (Feature C)."""
+
+from __future__ import annotations
+
+import asyncio
+
+from compliance.services.legacy_url_cdx import _parse_cdx_rows, cdx_enumerate
+
+
+def _run(coro):
+    return asyncio.get_event_loop().run_until_complete(coro)
+
+
+# ── Pure: _parse_cdx_rows ───────────────────────────────────────────
+
+
+def test_parse_cdx_rows_drops_assets_and_dedups():
+    rows = [
+        ["original", "timestamp", "statuscode"],          # Header
+        ["http://x.com/datenschutz", "20190101", "200"],
+        ["http://x.com/datenschutz", "20200101", "200"],  # Duplikat
+        ["http://x.com/style.css", "20200101", "200"],     # Asset
+        ["http://x.com/app.js", "20200101", "200"],         # Asset
+        ["http://x.com/impressum", "20180101", "200"],
+    ]
+    out = _parse_cdx_rows(rows)
+    urls = [u for u, _ in out]
+    assert urls == ["http://x.com/datenschutz", "http://x.com/impressum"]
+    # timestamp des ERSTEN (ältesten) Snapshots bleibt erhalten
+    assert out[0] == ("http://x.com/datenschutz", "20190101")
+
+
+def test_parse_cdx_rows_empty_or_header_only():
+    assert _parse_cdx_rows([]) == []
+    assert _parse_cdx_rows([["original", "timestamp"]]) == []
+    assert _parse_cdx_rows("garbage") == []  # type: ignore[arg-type]
+
+
+# ── cdx_enumerate mit gemocktem httpx ───────────────────────────────
+
+
+class _FakeResp:
+    def __init__(self, status_code, json_data):
+        self.status_code = status_code
+        self._json = json_data
+
+    def json(self):
+        return self._json
+
+
+class _FakeClient:
+    def __init__(self, resp):
+        self._resp = resp
+
+    async def __aenter__(self):
+        return self
+
+    async def __aexit__(self, *a):
+        return False
+
+    async def get(self, *a, **kw):
+        return self._resp
+
+
+def _patch_httpx(monkeypatch, resp):
+    monkeypatch.setattr(
+        "compliance.services.legacy_url_cdx.httpx.AsyncClient",
+        lambda *a, **kw: _FakeClient(resp),
+    )
+
+
+def test_cdx_enumerate_returns_parsed_pairs(monkeypatch):
+    rows = [
+        ["original", "timestamp", "statuscode"],
+        ["http://x.com/datenschutz", "20190101120000", "200"],
+        ["http://x.com/logo.png", "20200101", "200"],
+    ]
+    _patch_httpx(monkeypatch, _FakeResp(200, rows))
+    out = _run(cdx_enumerate("https://x.com"))
+    urls = [u for u, _ in out]
+    assert "http://x.com/datenschutz" in urls
+    assert "http://x.com/logo.png" not in urls  # Asset gedroppt
+
+
+def test_cdx_enumerate_non_200_returns_empty(monkeypatch):
+    _patch_httpx(monkeypatch, _FakeResp(503, []))
+    assert _run(cdx_enumerate("https://x.com")) == []
+
+
+def test_cdx_enumerate_no_netloc_returns_empty(monkeypatch):
+    _patch_httpx(monkeypatch, _FakeResp(200, []))
+    assert _run(cdx_enumerate("")) == []
+
+
+# ── Orphan-Pfad: CDX-Fund → Legal-Filter behält Rechts-Seite ────────
+
+
+def test_cdx_orphan_survives_legal_filter():
+    """Der eigentliche Orphan-Fall: CDX findet /datenschutz (nicht mehr
+    verlinkt), der Legal-Filter behält sie, Produktseiten fallen raus."""
+    from compliance.services.legacy_url_discovery import _filter_legal_urls
+    rows = [
+        ["original", "timestamp", "statuscode"],
+        ["http://x.com/datenschutz", "20190101", "200"],
+        ["http://x.com/products/widget", "20200101", "200"],
+    ]
+    pairs = _parse_cdx_rows(rows)
+    legal = _filter_legal_urls([u for u, _ in pairs])
+    assert "http://x.com/datenschutz" in legal
+    assert "http://x.com/products/widget" not in legal