08c08fcba2
CI / test-python-backend (push) Successful in 30s
CI / detect-changes (push) Successful in 9s
CI / branch-name (push) Has been skipped
CI / guardrail-integrity (push) Has been skipped
CI / secret-scan (push) Has been skipped
CI / dep-audit (push) Has been skipped
CI / sbom-scan (push) Has been skipped
CI / build-sha-integrity (push) Failing after 4s
CI / validate-canonical-controls (push) Successful in 12s
CI / loc-budget (push) Successful in 15s
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / nodejs-build (push) Has been skipped
CI / test-go (push) Has been skipped
CI / iace-gt-coverage (push) Has been skipped
CI / test-python-document-crawler (push) Has been skipped
CI / test-python-dsms-gateway (push) Has been skipped
Damit die Specialist-Agents auf vollstaendigem Website-Content arbeiten:
A — _find_dsi_links pierct jetzt Shadow-DOM (Web-Components wie Usercentrics/
Mercedes) rekursiv; versteckte (display:none) Links werden erfasst + als
Coverage-Metadatum geflaggt.
B — _expand_to_fixpoint klappt Akkordeons/Tabs/Hover-Menues in einer Schleife
auf, bis das DOM stabil ist (statt 1 Pass); erweiterte Selektoren;
Coverage-Telemetrie (Runden, expandierte Elemente, DOM-Wachstum, Shadow-/
versteckte Links) → Response + Backend-Log.
C — legacy_url_cdx.cdx_enumerate listet via Wayback-CDX-API ALLE je
archivierten URLs der Domain → findet Orphan-/Legacy-Seiten, die nie im
Slug-Raster standen (z.B. nicht mehr verlinktes /datenschutz, per Direkt-
URL noch erreichbar). Fliesst durch das bestehende Legacy-URL-Inventar.
Tests: test_legacy_url_cdx.py (6) + consent-tester/tests/test_dsi_discovery.py
(Pure-Helper + Real-Browser-Integration). Alle gruen, LOC-Gate gruen.
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
111 lines
3.6 KiB
Python
111 lines
3.6 KiB
Python
"""Tests für die Wayback-CDX-Orphan-Enumeration (Feature C)."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import asyncio
|
|
|
|
from compliance.services.legacy_url_cdx import _parse_cdx_rows, cdx_enumerate
|
|
|
|
|
|
def _run(coro):
|
|
return asyncio.get_event_loop().run_until_complete(coro)
|
|
|
|
|
|
# ── Pure: _parse_cdx_rows ───────────────────────────────────────────
|
|
|
|
|
|
def test_parse_cdx_rows_drops_assets_and_dedups():
|
|
rows = [
|
|
["original", "timestamp", "statuscode"], # Header
|
|
["http://x.com/datenschutz", "20190101", "200"],
|
|
["http://x.com/datenschutz", "20200101", "200"], # Duplikat
|
|
["http://x.com/style.css", "20200101", "200"], # Asset
|
|
["http://x.com/app.js", "20200101", "200"], # Asset
|
|
["http://x.com/impressum", "20180101", "200"],
|
|
]
|
|
out = _parse_cdx_rows(rows)
|
|
urls = [u for u, _ in out]
|
|
assert urls == ["http://x.com/datenschutz", "http://x.com/impressum"]
|
|
# timestamp des ERSTEN (ältesten) Snapshots bleibt erhalten
|
|
assert out[0] == ("http://x.com/datenschutz", "20190101")
|
|
|
|
|
|
def test_parse_cdx_rows_empty_or_header_only():
|
|
assert _parse_cdx_rows([]) == []
|
|
assert _parse_cdx_rows([["original", "timestamp"]]) == []
|
|
assert _parse_cdx_rows("garbage") == [] # type: ignore[arg-type]
|
|
|
|
|
|
# ── cdx_enumerate mit gemocktem httpx ───────────────────────────────
|
|
|
|
|
|
class _FakeResp:
|
|
def __init__(self, status_code, json_data):
|
|
self.status_code = status_code
|
|
self._json = json_data
|
|
|
|
def json(self):
|
|
return self._json
|
|
|
|
|
|
class _FakeClient:
|
|
def __init__(self, resp):
|
|
self._resp = resp
|
|
|
|
async def __aenter__(self):
|
|
return self
|
|
|
|
async def __aexit__(self, *a):
|
|
return False
|
|
|
|
async def get(self, *a, **kw):
|
|
return self._resp
|
|
|
|
|
|
def _patch_httpx(monkeypatch, resp):
|
|
monkeypatch.setattr(
|
|
"compliance.services.legacy_url_cdx.httpx.AsyncClient",
|
|
lambda *a, **kw: _FakeClient(resp),
|
|
)
|
|
|
|
|
|
def test_cdx_enumerate_returns_parsed_pairs(monkeypatch):
|
|
rows = [
|
|
["original", "timestamp", "statuscode"],
|
|
["http://x.com/datenschutz", "20190101120000", "200"],
|
|
["http://x.com/logo.png", "20200101", "200"],
|
|
]
|
|
_patch_httpx(monkeypatch, _FakeResp(200, rows))
|
|
out = _run(cdx_enumerate("https://x.com"))
|
|
urls = [u for u, _ in out]
|
|
assert "http://x.com/datenschutz" in urls
|
|
assert "http://x.com/logo.png" not in urls # Asset gedroppt
|
|
|
|
|
|
def test_cdx_enumerate_non_200_returns_empty(monkeypatch):
|
|
_patch_httpx(monkeypatch, _FakeResp(503, []))
|
|
assert _run(cdx_enumerate("https://x.com")) == []
|
|
|
|
|
|
def test_cdx_enumerate_no_netloc_returns_empty(monkeypatch):
|
|
_patch_httpx(monkeypatch, _FakeResp(200, []))
|
|
assert _run(cdx_enumerate("")) == []
|
|
|
|
|
|
# ── Orphan-Pfad: CDX-Fund → Legal-Filter behält Rechts-Seite ────────
|
|
|
|
|
|
def test_cdx_orphan_survives_legal_filter():
|
|
"""Der eigentliche Orphan-Fall: CDX findet /datenschutz (nicht mehr
|
|
verlinkt), der Legal-Filter behält sie, Produktseiten fallen raus."""
|
|
from compliance.services.legacy_url_discovery import _filter_legal_urls
|
|
rows = [
|
|
["original", "timestamp", "statuscode"],
|
|
["http://x.com/datenschutz", "20190101", "200"],
|
|
["http://x.com/products/widget", "20200101", "200"],
|
|
]
|
|
pairs = _parse_cdx_rows(rows)
|
|
legal = _filter_legal_urls([u for u, _ in pairs])
|
|
assert "http://x.com/datenschutz" in legal
|
|
assert "http://x.com/products/widget" not in legal
|