"""Tests für die Wayback-CDX-Orphan-Enumeration (Feature C).""" from __future__ import annotations import asyncio from compliance.services.legacy_url_cdx import _parse_cdx_rows, cdx_enumerate def _run(coro): return asyncio.get_event_loop().run_until_complete(coro) # ── Pure: _parse_cdx_rows ─────────────────────────────────────────── def test_parse_cdx_rows_drops_assets_and_dedups(): rows = [ ["original", "timestamp", "statuscode"], # Header ["http://x.com/datenschutz", "20190101", "200"], ["http://x.com/datenschutz", "20200101", "200"], # Duplikat ["http://x.com/style.css", "20200101", "200"], # Asset ["http://x.com/app.js", "20200101", "200"], # Asset ["http://x.com/impressum", "20180101", "200"], ] out = _parse_cdx_rows(rows) urls = [u for u, _ in out] assert urls == ["http://x.com/datenschutz", "http://x.com/impressum"] # timestamp des ERSTEN (ältesten) Snapshots bleibt erhalten assert out[0] == ("http://x.com/datenschutz", "20190101") def test_parse_cdx_rows_empty_or_header_only(): assert _parse_cdx_rows([]) == [] assert _parse_cdx_rows([["original", "timestamp"]]) == [] assert _parse_cdx_rows("garbage") == [] # type: ignore[arg-type] # ── cdx_enumerate mit gemocktem httpx ─────────────────────────────── class _FakeResp: def __init__(self, status_code, json_data): self.status_code = status_code self._json = json_data def json(self): return self._json class _FakeClient: def __init__(self, resp): self._resp = resp async def __aenter__(self): return self async def __aexit__(self, *a): return False async def get(self, *a, **kw): return self._resp def _patch_httpx(monkeypatch, resp): monkeypatch.setattr( "compliance.services.legacy_url_cdx.httpx.AsyncClient", lambda *a, **kw: _FakeClient(resp), ) def test_cdx_enumerate_returns_parsed_pairs(monkeypatch): rows = [ ["original", "timestamp", "statuscode"], ["http://x.com/datenschutz", "20190101120000", "200"], ["http://x.com/logo.png", "20200101", "200"], ] _patch_httpx(monkeypatch, _FakeResp(200, rows)) out = _run(cdx_enumerate("https://x.com")) urls = [u for u, _ in out] assert "http://x.com/datenschutz" in urls assert "http://x.com/logo.png" not in urls # Asset gedroppt def test_cdx_enumerate_non_200_returns_empty(monkeypatch): _patch_httpx(monkeypatch, _FakeResp(503, [])) assert _run(cdx_enumerate("https://x.com")) == [] def test_cdx_enumerate_no_netloc_returns_empty(monkeypatch): _patch_httpx(monkeypatch, _FakeResp(200, [])) assert _run(cdx_enumerate("")) == [] # ── Orphan-Pfad: CDX-Fund → Legal-Filter behält Rechts-Seite ──────── def test_cdx_orphan_survives_legal_filter(): """Der eigentliche Orphan-Fall: CDX findet /datenschutz (nicht mehr verlinkt), der Legal-Filter behält sie, Produktseiten fallen raus.""" from compliance.services.legacy_url_discovery import _filter_legal_urls rows = [ ["original", "timestamp", "statuscode"], ["http://x.com/datenschutz", "20190101", "200"], ["http://x.com/products/widget", "20200101", "200"], ] pairs = _parse_cdx_rows(rows) legal = _filter_legal_urls([u for u, _ in pairs]) assert "http://x.com/datenschutz" in legal assert "http://x.com/products/widget" not in legal