breakpilot-compliance/backend-compliance/compliance/services/legacy_url_cdx.py

"""Wayback-CDX-Enumeration — listet ALLE je archivierten URLs einer Domain.

Anders als die per-Slug-Wayback-Pruefung (legacy_url_discovery._wayback_check)
holen wir hier die KOMPLETTE History-Liste der Domain ueber die CDX-API. So
finden wir Orphan-/Legacy-Seiten, die nie im Slug-Raster standen und heute
nicht mehr verlinkt sind, aber per Direkt-URL noch erreichbar — genau der Fall
"www.xyz.com/datenschutz existierte mal, wurde nie entfernt".

Best-effort: jede Exception → leere Liste, blockiert die uebrige Discovery nie.
"""

from __future__ import annotations

import logging
from urllib.parse import urlparse

import httpx

logger = logging.getLogger(__name__)

_CDX_API = "http://web.archive.org/cdx/search/cdx"

# Nicht-HTML-Assets, die uns fuer Rechts-Content nicht interessieren.
_ASSET_SUFFIXES = (
    ".js", ".css", ".png", ".jpg", ".jpeg", ".gif", ".svg", ".ico",
    ".woff", ".woff2", ".ttf", ".eot", ".webp", ".mp4", ".webm",
    ".zip", ".map", ".json", ".xml", ".rss", ".txt", ".csv",
)


def _parse_cdx_rows(rows: list) -> list[tuple[str, str]]:
    """Parst CDX-JSON zu (url, timestamp)-Paaren.

    CDX-JSON ist ein Array von Arrays; Zeile 0 ist der Header
    ["original","timestamp","statuscode"]. Assets werden gedroppt,
    Duplikate (per URL ohne Fragment) entfernt.
    """
    if not isinstance(rows, list) or len(rows) < 2:
        return []
    seen: set[str] = set()
    out: list[tuple[str, str]] = []
    for row in rows[1:]:  # Zeile 0 = Header
        if not isinstance(row, (list, tuple)) or not row:
            continue
        url = str(row[0]).strip()
        if not url:
            continue
        path = url.lower().split("?", 1)[0].split("#", 1)[0]
        if path.endswith(_ASSET_SUFFIXES):
            continue
        key = url.split("#", 1)[0]
        if key in seen:
            continue
        seen.add(key)
        ts = str(row[1]).strip() if len(row) > 1 else ""
        out.append((url, ts))
    return out


async def cdx_enumerate(origin: str, limit: int = 2000) -> list[tuple[str, str]]:
    """Liefert (url, wayback_timestamp) fuer alle je archivierten HTML-URLs.

    `collapse=urlkey` → eine Zeile pro URL; `filter=statuscode:200` → nur
    erfolgreich archivierte. Der timestamp wird spaeter als Legacy-Alter
    wiederverwendet (spart einen zweiten Wayback-Call pro URL).
    """
    netloc = urlparse(origin).netloc or origin.replace("https://", "").replace(
        "http://", "",
    )
    if not netloc:
        return []
    params = {
        "url": f"{netloc}*",
        "output": "json",
        "collapse": "urlkey",
        "fl": "original,timestamp,statuscode",
        "filter": "statuscode:200",
        "limit": str(limit),
    }
    try:
        async with httpx.AsyncClient(timeout=15.0) as c:
            r = await c.get(_CDX_API, params=params)
            if r.status_code != 200:
                return []
            rows = r.json() or []
    except Exception as e:
        logger.info("CDX enumerate failed for %s: %s", netloc, e)
        return []
    return _parse_cdx_rows(rows)