breakpilot-compliance/backend-compliance/compliance/services/legacy_url_discovery.py

"""Legacy-URL-Discovery — systematische Suche nach veralteten DSE-/
Impressum-/Cookie-/AGB-URLs auf einer Domain.

Strategie aus 4 unabhängigen Quellen:
  A.1 Sitemap-Parser  — /sitemap.xml, /sitemap_index.xml, sitemap-de.xml,
                         sitemap-legal.xml
  A.2 Wayback Machine — archive.org/wayback/available für jeden bekannten
                         Slug; URLs die vor ≥18 Monaten archiviert wurden
                         und heute noch 200 liefern = Legacy-Verdacht
  A.3 Slug-Permutations — bekannte Slug-Familie × Locale/Brand-Parameter
  A.4 Banner-Modal-Links — Playwright öffnet Cookie-Einstellungen-Modal
                            und sammelt alle Links (Plan A.4 wird via
                            consent-tester aufgerufen, hier nur Schema)

Output: Liste von Legacy-Kandidaten mit Status, last_modified, found_via,
recommended_action ("Redirect 301", "Offline nehmen", "Belassen — aktuell").

Best-Effort: jede Quelle catched eigene Exceptions — eine ausgefallene
Sitemap blockiert nicht Wayback.
"""

from __future__ import annotations

import asyncio
import logging
import re
from datetime import datetime, timezone
from urllib.parse import urljoin, urlparse

import httpx

from compliance.services.legacy_url_cdx import cdx_enumerate

logger = logging.getLogger(__name__)


# Kanonische DE/EN Slug-Familie pro Doc-Type. Wir suchen jede dieser
# Pfade auf jeder Origin — auch wenn die Discovery sie schon hat,
# als unabhängige Verifikation.
_SLUG_FAMILY: dict[str, tuple[str, ...]] = {
    "dse": (
        "datenschutz", "datenschutzerklaerung", "datenschutzerklärung",
        "datenschutzhinweise", "datenschutzhinweis",
        "privacy", "privacy-policy", "privacy-notice",
        "datenschutz-online", "dse",
    ),
    "impressum": (
        "impressum", "imprint", "legal-notice", "site-notice",
        "anbieterkennzeichnung",
    ),
    "cookie": (
        "cookie-richtlinie", "cookies", "cookie-policy",
        "cookie-erklaerung", "cookieerklaerung", "cookie-hinweise",
    ),
    "agb": (
        "agb", "allgemeine-geschaeftsbedingungen",
        "geschaeftsbedingungen", "terms-and-conditions",
        "general-terms-of-business",
    ),
    "nutzungsbedingungen": (
        "nutzungsbedingungen", "terms-of-use", "terms-of-service",
        "nutzungsordnung",
    ),
    "widerruf": (
        "widerruf", "widerrufsbelehrung",
        "widerrufsbelehrung-privatkunden", "cancellation",
    ),
}


_LANG_PREFIXES = ("", "/de", "/de_de", "/de-de", "/germany", "/en")
_BRAND_PARAMS = ("", "?brand=", "?lang=de", "?locale=de_DE")


_LEGACY_AGE_MONTHS_THRESHOLD = 18  # ältere = Legacy-Verdacht


async def _fetch_sitemap_urls(origin: str) -> list[str]:
    """A.1 — sitemap.xml + Varianten."""
    candidates = (
        f"{origin}/sitemap.xml",
        f"{origin}/sitemap_index.xml",
        f"{origin}/sitemap-de.xml",
        f"{origin}/sitemap-legal.xml",
        f"{origin}/sitemap-pages.xml",
    )
    out: set[str] = set()
    try:
        async with httpx.AsyncClient(timeout=10.0, follow_redirects=True) as c:
            for url in candidates:
                try:
                    r = await c.get(url)
                    if r.status_code != 200:
                        continue
                    # Sitemap-Index: weitere Sitemaps verlinkt
                    locs = re.findall(r"<loc>([^<]+)</loc>", r.text)
                    for loc in locs:
                        loc = loc.strip()
                        if loc.endswith(".xml"):
                            # nested sitemap — fetch
                            try:
                                rr = await c.get(loc)
                                if rr.status_code == 200:
                                    out.update(
                                        m.strip() for m in
                                        re.findall(r"<loc>([^<]+)</loc>",
                                                   rr.text)
                                        if not m.strip().endswith(".xml")
                                    )
                            except Exception:
                                continue
                        else:
                            out.add(loc)
                except Exception:
                    continue
    except Exception as e:
        logger.info("sitemap fetch failed for %s: %s", origin, e)
    return list(out)


async def _wayback_check(url: str) -> dict | None:
    """A.2 — Wayback-Machine. Return latest archived snapshot info."""
    try:
        async with httpx.AsyncClient(timeout=10.0) as c:
            r = await c.get(
                "https://archive.org/wayback/available",
                params={"url": url, "timestamp": "20200101"},
            )
            if r.status_code != 200:
                return None
            data = r.json() or {}
            snap = (data.get("archived_snapshots") or {}).get("closest") or {}
            if not snap.get("available"):
                return None
            ts = snap.get("timestamp", "")
            return {
                "snapshot_url": snap.get("url"),
                "timestamp": ts,
                "status": snap.get("status"),
            }
    except Exception:
        return None


def _months_since(timestamp_yyyymmdd: str) -> int | None:
    """Wayback-Timestamp Format: YYYYMMDDHHMMSS."""
    if not timestamp_yyyymmdd or len(timestamp_yyyymmdd) < 6:
        return None
    try:
        snap = datetime.strptime(timestamp_yyyymmdd[:6], "%Y%m").replace(
            tzinfo=timezone.utc,
        )
        now = datetime.now(timezone.utc)
        delta = (now.year - snap.year) * 12 + (now.month - snap.month)
        return max(0, delta)
    except Exception:
        return None


async def _probe_alive(url: str) -> tuple[int, str]:
    """Return (status_code, last_modified_header)."""
    try:
        async with httpx.AsyncClient(
            timeout=6.0, follow_redirects=False,
        ) as c:
            r = await c.head(url)
            if r.status_code == 405:
                r = await c.get(url)
            return r.status_code, r.headers.get("last-modified", "")
    except Exception:
        return 0, ""


def _build_slug_candidates(origin: str) -> list[str]:
    out: set[str] = set()
    for doc_type, slugs in _SLUG_FAMILY.items():
        for lang in _LANG_PREFIXES:
            for slug in slugs:
                base = f"{origin}{lang}/{slug}".replace("//", "/")
                base = base.replace("https:/", "https://")
                base = base.replace("http:/", "http://")
                out.add(base)
                for bp in _BRAND_PARAMS:
                    if bp:
                        out.add(base + bp)
    return list(out)


def _filter_legal_urls(urls: list[str]) -> list[str]:
    """Compliance-relevante Pfade aus Sitemap-Output."""
    keywords = []
    for slugs in _SLUG_FAMILY.values():
        keywords.extend(slugs)
    keywords_lc = [k.lower() for k in keywords]
    out: list[str] = []
    for u in urls:
        ul = u.lower()
        if any(k in ul for k in keywords_lc):
            out.append(u)
    return out


def _recommend(status: int, age_months: int | None,
                in_sitemap: bool, in_footer: bool) -> str:
    if status == 404 or status == 410:
        return "URL veraltet (404/410) — Backlinks prüfen, ggf. 301 setzen"
    if status == 0:
        return "Nicht erreichbar — manuell prüfen"
    if status in (301, 302, 303, 307, 308):
        return "Bereits redirected — behalten"
    if status == 200:
        if age_months is None:
            return "Erreichbar, kein Wayback-Stand — Inhalt manuell prüfen"
        if age_months >= _LEGACY_AGE_MONTHS_THRESHOLD and not in_footer:
            return (
                f"Legacy-Verdacht ({age_months} Monate altes Wayback, "
                "nicht im Footer verlinkt) — 301-Redirect auf aktuelle "
                "Version setzen ODER offline nehmen"
            )
        if age_months >= 36 and in_footer:
            return (
                f"Reachable + im Footer, aber Wayback {age_months} Monate "
                "alt — manuell prüfen ob Inhalt noch aktuell"
            )
        return "Aktuell, kein Handlungsbedarf"
    return f"HTTP {status} — manuell prüfen"


async def discover_legacy_urls(state: dict) -> dict:
    """Run all 4 sources + consolidate. Returns dict for HTML rendering."""
    doc_entries = state.get("doc_entries") or []
    origins: set[str] = set()
    footer_urls: set[str] = set()
    for e in doc_entries:
        url = (e.get("url") or "").strip()
        if url and "://" in url:
            p = urlparse(url)
            origins.add(f"{p.scheme}://{p.netloc}")
            footer_urls.add(url.split("#")[0].split("?")[0])
    if not origins:
        return {"candidates": [], "skipped": "no_origin"}

    candidates: set[str] = set()
    # A.1 Sitemap + A.3 Slug-Permutations
    for o in list(origins)[:2]:
        sitemap_urls = await _fetch_sitemap_urls(o)
        candidates.update(_filter_legal_urls(sitemap_urls))
        candidates.update(_build_slug_candidates(o))

    # A.5 Wayback-CDX: alle je archivierten URLs der Domain → faengt
    # Orphans, die nie im Slug-Raster standen. (url, cdx_timestamp); der
    # timestamp dient als Legacy-Alter (kein zweiter Wayback-Call noetig).
    cdx_pairs: list[tuple[str, str]] = []
    for o in list(origins)[:2]:
        cdx_pairs.extend(await cdx_enumerate(o))
    cdx_legal_urls = set(_filter_legal_urls([u for u, _ in cdx_pairs]))
    cdx_legal = [
        (u, ts) for (u, ts) in cdx_pairs
        if u in cdx_legal_urls and u not in candidates
    ][:100]

    # Cap to avoid explosion
    cands = list(candidates)[:60]

    # Probe alive + Wayback in parallel
    async def _check(url: str) -> dict:
        status, lm = await _probe_alive(url)
        wb = await _wayback_check(url) if status == 200 else None
        age = _months_since(wb.get("timestamp", "") if wb else "")
        in_footer = url.split("#")[0].split("?")[0] in footer_urls
        return {
            "url": url,
            "status": status,
            "last_modified": lm,
            "wayback_snapshot": wb.get("snapshot_url") if wb else "",
            "wayback_timestamp": wb.get("timestamp", "") if wb else "",
            "age_months": age,
            "in_footer": in_footer,
            "recommendation": _recommend(status, age, False, in_footer),
            "via": "sitemap/slug",
        }

    # CDX-Kandidaten: nur Liveness pruefen (Archiv-Stand kennen wir schon).
    async def _check_cdx(url: str, ts: str) -> dict:
        status, lm = await _probe_alive(url)
        age = _months_since(ts)
        in_footer = url.split("#")[0].split("?")[0] in footer_urls
        return {
            "url": url,
            "status": status,
            "last_modified": lm,
            "wayback_snapshot": "",
            "wayback_timestamp": ts,
            "age_months": age,
            "in_footer": in_footer,
            "recommendation": _recommend(status, age, False, in_footer),
            "via": "wayback-cdx",
        }

    gathered = await asyncio.gather(
        *[_check(u) for u in cands],
        *[_check_cdx(u, ts) for u, ts in cdx_legal],
        return_exceptions=True,
    )
    results = [r for r in gathered if isinstance(r, dict)]

    # Filter: only show interesting ones (≥200 reachable + legacy-relevant)
    interesting: list[dict] = []
    for r in results:
        if r["status"] == 0:
            continue  # Nicht erreichbar, nicht interessant
        # 404/410/redirects nur wenn im footer → broken link
        if r["status"] in (404, 410) and not r["in_footer"]:
            continue
        # 200 + im Footer + recent Wayback → "alles OK" filter
        if (r["status"] == 200 and r["in_footer"]
                and r["age_months"] is not None
                and r["age_months"] < _LEGACY_AGE_MONTHS_THRESHOLD):
            continue
        interesting.append(r)
    # Sort: Legacy-Verdächtige zuerst (200 + alt + nicht im Footer)
    interesting.sort(
        key=lambda r: (
            0 if "Legacy-Verdacht" in r["recommendation"] else
            1 if "veraltet" in r["recommendation"] else 2,
            -(r.get("age_months") or 0),
        ),
    )
    return {
        "candidates": interesting,
        "probed": len(results),
        "filtered_kept": len(interesting),
        "cdx_candidates": len(cdx_legal),
        "origins": list(origins),
    }