breakpilot-compliance/backend-compliance/compliance/services/url_slug_drift_check.py

"""B16 — Footer-Label-vs-URL-Slug-Drift-Detector.

Erkennt: gängige Footer-Labels (z.B. "Cookie-Richtlinie", "AGB",
"Datenschutzerklärung") existieren als Bookmark-/SEO-Erwartung,
aber auf der Site antwortet der entsprechende Standard-Slug mit 404.
Real wird das Doc unter einem abweichenden Slug ausgeliefert.

GT-Anker: Elli URL-STRUCTURE-001:
  Footer-Label "Cookie-Richtlinie" → /cookie-richtlinie 404
  Footer-Label "AGB"               → /agb 404
  Real: /de/cookies, /de/nutzungsbedingungen.

Heuristik:
  1. Aus den discovered URLs den Base-Host extrahieren.
  2. Pro doc_type eine kleine Liste kanonischer Standard-Slugs prüfen
     (HEAD oder GET), je 2 s Timeout.
  3. Wenn discovered Slug bekannt ist, ABER mindestens ein
     gleichwertiger Standard-Slug 404 ergibt → LOW Finding pro alt-Slug.

Severity: LOW (SEO/Bookmark-Bruch, kein juristisches Hardfail).
"""

from __future__ import annotations

import logging
import os
from concurrent.futures import ThreadPoolExecutor
from urllib.parse import urlparse

import httpx

logger = logging.getLogger(__name__)


# Kanonische DE/EN Standard-Slugs pro doc_type (ohne führenden /).
# Reihenfolge: erst der häufigste deutsche, dann Synonyme, dann EN.
_CANONICAL_SLUGS: dict[str, tuple[str, ...]] = {
    "dse": (
        "datenschutz", "datenschutzerklaerung", "datenschutzerklärung",
        "privacy", "privacy-policy",
    ),
    "impressum": (
        "impressum", "imprint", "legal-notice",
    ),
    "cookie": (
        "cookie-richtlinie", "cookies", "cookie-policy",
    ),
    "agb": (
        "agb", "allgemeine-geschaeftsbedingungen",
        "geschaeftsbedingungen", "terms-and-conditions",
    ),
    "nutzungsbedingungen": (
        "nutzungsbedingungen", "terms-of-use", "terms-of-service",
    ),
    "widerruf": (
        "widerrufsbelehrung", "widerruf", "cancellation",
    ),
}


# Konfigurations-Schalter (default: AN; lässt sich pro Run abschalten).
_DISABLED = os.environ.get("URL_SLUG_PROBE_DISABLED", "").lower() in (
    "1", "true", "yes", "on",
)


def _strip_path_slug(url: str) -> str:
    """Return the LAST path-segment of a URL (without trailing /)."""
    if not url:
        return ""
    try:
        p = urlparse(url)
        path = (p.path or "").strip("/")
        if not path:
            return ""
        return path.split("/")[-1].lower()
    except Exception:
        return ""


def _origin_and_prefix(url: str) -> tuple[str, str]:
    """Return (origin, language-prefix-or-empty) so we can rebuild
    alternative URLs at the same scope as the discovered one.

    Example: 'https://www.elli.eco/de/cookies' → ('https://www.elli.eco', '/de')
    """
    try:
        p = urlparse(url)
        origin = f"{p.scheme}://{p.netloc}"
        path = p.path or "/"
        parts = [s for s in path.split("/") if s]
        # Heuristik: ein 2-3 Zeichen-Pfad-Segment ganz vorn wird als
        # Sprach-Prefix interpretiert (de, en, fr, de-de, en-us).
        if parts and (len(parts[0]) == 2 or len(parts[0]) == 5):
            return origin, f"/{parts[0]}"
        return origin, ""
    except Exception:
        return "", ""


def _head_status(url: str, timeout_s: float = 2.0) -> int:
    """Return HTTP status code (0 on network error)."""
    try:
        with httpx.Client(timeout=timeout_s, follow_redirects=False) as c:
            r = c.head(url)
            # Some servers reject HEAD with 405 — fall back to GET.
            if r.status_code == 405:
                r = c.get(url)
            return r.status_code
    except Exception:
        return 0


def check_url_slug_drift(state: dict) -> list[dict]:
    """Probe canonical alternative slugs per discovered doc; emit a LOW
    finding per slug that 404s while the doc is reachable under a
    different slug."""
    if _DISABLED:
        return []
    doc_entries = state.get("doc_entries") or []
    # Build {doc_type: (discovered_url, discovered_slug)} for
    # auto-discovered docs with non-empty text.
    discovered: dict[str, tuple[str, str]] = {}
    for e in doc_entries:
        dt = (e.get("doc_type") or "").lower()
        if dt not in _CANONICAL_SLUGS:
            continue
        url = (e.get("url") or "").strip()
        text_len = len((e.get("text") or "").strip())
        if not url or text_len < 400:
            continue
        slug = _strip_path_slug(url)
        if not slug:
            continue
        discovered[dt] = (url, slug)

    if not discovered:
        return []

    # Build probe-plan: for each doc_type, probe the canonical slugs
    # OTHER than the one that's already discovered.
    probes: list[tuple[str, str, str]] = []  # (doc_type, alt_slug, url)
    for dt, (url, slug) in discovered.items():
        origin, prefix = _origin_and_prefix(url)
        if not origin:
            continue
        for alt in _CANONICAL_SLUGS[dt]:
            if alt.lower() == slug:
                continue
            probes.append((dt, alt, f"{origin}{prefix}/{alt}"))

    # Cap to keep network noise bounded.
    probes = probes[:18]
    if not probes:
        return []

    def _do_probe(item: tuple[str, str, str]) -> tuple[str, str, str, int]:
        dt, alt, u = item
        return dt, alt, u, _head_status(u)

    results: list[tuple[str, str, str, int]] = []
    with ThreadPoolExecutor(max_workers=6) as ex:
        results = list(ex.map(_do_probe, probes))

    findings: list[dict] = []
    # Group by doc_type so we can emit ONE finding per doc with the
    # list of 404-alts.
    per_dt: dict[str, list[tuple[str, str]]] = {}
    for dt, alt, u, status in results:
        if status == 404 or status == 410:
            per_dt.setdefault(dt, []).append((alt, u))

    for dt, alts in per_dt.items():
        if not alts:
            continue
        discovered_url, discovered_slug = discovered[dt]
        broken_urls = ", ".join(u for _, u in alts[:3])
        broken_slugs = ", ".join(s for s, _ in alts[:3])
        findings.append({
            "check_id": "URL-SLUG-DRIFT-001",
            "severity": "LOW",
            "severity_reason": "seo_bookmark_break",
            "doc_type": dt,
            "title": (
                f"Externe Bookmarks / SEO-Erwartung für {dt} brechen "
                f"({len(alts)} Standard-Slug(s) 404)"
            ),
            "norm": (
                "Kein juristischer Pflichttatbestand — Best-Practice "
                "(SEO, externe Verlinkungen, Footer-Label-Konsistenz)"
            ),
            "evidence": (
                f"Doc ist erreichbar unter '{discovered_url}'. "
                f"Standard-Slug(s) {broken_slugs} liefern 404/410 "
                f"({broken_urls})."
            ),
            "action": (
                f"Redirects einrichten von {broken_slugs} nach "
                f"'{discovered_url}' — damit externe Bookmarks, "
                "alte Footer-Labels und Google-Treffer nicht brechen."
            ),
            "alt_slugs_404": [s for s, _ in alts],
        })
    if findings:
        logger.info("B16 url-slug-drift: %d finding(s)", len(findings))
    return findings