feat(b16): Footer-Label-vs-URL-Slug-Drift-Check (GT URL-STRUCTURE-001)

Erkennt: gängige Footer-Labels / Bookmark- + SEO-Erwartungs-Slugs (z.B. "Cookie-Richtlinie", "AGB", "Datenschutzerklärung") liefern 404, während das Doc tatsächlich unter einem abweichenden Slug ausgeliefert wird. GT-Anker (Elli URL-STRUCTURE-001): Footer-Label "Cookie-Richtlinie" → /cookie-richtlinie 404 Real: /de/cookies → externe Bookmarks und Google-Treffer brechen. Heuristik: - Aus auto-discovered URLs Origin + Sprach-Prefix extrahieren (z.B. /de, /de-de) - Pro doc_type 2-4 kanonische Standard-Slugs probieren (parallel via ThreadPoolExecutor, 2s Timeout, HEAD → GET fallback bei 405) - Wenn alternative Slug 404/410 → LOW Finding pro doc_type - Probe-Cap auf 18 Requests gesamt (Network-Noise-Schutz) - Abschaltbar via URL_SLUG_PROBE_DISABLED=1 Severity: LOW (Best-Practice, kein juristisches Hardfail). Tests: 13/13 grün (Strip-Helper 4 + Origin-Helper 3 + Check-Pfade 6 inkl. mocked _head_status). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-06-07 00:23:25 +02:00
parent b0b7f80914
commit 65e8bb9d42
5 changed files with 415 additions and 0 deletions
@@ -54,6 +54,8 @@ def compose_v2(state: dict) -> str:
        state.get("retention_conflict_html", ""),
        # B15 AI-Act Rechtsgrundlage (LLM-Vendor auf lit. f)
        state.get("ai_legal_basis_html", ""),
+        # B16 Footer-Label-vs-URL-Slug-Drift (SEO / Bookmarks)
+        state.get("url_slug_drift_html", ""),
        # Browser-Matrix (Stage 1.c)
        state.get("browser_matrix_html", ""),
        # All legacy build_*_html() wrapped in V2 sections — preserves
@@ -0,0 +1,206 @@
+"""B16 — Footer-Label-vs-URL-Slug-Drift-Detector.
+
+Erkennt: gängige Footer-Labels (z.B. "Cookie-Richtlinie", "AGB",
+"Datenschutzerklärung") existieren als Bookmark-/SEO-Erwartung,
+aber auf der Site antwortet der entsprechende Standard-Slug mit 404.
+Real wird das Doc unter einem abweichenden Slug ausgeliefert.
+
+GT-Anker: Elli URL-STRUCTURE-001:
+  Footer-Label "Cookie-Richtlinie" → /cookie-richtlinie 404
+  Footer-Label "AGB"               → /agb 404
+  Real: /de/cookies, /de/nutzungsbedingungen.
+
+Heuristik:
+  1. Aus den discovered URLs den Base-Host extrahieren.
+  2. Pro doc_type eine kleine Liste kanonischer Standard-Slugs prüfen
+     (HEAD oder GET), je 2 s Timeout.
+  3. Wenn discovered Slug bekannt ist, ABER mindestens ein
+     gleichwertiger Standard-Slug 404 ergibt → LOW Finding pro alt-Slug.
+
+Severity: LOW (SEO/Bookmark-Bruch, kein juristisches Hardfail).
+"""
+
+from __future__ import annotations
+
+import logging
+import os
+from concurrent.futures import ThreadPoolExecutor
+from urllib.parse import urlparse
+
+import httpx
+
+logger = logging.getLogger(__name__)
+
+
+# Kanonische DE/EN Standard-Slugs pro doc_type (ohne führenden /).
+# Reihenfolge: erst der häufigste deutsche, dann Synonyme, dann EN.
+_CANONICAL_SLUGS: dict[str, tuple[str, ...]] = {
+    "dse": (
+        "datenschutz", "datenschutzerklaerung", "datenschutzerklärung",
+        "privacy", "privacy-policy",
+    ),
+    "impressum": (
+        "impressum", "imprint", "legal-notice",
+    ),
+    "cookie": (
+        "cookie-richtlinie", "cookies", "cookie-policy",
+    ),
+    "agb": (
+        "agb", "allgemeine-geschaeftsbedingungen",
+        "geschaeftsbedingungen", "terms-and-conditions",
+    ),
+    "nutzungsbedingungen": (
+        "nutzungsbedingungen", "terms-of-use", "terms-of-service",
+    ),
+    "widerruf": (
+        "widerrufsbelehrung", "widerruf", "cancellation",
+    ),
+}
+
+
+# Konfigurations-Schalter (default: AN; lässt sich pro Run abschalten).
+_DISABLED = os.environ.get("URL_SLUG_PROBE_DISABLED", "").lower() in (
+    "1", "true", "yes", "on",
+)
+
+
+def _strip_path_slug(url: str) -> str:
+    """Return the LAST path-segment of a URL (without trailing /)."""
+    if not url:
+        return ""
+    try:
+        p = urlparse(url)
+        path = (p.path or "").strip("/")
+        if not path:
+            return ""
+        return path.split("/")[-1].lower()
+    except Exception:
+        return ""
+
+
+def _origin_and_prefix(url: str) -> tuple[str, str]:
+    """Return (origin, language-prefix-or-empty) so we can rebuild
+    alternative URLs at the same scope as the discovered one.
+
+    Example: 'https://www.elli.eco/de/cookies' → ('https://www.elli.eco', '/de')
+    """
+    try:
+        p = urlparse(url)
+        origin = f"{p.scheme}://{p.netloc}"
+        path = p.path or "/"
+        parts = [s for s in path.split("/") if s]
+        # Heuristik: ein 2-3 Zeichen-Pfad-Segment ganz vorn wird als
+        # Sprach-Prefix interpretiert (de, en, fr, de-de, en-us).
+        if parts and (len(parts[0]) == 2 or len(parts[0]) == 5):
+            return origin, f"/{parts[0]}"
+        return origin, ""
+    except Exception:
+        return "", ""
+
+
+def _head_status(url: str, timeout_s: float = 2.0) -> int:
+    """Return HTTP status code (0 on network error)."""
+    try:
+        with httpx.Client(timeout=timeout_s, follow_redirects=False) as c:
+            r = c.head(url)
+            # Some servers reject HEAD with 405 — fall back to GET.
+            if r.status_code == 405:
+                r = c.get(url)
+            return r.status_code
+    except Exception:
+        return 0
+
+
+def check_url_slug_drift(state: dict) -> list[dict]:
+    """Probe canonical alternative slugs per discovered doc; emit a LOW
+    finding per slug that 404s while the doc is reachable under a
+    different slug."""
+    if _DISABLED:
+        return []
+    doc_entries = state.get("doc_entries") or []
+    # Build {doc_type: (discovered_url, discovered_slug)} for
+    # auto-discovered docs with non-empty text.
+    discovered: dict[str, tuple[str, str]] = {}
+    for e in doc_entries:
+        dt = (e.get("doc_type") or "").lower()
+        if dt not in _CANONICAL_SLUGS:
+            continue
+        url = (e.get("url") or "").strip()
+        text_len = len((e.get("text") or "").strip())
+        if not url or text_len < 400:
+            continue
+        slug = _strip_path_slug(url)
+        if not slug:
+            continue
+        discovered[dt] = (url, slug)
+
+    if not discovered:
+        return []
+
+    # Build probe-plan: for each doc_type, probe the canonical slugs
+    # OTHER than the one that's already discovered.
+    probes: list[tuple[str, str, str]] = []  # (doc_type, alt_slug, url)
+    for dt, (url, slug) in discovered.items():
+        origin, prefix = _origin_and_prefix(url)
+        if not origin:
+            continue
+        for alt in _CANONICAL_SLUGS[dt]:
+            if alt.lower() == slug:
+                continue
+            probes.append((dt, alt, f"{origin}{prefix}/{alt}"))
+
+    # Cap to keep network noise bounded.
+    probes = probes[:18]
+    if not probes:
+        return []
+
+    def _do_probe(item: tuple[str, str, str]) -> tuple[str, str, str, int]:
+        dt, alt, u = item
+        return dt, alt, u, _head_status(u)
+
+    results: list[tuple[str, str, str, int]] = []
+    with ThreadPoolExecutor(max_workers=6) as ex:
+        results = list(ex.map(_do_probe, probes))
+
+    findings: list[dict] = []
+    # Group by doc_type so we can emit ONE finding per doc with the
+    # list of 404-alts.
+    per_dt: dict[str, list[tuple[str, str]]] = {}
+    for dt, alt, u, status in results:
+        if status == 404 or status == 410:
+            per_dt.setdefault(dt, []).append((alt, u))
+
+    for dt, alts in per_dt.items():
+        if not alts:
+            continue
+        discovered_url, discovered_slug = discovered[dt]
+        broken_urls = ", ".join(u for _, u in alts[:3])
+        broken_slugs = ", ".join(s for s, _ in alts[:3])
+        findings.append({
+            "check_id": "URL-SLUG-DRIFT-001",
+            "severity": "LOW",
+            "severity_reason": "seo_bookmark_break",
+            "doc_type": dt,
+            "title": (
+                f"Externe Bookmarks / SEO-Erwartung für {dt} brechen "
+                f"({len(alts)} Standard-Slug(s) 404)"
+            ),
+            "norm": (
+                "Kein juristischer Pflichttatbestand — Best-Practice "
+                "(SEO, externe Verlinkungen, Footer-Label-Konsistenz)"
+            ),
+            "evidence": (
+                f"Doc ist erreichbar unter '{discovered_url}'. "
+                f"Standard-Slug(s) {broken_slugs} liefern 404/410 "
+                f"({broken_urls})."
+            ),
+            "action": (
+                f"Redirects einrichten von {broken_slugs} nach "
+                f"'{discovered_url}' — damit externe Bookmarks, "
+                "alte Footer-Labels und Google-Treffer nicht brechen."
+            ),
+            "alt_slugs_404": [s for s, _ in alts],
+        })
+    if findings:
+        logger.info("B16 url-slug-drift: %d finding(s)", len(findings))
+    return findings