Files
breakpilot-compliance/backend-compliance/compliance/services/url_slug_drift_check.py
T
Benjamin Admin 65e8bb9d42 feat(b16): Footer-Label-vs-URL-Slug-Drift-Check (GT URL-STRUCTURE-001)
Erkennt: gängige Footer-Labels / Bookmark- + SEO-Erwartungs-Slugs
(z.B. "Cookie-Richtlinie", "AGB", "Datenschutzerklärung") liefern
404, während das Doc tatsächlich unter einem abweichenden Slug
ausgeliefert wird.

GT-Anker (Elli URL-STRUCTURE-001):
  Footer-Label "Cookie-Richtlinie" → /cookie-richtlinie 404
  Real: /de/cookies
  → externe Bookmarks und Google-Treffer brechen.

Heuristik:
  - Aus auto-discovered URLs Origin + Sprach-Prefix extrahieren
    (z.B. /de, /de-de)
  - Pro doc_type 2-4 kanonische Standard-Slugs probieren (parallel
    via ThreadPoolExecutor, 2s Timeout, HEAD → GET fallback bei 405)
  - Wenn alternative Slug 404/410 → LOW Finding pro doc_type
  - Probe-Cap auf 18 Requests gesamt (Network-Noise-Schutz)
  - Abschaltbar via URL_SLUG_PROBE_DISABLED=1

Severity: LOW (Best-Practice, kein juristisches Hardfail).

Tests: 13/13 grün (Strip-Helper 4 + Origin-Helper 3 + Check-Pfade 6
inkl. mocked _head_status).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-06-07 00:23:25 +02:00

207 lines
6.9 KiB
Python

"""B16 — Footer-Label-vs-URL-Slug-Drift-Detector.
Erkennt: gängige Footer-Labels (z.B. "Cookie-Richtlinie", "AGB",
"Datenschutzerklärung") existieren als Bookmark-/SEO-Erwartung,
aber auf der Site antwortet der entsprechende Standard-Slug mit 404.
Real wird das Doc unter einem abweichenden Slug ausgeliefert.
GT-Anker: Elli URL-STRUCTURE-001:
Footer-Label "Cookie-Richtlinie" → /cookie-richtlinie 404
Footer-Label "AGB" → /agb 404
Real: /de/cookies, /de/nutzungsbedingungen.
Heuristik:
1. Aus den discovered URLs den Base-Host extrahieren.
2. Pro doc_type eine kleine Liste kanonischer Standard-Slugs prüfen
(HEAD oder GET), je 2 s Timeout.
3. Wenn discovered Slug bekannt ist, ABER mindestens ein
gleichwertiger Standard-Slug 404 ergibt → LOW Finding pro alt-Slug.
Severity: LOW (SEO/Bookmark-Bruch, kein juristisches Hardfail).
"""
from __future__ import annotations
import logging
import os
from concurrent.futures import ThreadPoolExecutor
from urllib.parse import urlparse
import httpx
logger = logging.getLogger(__name__)
# Kanonische DE/EN Standard-Slugs pro doc_type (ohne führenden /).
# Reihenfolge: erst der häufigste deutsche, dann Synonyme, dann EN.
_CANONICAL_SLUGS: dict[str, tuple[str, ...]] = {
"dse": (
"datenschutz", "datenschutzerklaerung", "datenschutzerklärung",
"privacy", "privacy-policy",
),
"impressum": (
"impressum", "imprint", "legal-notice",
),
"cookie": (
"cookie-richtlinie", "cookies", "cookie-policy",
),
"agb": (
"agb", "allgemeine-geschaeftsbedingungen",
"geschaeftsbedingungen", "terms-and-conditions",
),
"nutzungsbedingungen": (
"nutzungsbedingungen", "terms-of-use", "terms-of-service",
),
"widerruf": (
"widerrufsbelehrung", "widerruf", "cancellation",
),
}
# Konfigurations-Schalter (default: AN; lässt sich pro Run abschalten).
_DISABLED = os.environ.get("URL_SLUG_PROBE_DISABLED", "").lower() in (
"1", "true", "yes", "on",
)
def _strip_path_slug(url: str) -> str:
"""Return the LAST path-segment of a URL (without trailing /)."""
if not url:
return ""
try:
p = urlparse(url)
path = (p.path or "").strip("/")
if not path:
return ""
return path.split("/")[-1].lower()
except Exception:
return ""
def _origin_and_prefix(url: str) -> tuple[str, str]:
"""Return (origin, language-prefix-or-empty) so we can rebuild
alternative URLs at the same scope as the discovered one.
Example: 'https://www.elli.eco/de/cookies' → ('https://www.elli.eco', '/de')
"""
try:
p = urlparse(url)
origin = f"{p.scheme}://{p.netloc}"
path = p.path or "/"
parts = [s for s in path.split("/") if s]
# Heuristik: ein 2-3 Zeichen-Pfad-Segment ganz vorn wird als
# Sprach-Prefix interpretiert (de, en, fr, de-de, en-us).
if parts and (len(parts[0]) == 2 or len(parts[0]) == 5):
return origin, f"/{parts[0]}"
return origin, ""
except Exception:
return "", ""
def _head_status(url: str, timeout_s: float = 2.0) -> int:
"""Return HTTP status code (0 on network error)."""
try:
with httpx.Client(timeout=timeout_s, follow_redirects=False) as c:
r = c.head(url)
# Some servers reject HEAD with 405 — fall back to GET.
if r.status_code == 405:
r = c.get(url)
return r.status_code
except Exception:
return 0
def check_url_slug_drift(state: dict) -> list[dict]:
"""Probe canonical alternative slugs per discovered doc; emit a LOW
finding per slug that 404s while the doc is reachable under a
different slug."""
if _DISABLED:
return []
doc_entries = state.get("doc_entries") or []
# Build {doc_type: (discovered_url, discovered_slug)} for
# auto-discovered docs with non-empty text.
discovered: dict[str, tuple[str, str]] = {}
for e in doc_entries:
dt = (e.get("doc_type") or "").lower()
if dt not in _CANONICAL_SLUGS:
continue
url = (e.get("url") or "").strip()
text_len = len((e.get("text") or "").strip())
if not url or text_len < 400:
continue
slug = _strip_path_slug(url)
if not slug:
continue
discovered[dt] = (url, slug)
if not discovered:
return []
# Build probe-plan: for each doc_type, probe the canonical slugs
# OTHER than the one that's already discovered.
probes: list[tuple[str, str, str]] = [] # (doc_type, alt_slug, url)
for dt, (url, slug) in discovered.items():
origin, prefix = _origin_and_prefix(url)
if not origin:
continue
for alt in _CANONICAL_SLUGS[dt]:
if alt.lower() == slug:
continue
probes.append((dt, alt, f"{origin}{prefix}/{alt}"))
# Cap to keep network noise bounded.
probes = probes[:18]
if not probes:
return []
def _do_probe(item: tuple[str, str, str]) -> tuple[str, str, str, int]:
dt, alt, u = item
return dt, alt, u, _head_status(u)
results: list[tuple[str, str, str, int]] = []
with ThreadPoolExecutor(max_workers=6) as ex:
results = list(ex.map(_do_probe, probes))
findings: list[dict] = []
# Group by doc_type so we can emit ONE finding per doc with the
# list of 404-alts.
per_dt: dict[str, list[tuple[str, str]]] = {}
for dt, alt, u, status in results:
if status == 404 or status == 410:
per_dt.setdefault(dt, []).append((alt, u))
for dt, alts in per_dt.items():
if not alts:
continue
discovered_url, discovered_slug = discovered[dt]
broken_urls = ", ".join(u for _, u in alts[:3])
broken_slugs = ", ".join(s for s, _ in alts[:3])
findings.append({
"check_id": "URL-SLUG-DRIFT-001",
"severity": "LOW",
"severity_reason": "seo_bookmark_break",
"doc_type": dt,
"title": (
f"Externe Bookmarks / SEO-Erwartung für {dt} brechen "
f"({len(alts)} Standard-Slug(s) 404)"
),
"norm": (
"Kein juristischer Pflichttatbestand — Best-Practice "
"(SEO, externe Verlinkungen, Footer-Label-Konsistenz)"
),
"evidence": (
f"Doc ist erreichbar unter '{discovered_url}'. "
f"Standard-Slug(s) {broken_slugs} liefern 404/410 "
f"({broken_urls})."
),
"action": (
f"Redirects einrichten von {broken_slugs} nach "
f"'{discovered_url}' — damit externe Bookmarks, "
"alte Footer-Labels und Google-Treffer nicht brechen."
),
"alt_slugs_404": [s for s, _ in alts],
})
if findings:
logger.info("B16 url-slug-drift: %d finding(s)", len(findings))
return findings