"""B16 — Footer-Label-vs-URL-Slug-Drift-Detector. Erkennt: gängige Footer-Labels (z.B. "Cookie-Richtlinie", "AGB", "Datenschutzerklärung") existieren als Bookmark-/SEO-Erwartung, aber auf der Site antwortet der entsprechende Standard-Slug mit 404. Real wird das Doc unter einem abweichenden Slug ausgeliefert. GT-Anker: Elli URL-STRUCTURE-001: Footer-Label "Cookie-Richtlinie" → /cookie-richtlinie 404 Footer-Label "AGB" → /agb 404 Real: /de/cookies, /de/nutzungsbedingungen. Heuristik: 1. Aus den discovered URLs den Base-Host extrahieren. 2. Pro doc_type eine kleine Liste kanonischer Standard-Slugs prüfen (HEAD oder GET), je 2 s Timeout. 3. Wenn discovered Slug bekannt ist, ABER mindestens ein gleichwertiger Standard-Slug 404 ergibt → LOW Finding pro alt-Slug. Severity: LOW (SEO/Bookmark-Bruch, kein juristisches Hardfail). """ from __future__ import annotations import logging import os from concurrent.futures import ThreadPoolExecutor from urllib.parse import urlparse import httpx logger = logging.getLogger(__name__) # Kanonische DE/EN Standard-Slugs pro doc_type (ohne führenden /). # Reihenfolge: erst der häufigste deutsche, dann Synonyme, dann EN. _CANONICAL_SLUGS: dict[str, tuple[str, ...]] = { "dse": ( "datenschutz", "datenschutzerklaerung", "datenschutzerklärung", "privacy", "privacy-policy", ), "impressum": ( "impressum", "imprint", "legal-notice", ), "cookie": ( "cookie-richtlinie", "cookies", "cookie-policy", ), "agb": ( "agb", "allgemeine-geschaeftsbedingungen", "geschaeftsbedingungen", "terms-and-conditions", ), "nutzungsbedingungen": ( "nutzungsbedingungen", "terms-of-use", "terms-of-service", ), "widerruf": ( "widerrufsbelehrung", "widerruf", "cancellation", ), } # Konfigurations-Schalter (default: AN; lässt sich pro Run abschalten). _DISABLED = os.environ.get("URL_SLUG_PROBE_DISABLED", "").lower() in ( "1", "true", "yes", "on", ) def _strip_path_slug(url: str) -> str: """Return the LAST path-segment of a URL (without trailing /).""" if not url: return "" try: p = urlparse(url) path = (p.path or "").strip("/") if not path: return "" return path.split("/")[-1].lower() except Exception: return "" def _origin_and_prefix(url: str) -> tuple[str, str]: """Return (origin, language-prefix-or-empty) so we can rebuild alternative URLs at the same scope as the discovered one. Example: 'https://www.elli.eco/de/cookies' → ('https://www.elli.eco', '/de') """ try: p = urlparse(url) origin = f"{p.scheme}://{p.netloc}" path = p.path or "/" parts = [s for s in path.split("/") if s] # Heuristik: ein 2-3 Zeichen-Pfad-Segment ganz vorn wird als # Sprach-Prefix interpretiert (de, en, fr, de-de, en-us). if parts and (len(parts[0]) == 2 or len(parts[0]) == 5): return origin, f"/{parts[0]}" return origin, "" except Exception: return "", "" def _head_status(url: str, timeout_s: float = 2.0) -> int: """Return HTTP status code (0 on network error).""" try: with httpx.Client(timeout=timeout_s, follow_redirects=False) as c: r = c.head(url) # Some servers reject HEAD with 405 — fall back to GET. if r.status_code == 405: r = c.get(url) return r.status_code except Exception: return 0 def check_url_slug_drift(state: dict) -> list[dict]: """Probe canonical alternative slugs per discovered doc; emit a LOW finding per slug that 404s while the doc is reachable under a different slug.""" if _DISABLED: return [] doc_entries = state.get("doc_entries") or [] # Build {doc_type: (discovered_url, discovered_slug)} for # auto-discovered docs with non-empty text. discovered: dict[str, tuple[str, str]] = {} for e in doc_entries: dt = (e.get("doc_type") or "").lower() if dt not in _CANONICAL_SLUGS: continue url = (e.get("url") or "").strip() text_len = len((e.get("text") or "").strip()) if not url or text_len < 400: continue slug = _strip_path_slug(url) if not slug: continue discovered[dt] = (url, slug) if not discovered: return [] # Build probe-plan: for each doc_type, probe the canonical slugs # OTHER than the one that's already discovered. probes: list[tuple[str, str, str]] = [] # (doc_type, alt_slug, url) for dt, (url, slug) in discovered.items(): origin, prefix = _origin_and_prefix(url) if not origin: continue for alt in _CANONICAL_SLUGS[dt]: if alt.lower() == slug: continue probes.append((dt, alt, f"{origin}{prefix}/{alt}")) # Cap to keep network noise bounded. probes = probes[:18] if not probes: return [] def _do_probe(item: tuple[str, str, str]) -> tuple[str, str, str, int]: dt, alt, u = item return dt, alt, u, _head_status(u) results: list[tuple[str, str, str, int]] = [] with ThreadPoolExecutor(max_workers=6) as ex: results = list(ex.map(_do_probe, probes)) findings: list[dict] = [] # Group by doc_type so we can emit ONE finding per doc with the # list of 404-alts. per_dt: dict[str, list[tuple[str, str]]] = {} for dt, alt, u, status in results: if status == 404 or status == 410: per_dt.setdefault(dt, []).append((alt, u)) for dt, alts in per_dt.items(): if not alts: continue discovered_url, discovered_slug = discovered[dt] broken_urls = ", ".join(u for _, u in alts[:3]) broken_slugs = ", ".join(s for s, _ in alts[:3]) findings.append({ "check_id": "URL-SLUG-DRIFT-001", "severity": "LOW", "severity_reason": "seo_bookmark_break", "doc_type": dt, "title": ( f"Externe Bookmarks / SEO-Erwartung für {dt} brechen " f"({len(alts)} Standard-Slug(s) 404)" ), "norm": ( "Kein juristischer Pflichttatbestand — Best-Practice " "(SEO, externe Verlinkungen, Footer-Label-Konsistenz)" ), "evidence": ( f"Doc ist erreichbar unter '{discovered_url}'. " f"Standard-Slug(s) {broken_slugs} liefern 404/410 " f"({broken_urls})." ), "action": ( f"Redirects einrichten von {broken_slugs} nach " f"'{discovered_url}' — damit externe Bookmarks, " "alte Footer-Labels und Google-Treffer nicht brechen." ), "alt_slugs_404": [s for s, _ in alts], }) if findings: logger.info("B16 url-slug-drift: %d finding(s)", len(findings)) return findings