diff --git a/backend-compliance/compliance/api/agent_check/_b16_wiring.py b/backend-compliance/compliance/api/agent_check/_b16_wiring.py new file mode 100644 index 00000000..b11ef600 --- /dev/null +++ b/backend-compliance/compliance/api/agent_check/_b16_wiring.py @@ -0,0 +1,66 @@ +"""B16 wiring — Footer-Label-vs-URL-Slug-Drift-Detector. + +Hängt sich an `state["extra_findings"]` an und rendert einen V2-Block +(`url_slug_drift_html`). +""" + +from __future__ import annotations + +import html +import logging + +from compliance.services.url_slug_drift_check import check_url_slug_drift + +logger = logging.getLogger(__name__) + + +def run_b16(state: dict) -> None: + new = check_url_slug_drift(state) + if not new: + return + extras = state.get("extra_findings") or [] + extras.extend(new) + state["extra_findings"] = extras + state["url_slug_drift_html"] = _render(new) + logger.info("B16 url-slug-drift: %d finding(s)", len(new)) + + +def _render(findings: list[dict]) -> str: + cards = [] + for f in findings: + sev = (f.get("severity") or "").upper() + color = "#64748b" if sev == "LOW" else "#f59e0b" + alts = f.get("alt_slugs_404") or [] + alts_html = "" + if alts: + alts_html = ( + "
" + f"404-Slugs: {html.escape(', '.join(alts))}
" + ) + cards.append( + f"
" + f"
" + f"{sev} · {html.escape(f.get('check_id') or '')}
" + f"
" + f"{html.escape(f.get('title') or '')}
" + f"
" + f"{html.escape(f.get('norm') or '')}
" + f"{alts_html}" + f"
" + f"{html.escape(f.get('evidence') or '')}
" + f"
" + f"→ Empfehlung: " + f"{html.escape(f.get('action') or '')}
" + "
" + ) + return ( + "
" + "

" + "🔗 Standard-Slug-Brüche (SEO / Bookmarks)" + "

" + + "".join(cards) + + "
" + ) diff --git a/backend-compliance/compliance/api/agent_check/_orchestrator.py b/backend-compliance/compliance/api/agent_check/_orchestrator.py index de219e12..235492d2 100644 --- a/backend-compliance/compliance/api/agent_check/_orchestrator.py +++ b/backend-compliance/compliance/api/agent_check/_orchestrator.py @@ -26,6 +26,7 @@ from ._b12_wiring import run_b12 from ._b13_wiring import run_b13 from ._b14_wiring import run_b14 from ._b15_wiring import run_b15 +from ._b16_wiring import run_b16 from ._constants import _compliance_check_jobs from ._phase_a_resolve import run_phase_a from ._phase_b_profile_check import run_phase_b @@ -76,6 +77,7 @@ async def run_compliance_check(check_id: str, req) -> None: run_b13(state) # Widerrufsbelehrung-Reachability (B2C-Pflicht) run_b14(state) # Widersprüchliche Speicherdauer im selben Doc run_b15(state) # AI-Act Rechtsgrundlage (LLM-Vendor auf lit. f) + run_b16(state) # Footer-Label-vs-URL-Slug-Drift # Phase D-3 top/mid/bot: Step 5 HTML blocks await run_phase_d3_top(state) await run_phase_d3_mid(state) diff --git a/backend-compliance/compliance/services/mail_render_v2/_compose.py b/backend-compliance/compliance/services/mail_render_v2/_compose.py index 083ae470..ba7baafb 100644 --- a/backend-compliance/compliance/services/mail_render_v2/_compose.py +++ b/backend-compliance/compliance/services/mail_render_v2/_compose.py @@ -54,6 +54,8 @@ def compose_v2(state: dict) -> str: state.get("retention_conflict_html", ""), # B15 AI-Act Rechtsgrundlage (LLM-Vendor auf lit. f) state.get("ai_legal_basis_html", ""), + # B16 Footer-Label-vs-URL-Slug-Drift (SEO / Bookmarks) + state.get("url_slug_drift_html", ""), # Browser-Matrix (Stage 1.c) state.get("browser_matrix_html", ""), # All legacy build_*_html() wrapped in V2 sections — preserves diff --git a/backend-compliance/compliance/services/url_slug_drift_check.py b/backend-compliance/compliance/services/url_slug_drift_check.py new file mode 100644 index 00000000..1435e5a9 --- /dev/null +++ b/backend-compliance/compliance/services/url_slug_drift_check.py @@ -0,0 +1,206 @@ +"""B16 — Footer-Label-vs-URL-Slug-Drift-Detector. + +Erkennt: gängige Footer-Labels (z.B. "Cookie-Richtlinie", "AGB", +"Datenschutzerklärung") existieren als Bookmark-/SEO-Erwartung, +aber auf der Site antwortet der entsprechende Standard-Slug mit 404. +Real wird das Doc unter einem abweichenden Slug ausgeliefert. + +GT-Anker: Elli URL-STRUCTURE-001: + Footer-Label "Cookie-Richtlinie" → /cookie-richtlinie 404 + Footer-Label "AGB" → /agb 404 + Real: /de/cookies, /de/nutzungsbedingungen. + +Heuristik: + 1. Aus den discovered URLs den Base-Host extrahieren. + 2. Pro doc_type eine kleine Liste kanonischer Standard-Slugs prüfen + (HEAD oder GET), je 2 s Timeout. + 3. Wenn discovered Slug bekannt ist, ABER mindestens ein + gleichwertiger Standard-Slug 404 ergibt → LOW Finding pro alt-Slug. + +Severity: LOW (SEO/Bookmark-Bruch, kein juristisches Hardfail). +""" + +from __future__ import annotations + +import logging +import os +from concurrent.futures import ThreadPoolExecutor +from urllib.parse import urlparse + +import httpx + +logger = logging.getLogger(__name__) + + +# Kanonische DE/EN Standard-Slugs pro doc_type (ohne führenden /). +# Reihenfolge: erst der häufigste deutsche, dann Synonyme, dann EN. +_CANONICAL_SLUGS: dict[str, tuple[str, ...]] = { + "dse": ( + "datenschutz", "datenschutzerklaerung", "datenschutzerklärung", + "privacy", "privacy-policy", + ), + "impressum": ( + "impressum", "imprint", "legal-notice", + ), + "cookie": ( + "cookie-richtlinie", "cookies", "cookie-policy", + ), + "agb": ( + "agb", "allgemeine-geschaeftsbedingungen", + "geschaeftsbedingungen", "terms-and-conditions", + ), + "nutzungsbedingungen": ( + "nutzungsbedingungen", "terms-of-use", "terms-of-service", + ), + "widerruf": ( + "widerrufsbelehrung", "widerruf", "cancellation", + ), +} + + +# Konfigurations-Schalter (default: AN; lässt sich pro Run abschalten). +_DISABLED = os.environ.get("URL_SLUG_PROBE_DISABLED", "").lower() in ( + "1", "true", "yes", "on", +) + + +def _strip_path_slug(url: str) -> str: + """Return the LAST path-segment of a URL (without trailing /).""" + if not url: + return "" + try: + p = urlparse(url) + path = (p.path or "").strip("/") + if not path: + return "" + return path.split("/")[-1].lower() + except Exception: + return "" + + +def _origin_and_prefix(url: str) -> tuple[str, str]: + """Return (origin, language-prefix-or-empty) so we can rebuild + alternative URLs at the same scope as the discovered one. + + Example: 'https://www.elli.eco/de/cookies' → ('https://www.elli.eco', '/de') + """ + try: + p = urlparse(url) + origin = f"{p.scheme}://{p.netloc}" + path = p.path or "/" + parts = [s for s in path.split("/") if s] + # Heuristik: ein 2-3 Zeichen-Pfad-Segment ganz vorn wird als + # Sprach-Prefix interpretiert (de, en, fr, de-de, en-us). + if parts and (len(parts[0]) == 2 or len(parts[0]) == 5): + return origin, f"/{parts[0]}" + return origin, "" + except Exception: + return "", "" + + +def _head_status(url: str, timeout_s: float = 2.0) -> int: + """Return HTTP status code (0 on network error).""" + try: + with httpx.Client(timeout=timeout_s, follow_redirects=False) as c: + r = c.head(url) + # Some servers reject HEAD with 405 — fall back to GET. + if r.status_code == 405: + r = c.get(url) + return r.status_code + except Exception: + return 0 + + +def check_url_slug_drift(state: dict) -> list[dict]: + """Probe canonical alternative slugs per discovered doc; emit a LOW + finding per slug that 404s while the doc is reachable under a + different slug.""" + if _DISABLED: + return [] + doc_entries = state.get("doc_entries") or [] + # Build {doc_type: (discovered_url, discovered_slug)} for + # auto-discovered docs with non-empty text. + discovered: dict[str, tuple[str, str]] = {} + for e in doc_entries: + dt = (e.get("doc_type") or "").lower() + if dt not in _CANONICAL_SLUGS: + continue + url = (e.get("url") or "").strip() + text_len = len((e.get("text") or "").strip()) + if not url or text_len < 400: + continue + slug = _strip_path_slug(url) + if not slug: + continue + discovered[dt] = (url, slug) + + if not discovered: + return [] + + # Build probe-plan: for each doc_type, probe the canonical slugs + # OTHER than the one that's already discovered. + probes: list[tuple[str, str, str]] = [] # (doc_type, alt_slug, url) + for dt, (url, slug) in discovered.items(): + origin, prefix = _origin_and_prefix(url) + if not origin: + continue + for alt in _CANONICAL_SLUGS[dt]: + if alt.lower() == slug: + continue + probes.append((dt, alt, f"{origin}{prefix}/{alt}")) + + # Cap to keep network noise bounded. + probes = probes[:18] + if not probes: + return [] + + def _do_probe(item: tuple[str, str, str]) -> tuple[str, str, str, int]: + dt, alt, u = item + return dt, alt, u, _head_status(u) + + results: list[tuple[str, str, str, int]] = [] + with ThreadPoolExecutor(max_workers=6) as ex: + results = list(ex.map(_do_probe, probes)) + + findings: list[dict] = [] + # Group by doc_type so we can emit ONE finding per doc with the + # list of 404-alts. + per_dt: dict[str, list[tuple[str, str]]] = {} + for dt, alt, u, status in results: + if status == 404 or status == 410: + per_dt.setdefault(dt, []).append((alt, u)) + + for dt, alts in per_dt.items(): + if not alts: + continue + discovered_url, discovered_slug = discovered[dt] + broken_urls = ", ".join(u for _, u in alts[:3]) + broken_slugs = ", ".join(s for s, _ in alts[:3]) + findings.append({ + "check_id": "URL-SLUG-DRIFT-001", + "severity": "LOW", + "severity_reason": "seo_bookmark_break", + "doc_type": dt, + "title": ( + f"Externe Bookmarks / SEO-Erwartung für {dt} brechen " + f"({len(alts)} Standard-Slug(s) 404)" + ), + "norm": ( + "Kein juristischer Pflichttatbestand — Best-Practice " + "(SEO, externe Verlinkungen, Footer-Label-Konsistenz)" + ), + "evidence": ( + f"Doc ist erreichbar unter '{discovered_url}'. " + f"Standard-Slug(s) {broken_slugs} liefern 404/410 " + f"({broken_urls})." + ), + "action": ( + f"Redirects einrichten von {broken_slugs} nach " + f"'{discovered_url}' — damit externe Bookmarks, " + "alte Footer-Labels und Google-Treffer nicht brechen." + ), + "alt_slugs_404": [s for s, _ in alts], + }) + if findings: + logger.info("B16 url-slug-drift: %d finding(s)", len(findings)) + return findings diff --git a/backend-compliance/tests/test_url_slug_drift_check.py b/backend-compliance/tests/test_url_slug_drift_check.py new file mode 100644 index 00000000..55d4d4af --- /dev/null +++ b/backend-compliance/tests/test_url_slug_drift_check.py @@ -0,0 +1,139 @@ +"""Tests for B16 URL-Slug-Drift-Detector (GT URL-STRUCTURE-001).""" + +from unittest.mock import patch + +from compliance.services.url_slug_drift_check import ( + _origin_and_prefix, + _strip_path_slug, + check_url_slug_drift, +) + + +class TestStripPathSlug: + def test_simple(self): + assert _strip_path_slug("https://x.de/cookies") == "cookies" + + def test_with_lang_prefix(self): + assert _strip_path_slug("https://x.de/de/cookies") == "cookies" + + def test_trailing_slash(self): + assert _strip_path_slug("https://x.de/cookies/") == "cookies" + + def test_empty(self): + assert _strip_path_slug("") == "" + + +class TestOriginAndPrefix: + def test_lang_prefix(self): + assert _origin_and_prefix("https://www.elli.eco/de/cookies") == ( + "https://www.elli.eco", "/de", + ) + + def test_no_lang_prefix(self): + assert _origin_and_prefix("https://x.de/cookies") == ( + "https://x.de", "", + ) + + def test_locale_prefix(self): + assert _origin_and_prefix("https://x.de/de-de/cookies") == ( + "https://x.de", "/de-de", + ) + + +class TestCheckURLSlugDrift: + def test_no_docs_no_findings(self): + assert check_url_slug_drift({"doc_entries": []}) == [] + + def test_disabled_via_env_no_findings(self, monkeypatch): + monkeypatch.setenv("URL_SLUG_PROBE_DISABLED", "1") + # Reload module-level _DISABLED flag + import importlib + + from compliance.services import url_slug_drift_check + importlib.reload(url_slug_drift_check) + result = url_slug_drift_check.check_url_slug_drift({ + "doc_entries": [{ + "doc_type": "cookie", + "url": "https://x.de/de/cookies", + "text": "x" * 500, + }] + }) + assert result == [] + # Restore + monkeypatch.delenv("URL_SLUG_PROBE_DISABLED") + importlib.reload(url_slug_drift_check) + + def test_all_alternatives_200_no_finding(self): + with patch( + "compliance.services.url_slug_drift_check._head_status", + return_value=200, + ): + result = check_url_slug_drift({ + "doc_entries": [{ + "doc_type": "cookie", + "url": "https://x.de/de/cookies", + "text": "x" * 500, + }] + }) + assert result == [] + + def test_alternative_404_emits_finding(self): + with patch( + "compliance.services.url_slug_drift_check._head_status", + return_value=404, + ): + result = check_url_slug_drift({ + "doc_entries": [{ + "doc_type": "cookie", + "url": "https://x.de/de/cookies", + "text": "x" * 500, + }] + }) + assert len(result) == 1 + f = result[0] + assert f["check_id"] == "URL-SLUG-DRIFT-001" + assert f["severity"] == "LOW" + assert f["doc_type"] == "cookie" + assert "cookie-richtlinie" in f["alt_slugs_404"] + + def test_short_text_skipped(self): + # text < 400 chars → not counted as reachable doc + with patch( + "compliance.services.url_slug_drift_check._head_status", + return_value=404, + ): + result = check_url_slug_drift({ + "doc_entries": [{ + "doc_type": "cookie", + "url": "https://x.de/de/cookies", + "text": "x" * 50, + }] + }) + assert result == [] + + def test_elli_pattern_cookie_and_agb_both_emit(self): + # Simulate Elli: cookie under /de/cookies, but cookie-richtlinie 404. + # agb-doc resolves at /de/nutzungsbedingungen with /agb 404. + # Note: nutzungsbedingungen is its own doc_type — Elli's "AGB" + # label thus has no canonical doc on the site. + state = { + "doc_entries": [ + {"doc_type": "cookie", + "url": "https://x.de/de/cookies", + "text": "x" * 500}, + {"doc_type": "nutzungsbedingungen", + "url": "https://x.de/de/nutzungsbedingungen", + "text": "x" * 500}, + ] + } + with patch( + "compliance.services.url_slug_drift_check._head_status", + return_value=404, + ): + result = check_url_slug_drift(state) + # cookie has 2 canonical alts (cookie-richtlinie, cookie-policy); + # nutzungsbedingungen has 2 alts (terms-of-use, terms-of-service). + # Both should emit findings since all alts return 404. + doc_types = {f["doc_type"] for f in result} + assert "cookie" in doc_types + assert "nutzungsbedingungen" in doc_types