feat(b16): Footer-Label-vs-URL-Slug-Drift-Check (GT URL-STRUCTURE-001)
Erkennt: gängige Footer-Labels / Bookmark- + SEO-Erwartungs-Slugs
(z.B. "Cookie-Richtlinie", "AGB", "Datenschutzerklärung") liefern
404, während das Doc tatsächlich unter einem abweichenden Slug
ausgeliefert wird.
GT-Anker (Elli URL-STRUCTURE-001):
Footer-Label "Cookie-Richtlinie" → /cookie-richtlinie 404
Real: /de/cookies
→ externe Bookmarks und Google-Treffer brechen.
Heuristik:
- Aus auto-discovered URLs Origin + Sprach-Prefix extrahieren
(z.B. /de, /de-de)
- Pro doc_type 2-4 kanonische Standard-Slugs probieren (parallel
via ThreadPoolExecutor, 2s Timeout, HEAD → GET fallback bei 405)
- Wenn alternative Slug 404/410 → LOW Finding pro doc_type
- Probe-Cap auf 18 Requests gesamt (Network-Noise-Schutz)
- Abschaltbar via URL_SLUG_PROBE_DISABLED=1
Severity: LOW (Best-Practice, kein juristisches Hardfail).
Tests: 13/13 grün (Strip-Helper 4 + Origin-Helper 3 + Check-Pfade 6
inkl. mocked _head_status).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,139 @@
|
||||
"""Tests for B16 URL-Slug-Drift-Detector (GT URL-STRUCTURE-001)."""
|
||||
|
||||
from unittest.mock import patch
|
||||
|
||||
from compliance.services.url_slug_drift_check import (
|
||||
_origin_and_prefix,
|
||||
_strip_path_slug,
|
||||
check_url_slug_drift,
|
||||
)
|
||||
|
||||
|
||||
class TestStripPathSlug:
|
||||
def test_simple(self):
|
||||
assert _strip_path_slug("https://x.de/cookies") == "cookies"
|
||||
|
||||
def test_with_lang_prefix(self):
|
||||
assert _strip_path_slug("https://x.de/de/cookies") == "cookies"
|
||||
|
||||
def test_trailing_slash(self):
|
||||
assert _strip_path_slug("https://x.de/cookies/") == "cookies"
|
||||
|
||||
def test_empty(self):
|
||||
assert _strip_path_slug("") == ""
|
||||
|
||||
|
||||
class TestOriginAndPrefix:
|
||||
def test_lang_prefix(self):
|
||||
assert _origin_and_prefix("https://www.elli.eco/de/cookies") == (
|
||||
"https://www.elli.eco", "/de",
|
||||
)
|
||||
|
||||
def test_no_lang_prefix(self):
|
||||
assert _origin_and_prefix("https://x.de/cookies") == (
|
||||
"https://x.de", "",
|
||||
)
|
||||
|
||||
def test_locale_prefix(self):
|
||||
assert _origin_and_prefix("https://x.de/de-de/cookies") == (
|
||||
"https://x.de", "/de-de",
|
||||
)
|
||||
|
||||
|
||||
class TestCheckURLSlugDrift:
|
||||
def test_no_docs_no_findings(self):
|
||||
assert check_url_slug_drift({"doc_entries": []}) == []
|
||||
|
||||
def test_disabled_via_env_no_findings(self, monkeypatch):
|
||||
monkeypatch.setenv("URL_SLUG_PROBE_DISABLED", "1")
|
||||
# Reload module-level _DISABLED flag
|
||||
import importlib
|
||||
|
||||
from compliance.services import url_slug_drift_check
|
||||
importlib.reload(url_slug_drift_check)
|
||||
result = url_slug_drift_check.check_url_slug_drift({
|
||||
"doc_entries": [{
|
||||
"doc_type": "cookie",
|
||||
"url": "https://x.de/de/cookies",
|
||||
"text": "x" * 500,
|
||||
}]
|
||||
})
|
||||
assert result == []
|
||||
# Restore
|
||||
monkeypatch.delenv("URL_SLUG_PROBE_DISABLED")
|
||||
importlib.reload(url_slug_drift_check)
|
||||
|
||||
def test_all_alternatives_200_no_finding(self):
|
||||
with patch(
|
||||
"compliance.services.url_slug_drift_check._head_status",
|
||||
return_value=200,
|
||||
):
|
||||
result = check_url_slug_drift({
|
||||
"doc_entries": [{
|
||||
"doc_type": "cookie",
|
||||
"url": "https://x.de/de/cookies",
|
||||
"text": "x" * 500,
|
||||
}]
|
||||
})
|
||||
assert result == []
|
||||
|
||||
def test_alternative_404_emits_finding(self):
|
||||
with patch(
|
||||
"compliance.services.url_slug_drift_check._head_status",
|
||||
return_value=404,
|
||||
):
|
||||
result = check_url_slug_drift({
|
||||
"doc_entries": [{
|
||||
"doc_type": "cookie",
|
||||
"url": "https://x.de/de/cookies",
|
||||
"text": "x" * 500,
|
||||
}]
|
||||
})
|
||||
assert len(result) == 1
|
||||
f = result[0]
|
||||
assert f["check_id"] == "URL-SLUG-DRIFT-001"
|
||||
assert f["severity"] == "LOW"
|
||||
assert f["doc_type"] == "cookie"
|
||||
assert "cookie-richtlinie" in f["alt_slugs_404"]
|
||||
|
||||
def test_short_text_skipped(self):
|
||||
# text < 400 chars → not counted as reachable doc
|
||||
with patch(
|
||||
"compliance.services.url_slug_drift_check._head_status",
|
||||
return_value=404,
|
||||
):
|
||||
result = check_url_slug_drift({
|
||||
"doc_entries": [{
|
||||
"doc_type": "cookie",
|
||||
"url": "https://x.de/de/cookies",
|
||||
"text": "x" * 50,
|
||||
}]
|
||||
})
|
||||
assert result == []
|
||||
|
||||
def test_elli_pattern_cookie_and_agb_both_emit(self):
|
||||
# Simulate Elli: cookie under /de/cookies, but cookie-richtlinie 404.
|
||||
# agb-doc resolves at /de/nutzungsbedingungen with /agb 404.
|
||||
# Note: nutzungsbedingungen is its own doc_type — Elli's "AGB"
|
||||
# label thus has no canonical doc on the site.
|
||||
state = {
|
||||
"doc_entries": [
|
||||
{"doc_type": "cookie",
|
||||
"url": "https://x.de/de/cookies",
|
||||
"text": "x" * 500},
|
||||
{"doc_type": "nutzungsbedingungen",
|
||||
"url": "https://x.de/de/nutzungsbedingungen",
|
||||
"text": "x" * 500},
|
||||
]
|
||||
}
|
||||
with patch(
|
||||
"compliance.services.url_slug_drift_check._head_status",
|
||||
return_value=404,
|
||||
):
|
||||
result = check_url_slug_drift(state)
|
||||
# cookie has 2 canonical alts (cookie-richtlinie, cookie-policy);
|
||||
# nutzungsbedingungen has 2 alts (terms-of-use, terms-of-service).
|
||||
# Both should emit findings since all alts return 404.
|
||||
doc_types = {f["doc_type"] for f in result}
|
||||
assert "cookie" in doc_types
|
||||
assert "nutzungsbedingungen" in doc_types
|
||||
Reference in New Issue
Block a user