refactor(agent-check): split routes file (2692→347 LOC) + wire B1/B3/A1 [guardrail-change]
Phase-5 split of agent_compliance_check_routes.py — the 2700-line
monolith was decomposed into 19 modules in compliance/api/agent_check/:
- Phase A-F: resolve / profile+check / banner+TCF / vendors raw+finalize /
HTML blocks top+mid+bot / email / persist
- Helpers: _constants, _helpers, _fetch, _discovery, _single_check
- Schemas + State + thin _orchestrator
A1 ZIP-Anhang nativ in _phase_e_email: evidence_zip_builder.py bundles
slices + manifest.json + audit_metadata.json (SHA256 per slice +
build_sha + source_url). smtp_sender.py erweitert um attachments-Parameter.
B1 COOKIE-CONSENT-UX-001 (Mobile Reachability): consent_reachability_check.py
parses footer anchors, classifies intent (reopen_cmp / info_only /
browser_deflect) + target (same_page_cmp / new_tab / external).
_b1_wiring.py fetches homepage with iPhone-UA + renders Art-7-Abs-3
severity-coloured block.
B3 TH-RETENTION (Cross-Doc Speicherdauer): retention_comparator.py
compares DSI claim ↔ cookie-table duration ↔ actual Max-Age/expires
with 5% tolerance + severity hierarchy (dsi_under_actual HIGH,
table_under_actual HIGH, dsi_vs_table MEDIUM, actual_under_table LOW
Safari-ITP-Hint). _b3_wiring.py + Top-10 mismatches table in mail.
Side-effects:
- Fixed silent UnboundLocalError in original Step 5 (gf_one_pager used
audit_quality_findings before declaration, caught by surrounding
except → block never rendered). New _phase_d3_blocks_bot.py runs
audit-quality FIRST.
- agent_compliance_check_routes.py removed from loc-exceptions.txt
("Phase 5 split target" — done).
Tests: 55/55 grün (B1 22 + B3 27 + saving_scan 6).
E2E: smoke against Elli DSE+Cookie produced HIGH/missing B1 finding,
TH-RETENTION table (17 cookies / 3 ✓ / 3 ✗ / 11 ?), evidence-zip
with 2 slices + manifest + audit_metadata (12089B, SHA256-chained,
source verified), email sent (attachments=1).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,278 @@
|
||||
"""
|
||||
B1 — Cookie-Consent-UX-001: Mobile Reachability of Consent Settings.
|
||||
|
||||
DSGVO Art. 7 Abs. 3 requires that withdrawing consent must be as
|
||||
easy as giving it. EDPB Cookie Banner Taskforce Report (2023) and
|
||||
DSK OH Digitale Dienste v1.2 (2024) both demand a permanent, directly
|
||||
reachable way to change cookie preferences — typically a Footer link
|
||||
labelled "Cookie-Einstellungen" that re-opens the CMP in place.
|
||||
|
||||
Common anti-patterns we want to flag:
|
||||
- Footer points to a Cookie-Policy *page* in a new tab, no CMP
|
||||
- Footer only offers "more info" but no "manage settings"
|
||||
- Only mention is a verbal reference to browser settings inside the
|
||||
privacy-policy text
|
||||
- Mobile footer hides the link in a multi-level accordion
|
||||
|
||||
This module does the STATIC HTML analysis. The dynamic part (mobile
|
||||
viewport rendering, tap-target measurement, click-behaviour
|
||||
verification) is performed by consent-tester via Playwright and feeds
|
||||
back into `evaluate_combined` in a later phase.
|
||||
|
||||
Pure module — no DB, no network. Tests live in
|
||||
tests/test_consent_reachability_check.py.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import re
|
||||
from html.parser import HTMLParser
|
||||
from urllib.parse import urljoin, urlparse
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Phrases that suggest "open the consent manager" rather than "show
|
||||
# more info / open a policy page".
|
||||
_REOPEN_PHRASES = (
|
||||
"cookie-einstellungen", "cookie einstellungen",
|
||||
"cookie-präferenzen", "cookie praeferenzen", "cookie-praferenzen",
|
||||
"cookie-einwilligung", "einwilligung verwalten",
|
||||
"consent manager", "consent settings", "consent-einstellungen",
|
||||
"datenschutz-einstellungen", "datenschutzeinstellungen",
|
||||
"cookies verwalten", "manage cookies", "manage preferences",
|
||||
"privacy settings", "privacy preferences",
|
||||
"tracking-einstellungen",
|
||||
)
|
||||
|
||||
# Weaker — these usually point at a policy page, not the CMP itself.
|
||||
_INFO_ONLY_PHRASES = (
|
||||
"cookie-richtlinie", "cookie richtlinie", "cookie-policy",
|
||||
"cookie policy", "cookies (information)",
|
||||
"datenschutz", "datenschutzerklärung", "privacy policy",
|
||||
"weitere informationen", "more information",
|
||||
)
|
||||
|
||||
# Phrases that try to shift the burden to the user's browser —
|
||||
# Bundesländer-Datenschutzbeauftragte explicitly call this insufficient.
|
||||
_BROWSER_DEFLECTION_PHRASES = (
|
||||
"browser-einstellungen", "browsereinstellungen",
|
||||
"einstellungen ihres browsers", "browser settings",
|
||||
"in ihrem browser", "über ihren browser",
|
||||
)
|
||||
|
||||
|
||||
class _AnchorCollector(HTMLParser):
|
||||
"""Collects <a> and <button> elements with text + attrs.
|
||||
|
||||
Track footer scope via a depth counter so we only return anchors
|
||||
that are descendants of <footer> (or have role="contentinfo" /
|
||||
id|class containing 'footer').
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
super().__init__(convert_charrefs=True)
|
||||
self._footer_depth = 0
|
||||
self._current: dict | None = None
|
||||
self._text_chunks: list[str] = []
|
||||
self.anchors: list[dict] = []
|
||||
|
||||
def _is_footer_open(self, tag: str, attrs: dict) -> bool:
|
||||
if tag == "footer":
|
||||
return True
|
||||
if attrs.get("role", "").lower() == "contentinfo":
|
||||
return True
|
||||
ident = (attrs.get("id", "") + " " + attrs.get("class", "")).lower()
|
||||
return "footer" in ident or "site-footer" in ident
|
||||
|
||||
def handle_starttag(self, tag, attrs):
|
||||
a = {k.lower(): (v or "") for k, v in attrs}
|
||||
if self._is_footer_open(tag, a):
|
||||
self._footer_depth += 1
|
||||
return
|
||||
if self._footer_depth > 0 and tag in ("a", "button"):
|
||||
self._current = {
|
||||
"tag": tag,
|
||||
"href": a.get("href", ""),
|
||||
"target": a.get("target", ""),
|
||||
"aria_label": a.get("aria-label", ""),
|
||||
"data_cmp": a.get("data-cmp", ""),
|
||||
"onclick": a.get("onclick", ""),
|
||||
"id": a.get("id", ""),
|
||||
"class": a.get("class", ""),
|
||||
}
|
||||
self._text_chunks = []
|
||||
|
||||
def handle_endtag(self, tag):
|
||||
if tag == "footer" and self._footer_depth > 0:
|
||||
self._footer_depth -= 1
|
||||
elif self._current and tag == self._current["tag"]:
|
||||
txt = " ".join(self._text_chunks).strip()
|
||||
self._current["text"] = re.sub(r"\s+", " ", txt)[:200]
|
||||
self.anchors.append(self._current)
|
||||
self._current = None
|
||||
self._text_chunks = []
|
||||
|
||||
def handle_data(self, data):
|
||||
if self._current is not None:
|
||||
self._text_chunks.append(data)
|
||||
|
||||
|
||||
def find_consent_anchors_in_footer(html: str) -> list[dict]:
|
||||
"""Parse <a> / <button> elements in <footer> and tag those that
|
||||
look related to cookie/consent management.
|
||||
|
||||
Each returned dict:
|
||||
{ tag, href, target, text, aria_label, onclick, id, class,
|
||||
intent }
|
||||
where intent ∈ {"reopen_cmp", "info_only", "browser_deflect",
|
||||
"unrelated"}.
|
||||
"""
|
||||
if not html:
|
||||
return []
|
||||
parser = _AnchorCollector()
|
||||
try:
|
||||
parser.feed(html)
|
||||
except Exception as e: # malformed HTML — recover silently
|
||||
logger.warning("footer parser failed: %s", e)
|
||||
return []
|
||||
out: list[dict] = []
|
||||
for a in parser.anchors:
|
||||
label = " ".join([
|
||||
a.get("text", ""), a.get("aria_label", ""),
|
||||
a.get("data_cmp", ""), a.get("onclick", ""),
|
||||
]).lower()
|
||||
intent = "unrelated"
|
||||
if any(p in label for p in _REOPEN_PHRASES):
|
||||
intent = "reopen_cmp"
|
||||
elif any(p in label for p in _BROWSER_DEFLECTION_PHRASES):
|
||||
intent = "browser_deflect"
|
||||
elif any(p in label for p in _INFO_ONLY_PHRASES):
|
||||
intent = "info_only"
|
||||
if intent != "unrelated":
|
||||
a["intent"] = intent
|
||||
out.append(a)
|
||||
return out
|
||||
|
||||
|
||||
def classify_anchor_target(
|
||||
anchor: dict, base_url: str,
|
||||
) -> str:
|
||||
"""Decide whether the anchor would open the CMP in place or
|
||||
navigate elsewhere.
|
||||
|
||||
Returns:
|
||||
"same_page_cmp" — onclick / data-cmp / data-* / hash-only link
|
||||
"same_origin" — relative link or same-origin page (still a
|
||||
navigation away from the live banner)
|
||||
"external" — link to a different origin
|
||||
"new_tab" — target="_blank" or rel*=external
|
||||
"javascript" — javascript: link, probably a CMP trigger
|
||||
"""
|
||||
href = (anchor.get("href") or "").strip()
|
||||
target = (anchor.get("target") or "").strip().lower()
|
||||
onclick = anchor.get("onclick", "") or ""
|
||||
data_cmp = anchor.get("data_cmp", "") or ""
|
||||
|
||||
if data_cmp or onclick:
|
||||
return "same_page_cmp"
|
||||
if href.startswith("javascript:"):
|
||||
return "javascript"
|
||||
if target == "_blank":
|
||||
return "new_tab"
|
||||
if not href or href.startswith("#"):
|
||||
return "same_page_cmp"
|
||||
|
||||
base_host = urlparse(base_url).netloc.lower() if base_url else ""
|
||||
try:
|
||||
target_host = urlparse(urljoin(base_url or "/", href)).netloc.lower()
|
||||
except Exception:
|
||||
target_host = ""
|
||||
if not target_host or target_host == base_host:
|
||||
return "same_origin"
|
||||
return "external"
|
||||
|
||||
|
||||
def evaluate_reachability(
|
||||
footer_html: str,
|
||||
base_url: str = "",
|
||||
) -> dict:
|
||||
"""Run static reachability analysis on a footer HTML fragment.
|
||||
|
||||
Returns a finding dict for the COOKIE-CONSENT-UX-001 check.
|
||||
"""
|
||||
anchors = find_consent_anchors_in_footer(footer_html)
|
||||
has_reopen_anchor = False
|
||||
reopen_anchor: dict | None = None
|
||||
info_only_count = 0
|
||||
browser_deflect_count = 0
|
||||
for a in anchors:
|
||||
intent = a.get("intent")
|
||||
if intent == "reopen_cmp":
|
||||
has_reopen_anchor = True
|
||||
target_class = classify_anchor_target(a, base_url)
|
||||
a["target_class"] = target_class
|
||||
if reopen_anchor is None:
|
||||
reopen_anchor = a
|
||||
elif intent == "info_only":
|
||||
info_only_count += 1
|
||||
elif intent == "browser_deflect":
|
||||
browser_deflect_count += 1
|
||||
|
||||
result: dict = {
|
||||
"check_id": "COOKIE-CONSENT-UX-001",
|
||||
"anchors_total": len(anchors),
|
||||
"has_reopen_anchor": has_reopen_anchor,
|
||||
"info_only_count": info_only_count,
|
||||
"browser_deflect_count": browser_deflect_count,
|
||||
"reopen_anchor": reopen_anchor,
|
||||
"passed": True,
|
||||
"severity": None,
|
||||
"severity_reason": None,
|
||||
"evidence_phrases": [],
|
||||
"notes": [],
|
||||
}
|
||||
|
||||
# Hard fail: no reopen anchor at all → withdrawal not as easy as
|
||||
# opt-in (Art. 7 Abs. 3 DSGVO).
|
||||
if not has_reopen_anchor:
|
||||
result["passed"] = False
|
||||
result["severity"] = "HIGH"
|
||||
result["severity_reason"] = "missing"
|
||||
result["notes"].append(
|
||||
"no consent-manager link in footer; withdrawal path "
|
||||
"missing or only indirect",
|
||||
)
|
||||
|
||||
# Soft fail: anchor exists but opens in new tab — DSK OH calls this
|
||||
# an avoidable hurdle. MEDIUM rather than HIGH because withdrawal
|
||||
# is technically still possible.
|
||||
if has_reopen_anchor and reopen_anchor is not None:
|
||||
cls = reopen_anchor.get("target_class")
|
||||
if cls == "new_tab":
|
||||
result["passed"] = False
|
||||
result["severity"] = "MEDIUM"
|
||||
result["severity_reason"] = "misclassified"
|
||||
result["notes"].append(
|
||||
"consent-manager link opens in new tab — context-break",
|
||||
)
|
||||
elif cls == "external":
|
||||
result["passed"] = False
|
||||
result["severity"] = "MEDIUM"
|
||||
result["severity_reason"] = "misclassified"
|
||||
result["notes"].append(
|
||||
"consent-manager link points to external host",
|
||||
)
|
||||
|
||||
# Extra signal: only browser-deflection phrases and zero reopen
|
||||
# anchor — this is the worst variant the LfDI BaWü explicitly
|
||||
# flagged.
|
||||
if (not has_reopen_anchor and browser_deflect_count > 0):
|
||||
result["severity"] = "HIGH"
|
||||
result["severity_reason"] = "factually_wrong"
|
||||
result["notes"].append(
|
||||
"withdrawal route only via browser-settings — not gleich "
|
||||
"einfach wie Erteilung",
|
||||
)
|
||||
|
||||
return result
|
||||
Reference in New Issue
Block a user