c2c8783fee
Phase-5 split of agent_compliance_check_routes.py — the 2700-line
monolith was decomposed into 19 modules in compliance/api/agent_check/:
- Phase A-F: resolve / profile+check / banner+TCF / vendors raw+finalize /
HTML blocks top+mid+bot / email / persist
- Helpers: _constants, _helpers, _fetch, _discovery, _single_check
- Schemas + State + thin _orchestrator
A1 ZIP-Anhang nativ in _phase_e_email: evidence_zip_builder.py bundles
slices + manifest.json + audit_metadata.json (SHA256 per slice +
build_sha + source_url). smtp_sender.py erweitert um attachments-Parameter.
B1 COOKIE-CONSENT-UX-001 (Mobile Reachability): consent_reachability_check.py
parses footer anchors, classifies intent (reopen_cmp / info_only /
browser_deflect) + target (same_page_cmp / new_tab / external).
_b1_wiring.py fetches homepage with iPhone-UA + renders Art-7-Abs-3
severity-coloured block.
B3 TH-RETENTION (Cross-Doc Speicherdauer): retention_comparator.py
compares DSI claim ↔ cookie-table duration ↔ actual Max-Age/expires
with 5% tolerance + severity hierarchy (dsi_under_actual HIGH,
table_under_actual HIGH, dsi_vs_table MEDIUM, actual_under_table LOW
Safari-ITP-Hint). _b3_wiring.py + Top-10 mismatches table in mail.
Side-effects:
- Fixed silent UnboundLocalError in original Step 5 (gf_one_pager used
audit_quality_findings before declaration, caught by surrounding
except → block never rendered). New _phase_d3_blocks_bot.py runs
audit-quality FIRST.
- agent_compliance_check_routes.py removed from loc-exceptions.txt
("Phase 5 split target" — done).
Tests: 55/55 grün (B1 22 + B3 27 + saving_scan 6).
E2E: smoke against Elli DSE+Cookie produced HIGH/missing B1 finding,
TH-RETENTION table (17 cookies / 3 ✓ / 3 ✗ / 11 ?), evidence-zip
with 2 slices + manifest + audit_metadata (12089B, SHA256-chained,
source verified), email sent (attachments=1).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
279 lines
9.9 KiB
Python
279 lines
9.9 KiB
Python
"""
|
|
B1 — Cookie-Consent-UX-001: Mobile Reachability of Consent Settings.
|
|
|
|
DSGVO Art. 7 Abs. 3 requires that withdrawing consent must be as
|
|
easy as giving it. EDPB Cookie Banner Taskforce Report (2023) and
|
|
DSK OH Digitale Dienste v1.2 (2024) both demand a permanent, directly
|
|
reachable way to change cookie preferences — typically a Footer link
|
|
labelled "Cookie-Einstellungen" that re-opens the CMP in place.
|
|
|
|
Common anti-patterns we want to flag:
|
|
- Footer points to a Cookie-Policy *page* in a new tab, no CMP
|
|
- Footer only offers "more info" but no "manage settings"
|
|
- Only mention is a verbal reference to browser settings inside the
|
|
privacy-policy text
|
|
- Mobile footer hides the link in a multi-level accordion
|
|
|
|
This module does the STATIC HTML analysis. The dynamic part (mobile
|
|
viewport rendering, tap-target measurement, click-behaviour
|
|
verification) is performed by consent-tester via Playwright and feeds
|
|
back into `evaluate_combined` in a later phase.
|
|
|
|
Pure module — no DB, no network. Tests live in
|
|
tests/test_consent_reachability_check.py.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
import re
|
|
from html.parser import HTMLParser
|
|
from urllib.parse import urljoin, urlparse
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Phrases that suggest "open the consent manager" rather than "show
|
|
# more info / open a policy page".
|
|
_REOPEN_PHRASES = (
|
|
"cookie-einstellungen", "cookie einstellungen",
|
|
"cookie-präferenzen", "cookie praeferenzen", "cookie-praferenzen",
|
|
"cookie-einwilligung", "einwilligung verwalten",
|
|
"consent manager", "consent settings", "consent-einstellungen",
|
|
"datenschutz-einstellungen", "datenschutzeinstellungen",
|
|
"cookies verwalten", "manage cookies", "manage preferences",
|
|
"privacy settings", "privacy preferences",
|
|
"tracking-einstellungen",
|
|
)
|
|
|
|
# Weaker — these usually point at a policy page, not the CMP itself.
|
|
_INFO_ONLY_PHRASES = (
|
|
"cookie-richtlinie", "cookie richtlinie", "cookie-policy",
|
|
"cookie policy", "cookies (information)",
|
|
"datenschutz", "datenschutzerklärung", "privacy policy",
|
|
"weitere informationen", "more information",
|
|
)
|
|
|
|
# Phrases that try to shift the burden to the user's browser —
|
|
# Bundesländer-Datenschutzbeauftragte explicitly call this insufficient.
|
|
_BROWSER_DEFLECTION_PHRASES = (
|
|
"browser-einstellungen", "browsereinstellungen",
|
|
"einstellungen ihres browsers", "browser settings",
|
|
"in ihrem browser", "über ihren browser",
|
|
)
|
|
|
|
|
|
class _AnchorCollector(HTMLParser):
|
|
"""Collects <a> and <button> elements with text + attrs.
|
|
|
|
Track footer scope via a depth counter so we only return anchors
|
|
that are descendants of <footer> (or have role="contentinfo" /
|
|
id|class containing 'footer').
|
|
"""
|
|
|
|
def __init__(self) -> None:
|
|
super().__init__(convert_charrefs=True)
|
|
self._footer_depth = 0
|
|
self._current: dict | None = None
|
|
self._text_chunks: list[str] = []
|
|
self.anchors: list[dict] = []
|
|
|
|
def _is_footer_open(self, tag: str, attrs: dict) -> bool:
|
|
if tag == "footer":
|
|
return True
|
|
if attrs.get("role", "").lower() == "contentinfo":
|
|
return True
|
|
ident = (attrs.get("id", "") + " " + attrs.get("class", "")).lower()
|
|
return "footer" in ident or "site-footer" in ident
|
|
|
|
def handle_starttag(self, tag, attrs):
|
|
a = {k.lower(): (v or "") for k, v in attrs}
|
|
if self._is_footer_open(tag, a):
|
|
self._footer_depth += 1
|
|
return
|
|
if self._footer_depth > 0 and tag in ("a", "button"):
|
|
self._current = {
|
|
"tag": tag,
|
|
"href": a.get("href", ""),
|
|
"target": a.get("target", ""),
|
|
"aria_label": a.get("aria-label", ""),
|
|
"data_cmp": a.get("data-cmp", ""),
|
|
"onclick": a.get("onclick", ""),
|
|
"id": a.get("id", ""),
|
|
"class": a.get("class", ""),
|
|
}
|
|
self._text_chunks = []
|
|
|
|
def handle_endtag(self, tag):
|
|
if tag == "footer" and self._footer_depth > 0:
|
|
self._footer_depth -= 1
|
|
elif self._current and tag == self._current["tag"]:
|
|
txt = " ".join(self._text_chunks).strip()
|
|
self._current["text"] = re.sub(r"\s+", " ", txt)[:200]
|
|
self.anchors.append(self._current)
|
|
self._current = None
|
|
self._text_chunks = []
|
|
|
|
def handle_data(self, data):
|
|
if self._current is not None:
|
|
self._text_chunks.append(data)
|
|
|
|
|
|
def find_consent_anchors_in_footer(html: str) -> list[dict]:
|
|
"""Parse <a> / <button> elements in <footer> and tag those that
|
|
look related to cookie/consent management.
|
|
|
|
Each returned dict:
|
|
{ tag, href, target, text, aria_label, onclick, id, class,
|
|
intent }
|
|
where intent ∈ {"reopen_cmp", "info_only", "browser_deflect",
|
|
"unrelated"}.
|
|
"""
|
|
if not html:
|
|
return []
|
|
parser = _AnchorCollector()
|
|
try:
|
|
parser.feed(html)
|
|
except Exception as e: # malformed HTML — recover silently
|
|
logger.warning("footer parser failed: %s", e)
|
|
return []
|
|
out: list[dict] = []
|
|
for a in parser.anchors:
|
|
label = " ".join([
|
|
a.get("text", ""), a.get("aria_label", ""),
|
|
a.get("data_cmp", ""), a.get("onclick", ""),
|
|
]).lower()
|
|
intent = "unrelated"
|
|
if any(p in label for p in _REOPEN_PHRASES):
|
|
intent = "reopen_cmp"
|
|
elif any(p in label for p in _BROWSER_DEFLECTION_PHRASES):
|
|
intent = "browser_deflect"
|
|
elif any(p in label for p in _INFO_ONLY_PHRASES):
|
|
intent = "info_only"
|
|
if intent != "unrelated":
|
|
a["intent"] = intent
|
|
out.append(a)
|
|
return out
|
|
|
|
|
|
def classify_anchor_target(
|
|
anchor: dict, base_url: str,
|
|
) -> str:
|
|
"""Decide whether the anchor would open the CMP in place or
|
|
navigate elsewhere.
|
|
|
|
Returns:
|
|
"same_page_cmp" — onclick / data-cmp / data-* / hash-only link
|
|
"same_origin" — relative link or same-origin page (still a
|
|
navigation away from the live banner)
|
|
"external" — link to a different origin
|
|
"new_tab" — target="_blank" or rel*=external
|
|
"javascript" — javascript: link, probably a CMP trigger
|
|
"""
|
|
href = (anchor.get("href") or "").strip()
|
|
target = (anchor.get("target") or "").strip().lower()
|
|
onclick = anchor.get("onclick", "") or ""
|
|
data_cmp = anchor.get("data_cmp", "") or ""
|
|
|
|
if data_cmp or onclick:
|
|
return "same_page_cmp"
|
|
if href.startswith("javascript:"):
|
|
return "javascript"
|
|
if target == "_blank":
|
|
return "new_tab"
|
|
if not href or href.startswith("#"):
|
|
return "same_page_cmp"
|
|
|
|
base_host = urlparse(base_url).netloc.lower() if base_url else ""
|
|
try:
|
|
target_host = urlparse(urljoin(base_url or "/", href)).netloc.lower()
|
|
except Exception:
|
|
target_host = ""
|
|
if not target_host or target_host == base_host:
|
|
return "same_origin"
|
|
return "external"
|
|
|
|
|
|
def evaluate_reachability(
|
|
footer_html: str,
|
|
base_url: str = "",
|
|
) -> dict:
|
|
"""Run static reachability analysis on a footer HTML fragment.
|
|
|
|
Returns a finding dict for the COOKIE-CONSENT-UX-001 check.
|
|
"""
|
|
anchors = find_consent_anchors_in_footer(footer_html)
|
|
has_reopen_anchor = False
|
|
reopen_anchor: dict | None = None
|
|
info_only_count = 0
|
|
browser_deflect_count = 0
|
|
for a in anchors:
|
|
intent = a.get("intent")
|
|
if intent == "reopen_cmp":
|
|
has_reopen_anchor = True
|
|
target_class = classify_anchor_target(a, base_url)
|
|
a["target_class"] = target_class
|
|
if reopen_anchor is None:
|
|
reopen_anchor = a
|
|
elif intent == "info_only":
|
|
info_only_count += 1
|
|
elif intent == "browser_deflect":
|
|
browser_deflect_count += 1
|
|
|
|
result: dict = {
|
|
"check_id": "COOKIE-CONSENT-UX-001",
|
|
"anchors_total": len(anchors),
|
|
"has_reopen_anchor": has_reopen_anchor,
|
|
"info_only_count": info_only_count,
|
|
"browser_deflect_count": browser_deflect_count,
|
|
"reopen_anchor": reopen_anchor,
|
|
"passed": True,
|
|
"severity": None,
|
|
"severity_reason": None,
|
|
"evidence_phrases": [],
|
|
"notes": [],
|
|
}
|
|
|
|
# Hard fail: no reopen anchor at all → withdrawal not as easy as
|
|
# opt-in (Art. 7 Abs. 3 DSGVO).
|
|
if not has_reopen_anchor:
|
|
result["passed"] = False
|
|
result["severity"] = "HIGH"
|
|
result["severity_reason"] = "missing"
|
|
result["notes"].append(
|
|
"no consent-manager link in footer; withdrawal path "
|
|
"missing or only indirect",
|
|
)
|
|
|
|
# Soft fail: anchor exists but opens in new tab — DSK OH calls this
|
|
# an avoidable hurdle. MEDIUM rather than HIGH because withdrawal
|
|
# is technically still possible.
|
|
if has_reopen_anchor and reopen_anchor is not None:
|
|
cls = reopen_anchor.get("target_class")
|
|
if cls == "new_tab":
|
|
result["passed"] = False
|
|
result["severity"] = "MEDIUM"
|
|
result["severity_reason"] = "misclassified"
|
|
result["notes"].append(
|
|
"consent-manager link opens in new tab — context-break",
|
|
)
|
|
elif cls == "external":
|
|
result["passed"] = False
|
|
result["severity"] = "MEDIUM"
|
|
result["severity_reason"] = "misclassified"
|
|
result["notes"].append(
|
|
"consent-manager link points to external host",
|
|
)
|
|
|
|
# Extra signal: only browser-deflection phrases and zero reopen
|
|
# anchor — this is the worst variant the LfDI BaWü explicitly
|
|
# flagged.
|
|
if (not has_reopen_anchor and browser_deflect_count > 0):
|
|
result["severity"] = "HIGH"
|
|
result["severity_reason"] = "factually_wrong"
|
|
result["notes"].append(
|
|
"withdrawal route only via browser-settings — not gleich "
|
|
"einfach wie Erteilung",
|
|
)
|
|
|
|
return result
|