refactor(agent-check): split routes file (2692→347 LOC) + wire B1/B3/A1 [guardrail-change]
Phase-5 split of agent_compliance_check_routes.py — the 2700-line
monolith was decomposed into 19 modules in compliance/api/agent_check/:
- Phase A-F: resolve / profile+check / banner+TCF / vendors raw+finalize /
HTML blocks top+mid+bot / email / persist
- Helpers: _constants, _helpers, _fetch, _discovery, _single_check
- Schemas + State + thin _orchestrator
A1 ZIP-Anhang nativ in _phase_e_email: evidence_zip_builder.py bundles
slices + manifest.json + audit_metadata.json (SHA256 per slice +
build_sha + source_url). smtp_sender.py erweitert um attachments-Parameter.
B1 COOKIE-CONSENT-UX-001 (Mobile Reachability): consent_reachability_check.py
parses footer anchors, classifies intent (reopen_cmp / info_only /
browser_deflect) + target (same_page_cmp / new_tab / external).
_b1_wiring.py fetches homepage with iPhone-UA + renders Art-7-Abs-3
severity-coloured block.
B3 TH-RETENTION (Cross-Doc Speicherdauer): retention_comparator.py
compares DSI claim ↔ cookie-table duration ↔ actual Max-Age/expires
with 5% tolerance + severity hierarchy (dsi_under_actual HIGH,
table_under_actual HIGH, dsi_vs_table MEDIUM, actual_under_table LOW
Safari-ITP-Hint). _b3_wiring.py + Top-10 mismatches table in mail.
Side-effects:
- Fixed silent UnboundLocalError in original Step 5 (gf_one_pager used
audit_quality_findings before declaration, caught by surrounding
except → block never rendered). New _phase_d3_blocks_bot.py runs
audit-quality FIRST.
- agent_compliance_check_routes.py removed from loc-exceptions.txt
("Phase 5 split target" — done).
Tests: 55/55 grün (B1 22 + B3 27 + saving_scan 6).
E2E: smoke against Elli DSE+Cookie produced HIGH/missing B1 finding,
TH-RETENTION table (17 cookies / 3 ✓ / 3 ✗ / 11 ?), evidence-zip
with 2 slices + manifest + audit_metadata (12089B, SHA256-chained,
source verified), email sent (attachments=1).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,278 @@
|
||||
"""
|
||||
B1 — Cookie-Consent-UX-001: Mobile Reachability of Consent Settings.
|
||||
|
||||
DSGVO Art. 7 Abs. 3 requires that withdrawing consent must be as
|
||||
easy as giving it. EDPB Cookie Banner Taskforce Report (2023) and
|
||||
DSK OH Digitale Dienste v1.2 (2024) both demand a permanent, directly
|
||||
reachable way to change cookie preferences — typically a Footer link
|
||||
labelled "Cookie-Einstellungen" that re-opens the CMP in place.
|
||||
|
||||
Common anti-patterns we want to flag:
|
||||
- Footer points to a Cookie-Policy *page* in a new tab, no CMP
|
||||
- Footer only offers "more info" but no "manage settings"
|
||||
- Only mention is a verbal reference to browser settings inside the
|
||||
privacy-policy text
|
||||
- Mobile footer hides the link in a multi-level accordion
|
||||
|
||||
This module does the STATIC HTML analysis. The dynamic part (mobile
|
||||
viewport rendering, tap-target measurement, click-behaviour
|
||||
verification) is performed by consent-tester via Playwright and feeds
|
||||
back into `evaluate_combined` in a later phase.
|
||||
|
||||
Pure module — no DB, no network. Tests live in
|
||||
tests/test_consent_reachability_check.py.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import re
|
||||
from html.parser import HTMLParser
|
||||
from urllib.parse import urljoin, urlparse
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Phrases that suggest "open the consent manager" rather than "show
|
||||
# more info / open a policy page".
|
||||
_REOPEN_PHRASES = (
|
||||
"cookie-einstellungen", "cookie einstellungen",
|
||||
"cookie-präferenzen", "cookie praeferenzen", "cookie-praferenzen",
|
||||
"cookie-einwilligung", "einwilligung verwalten",
|
||||
"consent manager", "consent settings", "consent-einstellungen",
|
||||
"datenschutz-einstellungen", "datenschutzeinstellungen",
|
||||
"cookies verwalten", "manage cookies", "manage preferences",
|
||||
"privacy settings", "privacy preferences",
|
||||
"tracking-einstellungen",
|
||||
)
|
||||
|
||||
# Weaker — these usually point at a policy page, not the CMP itself.
|
||||
_INFO_ONLY_PHRASES = (
|
||||
"cookie-richtlinie", "cookie richtlinie", "cookie-policy",
|
||||
"cookie policy", "cookies (information)",
|
||||
"datenschutz", "datenschutzerklärung", "privacy policy",
|
||||
"weitere informationen", "more information",
|
||||
)
|
||||
|
||||
# Phrases that try to shift the burden to the user's browser —
|
||||
# Bundesländer-Datenschutzbeauftragte explicitly call this insufficient.
|
||||
_BROWSER_DEFLECTION_PHRASES = (
|
||||
"browser-einstellungen", "browsereinstellungen",
|
||||
"einstellungen ihres browsers", "browser settings",
|
||||
"in ihrem browser", "über ihren browser",
|
||||
)
|
||||
|
||||
|
||||
class _AnchorCollector(HTMLParser):
|
||||
"""Collects <a> and <button> elements with text + attrs.
|
||||
|
||||
Track footer scope via a depth counter so we only return anchors
|
||||
that are descendants of <footer> (or have role="contentinfo" /
|
||||
id|class containing 'footer').
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
super().__init__(convert_charrefs=True)
|
||||
self._footer_depth = 0
|
||||
self._current: dict | None = None
|
||||
self._text_chunks: list[str] = []
|
||||
self.anchors: list[dict] = []
|
||||
|
||||
def _is_footer_open(self, tag: str, attrs: dict) -> bool:
|
||||
if tag == "footer":
|
||||
return True
|
||||
if attrs.get("role", "").lower() == "contentinfo":
|
||||
return True
|
||||
ident = (attrs.get("id", "") + " " + attrs.get("class", "")).lower()
|
||||
return "footer" in ident or "site-footer" in ident
|
||||
|
||||
def handle_starttag(self, tag, attrs):
|
||||
a = {k.lower(): (v or "") for k, v in attrs}
|
||||
if self._is_footer_open(tag, a):
|
||||
self._footer_depth += 1
|
||||
return
|
||||
if self._footer_depth > 0 and tag in ("a", "button"):
|
||||
self._current = {
|
||||
"tag": tag,
|
||||
"href": a.get("href", ""),
|
||||
"target": a.get("target", ""),
|
||||
"aria_label": a.get("aria-label", ""),
|
||||
"data_cmp": a.get("data-cmp", ""),
|
||||
"onclick": a.get("onclick", ""),
|
||||
"id": a.get("id", ""),
|
||||
"class": a.get("class", ""),
|
||||
}
|
||||
self._text_chunks = []
|
||||
|
||||
def handle_endtag(self, tag):
|
||||
if tag == "footer" and self._footer_depth > 0:
|
||||
self._footer_depth -= 1
|
||||
elif self._current and tag == self._current["tag"]:
|
||||
txt = " ".join(self._text_chunks).strip()
|
||||
self._current["text"] = re.sub(r"\s+", " ", txt)[:200]
|
||||
self.anchors.append(self._current)
|
||||
self._current = None
|
||||
self._text_chunks = []
|
||||
|
||||
def handle_data(self, data):
|
||||
if self._current is not None:
|
||||
self._text_chunks.append(data)
|
||||
|
||||
|
||||
def find_consent_anchors_in_footer(html: str) -> list[dict]:
|
||||
"""Parse <a> / <button> elements in <footer> and tag those that
|
||||
look related to cookie/consent management.
|
||||
|
||||
Each returned dict:
|
||||
{ tag, href, target, text, aria_label, onclick, id, class,
|
||||
intent }
|
||||
where intent ∈ {"reopen_cmp", "info_only", "browser_deflect",
|
||||
"unrelated"}.
|
||||
"""
|
||||
if not html:
|
||||
return []
|
||||
parser = _AnchorCollector()
|
||||
try:
|
||||
parser.feed(html)
|
||||
except Exception as e: # malformed HTML — recover silently
|
||||
logger.warning("footer parser failed: %s", e)
|
||||
return []
|
||||
out: list[dict] = []
|
||||
for a in parser.anchors:
|
||||
label = " ".join([
|
||||
a.get("text", ""), a.get("aria_label", ""),
|
||||
a.get("data_cmp", ""), a.get("onclick", ""),
|
||||
]).lower()
|
||||
intent = "unrelated"
|
||||
if any(p in label for p in _REOPEN_PHRASES):
|
||||
intent = "reopen_cmp"
|
||||
elif any(p in label for p in _BROWSER_DEFLECTION_PHRASES):
|
||||
intent = "browser_deflect"
|
||||
elif any(p in label for p in _INFO_ONLY_PHRASES):
|
||||
intent = "info_only"
|
||||
if intent != "unrelated":
|
||||
a["intent"] = intent
|
||||
out.append(a)
|
||||
return out
|
||||
|
||||
|
||||
def classify_anchor_target(
|
||||
anchor: dict, base_url: str,
|
||||
) -> str:
|
||||
"""Decide whether the anchor would open the CMP in place or
|
||||
navigate elsewhere.
|
||||
|
||||
Returns:
|
||||
"same_page_cmp" — onclick / data-cmp / data-* / hash-only link
|
||||
"same_origin" — relative link or same-origin page (still a
|
||||
navigation away from the live banner)
|
||||
"external" — link to a different origin
|
||||
"new_tab" — target="_blank" or rel*=external
|
||||
"javascript" — javascript: link, probably a CMP trigger
|
||||
"""
|
||||
href = (anchor.get("href") or "").strip()
|
||||
target = (anchor.get("target") or "").strip().lower()
|
||||
onclick = anchor.get("onclick", "") or ""
|
||||
data_cmp = anchor.get("data_cmp", "") or ""
|
||||
|
||||
if data_cmp or onclick:
|
||||
return "same_page_cmp"
|
||||
if href.startswith("javascript:"):
|
||||
return "javascript"
|
||||
if target == "_blank":
|
||||
return "new_tab"
|
||||
if not href or href.startswith("#"):
|
||||
return "same_page_cmp"
|
||||
|
||||
base_host = urlparse(base_url).netloc.lower() if base_url else ""
|
||||
try:
|
||||
target_host = urlparse(urljoin(base_url or "/", href)).netloc.lower()
|
||||
except Exception:
|
||||
target_host = ""
|
||||
if not target_host or target_host == base_host:
|
||||
return "same_origin"
|
||||
return "external"
|
||||
|
||||
|
||||
def evaluate_reachability(
|
||||
footer_html: str,
|
||||
base_url: str = "",
|
||||
) -> dict:
|
||||
"""Run static reachability analysis on a footer HTML fragment.
|
||||
|
||||
Returns a finding dict for the COOKIE-CONSENT-UX-001 check.
|
||||
"""
|
||||
anchors = find_consent_anchors_in_footer(footer_html)
|
||||
has_reopen_anchor = False
|
||||
reopen_anchor: dict | None = None
|
||||
info_only_count = 0
|
||||
browser_deflect_count = 0
|
||||
for a in anchors:
|
||||
intent = a.get("intent")
|
||||
if intent == "reopen_cmp":
|
||||
has_reopen_anchor = True
|
||||
target_class = classify_anchor_target(a, base_url)
|
||||
a["target_class"] = target_class
|
||||
if reopen_anchor is None:
|
||||
reopen_anchor = a
|
||||
elif intent == "info_only":
|
||||
info_only_count += 1
|
||||
elif intent == "browser_deflect":
|
||||
browser_deflect_count += 1
|
||||
|
||||
result: dict = {
|
||||
"check_id": "COOKIE-CONSENT-UX-001",
|
||||
"anchors_total": len(anchors),
|
||||
"has_reopen_anchor": has_reopen_anchor,
|
||||
"info_only_count": info_only_count,
|
||||
"browser_deflect_count": browser_deflect_count,
|
||||
"reopen_anchor": reopen_anchor,
|
||||
"passed": True,
|
||||
"severity": None,
|
||||
"severity_reason": None,
|
||||
"evidence_phrases": [],
|
||||
"notes": [],
|
||||
}
|
||||
|
||||
# Hard fail: no reopen anchor at all → withdrawal not as easy as
|
||||
# opt-in (Art. 7 Abs. 3 DSGVO).
|
||||
if not has_reopen_anchor:
|
||||
result["passed"] = False
|
||||
result["severity"] = "HIGH"
|
||||
result["severity_reason"] = "missing"
|
||||
result["notes"].append(
|
||||
"no consent-manager link in footer; withdrawal path "
|
||||
"missing or only indirect",
|
||||
)
|
||||
|
||||
# Soft fail: anchor exists but opens in new tab — DSK OH calls this
|
||||
# an avoidable hurdle. MEDIUM rather than HIGH because withdrawal
|
||||
# is technically still possible.
|
||||
if has_reopen_anchor and reopen_anchor is not None:
|
||||
cls = reopen_anchor.get("target_class")
|
||||
if cls == "new_tab":
|
||||
result["passed"] = False
|
||||
result["severity"] = "MEDIUM"
|
||||
result["severity_reason"] = "misclassified"
|
||||
result["notes"].append(
|
||||
"consent-manager link opens in new tab — context-break",
|
||||
)
|
||||
elif cls == "external":
|
||||
result["passed"] = False
|
||||
result["severity"] = "MEDIUM"
|
||||
result["severity_reason"] = "misclassified"
|
||||
result["notes"].append(
|
||||
"consent-manager link points to external host",
|
||||
)
|
||||
|
||||
# Extra signal: only browser-deflection phrases and zero reopen
|
||||
# anchor — this is the worst variant the LfDI BaWü explicitly
|
||||
# flagged.
|
||||
if (not has_reopen_anchor and browser_deflect_count > 0):
|
||||
result["severity"] = "HIGH"
|
||||
result["severity_reason"] = "factually_wrong"
|
||||
result["notes"].append(
|
||||
"withdrawal route only via browser-settings — not gleich "
|
||||
"einfach wie Erteilung",
|
||||
)
|
||||
|
||||
return result
|
||||
@@ -0,0 +1,119 @@
|
||||
"""
|
||||
Evidence ZIP Builder — bundles cookie-evidence slices into one ZIP
|
||||
suitable as email attachment for the audit trail.
|
||||
|
||||
Why: capture_cookie_evidence_slices() produces N PNG slices per check
|
||||
with timestamps + per-slice SHA256. Without an attachment to the
|
||||
compliance report, the evidence chain stops at the backend. The ZIP
|
||||
makes the slices portable so a DSB / lawyer can hand them to an
|
||||
auditor or supervisory authority.
|
||||
|
||||
ZIP layout:
|
||||
evidence.zip
|
||||
├── manifest.json # per-slice metadata
|
||||
├── audit_metadata.json # run-level (check_id, url, build_sha, ...)
|
||||
└── slice_001.png ... # binary PNG per slice
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import base64
|
||||
import io
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import zipfile
|
||||
from datetime import datetime, timezone
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def build_evidence_zip(
|
||||
slices: list[dict],
|
||||
meta: dict | None = None,
|
||||
check_id: str = "",
|
||||
) -> bytes:
|
||||
"""Build a ZIP archive with all slices + a manifest.
|
||||
|
||||
Args:
|
||||
slices: list of dicts from capture_cookie_evidence_slices():
|
||||
each {"idx", "ts", "top_y", "bot_y", "sha256", "png_b64",
|
||||
"png_size"}
|
||||
meta: run-level dict from the same call:
|
||||
{"total_height_px", "width_px", "accepted_banner",
|
||||
"expanded", "url", "captured_at", "slice_count"}
|
||||
check_id: the compliance-check job id
|
||||
|
||||
Returns:
|
||||
raw ZIP bytes (suitable as email attachment payload)
|
||||
"""
|
||||
buf = io.BytesIO()
|
||||
manifest_slices: list[dict] = []
|
||||
|
||||
with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as zf:
|
||||
for s in slices or []:
|
||||
idx = int(s.get("idx", 0))
|
||||
fname = f"slice_{idx + 1:03d}.png"
|
||||
try:
|
||||
png = base64.b64decode(s.get("png_b64", ""))
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
"evidence_zip: skip slice %s, b64 decode failed: %s",
|
||||
idx, e,
|
||||
)
|
||||
continue
|
||||
zf.writestr(fname, png)
|
||||
manifest_slices.append({
|
||||
"filename": fname,
|
||||
"slice_idx": idx,
|
||||
"captured_at": s.get("ts", ""),
|
||||
"top_y_px": s.get("top_y"),
|
||||
"bot_y_px": s.get("bot_y"),
|
||||
"sha256_short": s.get("sha256", ""),
|
||||
"png_size_bytes": s.get("png_size", len(png)),
|
||||
})
|
||||
|
||||
manifest = {
|
||||
"schema_version": "1.0",
|
||||
"check_id": check_id,
|
||||
"slices": manifest_slices,
|
||||
"slice_count": len(manifest_slices),
|
||||
}
|
||||
zf.writestr(
|
||||
"manifest.json",
|
||||
json.dumps(manifest, indent=2, ensure_ascii=False),
|
||||
)
|
||||
|
||||
audit_meta = {
|
||||
"schema_version": "1.0",
|
||||
"check_id": check_id,
|
||||
"build_sha": os.environ.get("BUILD_SHA", "unknown"),
|
||||
"generated_at": datetime.now(timezone.utc).isoformat(),
|
||||
"source_url": (meta or {}).get("url", ""),
|
||||
"captured_at": (meta or {}).get("captured_at", ""),
|
||||
"accepted_banner": (meta or {}).get("accepted_banner"),
|
||||
"expanded": (meta or {}).get("expanded"),
|
||||
"total_height_px": (meta or {}).get("total_height_px"),
|
||||
"width_px": (meta or {}).get("width_px"),
|
||||
"slice_count": (meta or {}).get(
|
||||
"slice_count", len(manifest_slices),
|
||||
),
|
||||
"note": (
|
||||
"Each slice_NNN.png is an overlapping screenshot fragment "
|
||||
"of the cookie policy page captured at captured_at. "
|
||||
"sha256_short is the first 16 hex chars of the SHA-256 of "
|
||||
"the raw PNG bytes — use it to verify the slice was not "
|
||||
"modified after capture."
|
||||
),
|
||||
}
|
||||
zf.writestr(
|
||||
"audit_metadata.json",
|
||||
json.dumps(audit_meta, indent=2, ensure_ascii=False),
|
||||
)
|
||||
|
||||
data = buf.getvalue()
|
||||
logger.info(
|
||||
"evidence_zip built: %d slices, %d bytes, check_id=%s",
|
||||
len(manifest_slices), len(data), check_id,
|
||||
)
|
||||
return data
|
||||
@@ -0,0 +1,362 @@
|
||||
"""
|
||||
B3 — Cross-Doc Retention Consistency Comparator.
|
||||
|
||||
Compares three sources of truth for cookie storage duration:
|
||||
|
||||
1. DSI claim — sentence(s) in the privacy policy mentioning retention
|
||||
("Die Speicherdauer beträgt 14 Monate", "_ga: 14 Monate", ...).
|
||||
2. Cookie-table — the `duration` field parsed from the cookie policy
|
||||
table (parse_flat_cookie_text / OCR / vendor-extract).
|
||||
3. Actual cookie — `Max-Age` / `Expires` from the real Set-Cookie
|
||||
header captured by the consent-tester.
|
||||
|
||||
Output is a per-cookie finding usable by the audit report:
|
||||
- matches=True → all three sources agree (within tolerance)
|
||||
- matches=False → mismatch with explicit type + severity_reason
|
||||
|
||||
Severity hierarchy (see project_audit_report_architecture.md):
|
||||
HIGH/factually_wrong : DSI claim is shorter than reality
|
||||
→ user is told "X" but tracked for longer
|
||||
HIGH/factually_wrong : table duration is shorter than reality
|
||||
→ cookie table understates what is set
|
||||
MEDIUM/misclassified : DSI is shorter than table (internal docs disagree)
|
||||
LOW/incomplete : only one source has data
|
||||
|
||||
The module is pure (no DB, no network) and meant to be called from the
|
||||
report pipeline after cookies+DSI+HAR have already been collected.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import re
|
||||
from dataclasses import dataclass
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# 5% tolerance — Safari ITP, leap years, server clocks etc.
|
||||
_MATCH_TOLERANCE_PCT = 5
|
||||
|
||||
# Multipliers in DAYS for the German + English unit vocabulary used in
|
||||
# our cookie tables and policies.
|
||||
_UNIT_DAYS: dict[str, float] = {
|
||||
"sekunden": 1 / 86400, "sekunde": 1 / 86400, "sec": 1 / 86400, "s": 1 / 86400,
|
||||
"minuten": 1 / 1440, "minute": 1 / 1440, "min": 1 / 1440,
|
||||
"stunden": 1 / 24, "stunde": 1 / 24, "h": 1 / 24,
|
||||
"tage": 1, "tag": 1, "d": 1, "day": 1, "days": 1,
|
||||
"wochen": 7, "woche": 7, "week": 7, "weeks": 7,
|
||||
"monate": 30, "monat": 30, "month": 30, "months": 30,
|
||||
"jahre": 365, "jahr": 365, "year": 365, "years": 365,
|
||||
}
|
||||
|
||||
# Phrases that mean "session" — cookie deleted when browser closes.
|
||||
_SESSION_TOKENS = {
|
||||
"session", "sitzung", "sitzungsdauer", "browsersitzung",
|
||||
"browser session", "browsing session", "tab",
|
||||
}
|
||||
|
||||
# Phrases that mean "persistent without explicit cap".
|
||||
_NO_EXPIRY_TOKENS = {
|
||||
"unbegrenzt", "unbestimmt", "kein ablaufdatum",
|
||||
"no expiry", "persistent", "permanent",
|
||||
}
|
||||
|
||||
|
||||
@dataclass
|
||||
class RetentionClaim:
|
||||
"""One retention statement found in the DSI text."""
|
||||
sentence: str
|
||||
days: float | None # None for session/unknown
|
||||
is_session: bool
|
||||
is_persistent: bool
|
||||
context_terms: list[str] # cookie names / provider names mentioned nearby
|
||||
|
||||
|
||||
def parse_duration_to_days(text: str) -> tuple[float | None, str]:
|
||||
"""Convert a duration phrase to days.
|
||||
|
||||
Returns (days, kind) where kind ∈
|
||||
{"days", "session", "persistent", "unknown"}.
|
||||
For "session" / "persistent" days is None — comparisons must
|
||||
handle these as special cases, not as 0 or infinity.
|
||||
"""
|
||||
if text is None:
|
||||
return None, "unknown"
|
||||
s = text.strip().lower()
|
||||
if not s:
|
||||
return None, "unknown"
|
||||
|
||||
for tok in _SESSION_TOKENS:
|
||||
if tok in s:
|
||||
return None, "session"
|
||||
for tok in _NO_EXPIRY_TOKENS:
|
||||
if tok in s:
|
||||
return None, "persistent"
|
||||
|
||||
# "14 Monate", "1 Jahr", "24h", "30 Tage", "365 Tage", "30d"
|
||||
m = re.search(
|
||||
r"(?P<num>\d+(?:[.,]\d+)?)\s*(?P<unit>"
|
||||
r"sekunden?|sec|s|minuten?|min|stunden?|h|"
|
||||
r"tage?|d(?:ays?)?|wochen?|weeks?|"
|
||||
r"monate?|months?|jahre?|years?)\b",
|
||||
s,
|
||||
)
|
||||
if not m:
|
||||
return None, "unknown"
|
||||
num = float(m.group("num").replace(",", "."))
|
||||
unit = m.group("unit")
|
||||
mult = _UNIT_DAYS.get(unit)
|
||||
if mult is None:
|
||||
return None, "unknown"
|
||||
return num * mult, "days"
|
||||
|
||||
|
||||
def max_age_to_days(max_age_seconds: int | float | None) -> float | None:
|
||||
"""Convert a Set-Cookie Max-Age (in seconds) to days."""
|
||||
if max_age_seconds is None:
|
||||
return None
|
||||
try:
|
||||
return float(max_age_seconds) / 86400.0
|
||||
except (TypeError, ValueError):
|
||||
return None
|
||||
|
||||
|
||||
# Sentence splitter that respects German legal text style (lots of
|
||||
# semicolons + parentheses but few capitalised abbreviations).
|
||||
_SENTENCE_SPLIT = re.compile(r"(?<=[.!?])\s+(?=[A-ZÄÖÜ])")
|
||||
|
||||
# Quick anchor terms for retention sentences.
|
||||
_RETENTION_ANCHORS = (
|
||||
"speicherdauer", "speicherfrist", "speicher",
|
||||
"aufbewahrungsdauer", "aufbewahrungsfrist",
|
||||
"löschfrist", "löschung",
|
||||
"gespeichert für", "wird gespeichert", "wird für",
|
||||
"retention", "expires", "expiration", "lifetime",
|
||||
"gültigkeit", "laufzeit",
|
||||
)
|
||||
|
||||
|
||||
def _looks_like_retention(sentence: str) -> bool:
|
||||
s = sentence.lower()
|
||||
if not any(a in s for a in _RETENTION_ANCHORS):
|
||||
return False
|
||||
# Need a unit token nearby — otherwise it's metadata not duration.
|
||||
return bool(re.search(
|
||||
r"\b\d[\d.,]*\s*("
|
||||
r"sekunden?|minuten?|stunden?|tage?|wochen?|"
|
||||
r"monate?|jahre?|sec|min|h|d|"
|
||||
r"weeks?|months?|years?|days?)\b",
|
||||
s,
|
||||
))
|
||||
|
||||
|
||||
def extract_retention_claims(
|
||||
dsi_text: str,
|
||||
cookie_names: list[str] | None = None,
|
||||
vendor_names: list[str] | None = None,
|
||||
) -> list[RetentionClaim]:
|
||||
"""Find sentences in the DSI that state a retention period.
|
||||
|
||||
cookie_names / vendor_names attach themselves to a sentence when
|
||||
they are mentioned in it; the comparator uses this to prefer the
|
||||
most specific claim available for a given cookie.
|
||||
"""
|
||||
if not dsi_text:
|
||||
return []
|
||||
cookie_names = cookie_names or []
|
||||
vendor_names = vendor_names or []
|
||||
# Normalise — keep original case for the sentence so it can be
|
||||
# cited verbatim in the audit report.
|
||||
sentences = _SENTENCE_SPLIT.split(dsi_text)
|
||||
claims: list[RetentionClaim] = []
|
||||
for raw in sentences:
|
||||
s = raw.strip()
|
||||
if not s:
|
||||
continue
|
||||
if not _looks_like_retention(s):
|
||||
continue
|
||||
days, kind = parse_duration_to_days(s)
|
||||
lower = s.lower()
|
||||
contexts: list[str] = []
|
||||
for n in cookie_names:
|
||||
if n and n.lower() in lower:
|
||||
contexts.append(n)
|
||||
for v in vendor_names:
|
||||
if v and v.lower() in lower:
|
||||
contexts.append(v)
|
||||
claims.append(RetentionClaim(
|
||||
sentence=s[:400],
|
||||
days=days,
|
||||
is_session=(kind == "session"),
|
||||
is_persistent=(kind == "persistent"),
|
||||
context_terms=contexts,
|
||||
))
|
||||
return claims
|
||||
|
||||
|
||||
def _best_dsi_claim(
|
||||
claims: list[RetentionClaim],
|
||||
cookie_name: str,
|
||||
vendor_name: str | None,
|
||||
) -> RetentionClaim | None:
|
||||
"""Pick the most specific DSI claim for a given cookie.
|
||||
|
||||
Priority: claim that mentions the cookie name > claim that mentions
|
||||
the vendor > generic (no context).
|
||||
"""
|
||||
if not claims:
|
||||
return None
|
||||
by_cookie = [c for c in claims if cookie_name and cookie_name in c.context_terms]
|
||||
if by_cookie:
|
||||
return by_cookie[0]
|
||||
if vendor_name:
|
||||
by_vendor = [c for c in claims if vendor_name in c.context_terms]
|
||||
if by_vendor:
|
||||
return by_vendor[0]
|
||||
generic = [c for c in claims if not c.context_terms]
|
||||
return generic[0] if generic else claims[0]
|
||||
|
||||
|
||||
def _within_tolerance(a: float, b: float) -> bool:
|
||||
if a == 0 and b == 0:
|
||||
return True
|
||||
base = max(abs(a), abs(b))
|
||||
return abs(a - b) <= base * (_MATCH_TOLERANCE_PCT / 100.0)
|
||||
|
||||
|
||||
def compare_retention(
|
||||
cookie_name: str,
|
||||
table_duration: str | None,
|
||||
actual_max_age_seconds: int | float | None,
|
||||
dsi_claims: list[RetentionClaim] | None = None,
|
||||
vendor_name: str | None = None,
|
||||
) -> dict:
|
||||
"""Per-cookie three-way retention comparison.
|
||||
|
||||
Returns a finding dict suitable for the audit-report aggregator
|
||||
(theme = TH-RETENTION). Output schema is stable — extending it must
|
||||
be additive so existing tests stay green.
|
||||
"""
|
||||
table_days, table_kind = parse_duration_to_days(table_duration or "")
|
||||
actual_days = max_age_to_days(actual_max_age_seconds)
|
||||
dsi_claim = _best_dsi_claim(
|
||||
dsi_claims or [], cookie_name, vendor_name,
|
||||
)
|
||||
dsi_days = dsi_claim.days if dsi_claim else None
|
||||
|
||||
out: dict = {
|
||||
"cookie_name": cookie_name,
|
||||
"vendor_name": vendor_name,
|
||||
"table_duration_raw": table_duration,
|
||||
"table_days": table_days,
|
||||
"table_kind": table_kind,
|
||||
"actual_max_age_seconds": actual_max_age_seconds,
|
||||
"actual_days": actual_days,
|
||||
"dsi_days": dsi_days,
|
||||
"dsi_sentence": dsi_claim.sentence if dsi_claim else None,
|
||||
"dsi_context_terms": dsi_claim.context_terms if dsi_claim else [],
|
||||
"matches": True,
|
||||
"mismatch_type": None,
|
||||
"severity_reason": None,
|
||||
"severity": None,
|
||||
"diff_days": None,
|
||||
"notes": [],
|
||||
}
|
||||
|
||||
sources = [v for v in (table_days, actual_days, dsi_days) if v is not None]
|
||||
if len(sources) <= 1:
|
||||
out["severity_reason"] = "incomplete"
|
||||
out["severity"] = "LOW"
|
||||
out["notes"].append("only_one_source_has_data")
|
||||
return out
|
||||
|
||||
# Highest-severity check first: DSI claim is shorter than the cookie
|
||||
# actually lives — user was misled.
|
||||
if dsi_days is not None and actual_days is not None:
|
||||
if not _within_tolerance(dsi_days, actual_days):
|
||||
if dsi_days < actual_days:
|
||||
out["matches"] = False
|
||||
out["mismatch_type"] = "dsi_under_actual"
|
||||
out["severity_reason"] = "factually_wrong"
|
||||
out["severity"] = "HIGH"
|
||||
out["diff_days"] = actual_days - dsi_days
|
||||
|
||||
# Cookie table understates reality — second highest.
|
||||
if (out["matches"] and table_days is not None
|
||||
and actual_days is not None):
|
||||
if not _within_tolerance(table_days, actual_days):
|
||||
if table_days < actual_days:
|
||||
out["matches"] = False
|
||||
out["mismatch_type"] = "table_under_actual"
|
||||
out["severity_reason"] = "factually_wrong"
|
||||
out["severity"] = "HIGH"
|
||||
out["diff_days"] = actual_days - table_days
|
||||
|
||||
# Internal disagreement DSI vs. table (less severe — both are
|
||||
# documentation, neither contradicts the live cookie).
|
||||
if (out["matches"] and dsi_days is not None and table_days is not None):
|
||||
if not _within_tolerance(dsi_days, table_days):
|
||||
out["matches"] = False
|
||||
out["mismatch_type"] = "dsi_vs_table"
|
||||
out["severity_reason"] = "misclassified"
|
||||
out["severity"] = "MEDIUM"
|
||||
out["diff_days"] = abs(dsi_days - table_days)
|
||||
|
||||
# Catch over-declaration too — table says "2 years" but cookie
|
||||
# lives 7 days (Safari ITP). Less severe but worth flagging.
|
||||
if (out["matches"] and table_days is not None
|
||||
and actual_days is not None):
|
||||
if (not _within_tolerance(table_days, actual_days)
|
||||
and table_days > actual_days):
|
||||
out["matches"] = False
|
||||
out["mismatch_type"] = "actual_under_table"
|
||||
out["severity_reason"] = "incomplete"
|
||||
out["severity"] = "LOW"
|
||||
out["notes"].append("possible_safari_itp_cap")
|
||||
out["diff_days"] = table_days - actual_days
|
||||
|
||||
return out
|
||||
|
||||
|
||||
def build_retention_theme_summary(
|
||||
findings: list[dict],
|
||||
) -> dict:
|
||||
"""Aggregate per-cookie findings into the per-theme block used by
|
||||
the report (theme = TH-RETENTION).
|
||||
"""
|
||||
total = len(findings)
|
||||
incomplete = sum(
|
||||
1 for f in findings if f.get("severity_reason") == "incomplete"
|
||||
)
|
||||
# Incomplete findings keep matches=True (we did not observe a
|
||||
# mismatch), but they don't count as a verified pass either.
|
||||
passed = sum(
|
||||
1 for f in findings
|
||||
if f.get("matches") and f.get("severity_reason") != "incomplete"
|
||||
)
|
||||
failed = total - passed - incomplete
|
||||
by_severity: dict[str, int] = {}
|
||||
by_type: dict[str, int] = {}
|
||||
for f in findings:
|
||||
sev = f.get("severity")
|
||||
if sev:
|
||||
by_severity[sev] = by_severity.get(sev, 0) + 1
|
||||
mt = f.get("mismatch_type")
|
||||
if mt:
|
||||
by_type[mt] = by_type.get(mt, 0) + 1
|
||||
return {
|
||||
"theme_id": "TH-RETENTION",
|
||||
"total": total,
|
||||
"passed": passed,
|
||||
"failed": failed,
|
||||
"incomplete": incomplete,
|
||||
"pct": int(round(100 * passed / total)) if total else 0,
|
||||
"by_severity": by_severity,
|
||||
"by_mismatch_type": by_type,
|
||||
"top_fails": sorted(
|
||||
(f for f in findings
|
||||
if not f.get("matches")
|
||||
and f.get("severity_reason") == "factually_wrong"),
|
||||
key=lambda f: -(f.get("diff_days") or 0),
|
||||
)[:10],
|
||||
}
|
||||
@@ -8,9 +8,13 @@ Uses standard smtplib. Configuration via environment variables:
|
||||
SMTP_FROM_ADDR (default: compliance@breakpilot.local)
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import os
|
||||
import smtplib
|
||||
from email import encoders
|
||||
from email.mime.base import MIMEBase
|
||||
from email.mime.multipart import MIMEMultipart
|
||||
from email.mime.text import MIMEText
|
||||
|
||||
@@ -28,22 +32,54 @@ def send_email(
|
||||
body_html: str,
|
||||
from_addr: str | None = None,
|
||||
from_name: str | None = None,
|
||||
attachments: list[dict] | None = None,
|
||||
) -> dict:
|
||||
"""Send an email via SMTP. Returns dict with status and message_id."""
|
||||
"""Send an email via SMTP. Returns dict with status and message_id.
|
||||
|
||||
attachments: optional list of dicts:
|
||||
[{"filename": "evidence.zip", "data": <bytes>,
|
||||
"mime": "application/zip"}, ...]
|
||||
"""
|
||||
sender_addr = from_addr or SMTP_FROM_ADDR
|
||||
sender_name = from_name or SMTP_FROM_NAME
|
||||
|
||||
msg = MIMEMultipart("alternative")
|
||||
if attachments:
|
||||
msg = MIMEMultipart("mixed")
|
||||
body = MIMEMultipart("alternative")
|
||||
body.attach(MIMEText(body_html, "html", "utf-8"))
|
||||
msg.attach(body)
|
||||
for att in attachments:
|
||||
mime = att.get("mime", "application/octet-stream")
|
||||
maintype, _, subtype = mime.partition("/")
|
||||
part = MIMEBase(maintype or "application", subtype or "octet-stream")
|
||||
part.set_payload(att.get("data", b""))
|
||||
encoders.encode_base64(part)
|
||||
fname = att.get("filename", "attachment.bin")
|
||||
part.add_header(
|
||||
"Content-Disposition",
|
||||
f'attachment; filename="{fname}"',
|
||||
)
|
||||
msg.attach(part)
|
||||
else:
|
||||
msg = MIMEMultipart("alternative")
|
||||
msg.attach(MIMEText(body_html, "html", "utf-8"))
|
||||
|
||||
msg["From"] = f"{sender_name} <{sender_addr}>"
|
||||
msg["To"] = recipient
|
||||
msg["Subject"] = subject
|
||||
msg.attach(MIMEText(body_html, "html", "utf-8"))
|
||||
|
||||
try:
|
||||
with smtplib.SMTP(SMTP_HOST, SMTP_PORT, timeout=10) as server:
|
||||
with smtplib.SMTP(SMTP_HOST, SMTP_PORT, timeout=30) as server:
|
||||
server.sendmail(sender_addr, [recipient], msg.as_string())
|
||||
logger.info("Email sent to %s: %s", recipient, subject)
|
||||
return {"status": "sent", "recipient": recipient, "subject": subject}
|
||||
att_count = len(attachments or [])
|
||||
logger.info(
|
||||
"Email sent to %s: %s (attachments=%d)",
|
||||
recipient, subject, att_count,
|
||||
)
|
||||
return {
|
||||
"status": "sent", "recipient": recipient, "subject": subject,
|
||||
"attachments": att_count,
|
||||
}
|
||||
except Exception as e:
|
||||
logger.error("Failed to send email to %s: %s", recipient, e)
|
||||
return {"status": "failed", "recipient": recipient, "error": str(e)}
|
||||
|
||||
Reference in New Issue
Block a user