refactor(agent-check): split routes file (2692→347 LOC) + wire B1/B3/A1 [guardrail-change]
Phase-5 split of agent_compliance_check_routes.py — the 2700-line
monolith was decomposed into 19 modules in compliance/api/agent_check/:
- Phase A-F: resolve / profile+check / banner+TCF / vendors raw+finalize /
HTML blocks top+mid+bot / email / persist
- Helpers: _constants, _helpers, _fetch, _discovery, _single_check
- Schemas + State + thin _orchestrator
A1 ZIP-Anhang nativ in _phase_e_email: evidence_zip_builder.py bundles
slices + manifest.json + audit_metadata.json (SHA256 per slice +
build_sha + source_url). smtp_sender.py erweitert um attachments-Parameter.
B1 COOKIE-CONSENT-UX-001 (Mobile Reachability): consent_reachability_check.py
parses footer anchors, classifies intent (reopen_cmp / info_only /
browser_deflect) + target (same_page_cmp / new_tab / external).
_b1_wiring.py fetches homepage with iPhone-UA + renders Art-7-Abs-3
severity-coloured block.
B3 TH-RETENTION (Cross-Doc Speicherdauer): retention_comparator.py
compares DSI claim ↔ cookie-table duration ↔ actual Max-Age/expires
with 5% tolerance + severity hierarchy (dsi_under_actual HIGH,
table_under_actual HIGH, dsi_vs_table MEDIUM, actual_under_table LOW
Safari-ITP-Hint). _b3_wiring.py + Top-10 mismatches table in mail.
Side-effects:
- Fixed silent UnboundLocalError in original Step 5 (gf_one_pager used
audit_quality_findings before declaration, caught by surrounding
except → block never rendered). New _phase_d3_blocks_bot.py runs
audit-quality FIRST.
- agent_compliance_check_routes.py removed from loc-exceptions.txt
("Phase 5 split target" — done).
Tests: 55/55 grün (B1 22 + B3 27 + saving_scan 6).
E2E: smoke against Elli DSE+Cookie produced HIGH/missing B1 finding,
TH-RETENTION table (17 cookies / 3 ✓ / 3 ✗ / 11 ?), evidence-zip
with 2 slices + manifest + audit_metadata (12089B, SHA256-chained,
source verified), email sent (attachments=1).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -122,9 +122,9 @@ consent-sdk/src/mobile/ios/ConsentManager.swift
|
|||||||
consent-tester/services/dsi_discovery.py
|
consent-tester/services/dsi_discovery.py
|
||||||
|
|
||||||
# --- backend-compliance: unified compliance check orchestrator ---
|
# --- backend-compliance: unified compliance check orchestrator ---
|
||||||
# Sequential 7-step pipeline (text resolve, profile detect, check documents,
|
# 2026-06-06: REMOVED — file split into agent_check/ subpackage
|
||||||
# banner scan, cross-check, profile extract, report). Phase 5 split target.
|
# (19 files, main module now 347 LOC). Phase 5 target completed.
|
||||||
backend-compliance/compliance/api/agent_compliance_check_routes.py
|
# [guardrail-change]
|
||||||
|
|
||||||
# --- docs-src: binary office files (not source code) ---
|
# --- docs-src: binary office files (not source code) ---
|
||||||
# (Also excluded by extension in scripts/check-loc.sh — kept here for legibility.)
|
# (Also excluded by extension in scripts/check-loc.sh — kept here for legibility.)
|
||||||
|
|||||||
@@ -0,0 +1,10 @@
|
|||||||
|
"""
|
||||||
|
Subpackage for the compliance-check route — extracted to keep
|
||||||
|
`agent_compliance_check_routes.py` under the 500-line guardrail.
|
||||||
|
|
||||||
|
The route module still owns the public HTTP endpoints and re-exports
|
||||||
|
all helpers from this subpackage, so external callers
|
||||||
|
(`saving_scan_routes`, `agent_migration_routes`, tests) continue to
|
||||||
|
import them from `compliance.api.agent_compliance_check_routes`
|
||||||
|
unchanged.
|
||||||
|
"""
|
||||||
@@ -0,0 +1,105 @@
|
|||||||
|
"""B1 wiring — Mobile Consent-Reachability check + HTML block.
|
||||||
|
|
||||||
|
Fetches the homepage of the first submitted URL, runs the static
|
||||||
|
`evaluate_reachability` analysis on the footer, and renders the
|
||||||
|
result as an HTML block for the audit mail.
|
||||||
|
|
||||||
|
Only renders a block when the check FAILS — a passing site doesn't
|
||||||
|
need a block. The block is severity-colored and lists the specific
|
||||||
|
notes that triggered the finding (missing reopen anchor, new-tab
|
||||||
|
break, browser-deflection language).
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import html
|
||||||
|
import logging
|
||||||
|
|
||||||
|
import httpx
|
||||||
|
|
||||||
|
from compliance.services.consent_reachability_check import (
|
||||||
|
evaluate_reachability,
|
||||||
|
)
|
||||||
|
|
||||||
|
from ._helpers import _update
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
async def run_b1(state: dict) -> None:
|
||||||
|
"""Run the reachability check + render HTML. Mutates state in place."""
|
||||||
|
req = state["req"]
|
||||||
|
check_id = state["check_id"]
|
||||||
|
homepage_url = ""
|
||||||
|
for d in req.documents:
|
||||||
|
if d.url:
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
p = urlparse(d.url)
|
||||||
|
if p.scheme and p.netloc:
|
||||||
|
homepage_url = f"{p.scheme}://{p.netloc}/"
|
||||||
|
break
|
||||||
|
if not homepage_url:
|
||||||
|
return
|
||||||
|
|
||||||
|
_update(check_id, "Mobile Consent-Reachability prüfen...", 95)
|
||||||
|
try:
|
||||||
|
async with httpx.AsyncClient(
|
||||||
|
timeout=20.0, follow_redirects=True,
|
||||||
|
headers={"User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 17_5 "
|
||||||
|
"like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) "
|
||||||
|
"Version/17.5 Mobile/15E148 Safari/604.1"},
|
||||||
|
) as c:
|
||||||
|
r = await c.get(homepage_url)
|
||||||
|
if r.status_code != 200:
|
||||||
|
logger.info("B1: homepage fetch %s → HTTP %d", homepage_url, r.status_code)
|
||||||
|
return
|
||||||
|
page_html = r.text
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("B1: homepage fetch failed: %s", e)
|
||||||
|
return
|
||||||
|
|
||||||
|
finding = evaluate_reachability(page_html, homepage_url)
|
||||||
|
state["reachability_finding"] = finding
|
||||||
|
state["reachability_html"] = _render_block(finding)
|
||||||
|
logger.info(
|
||||||
|
"B1 Reachability: passed=%s severity=%s reason=%s",
|
||||||
|
finding["passed"], finding.get("severity"),
|
||||||
|
finding.get("severity_reason"),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _render_block(finding: dict) -> str:
|
||||||
|
"""Render the reachability finding as an audit-mail HTML block."""
|
||||||
|
if finding["passed"]:
|
||||||
|
return ""
|
||||||
|
sev = (finding.get("severity") or "").upper()
|
||||||
|
color = "#dc2626" if sev == "HIGH" else "#f59e0b"
|
||||||
|
notes_html = "".join(
|
||||||
|
f"<li>{html.escape(n)}</li>" for n in finding.get("notes") or []
|
||||||
|
)
|
||||||
|
anchor = finding.get("reopen_anchor") or {}
|
||||||
|
anchor_html = ""
|
||||||
|
if anchor:
|
||||||
|
anchor_html = (
|
||||||
|
"<p style='margin:8px 0 0;font-size:13px;color:#475569;'>"
|
||||||
|
"Gefundener Footer-Link: "
|
||||||
|
f"<code>{html.escape((anchor.get('text') or '')[:80])}</code> "
|
||||||
|
f"→ <code>{html.escape((anchor.get('href') or '')[:120])}</code> "
|
||||||
|
f"(target_class: {html.escape(anchor.get('target_class') or '—')})"
|
||||||
|
"</p>"
|
||||||
|
)
|
||||||
|
return (
|
||||||
|
f"<div style='margin:24px 0;padding:16px;border-left:4px solid {color};"
|
||||||
|
"background:#fef2f2;border-radius:4px;'>"
|
||||||
|
f"<h2 style='margin:0 0 8px;color:{color};font-size:16px;'>"
|
||||||
|
"COOKIE-CONSENT-UX-001 — Mobile Consent-Reachability</h2>"
|
||||||
|
f"<p style='margin:0 0 8px;font-size:14px;'><strong>Severity:</strong> "
|
||||||
|
f"{sev} ({html.escape(finding.get('severity_reason') or '')})</p>"
|
||||||
|
"<p style='margin:0 0 4px;font-size:14px;'>"
|
||||||
|
"Art. 7 Abs. 3 DSGVO: Widerruf muss so einfach wie Erteilung sein. "
|
||||||
|
"Auf Mobile-Safari konnten wir folgendes Problem feststellen:</p>"
|
||||||
|
f"<ul style='margin:8px 0 0 20px;font-size:14px;color:#7f1d1d;'>"
|
||||||
|
f"{notes_html}</ul>"
|
||||||
|
f"{anchor_html}"
|
||||||
|
"</div>"
|
||||||
|
)
|
||||||
@@ -0,0 +1,189 @@
|
|||||||
|
"""B3 wiring — Cross-doc retention consistency check + HTML block.
|
||||||
|
|
||||||
|
Combines three sources of retention truth per cookie:
|
||||||
|
|
||||||
|
- DSI text (state["doc_texts"]["dse"] or "cookie")
|
||||||
|
- cookie-table `duration` from cmp_vendors[i]["cookies"][j]
|
||||||
|
- actual cookie expiry from banner_result["cookies_detailed"][k]
|
||||||
|
|
||||||
|
and produces per-cookie findings + a TH-RETENTION theme summary. Only
|
||||||
|
renders an HTML block when there are findings to show; the block is
|
||||||
|
sorted by severity (HIGH first) and shows the top-10 mismatches.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import html
|
||||||
|
import logging
|
||||||
|
import time
|
||||||
|
|
||||||
|
from compliance.services.retention_comparator import (
|
||||||
|
build_retention_theme_summary,
|
||||||
|
compare_retention,
|
||||||
|
extract_retention_claims,
|
||||||
|
)
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def _actual_max_age_seconds(cookie: dict) -> float | None:
|
||||||
|
"""Get cookie Max-Age in seconds.
|
||||||
|
|
||||||
|
Playwright gives us `expires` as a Unix timestamp (seconds-since-
|
||||||
|
epoch). Some sources give `max_age` directly. -1 / 0 means session
|
||||||
|
cookie (no expiry) — return None to signal that.
|
||||||
|
"""
|
||||||
|
ma = cookie.get("max_age")
|
||||||
|
if isinstance(ma, (int, float)) and ma > 0:
|
||||||
|
return float(ma)
|
||||||
|
exp = cookie.get("expires")
|
||||||
|
if isinstance(exp, (int, float)) and exp > 0:
|
||||||
|
delta = exp - time.time()
|
||||||
|
if delta > 0:
|
||||||
|
return float(delta)
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def run_b3(state: dict) -> None:
|
||||||
|
"""Cross-doc retention check + render HTML. Mutates state in place."""
|
||||||
|
doc_texts = state["doc_texts"]
|
||||||
|
cmp_vendors = state["cmp_vendors"]
|
||||||
|
banner_result = state["banner_result"]
|
||||||
|
|
||||||
|
dsi_text = doc_texts.get("dse") or doc_texts.get("cookie") or ""
|
||||||
|
if not dsi_text:
|
||||||
|
return
|
||||||
|
|
||||||
|
cookie_records: list[dict] = []
|
||||||
|
cookie_names: list[str] = []
|
||||||
|
vendor_names: list[str] = []
|
||||||
|
for v in cmp_vendors or []:
|
||||||
|
vname = (v.get("name") or "").strip()
|
||||||
|
if vname:
|
||||||
|
vendor_names.append(vname)
|
||||||
|
for c in (v.get("cookies") or []):
|
||||||
|
cname = (c.get("name") or "").strip()
|
||||||
|
if not cname:
|
||||||
|
continue
|
||||||
|
duration = (c.get("duration") or c.get("persistence")
|
||||||
|
or c.get("expiry") or "")
|
||||||
|
cookie_names.append(cname)
|
||||||
|
cookie_records.append({
|
||||||
|
"name": cname,
|
||||||
|
"vendor": vname,
|
||||||
|
"table_duration": duration,
|
||||||
|
"actual_max_age": None,
|
||||||
|
})
|
||||||
|
|
||||||
|
if not cookie_records:
|
||||||
|
return
|
||||||
|
|
||||||
|
# Match actual max_age from banner_result.cookies_detailed
|
||||||
|
if banner_result:
|
||||||
|
cookies_detailed = banner_result.get("cookies_detailed") or []
|
||||||
|
by_name: dict[str, dict] = {}
|
||||||
|
for c in cookies_detailed:
|
||||||
|
n = (c.get("name") or "").lower()
|
||||||
|
if n:
|
||||||
|
by_name[n] = c
|
||||||
|
for rec in cookie_records:
|
||||||
|
nm = rec["name"].lower()
|
||||||
|
if nm in by_name:
|
||||||
|
rec["actual_max_age"] = _actual_max_age_seconds(by_name[nm])
|
||||||
|
|
||||||
|
claims = extract_retention_claims(dsi_text, cookie_names, vendor_names)
|
||||||
|
|
||||||
|
findings: list[dict] = []
|
||||||
|
for rec in cookie_records:
|
||||||
|
finding = compare_retention(
|
||||||
|
cookie_name=rec["name"],
|
||||||
|
table_duration=rec["table_duration"],
|
||||||
|
actual_max_age_seconds=rec["actual_max_age"],
|
||||||
|
dsi_claims=claims,
|
||||||
|
vendor_name=rec["vendor"] or None,
|
||||||
|
)
|
||||||
|
findings.append(finding)
|
||||||
|
|
||||||
|
summary = build_retention_theme_summary(findings)
|
||||||
|
state["retention_findings"] = findings
|
||||||
|
state["retention_theme_summary"] = summary
|
||||||
|
state["retention_html"] = _render_block(summary, findings)
|
||||||
|
logger.info(
|
||||||
|
"B3 Retention: %d findings, %d passed, %d failed, %d incomplete",
|
||||||
|
summary["total"], summary["passed"], summary["failed"],
|
||||||
|
summary["incomplete"],
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _fmt_days(d: float | None) -> str:
|
||||||
|
if d is None:
|
||||||
|
return "—"
|
||||||
|
if d < 1:
|
||||||
|
return f"{int(d * 24)}h"
|
||||||
|
if d < 30:
|
||||||
|
return f"{int(d)}d"
|
||||||
|
if d < 365:
|
||||||
|
return f"{int(d / 30)}mo"
|
||||||
|
return f"{d / 365:.1f}y"
|
||||||
|
|
||||||
|
|
||||||
|
def _render_block(summary: dict, findings: list[dict]) -> str:
|
||||||
|
if summary["total"] == 0:
|
||||||
|
return ""
|
||||||
|
failed_findings = [f for f in findings if not f.get("matches")
|
||||||
|
and f.get("severity_reason") != "incomplete"]
|
||||||
|
if not failed_findings:
|
||||||
|
return "" # all OK, no block needed
|
||||||
|
# Sort by severity (HIGH first) then diff_days desc
|
||||||
|
sev_rank = {"HIGH": 0, "MEDIUM": 1, "LOW": 2}
|
||||||
|
failed_findings.sort(key=lambda f: (
|
||||||
|
sev_rank.get((f.get("severity") or "").upper(), 9),
|
||||||
|
-(f.get("diff_days") or 0),
|
||||||
|
))
|
||||||
|
rows = []
|
||||||
|
for f in failed_findings[:10]:
|
||||||
|
sev = (f.get("severity") or "").upper()
|
||||||
|
color = ("#dc2626" if sev == "HIGH"
|
||||||
|
else "#f59e0b" if sev == "MEDIUM" else "#64748b")
|
||||||
|
rows.append(
|
||||||
|
"<tr>"
|
||||||
|
f"<td style='padding:6px 10px;border-bottom:1px solid #e5e7eb;'>"
|
||||||
|
f"<code>{html.escape(f.get('cookie_name') or '—')}</code></td>"
|
||||||
|
f"<td style='padding:6px 10px;border-bottom:1px solid #e5e7eb;'>"
|
||||||
|
f"{html.escape((f.get('vendor_name') or '—'))}</td>"
|
||||||
|
f"<td style='padding:6px 10px;border-bottom:1px solid #e5e7eb;'>"
|
||||||
|
f"DSI: {_fmt_days(f.get('dsi_days'))} • "
|
||||||
|
f"Tabelle: {_fmt_days(f.get('table_days'))} • "
|
||||||
|
f"Realität: {_fmt_days(f.get('actual_days'))}</td>"
|
||||||
|
f"<td style='padding:6px 10px;border-bottom:1px solid #e5e7eb;"
|
||||||
|
f"color:{color};font-weight:600;'>"
|
||||||
|
f"{sev} ({html.escape(f.get('mismatch_type') or '—')})</td>"
|
||||||
|
"</tr>"
|
||||||
|
)
|
||||||
|
total = summary["total"]
|
||||||
|
passed = summary["passed"]
|
||||||
|
failed = summary["failed"]
|
||||||
|
incomplete = summary["incomplete"]
|
||||||
|
return (
|
||||||
|
"<div style='margin:24px 0;padding:16px;border-left:4px solid #dc2626;"
|
||||||
|
"background:#fefce8;border-radius:4px;'>"
|
||||||
|
"<h2 style='margin:0 0 8px;color:#854d0e;font-size:16px;'>"
|
||||||
|
"TH-RETENTION — Speicherdauer-Konsistenz (DSI ↔ Cookie-Tabelle ↔ Realität)"
|
||||||
|
"</h2>"
|
||||||
|
"<p style='margin:0 0 8px;font-size:14px;color:#3f3f46;'>"
|
||||||
|
f"<strong>{total}</strong> Cookies verglichen: "
|
||||||
|
f"<strong style='color:#15803d;'>{passed} ✓</strong> / "
|
||||||
|
f"<strong style='color:#dc2626;'>{failed} ✗</strong> / "
|
||||||
|
f"<strong style='color:#64748b;'>{incomplete} ?</strong></p>"
|
||||||
|
"<table style='width:100%;border-collapse:collapse;font-size:13px;"
|
||||||
|
"margin-top:8px;background:#fff;'>"
|
||||||
|
"<thead><tr style='background:#f1f5f9;'>"
|
||||||
|
"<th style='text-align:left;padding:6px 10px;'>Cookie</th>"
|
||||||
|
"<th style='text-align:left;padding:6px 10px;'>Vendor</th>"
|
||||||
|
"<th style='text-align:left;padding:6px 10px;'>Werte</th>"
|
||||||
|
"<th style='text-align:left;padding:6px 10px;'>Mismatch</th>"
|
||||||
|
"</tr></thead>"
|
||||||
|
f"<tbody>{''.join(rows)}</tbody>"
|
||||||
|
"</table>"
|
||||||
|
"</div>"
|
||||||
|
)
|
||||||
@@ -0,0 +1,93 @@
|
|||||||
|
"""Module-level constants + shared job state for the compliance-check
|
||||||
|
route.
|
||||||
|
|
||||||
|
`_compliance_check_jobs` is the SINGLE source of truth for in-flight
|
||||||
|
job progress. Other modules MUST import the same object — never
|
||||||
|
re-declare it — otherwise progress updates land in a detached dict.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
# Internal hostname of the consent-tester container.
|
||||||
|
CONSENT_TESTER_URL = "http://bp-compliance-consent-tester:8094"
|
||||||
|
|
||||||
|
# In-memory job registry. Keyed by check_id. Values:
|
||||||
|
# {"status": "running"|"completed"|"failed"|"skipped_tdm",
|
||||||
|
# "progress": str, "progress_pct": int, "result": dict, ...}
|
||||||
|
# Read/written by:
|
||||||
|
# - agent_compliance_check_routes (start/status/_run/_update)
|
||||||
|
# - saving_scan_routes (start)
|
||||||
|
# - agent_migration_routes (status mirror)
|
||||||
|
_compliance_check_jobs: dict[str, dict] = {}
|
||||||
|
|
||||||
|
|
||||||
|
# Canonical doc types in the same order the frontend
|
||||||
|
# ComplianceCheckTab renders them. The route pads `results` to always
|
||||||
|
# include an entry for each — missing rows are flagged as 'Nicht
|
||||||
|
# eingereicht' or 'Auf der Website nicht gefunden'.
|
||||||
|
#
|
||||||
|
# DSB-Kontakt is NOT canonical: per GDPR practice the DSB is named
|
||||||
|
# inside the DSI/datenschutz document (email or contact block), not as
|
||||||
|
# a separate page. We check 'DSB benannt' as a sub-check of the DSE.
|
||||||
|
_ALL_DOC_TYPES = [
|
||||||
|
"dse", "impressum", "social_media", "cookie",
|
||||||
|
"agb", "nutzungsbedingungen", "widerruf",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
# Human-readable labels per doc_type. Used in the report + emails.
|
||||||
|
_DOC_TYPE_LABELS = {
|
||||||
|
"dse": "Datenschutzerklaerung",
|
||||||
|
"datenschutz": "Datenschutzerklaerung",
|
||||||
|
"privacy": "Datenschutzerklaerung",
|
||||||
|
"impressum": "Impressum",
|
||||||
|
"agb": "AGB",
|
||||||
|
"widerruf": "Widerrufsbelehrung",
|
||||||
|
"cookie": "Cookie-Richtlinie",
|
||||||
|
"avv": "Auftragsverarbeitung",
|
||||||
|
"loeschkonzept": "Loeschkonzept",
|
||||||
|
"dsfa": "Datenschutz-Folgenabschaetzung",
|
||||||
|
"social_media": "Social Media Datenschutz",
|
||||||
|
"nutzungsbedingungen": "Nutzungsbedingungen",
|
||||||
|
"dsb": "DSB-Kontakt",
|
||||||
|
# P74: Legal-Notice / Rechtliche Hinweise (IP, Forward-Looking, Risiko)
|
||||||
|
"legal_notice": "Rechtliche Hinweise",
|
||||||
|
# P96: Digital Services Act-Pflichtangaben (Art. 12+17 DSA)
|
||||||
|
"dsa": "DSA-Pflichtangaben",
|
||||||
|
# P97: Lizenzhinweise Dritter (OSS-Compliance)
|
||||||
|
"lizenzhinweise": "Lizenzhinweise Dritter",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# Title/URL keywords → canonical doc_type. Order matters: most-specific first.
|
||||||
|
_DISCOVERY_RULES: list[tuple[str, tuple[str, ...]]] = [
|
||||||
|
("cookie", ("cookie", "kuche", "biscuit", "cookies-")),
|
||||||
|
("widerruf", ("widerruf", "rueckgabe", "rückgabe", "cancellation",
|
||||||
|
"right-of-withdrawal", "ruecktritts", "rücktritts")),
|
||||||
|
("social_media", ("social-media", "soziale-medien", "social_media",
|
||||||
|
"social-media-policy")),
|
||||||
|
# P23: 'terms-and-conditions' kann Allgemeine Geschaeftsbedingungen ODER
|
||||||
|
# Nutzungsbedingungen meinen. Discovery-Funktion klassifiziert spaeter
|
||||||
|
# praeziser per Titel + Inhalt. Hier nur Url-Hint:
|
||||||
|
("agb", ("/agb", "geschaeftsbedingungen", "geschäftsbedingungen",
|
||||||
|
"general-terms")),
|
||||||
|
("nutzungsbedingungen", ("nutzungsbedingung", "nutzungsbedingungen",
|
||||||
|
"terms-of-use", "terms-and-conditions",
|
||||||
|
"nutzungsordnung", "terms-of-service",
|
||||||
|
"allgemeine-nutzungsbedingungen")),
|
||||||
|
("dsb", ("datenschutzbeauftragt", "data-protection-officer",
|
||||||
|
"dpo-contact", "/dsb")),
|
||||||
|
("impressum", ("impressum", "imprint", "legal-notice", "site-notice",
|
||||||
|
"anbieterkennzeichnung", "legal-disclaimer-pool")),
|
||||||
|
("dse", ("data-privacy", "datenschutz", "data-protection",
|
||||||
|
"privacy-policy", "privacy-notice", "dsgvo",
|
||||||
|
"data_privacy", "datenschutzinformation")),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
# Compound TLDs that count as 2 labels when extracting the second-level
|
||||||
|
# domain (e.g. shop.example.co.uk → 'example', not 'co').
|
||||||
|
_COMPOUND_TLDS = {
|
||||||
|
"co.uk", "co.jp", "co.nz", "co.kr", "co.za", "co.in",
|
||||||
|
"com.au", "com.br", "com.mx", "com.tr", "com.sg",
|
||||||
|
}
|
||||||
@@ -0,0 +1,230 @@
|
|||||||
|
"""Auto-discovery of missing canonical doc-types.
|
||||||
|
|
||||||
|
For each canonical type the user did NOT submit, try to find it on the
|
||||||
|
homepage of the URLs they DID submit. Also follow same-owner subdomains
|
||||||
|
mentioned in the submitted text (BMW Group → bmwgroup.com etc.).
|
||||||
|
|
||||||
|
Discovered docs are classified by `_classify_discovered_doc` and merged
|
||||||
|
back into `doc_entries`; entries that stayed empty get
|
||||||
|
`discovery_attempted=True` so the padding step can differentiate
|
||||||
|
"Nicht eingereicht" from "Auf der Website nicht gefunden".
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import re
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
|
import httpx
|
||||||
|
|
||||||
|
from ._constants import _ALL_DOC_TYPES, CONSENT_TESTER_URL
|
||||||
|
from ._helpers import _classify_discovered_doc, _update
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
async def _autodiscover_missing(
|
||||||
|
check_id: str,
|
||||||
|
doc_entries: list[dict],
|
||||||
|
doc_texts: dict[str, str],
|
||||||
|
url_text_cache: dict[str, str],
|
||||||
|
) -> None:
|
||||||
|
"""For each canonical doc_type the user did not submit, try to find
|
||||||
|
the corresponding document on the homepage of the site they DID submit.
|
||||||
|
|
||||||
|
Modifies doc_entries in place: fills text/url/word_count and sets
|
||||||
|
`auto_discovered=True`. Marks `discovery_attempted=True` on every
|
||||||
|
missing entry (even when nothing was found) so the report can
|
||||||
|
distinguish 'Nicht eingereicht' from 'Auf der Website nicht gefunden'.
|
||||||
|
"""
|
||||||
|
# VW-Fix: nur Doc-Types mit substantieller Text-Ausbeute zaehlen
|
||||||
|
# als 'submitted'. Wenn der User eine URL eingegeben hat aber die
|
||||||
|
# 404 liefert (VW cookie-richtlinie.html), oder der Crawler weniger
|
||||||
|
# als 200 Zeichen extrahiert (SPA-Shell), als 'missing' behandeln
|
||||||
|
# damit der Discovery-Pass alternative URLs probiert.
|
||||||
|
_MIN_USEFUL_CHARS = 200
|
||||||
|
submitted_types = {
|
||||||
|
e["doc_type"] for e in doc_entries
|
||||||
|
if len((e.get("text") or "").strip()) >= _MIN_USEFUL_CHARS
|
||||||
|
}
|
||||||
|
# Markiere die fehlgeschlagenen URL-Submissions damit der Discovery
|
||||||
|
# ihre URL nicht erneut probiert (waere sinnlos).
|
||||||
|
failed_urls: set[str] = {
|
||||||
|
(e.get("url") or "").strip()
|
||||||
|
for e in doc_entries
|
||||||
|
if (e.get("url") or "").strip()
|
||||||
|
and len((e.get("text") or "").strip()) < _MIN_USEFUL_CHARS
|
||||||
|
}
|
||||||
|
if failed_urls:
|
||||||
|
logger.info(
|
||||||
|
"VW-Fix: %d eingegebene URLs lieferten <%d Zeichen — Discovery "
|
||||||
|
"soll Alternativen probieren: %s",
|
||||||
|
len(failed_urls), _MIN_USEFUL_CHARS,
|
||||||
|
", ".join(list(failed_urls)[:3]),
|
||||||
|
)
|
||||||
|
# Map alias types to canonical
|
||||||
|
submitted_canon = {
|
||||||
|
"dse" if t in ("datenschutz", "privacy") else t for t in submitted_types
|
||||||
|
}
|
||||||
|
# Missing = canonical types the user did NOT submit
|
||||||
|
missing = set(_ALL_DOC_TYPES) - submitted_canon
|
||||||
|
if not missing:
|
||||||
|
return
|
||||||
|
|
||||||
|
# Pick the most common base (scheme://netloc) from submitted URLs.
|
||||||
|
bases: dict[str, int] = {}
|
||||||
|
for e in doc_entries:
|
||||||
|
u = (e.get("url") or "").strip()
|
||||||
|
if u and "://" in u:
|
||||||
|
p = urlparse(u)
|
||||||
|
base = f"{p.scheme}://{p.netloc}"
|
||||||
|
bases[base] = bases.get(base, 0) + 1
|
||||||
|
if not bases:
|
||||||
|
# No submitted URL at all — nothing to crawl from. Add empty
|
||||||
|
# placeholders (with discovery_attempted=False) so the padding
|
||||||
|
# step renders them as 'Nicht eingereicht' (not 'Nicht gefunden').
|
||||||
|
for dt in missing:
|
||||||
|
doc_entries.append({
|
||||||
|
"doc_type": dt, "url": "", "text": "", "word_count": 0,
|
||||||
|
"auto_discovered": False, "discovery_attempted": False,
|
||||||
|
})
|
||||||
|
return
|
||||||
|
|
||||||
|
# Build crawl plan: primary base + any related domains mentioned in
|
||||||
|
# the submitted texts that share the owner's SLD. Example: BMW Group
|
||||||
|
# text mentions bmwgroup.com and bmwgroup.jobs in addition to bmw.de.
|
||||||
|
primary_base = max(bases, key=bases.get) + "/"
|
||||||
|
crawl_bases: list[str] = [primary_base]
|
||||||
|
primary_netloc = urlparse(primary_base).netloc.lower().lstrip("www.")
|
||||||
|
owner_token = primary_netloc.split(".")[0] # 'bmw'
|
||||||
|
|
||||||
|
if owner_token and len(owner_token) >= 3:
|
||||||
|
domain_re = re.compile(
|
||||||
|
r"https?://([a-z0-9][a-z0-9\-]*\.)*" + re.escape(owner_token)
|
||||||
|
+ r"[a-z0-9\-]*\.[a-z]{2,}",
|
||||||
|
re.IGNORECASE,
|
||||||
|
)
|
||||||
|
seen_bases = {primary_base}
|
||||||
|
for entry in doc_entries:
|
||||||
|
text = entry.get("text") or ""
|
||||||
|
for m in domain_re.finditer(text):
|
||||||
|
p = urlparse(m.group(0))
|
||||||
|
base = f"{p.scheme}://{p.netloc}/"
|
||||||
|
base_netloc = p.netloc.lower().lstrip("www.")
|
||||||
|
if base_netloc == primary_netloc:
|
||||||
|
continue
|
||||||
|
if base in seen_bases:
|
||||||
|
continue
|
||||||
|
seen_bases.add(base)
|
||||||
|
crawl_bases.append(base)
|
||||||
|
if len(crawl_bases) >= 3:
|
||||||
|
break
|
||||||
|
if len(crawl_bases) >= 3:
|
||||||
|
break
|
||||||
|
|
||||||
|
_update(
|
||||||
|
check_id,
|
||||||
|
f"Suche fehlende Dokumente auf {', '.join(urlparse(b).netloc for b in crawl_bases)}...",
|
||||||
|
18,
|
||||||
|
)
|
||||||
|
|
||||||
|
discovered: list[dict] = []
|
||||||
|
disc_payloads: list[dict] = []
|
||||||
|
disc_cookie_texts: list[str] = []
|
||||||
|
for base in crawl_bases:
|
||||||
|
try:
|
||||||
|
async with httpx.AsyncClient(timeout=300.0) as client: # P90: 180s -> 300s
|
||||||
|
resp = await client.post(
|
||||||
|
f"{CONSENT_TESTER_URL}/dsi-discovery",
|
||||||
|
json={"url": base, "max_documents": 15},
|
||||||
|
timeout=300.0, # P90: 180s -> 300s
|
||||||
|
)
|
||||||
|
if resp.status_code != 200:
|
||||||
|
logger.warning("auto-discovery: HTTP %d for %s",
|
||||||
|
resp.status_code, base)
|
||||||
|
continue
|
||||||
|
body = resp.json()
|
||||||
|
discovered.extend(body.get("documents", []) or [])
|
||||||
|
disc_payloads.extend(body.get("cmp_payloads") or [])
|
||||||
|
cmp_text = body.get("cmp_cookie_text") or ""
|
||||||
|
if cmp_text:
|
||||||
|
disc_cookie_texts.append(cmp_text)
|
||||||
|
logger.info("auto-discovery on %s: %d docs, %d CMP payloads, "
|
||||||
|
"cmp_cookie_text=%d words", base,
|
||||||
|
len(body.get("documents", []) or []),
|
||||||
|
len(body.get("cmp_payloads") or []),
|
||||||
|
len(cmp_text.split()))
|
||||||
|
except Exception as e:
|
||||||
|
# P90: verbose exception fuer Diagnose
|
||||||
|
logger.warning("auto-discovery failed for %s: %s (%s)",
|
||||||
|
base, str(e) or "(empty)", type(e).__name__)
|
||||||
|
|
||||||
|
# Classify each discovered doc into a canonical doc_type
|
||||||
|
by_type: dict[str, dict] = {}
|
||||||
|
for d in discovered:
|
||||||
|
title = (d.get("title") or "").lower()
|
||||||
|
url = (d.get("url") or "").lower()
|
||||||
|
wc = d.get("word_count") or 0
|
||||||
|
if wc < 100:
|
||||||
|
continue
|
||||||
|
canon = _classify_discovered_doc(title, url)
|
||||||
|
if canon and canon in missing and canon not in by_type:
|
||||||
|
by_type[canon] = d
|
||||||
|
|
||||||
|
# Append/Update entry for every missing canonical type. Auto-discovered
|
||||||
|
# ones get the text/URL filled; ungratched ones stay empty so the
|
||||||
|
# padding step renders them as 'Auf der Website nicht gefunden'.
|
||||||
|
# VW-Fix: wenn schon ein leerer entry existiert (URL gesetzt, aber
|
||||||
|
# fetch hat 0/Mini-Text geliefert), in-place updaten statt duplizieren.
|
||||||
|
filled = 0
|
||||||
|
for dt in missing:
|
||||||
|
existing = next((e for e in doc_entries
|
||||||
|
if e.get("doc_type") == dt), None)
|
||||||
|
new_entry: dict = existing if existing else {
|
||||||
|
"doc_type": dt, "url": "", "text": "", "word_count": 0,
|
||||||
|
"auto_discovered": False, "discovery_attempted": True,
|
||||||
|
"cmp_payloads": [],
|
||||||
|
}
|
||||||
|
new_entry["discovery_attempted"] = True
|
||||||
|
d = by_type.get(dt)
|
||||||
|
if d:
|
||||||
|
full = d.get("full_text") or d.get("text_preview") or ""
|
||||||
|
# For cookie: prefer the CMP-reconstructed text when it's
|
||||||
|
# substantially richer than the auto-discovered DOM extraction.
|
||||||
|
# BMW homepage CMP yields ~1800 words of authoritative policy;
|
||||||
|
# DOM extraction typically yields ~600 words of site chrome.
|
||||||
|
if dt == "cookie" and disc_cookie_texts:
|
||||||
|
cmp_merged = "\n\n".join(disc_cookie_texts)
|
||||||
|
if len(cmp_merged.split()) > len(full.split()):
|
||||||
|
logger.info(
|
||||||
|
"cookie: using CMP-reconstructed text (%d words) "
|
||||||
|
"instead of DOM (%d words)",
|
||||||
|
len(cmp_merged.split()), len(full.split()),
|
||||||
|
)
|
||||||
|
full = cmp_merged
|
||||||
|
if len(full.split()) >= 100:
|
||||||
|
new_entry["text"] = full
|
||||||
|
# Behalte die original URL als "rejected_url" damit Audit
|
||||||
|
# zeigt 'X war 404, wir haben Y gefunden'.
|
||||||
|
if existing and (existing.get("url") or "").strip() in failed_urls:
|
||||||
|
new_entry["rejected_url"] = existing.get("url")
|
||||||
|
new_entry["url"] = d.get("url", "")
|
||||||
|
new_entry["word_count"] = len(full.split())
|
||||||
|
new_entry["auto_discovered"] = True
|
||||||
|
if dt == "cookie" and disc_payloads:
|
||||||
|
new_entry["cmp_payloads"] = disc_payloads
|
||||||
|
doc_texts[dt] = full
|
||||||
|
filled += 1
|
||||||
|
logger.info(
|
||||||
|
"auto-discovered %s on %s: %s (%d words)%s",
|
||||||
|
dt, base, d.get("url", "")[:80], new_entry["word_count"],
|
||||||
|
" [REPLACED failed URL]" if existing else "",
|
||||||
|
)
|
||||||
|
if not existing:
|
||||||
|
doc_entries.append(new_entry)
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
"auto-discovery: filled %d/%d missing types from %s",
|
||||||
|
filled, len(missing), base,
|
||||||
|
)
|
||||||
@@ -0,0 +1,142 @@
|
|||||||
|
"""URL → text fetch helper for the compliance-check pipeline.
|
||||||
|
|
||||||
|
Tries the consent-tester service first (Playwright, full JS render +
|
||||||
|
CMP capture). On any failure or empty result, falls back to a direct
|
||||||
|
HTTP GET with an identifiable User-Agent and per-domain rate limiting.
|
||||||
|
|
||||||
|
For cookie/dse/social_media doc types we cap discovery to 1 sub-page
|
||||||
|
(the policy itself is authoritative). For Impressum/AGB/Widerruf and
|
||||||
|
similar enterprise-split pages we follow up to 3 sub-pages.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import re as _re
|
||||||
|
|
||||||
|
import httpx
|
||||||
|
|
||||||
|
from ._constants import CONSENT_TESTER_URL
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
async def _fetch_text(url: str, doc_type: str = "") -> tuple[str, list[dict]]:
|
||||||
|
"""Fetch text from URL via consent-tester, with HTTP fallback.
|
||||||
|
|
||||||
|
Returns (text, cmp_payloads). cmp_payloads is the raw CMP JSON captured
|
||||||
|
during navigation (ePaaS, OneTrust, …) — empty when no CMP fired or
|
||||||
|
HTTP fallback was used. Backend turns payloads into structured vendor
|
||||||
|
records for the VVT table in the email.
|
||||||
|
"""
|
||||||
|
# 1. Consent-tester (Playwright-based, full JS rendering).
|
||||||
|
# max_documents depends on doc_type:
|
||||||
|
# - cookie/dse/social_media: self-extract (often + CMP capture) is
|
||||||
|
# authoritative, sub-pages dilute the policy text. max=1.
|
||||||
|
# - impressum/agb/widerruf/nutzungsbedingungen/dsb: BMW & similar
|
||||||
|
# enterprise sites split this across 3-4 short sub-pages
|
||||||
|
# (Versicherungsvermittler, Aufsicht, Berufsrecht). max=3 follows
|
||||||
|
# them. The 15s networkidle bail (dsi_helpers) keeps timing safe.
|
||||||
|
short_extract_types = {"cookie", "dse", "datenschutz", "privacy", "social_media"}
|
||||||
|
max_docs = 1 if (doc_type or "") in short_extract_types else 3
|
||||||
|
try:
|
||||||
|
# P90: 120s reicht nicht fuer BMW-Impressum (Auto-Discovery folgt
|
||||||
|
# 3 Sub-Docs). 240s gibt Spielraum. Mercedes faellt aktuell mit
|
||||||
|
# 120s auch oft an Akamai-Latenz.
|
||||||
|
async with httpx.AsyncClient(timeout=240.0) as client:
|
||||||
|
resp = await client.post(
|
||||||
|
f"{CONSENT_TESTER_URL}/dsi-discovery",
|
||||||
|
json={"url": url, "max_documents": max_docs},
|
||||||
|
timeout=240.0,
|
||||||
|
)
|
||||||
|
if resp.status_code == 200:
|
||||||
|
payload = resp.json()
|
||||||
|
docs = payload.get("documents", [])
|
||||||
|
cmp_payloads = payload.get("cmp_payloads") or []
|
||||||
|
cmp_cookie_text = payload.get("cmp_cookie_text") or ""
|
||||||
|
# D — wenn der consent-tester HTML-Tabellen aus dem DOM
|
||||||
|
# extrahiert hat, in die cmp_payloads als "generic_table"
|
||||||
|
# einschleusen damit das Backend sie via cookies_table_parser
|
||||||
|
# verarbeiten kann.
|
||||||
|
for doc in (docs or []):
|
||||||
|
for tbl in (doc.get("tables") or []):
|
||||||
|
if not tbl or len(tbl) < 3:
|
||||||
|
continue
|
||||||
|
cmp_payloads.append({
|
||||||
|
"kind": "html_table",
|
||||||
|
"url": doc.get("url", ""),
|
||||||
|
"rows": tbl,
|
||||||
|
})
|
||||||
|
if docs:
|
||||||
|
texts = []
|
||||||
|
for doc in docs:
|
||||||
|
t = doc.get("full_text", "") or doc.get("text_preview", "") or ""
|
||||||
|
if t and len(t) > 50:
|
||||||
|
texts.append(t)
|
||||||
|
merged = "\n\n".join(texts)
|
||||||
|
# For cookie/dse/social_media: when CMP reconstruction is
|
||||||
|
# substantially richer than DOM extraction, use it. This
|
||||||
|
# fixes the BMW case where DOM yields ~600 words of
|
||||||
|
# navigation but the ePaaS payload reconstructs to ~1800
|
||||||
|
# words of actual cookie policy.
|
||||||
|
if (doc_type in short_extract_types
|
||||||
|
and cmp_cookie_text
|
||||||
|
and len(cmp_cookie_text.split()) > len(merged.split())):
|
||||||
|
logger.info(
|
||||||
|
"Preferring CMP-reconstructed text for %s on %s "
|
||||||
|
"(%d words CMP vs %d words DOM)",
|
||||||
|
doc_type, url,
|
||||||
|
len(cmp_cookie_text.split()),
|
||||||
|
len(merged.split()),
|
||||||
|
)
|
||||||
|
merged = cmp_cookie_text
|
||||||
|
if merged and len(merged.split()) > 100:
|
||||||
|
if len(texts) > 1:
|
||||||
|
logger.info("Merged %d docs from %s (%d words)",
|
||||||
|
len(texts), url, len(merged.split()))
|
||||||
|
return merged, cmp_payloads
|
||||||
|
# P90-Bug-Fix: auch wenn DSE-Text zu kurz fuer 100-Wort-
|
||||||
|
# Schwelle ist, die captured CMP-Payloads NICHT verwerfen.
|
||||||
|
# BMW-Bug: DSE liefert 10 Wort SPA-Shell, aber ePaaS-JSON
|
||||||
|
# (393KB) wurde captured. Backend braucht die fuer
|
||||||
|
# extract_vendors_from_payloads (VVT-Tabelle).
|
||||||
|
if cmp_payloads:
|
||||||
|
logger.info(
|
||||||
|
"P90: keeping %d CMP payloads for %s despite "
|
||||||
|
"short text (%d words) — HTTP fallback runs in parallel",
|
||||||
|
len(cmp_payloads), url,
|
||||||
|
len((merged or cmp_cookie_text).split()),
|
||||||
|
)
|
||||||
|
fallback_text = merged or cmp_cookie_text or ""
|
||||||
|
return fallback_text, cmp_payloads
|
||||||
|
except Exception as e:
|
||||||
|
# P90: verbose exception fuer Diagnose (war vorher empty)
|
||||||
|
logger.warning("Consent-tester fetch failed for %s: %s (%s)",
|
||||||
|
url, str(e) or "(empty)", type(e).__name__)
|
||||||
|
|
||||||
|
# 2. Fallback: direct HTTP fetch (works for SSR pages like BMW).
|
||||||
|
# P7: kenntlicher UA + per-Domain Rate-Limit.
|
||||||
|
try:
|
||||||
|
from compliance.services.compliance_user_agent import (
|
||||||
|
default_request_headers, DomainRateLimiter,
|
||||||
|
)
|
||||||
|
async with httpx.AsyncClient(
|
||||||
|
timeout=30.0, follow_redirects=True,
|
||||||
|
headers=default_request_headers(),
|
||||||
|
) as client:
|
||||||
|
async with DomainRateLimiter(url):
|
||||||
|
resp = await client.get(url)
|
||||||
|
if resp.status_code == 200 and "text/html" in resp.headers.get("content-type", ""):
|
||||||
|
html = resp.text
|
||||||
|
# Strip HTML tags, decode entities
|
||||||
|
text = _re.sub(r"<script[^>]*>.*?</script>", " ", html, flags=_re.DOTALL | _re.IGNORECASE)
|
||||||
|
text = _re.sub(r"<style[^>]*>.*?</style>", " ", text, flags=_re.DOTALL | _re.IGNORECASE)
|
||||||
|
text = _re.sub(r"<[^>]+>", " ", text)
|
||||||
|
text = _re.sub(r"\s+", " ", text).strip()
|
||||||
|
if len(text.split()) > 100:
|
||||||
|
logger.info("HTTP fallback for %s: %d words", url, len(text.split()))
|
||||||
|
return text, []
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("HTTP fallback failed for %s: %s", url, e)
|
||||||
|
|
||||||
|
return "", []
|
||||||
@@ -0,0 +1,228 @@
|
|||||||
|
"""Pure helpers for the compliance-check route — no I/O, no async.
|
||||||
|
|
||||||
|
Grouped here because each is small and they share the same constants
|
||||||
|
imports. Splitting further would not improve readability.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
|
from ._constants import (
|
||||||
|
_ALL_DOC_TYPES,
|
||||||
|
_COMPOUND_TLDS,
|
||||||
|
_DISCOVERY_RULES,
|
||||||
|
_DOC_TYPE_LABELS,
|
||||||
|
_compliance_check_jobs,
|
||||||
|
)
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def _update(check_id: str, msg: str, pct: int | None = None) -> None:
|
||||||
|
"""Update the in-memory job entry with a progress message + pct."""
|
||||||
|
job = _compliance_check_jobs[check_id]
|
||||||
|
job["progress"] = msg
|
||||||
|
if pct is not None:
|
||||||
|
job["progress_pct"] = max(0, min(100, int(pct)))
|
||||||
|
|
||||||
|
|
||||||
|
def _doc_type_label(doc_type: str) -> str:
|
||||||
|
return _DOC_TYPE_LABELS.get(doc_type, doc_type.upper())
|
||||||
|
|
||||||
|
|
||||||
|
def _classify_discovered_doc(title: str, url: str) -> str | None:
|
||||||
|
"""Map a discovered doc (by its title + URL) to one of our 8 canonical types."""
|
||||||
|
haystack = f"{title} {url}"
|
||||||
|
for canon, keywords in _DISCOVERY_RULES:
|
||||||
|
if any(kw in haystack for kw in keywords):
|
||||||
|
return canon
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_domain(doc_entries: list[dict]) -> str | None:
|
||||||
|
"""Extract base domain (without www) from first URL."""
|
||||||
|
for entry in doc_entries:
|
||||||
|
url = entry.get("url", "")
|
||||||
|
if url and "://" in url:
|
||||||
|
host = urlparse(url).netloc.lower()
|
||||||
|
if host.startswith("www."):
|
||||||
|
host = host[4:]
|
||||||
|
return host or None
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _company_name_from_url(doc_entries: list[dict]) -> str | None:
|
||||||
|
"""Derive a display company name from the entered URLs.
|
||||||
|
|
||||||
|
Heuristic: take the second-level domain (e.g. "bmw" from "www.bmw.de"),
|
||||||
|
uppercase short acronyms (<=4 chars, no hyphens), title-case the rest.
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
www.bmw.de -> BMW
|
||||||
|
mercedes-benz.de -> Mercedes-Benz
|
||||||
|
shop.example.co.uk -> Example
|
||||||
|
juris.de -> Juris
|
||||||
|
"""
|
||||||
|
for entry in doc_entries:
|
||||||
|
url = entry.get("url", "")
|
||||||
|
if not url or "://" not in url:
|
||||||
|
continue
|
||||||
|
host = urlparse(url).netloc.lower()
|
||||||
|
if host.startswith("www."):
|
||||||
|
host = host[4:]
|
||||||
|
parts = host.split(".")
|
||||||
|
if len(parts) < 2:
|
||||||
|
continue
|
||||||
|
# Handle compound TLDs (.co.uk etc.)
|
||||||
|
if len(parts) >= 3 and ".".join(parts[-2:]) in _COMPOUND_TLDS:
|
||||||
|
sld = parts[-3]
|
||||||
|
else:
|
||||||
|
sld = parts[-2]
|
||||||
|
if not sld:
|
||||||
|
continue
|
||||||
|
if len(sld) <= 4 and "-" not in sld:
|
||||||
|
return sld.upper()
|
||||||
|
return "-".join(p.capitalize() for p in sld.split("-"))
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _get_skip_types(profile) -> dict[str, str]:
|
||||||
|
"""Doc_types to skip entirely with a per-type reason message.
|
||||||
|
|
||||||
|
Heute primaer fuer OEM-Konfigurator-Pattern (BMW/Audi/Mercedes):
|
||||||
|
wenn die Site kein Direkt-Vertrieb macht, sind AGB/Widerruf/
|
||||||
|
Nutzungsbedingungen nicht Pflicht auf der Website — sie werden
|
||||||
|
beim Vertragshaendler ausgehaendigt.
|
||||||
|
"""
|
||||||
|
if getattr(profile, "no_direct_sales", False):
|
||||||
|
msg = (
|
||||||
|
"Nicht anwendbar — die Webseite schliesst keinen Direkt-"
|
||||||
|
"Kaufvertrag (OEM-Konfigurator-Pattern, Vertrag laeuft "
|
||||||
|
"ueber Vertragshaendler). AGB/Widerruf werden beim "
|
||||||
|
"Haendler ausgehaendigt."
|
||||||
|
)
|
||||||
|
return {
|
||||||
|
"agb": msg,
|
||||||
|
"widerruf": msg,
|
||||||
|
"nutzungsbedingungen": msg,
|
||||||
|
}
|
||||||
|
return {}
|
||||||
|
|
||||||
|
|
||||||
|
def _apply_profile_filter(result, profile, doc_type: str):
|
||||||
|
"""Adjust INFO-level checks based on business profile context.
|
||||||
|
|
||||||
|
For example: ODR check only relevant for B2C online shops.
|
||||||
|
"""
|
||||||
|
for check in result.checks:
|
||||||
|
cid = check.id.lower()
|
||||||
|
|
||||||
|
# ODR/OS-Link: relevant ONLY for B2C online shops. The check's
|
||||||
|
# default hint is written for B2B (it explains why it's not
|
||||||
|
# relevant) — for B2C we must replace it with action-oriented
|
||||||
|
# guidance, otherwise the report contradicts itself.
|
||||||
|
if "odr" in cid or "os-link" in cid or "streitbeilegung" in check.label.lower():
|
||||||
|
if profile.needs_odr:
|
||||||
|
if not check.passed:
|
||||||
|
check.hint = (
|
||||||
|
"Als B2C-Anbieter muessen Sie nach Art. 14 EU-VO 524/2013 "
|
||||||
|
"auf die OS-Plattform (https://ec.europa.eu/consumers/odr) "
|
||||||
|
"verlinken — klickbarer Link, nicht nur Text. Zusaetzlich "
|
||||||
|
"§36 VSBG: angeben, ob Sie an Verbraucher-"
|
||||||
|
"Streitbeilegungsverfahren teilnehmen (oder nicht)."
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
check.skipped = True
|
||||||
|
check.hint = "Nicht relevant (kein B2C Online-Shop)"
|
||||||
|
|
||||||
|
# Widerruf: Flag entire document as unnecessary for B2B
|
||||||
|
if doc_type == "widerruf" and profile.business_type not in ("b2c", "unknown"):
|
||||||
|
check.severity = "INFO"
|
||||||
|
if not check.passed:
|
||||||
|
check.hint = (
|
||||||
|
"Als B2B-Unternehmen benoetigen Sie keine Widerrufsbelehrung "
|
||||||
|
"(§355 BGB gilt nur fuer Verbrauchervertraege). "
|
||||||
|
"Empfehlung: Entfernen Sie die Widerrufsbelehrung von "
|
||||||
|
"Ihrer Website, da sie Verwirrung stiften kann."
|
||||||
|
)
|
||||||
|
|
||||||
|
# Regulated profession: check for Kammer info
|
||||||
|
if "kammer" in cid or "berufsordnung" in check.label.lower():
|
||||||
|
if not profile.is_regulated_profession:
|
||||||
|
check.skipped = True
|
||||||
|
check.hint = "Nicht relevant (kein regulierter Beruf)"
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def _pad_results_with_missing(
|
||||||
|
results: list,
|
||||||
|
discovery_attempted: set[str] | None = None,
|
||||||
|
) -> list:
|
||||||
|
"""Ensure every canonical doc_type has an entry in the results list.
|
||||||
|
|
||||||
|
Doc_types the user did not submit AND auto-discovery did not find get
|
||||||
|
a placeholder DocCheckResult. The error message distinguishes:
|
||||||
|
- 'Auf der Website nicht gefunden' (discovery was attempted)
|
||||||
|
- 'Nicht eingereicht' (no submitted URLs to crawl from)
|
||||||
|
|
||||||
|
Preserves the canonical ordering from _ALL_DOC_TYPES so the report
|
||||||
|
layout is stable.
|
||||||
|
"""
|
||||||
|
from ..agent_doc_check_routes import DocCheckResult
|
||||||
|
attempted = discovery_attempted or set()
|
||||||
|
|
||||||
|
by_type: dict[str, object] = {}
|
||||||
|
for r in results:
|
||||||
|
canon = "dse" if r.doc_type in ("datenschutz", "privacy") else r.doc_type
|
||||||
|
by_type[canon] = r
|
||||||
|
|
||||||
|
ordered: list = []
|
||||||
|
for dt in _ALL_DOC_TYPES:
|
||||||
|
if dt in by_type:
|
||||||
|
ordered.append(by_type[dt])
|
||||||
|
continue
|
||||||
|
if dt in attempted:
|
||||||
|
msg = ("Auf der Website nicht gefunden — bitte URL des "
|
||||||
|
"Dokuments manuell eintragen, falls vorhanden")
|
||||||
|
else:
|
||||||
|
msg = "Nicht eingereicht — Quelle nicht angegeben"
|
||||||
|
ordered.append(DocCheckResult(
|
||||||
|
label=_doc_type_label(dt),
|
||||||
|
url="",
|
||||||
|
doc_type=dt,
|
||||||
|
word_count=0,
|
||||||
|
completeness_pct=0,
|
||||||
|
correctness_pct=0,
|
||||||
|
checks=[],
|
||||||
|
findings_count=0,
|
||||||
|
error=msg,
|
||||||
|
scenario="missing",
|
||||||
|
))
|
||||||
|
|
||||||
|
extras = [r for r in results
|
||||||
|
if (r.doc_type if r.doc_type not in ("datenschutz", "privacy") else "dse")
|
||||||
|
not in _ALL_DOC_TYPES]
|
||||||
|
ordered.extend(extras)
|
||||||
|
return ordered
|
||||||
|
|
||||||
|
|
||||||
|
def _result_to_dict(r) -> dict:
|
||||||
|
"""Convert DocCheckResult to JSON-serializable dict."""
|
||||||
|
fields = ("id", "label", "passed", "severity", "matched_text",
|
||||||
|
"level", "parent", "skipped", "hint")
|
||||||
|
return {
|
||||||
|
"label": r.label, "url": r.url, "doc_type": r.doc_type,
|
||||||
|
"word_count": r.word_count, "completeness_pct": r.completeness_pct,
|
||||||
|
"correctness_pct": r.correctness_pct,
|
||||||
|
"checks": [{f: getattr(c, f) for f in fields} for c in r.checks],
|
||||||
|
"findings_count": r.findings_count, "error": r.error,
|
||||||
|
"scenario": getattr(r, "scenario", ""),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _build_profile_html(profile) -> str:
|
||||||
|
from ..agent_doc_check_report import build_profile_html
|
||||||
|
return build_profile_html(profile)
|
||||||
@@ -0,0 +1,69 @@
|
|||||||
|
"""Thin orchestrator — runs the 6 phases of the compliance check.
|
||||||
|
|
||||||
|
The original `_run_compliance_check` was a 1620-line monolith. It is
|
||||||
|
now decomposed into six phases (A=resolve, B=profile+check,
|
||||||
|
C=banner+extract, D=report-build [D1 raw vendors, D2 finalize,
|
||||||
|
D3-top/mid/bot blocks], E=email, F=persist), each in its own module.
|
||||||
|
|
||||||
|
State flows through a single mutable `dict` (see `_state.new_state`).
|
||||||
|
This intentionally trades type safety for additive flexibility: the
|
||||||
|
report-building phase routinely adds new optional keys for each new
|
||||||
|
HTML block, and a typed dataclass would freeze the schema before the
|
||||||
|
new blocks could land.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
|
||||||
|
from ._b1_wiring import run_b1
|
||||||
|
from ._b3_wiring import run_b3
|
||||||
|
from ._constants import _compliance_check_jobs
|
||||||
|
from ._phase_a_resolve import run_phase_a
|
||||||
|
from ._phase_b_profile_check import run_phase_b
|
||||||
|
from ._phase_c_banner import run_phase_c
|
||||||
|
from ._phase_d1_vendors_raw import run_phase_d1
|
||||||
|
from ._phase_d2_vendors_finalize import run_phase_d2
|
||||||
|
from ._phase_d3_blocks_bot import run_phase_d3_bot
|
||||||
|
from ._phase_d3_blocks_mid import run_phase_d3_mid
|
||||||
|
from ._phase_d3_blocks_top import run_phase_d3_top
|
||||||
|
from ._phase_e_email import run_phase_e
|
||||||
|
from ._phase_f_persist import run_phase_f
|
||||||
|
from ._state import new_state
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
async def run_compliance_check(check_id: str, req) -> None:
|
||||||
|
"""Background task: check all documents with business-profile context."""
|
||||||
|
state = new_state(check_id, req)
|
||||||
|
try:
|
||||||
|
# Phase A: TDM gate + Step 1 (resolve / discover / split / dedup)
|
||||||
|
continue_run = await run_phase_a(state)
|
||||||
|
if not continue_run:
|
||||||
|
return # TDM denied — job already marked skipped_tdm
|
||||||
|
# Phase B: Step 2 (profile detect) + Step 3 (per-doc checks)
|
||||||
|
await run_phase_b(state)
|
||||||
|
# Phase C: Step 3b-d (banner + cross-check + TCF) + Step 4
|
||||||
|
await run_phase_c(state)
|
||||||
|
# Phase D-1/D-2: Step 5 vendor extraction + finalize
|
||||||
|
await run_phase_d1(state)
|
||||||
|
await run_phase_d2(state)
|
||||||
|
# B1 + B3: cross-cutting checks that need the finalized vendor
|
||||||
|
# list + DSI text. Render their own HTML blocks consumed by
|
||||||
|
# phase D-3 bot's full_html composition.
|
||||||
|
await run_b1(state)
|
||||||
|
run_b3(state)
|
||||||
|
# Phase D-3 top/mid/bot: Step 5 HTML blocks
|
||||||
|
await run_phase_d3_top(state)
|
||||||
|
await run_phase_d3_mid(state)
|
||||||
|
await run_phase_d3_bot(state)
|
||||||
|
# Phase E: Step 6 send mail (with A1 ZIP attachment)
|
||||||
|
run_phase_e(state)
|
||||||
|
# Phase F: Step 7 persist + audit log + unified findings
|
||||||
|
run_phase_f(state)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error("Compliance check %s failed: %s",
|
||||||
|
check_id, e, exc_info=True)
|
||||||
|
_compliance_check_jobs[check_id]["status"] = "failed"
|
||||||
|
_compliance_check_jobs[check_id]["error"] = str(e)[:500]
|
||||||
@@ -0,0 +1,232 @@
|
|||||||
|
"""Phase A — TDM gate + text resolution + section split + dedup.
|
||||||
|
|
||||||
|
Covers (in the original `_run_compliance_check`):
|
||||||
|
- TDM-reservation pre-check (§ 44b UrhG)
|
||||||
|
- Step 1 Resolve texts (URL fetch / pasted text / auto-reclassify)
|
||||||
|
- Step 1a Auto-discovery of missing canonical doc_types
|
||||||
|
- Step 1b Section splitting (shared URL → multiple doc_types,
|
||||||
|
DSI → Cookie/Social-Media auto-fill)
|
||||||
|
- Step 1c Cross-document keyword search
|
||||||
|
- P15 Dedup of doc_types referencing the same source document
|
||||||
|
|
||||||
|
Returns True to continue, False if the run was aborted (TDM denied).
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
|
||||||
|
from ._constants import _compliance_check_jobs
|
||||||
|
from ._discovery import _autodiscover_missing
|
||||||
|
from ._fetch import _fetch_text
|
||||||
|
from ._helpers import _update
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
async def run_phase_a(state: dict) -> bool:
|
||||||
|
"""Run TDM gate + Step 1 + Step 1a-c + P15 dedup. Mutate state in place."""
|
||||||
|
check_id = state["check_id"]
|
||||||
|
req = state["req"]
|
||||||
|
|
||||||
|
# Reset anchor-locator cache per run (avoid cross-run leak)
|
||||||
|
try:
|
||||||
|
from compliance.services.doc_anchor_locator import reset_cache
|
||||||
|
reset_cache()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# P7: TDM-Reservation-Check der Base-Domain (§ 44b UrhG).
|
||||||
|
# Bei reserved/denied: Run sofort beenden, kein Crawl.
|
||||||
|
try:
|
||||||
|
from compliance.services.tdm_reservation_check import (
|
||||||
|
check_tdm_reservation, is_crawl_allowed,
|
||||||
|
)
|
||||||
|
first_url = next(
|
||||||
|
(d.url for d in req.documents if d.url), "",
|
||||||
|
)
|
||||||
|
if first_url:
|
||||||
|
tdm = await check_tdm_reservation(first_url)
|
||||||
|
_compliance_check_jobs[check_id]["tdm"] = tdm
|
||||||
|
# P12: Bei tdm_override + Reason wird NICHT abgebrochen,
|
||||||
|
# sondern nur dokumentiert. Override ohne Reason wird ignoriert.
|
||||||
|
override_active = (
|
||||||
|
req.tdm_override
|
||||||
|
and len((req.tdm_override_reason or "").strip()) >= 10
|
||||||
|
)
|
||||||
|
if not is_crawl_allowed(tdm) and not override_active:
|
||||||
|
_compliance_check_jobs[check_id]["status"] = "skipped_tdm"
|
||||||
|
_compliance_check_jobs[check_id]["error"] = (
|
||||||
|
f"TDM-Vorbehalt fuer {tdm.get('domain')} erkannt "
|
||||||
|
f"(status={tdm.get('status')}) — Crawl nach § 44b "
|
||||||
|
f"UrhG nicht zulaessig. Signals: "
|
||||||
|
f"{[s.get('src') for s in tdm.get('signals', [])]}"
|
||||||
|
)
|
||||||
|
_compliance_check_jobs[check_id]["progress_pct"] = 100
|
||||||
|
logger.info("TDM-skip check_id=%s domain=%s status=%s",
|
||||||
|
check_id, tdm.get("domain"), tdm.get("status"))
|
||||||
|
return False
|
||||||
|
if override_active and not is_crawl_allowed(tdm):
|
||||||
|
_compliance_check_jobs[check_id]["tdm_override"] = {
|
||||||
|
"reason": req.tdm_override_reason.strip()[:500],
|
||||||
|
"original_status": tdm.get("status"),
|
||||||
|
}
|
||||||
|
logger.warning(
|
||||||
|
"TDM-Override aktiv: check_id=%s domain=%s "
|
||||||
|
"status=%s reason=%r",
|
||||||
|
check_id, tdm.get("domain"), tdm.get("status"),
|
||||||
|
req.tdm_override_reason.strip()[:80],
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("TDM-check failed (proceeding): %s", e)
|
||||||
|
|
||||||
|
# Step 1: Resolve texts (fetch from URL if needed) — 0-30%
|
||||||
|
_update(check_id, "Texte werden geladen...", 1)
|
||||||
|
doc_texts: dict[str, str] = {}
|
||||||
|
doc_entries: list[dict] = []
|
||||||
|
|
||||||
|
# Cache fetched URLs to detect duplicates
|
||||||
|
url_text_cache: dict[str, str] = {}
|
||||||
|
|
||||||
|
n_docs = max(1, len(req.documents))
|
||||||
|
# User-pasted-Tabellen-Vendors (kein LLM noetig) — werden weiter
|
||||||
|
# unten in cmp_vendors gemerged.
|
||||||
|
pasted_table_vendors: list[dict] = []
|
||||||
|
for i, doc in enumerate(req.documents):
|
||||||
|
pct = int(1 + (i / n_docs) * 29)
|
||||||
|
_update(check_id, f"Texte laden {i+1}/{n_docs}: {doc.doc_type}...", pct)
|
||||||
|
text = (doc.text or "").strip()
|
||||||
|
input_source = "url"
|
||||||
|
cmp_payloads: list[dict] = []
|
||||||
|
if text:
|
||||||
|
input_source = "text"
|
||||||
|
if doc.url:
|
||||||
|
input_source = "text+url" # User hat beide gefuellt
|
||||||
|
logger.info(
|
||||||
|
"doc_type=%s: User hat URL UND Text geliefert — "
|
||||||
|
"Text gewinnt, URL wird als Quellen-Referenz behalten",
|
||||||
|
doc.doc_type,
|
||||||
|
)
|
||||||
|
elif doc.url:
|
||||||
|
url_key = doc.url.strip().rstrip("/").lower()
|
||||||
|
if url_key in url_text_cache:
|
||||||
|
text = url_text_cache[url_key]
|
||||||
|
else:
|
||||||
|
text, cmp_payloads = await _fetch_text(doc.url, doc_type=doc.doc_type)
|
||||||
|
if text:
|
||||||
|
url_text_cache[url_key] = text
|
||||||
|
|
||||||
|
# Auto-Reclassify-Check: wenn der user Text in das falsche
|
||||||
|
# Doc-Type-Feld kopiert hat (z.B. Impressum-Text in DSE),
|
||||||
|
# erkennen und ggf. umtaggen.
|
||||||
|
actual_doc_type = doc.doc_type
|
||||||
|
reclassify_hint: dict | None = None
|
||||||
|
if input_source.startswith("text") and len(text) >= 500:
|
||||||
|
try:
|
||||||
|
from compliance.services.doc_type_classifier import (
|
||||||
|
detect_mismatch,
|
||||||
|
)
|
||||||
|
reclassify_hint = detect_mismatch(doc.doc_type, text)
|
||||||
|
if reclassify_hint and reclassify_hint["action"] == "reclassify":
|
||||||
|
actual_doc_type = reclassify_hint["detected"]
|
||||||
|
logger.info(
|
||||||
|
"doc_type AUTO-RECLASSIFY: deklariert=%s "
|
||||||
|
"erkannt=%s (score %d vs %d) — uebernehme erkannten Typ",
|
||||||
|
doc.doc_type, actual_doc_type,
|
||||||
|
reclassify_hint["detected_score"],
|
||||||
|
reclassify_hint["declared_score"],
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("doc_type_classifier failed: %s", e)
|
||||||
|
|
||||||
|
# Cookie-Tabelle: wenn User Tabelle reinkopiert hat, deterministisch
|
||||||
|
# parsen (kein LLM noetig) und Vendors gleich ableiten.
|
||||||
|
if input_source.startswith("text") and actual_doc_type == "cookie":
|
||||||
|
try:
|
||||||
|
from compliance.services.cookies_table_parser import (
|
||||||
|
parse_cookie_table,
|
||||||
|
)
|
||||||
|
tab_vendors = parse_cookie_table(text)
|
||||||
|
if tab_vendors:
|
||||||
|
pasted_table_vendors.extend(tab_vendors)
|
||||||
|
logger.info(
|
||||||
|
"Cookie-Tabelle erkannt im pasted Text — "
|
||||||
|
"%d Vendors / %d Cookies deterministisch geparst",
|
||||||
|
len(tab_vendors),
|
||||||
|
sum(len(v.get("cookies", [])) for v in tab_vendors),
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("cookies_table_parser failed: %s", e)
|
||||||
|
|
||||||
|
if text:
|
||||||
|
doc_texts[actual_doc_type] = text
|
||||||
|
doc_entries.append({
|
||||||
|
"doc_type": actual_doc_type,
|
||||||
|
"declared_doc_type": doc.doc_type,
|
||||||
|
"url": doc.url,
|
||||||
|
"text": text,
|
||||||
|
"word_count": len(text.split()) if text else 0,
|
||||||
|
"auto_discovered": False,
|
||||||
|
"discovery_attempted": False,
|
||||||
|
"cmp_payloads": cmp_payloads,
|
||||||
|
"input_source": input_source,
|
||||||
|
"reclassify_hint": reclassify_hint,
|
||||||
|
})
|
||||||
|
|
||||||
|
# Step 1a-bis: AUTO-DISCOVERY
|
||||||
|
await _autodiscover_missing(
|
||||||
|
check_id, doc_entries, doc_texts, url_text_cache,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Step 1b: Section splitting — two cases:
|
||||||
|
# 1. Same URL used for multiple doc_types → split by heading
|
||||||
|
# 2. DSI text contains Cookie/Social-Media sections → auto-fill empty rows
|
||||||
|
from compliance.services.section_splitter import (
|
||||||
|
split_shared_texts, auto_fill_from_dsi, cross_search_documents,
|
||||||
|
)
|
||||||
|
split_shared_texts(doc_entries, url_text_cache)
|
||||||
|
auto_fill_from_dsi(doc_entries)
|
||||||
|
|
||||||
|
# Step 1c: Cross-document search — find doc_types in wrong documents (30-35%)
|
||||||
|
_update(check_id, "Dokumente werden uebergreifend durchsucht...", 32)
|
||||||
|
placement_findings = cross_search_documents(doc_entries)
|
||||||
|
|
||||||
|
# Refresh doc_texts after all splitting/searching
|
||||||
|
for entry in doc_entries:
|
||||||
|
if entry.get("text"):
|
||||||
|
doc_texts[entry["doc_type"]] = entry["text"]
|
||||||
|
|
||||||
|
# P15: Dedupe — wenn mehrere Doc-Types DASSELBE Dokument referenzieren
|
||||||
|
# (z.B. Safetykon: User gibt /datenschutz fuer dse + cookie + widerruf),
|
||||||
|
# behalten wir nur den primaeren Doc-Type. Andere: leeren + note.
|
||||||
|
# Priorität: dse > impressum > cookie > widerruf > agb > nutzungsbedingungen
|
||||||
|
_DOC_PRIORITY = ["dse", "impressum", "cookie", "widerruf", "agb",
|
||||||
|
"nutzungsbedingungen", "social_media", "dsb"]
|
||||||
|
seen_text_hash: dict[int, str] = {}
|
||||||
|
for dt in _DOC_PRIORITY:
|
||||||
|
entry = next((e for e in doc_entries if e.get("doc_type") == dt
|
||||||
|
and e.get("text")), None)
|
||||||
|
if not entry:
|
||||||
|
continue
|
||||||
|
text_hash = hash((entry.get("text") or "").strip()[:1000])
|
||||||
|
if text_hash in seen_text_hash:
|
||||||
|
primary = seen_text_hash[text_hash]
|
||||||
|
logger.info(
|
||||||
|
"P15 dedup: doc_type=%s referenziert dasselbe Dokument "
|
||||||
|
"wie %s (URL=%s) -> als Duplikat markiert.",
|
||||||
|
dt, primary, entry.get("url", "")[:60],
|
||||||
|
)
|
||||||
|
entry["text"] = ""
|
||||||
|
entry["word_count"] = 0
|
||||||
|
entry["url"] = ""
|
||||||
|
entry["dup_of"] = primary
|
||||||
|
doc_texts.pop(dt, None)
|
||||||
|
else:
|
||||||
|
seen_text_hash[text_hash] = dt
|
||||||
|
|
||||||
|
state["doc_texts"] = doc_texts
|
||||||
|
state["doc_entries"] = doc_entries
|
||||||
|
state["url_text_cache"] = url_text_cache
|
||||||
|
state["pasted_table_vendors"] = pasted_table_vendors
|
||||||
|
state["placement_findings"] = placement_findings
|
||||||
|
return True
|
||||||
@@ -0,0 +1,183 @@
|
|||||||
|
"""Phase B — Business-profile detection + per-document checks.
|
||||||
|
|
||||||
|
Covers (in the original `_run_compliance_check`):
|
||||||
|
- Step 2 Detect business profile (with optional homepage merge for
|
||||||
|
P16 keywords)
|
||||||
|
- Step 3 Run regex + MC + LLM checks on each submitted document
|
||||||
|
(`_check_single`), applying skip rules + profile filter
|
||||||
|
+ placement findings
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import re as _re
|
||||||
|
from dataclasses import asdict
|
||||||
|
|
||||||
|
import httpx
|
||||||
|
|
||||||
|
from ._helpers import (
|
||||||
|
_apply_profile_filter,
|
||||||
|
_doc_type_label,
|
||||||
|
_get_skip_types,
|
||||||
|
_update,
|
||||||
|
)
|
||||||
|
from ._single_check import _check_single
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
async def run_phase_b(state: dict) -> None:
|
||||||
|
"""Detect business profile + check each document. Mutates state in place."""
|
||||||
|
check_id = state["check_id"]
|
||||||
|
req = state["req"]
|
||||||
|
doc_texts = state["doc_texts"]
|
||||||
|
doc_entries = state["doc_entries"]
|
||||||
|
placement_findings = state["placement_findings"]
|
||||||
|
|
||||||
|
# Step 2: Detect business profile (35-40%)
|
||||||
|
from compliance.services.business_profiler import detect_business_profile
|
||||||
|
_update(check_id, "Geschaeftsmodell wird erkannt...", 37)
|
||||||
|
# P16: Homepage-Text mit fuer Profile-Detection (no_direct_sales
|
||||||
|
# B2B-Indikatoren wie "CE-Zertifizierung" / "Schulungen" stehen oft
|
||||||
|
# nur im Homepage-Menue, nicht im Pflichttext).
|
||||||
|
profile_input = dict(doc_texts)
|
||||||
|
try:
|
||||||
|
base_url = ""
|
||||||
|
for e in doc_entries:
|
||||||
|
if e.get("url"):
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
p = urlparse(e["url"])
|
||||||
|
if p.scheme and p.netloc:
|
||||||
|
base_url = f"{p.scheme}://{p.netloc}/"
|
||||||
|
break
|
||||||
|
if base_url:
|
||||||
|
async with httpx.AsyncClient(
|
||||||
|
timeout=8.0, follow_redirects=True,
|
||||||
|
headers={"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) "
|
||||||
|
"AppleWebKit/537.36 HeadlessChrome/120.0.0.0"},
|
||||||
|
) as _hc:
|
||||||
|
_hr = await _hc.get(base_url)
|
||||||
|
if _hr.status_code == 200 and "text/html" in _hr.headers.get(
|
||||||
|
"content-type", ""):
|
||||||
|
_html = _hr.text[:60000]
|
||||||
|
_html = _re.sub(r"<script[^>]*>.*?</script>", " ",
|
||||||
|
_html, flags=_re.DOTALL | _re.IGNORECASE)
|
||||||
|
_html = _re.sub(r"<style[^>]*>.*?</style>", " ",
|
||||||
|
_html, flags=_re.DOTALL | _re.IGNORECASE)
|
||||||
|
_html = _re.sub(r"<[^>]+>", " ", _html)
|
||||||
|
_html = _re.sub(r"\s+", " ", _html).strip()
|
||||||
|
if len(_html.split()) > 30:
|
||||||
|
profile_input["__homepage"] = _html[:20000]
|
||||||
|
logger.info("P16 homepage merged for profile: %d words",
|
||||||
|
len(_html.split()))
|
||||||
|
except Exception as e:
|
||||||
|
logger.debug("homepage fetch for profile failed: %s", e)
|
||||||
|
profile = await detect_business_profile(profile_input)
|
||||||
|
profile_dict = asdict(profile)
|
||||||
|
|
||||||
|
# Step 3: Check each document
|
||||||
|
from ..agent_doc_check_routes import CheckItem, DocCheckResult
|
||||||
|
results: list[DocCheckResult] = []
|
||||||
|
total_findings = 0
|
||||||
|
use_agent_flag = req.use_agent or os.getenv(
|
||||||
|
"COMPLIANCE_USE_AGENT", "false",
|
||||||
|
).lower() == "true"
|
||||||
|
|
||||||
|
# Filter out doc_types that don't apply to this business profile
|
||||||
|
skip_types = _get_skip_types(profile)
|
||||||
|
|
||||||
|
# Derive business_scope hints for the MC filter (O1 — Doc-type Scope-Flag).
|
||||||
|
# MCs that explicitly require a feature (e.g. 'biometric_processing',
|
||||||
|
# 'ai_decision_making', 'child_targeting') get dropped when the
|
||||||
|
# detected profile doesn't declare it.
|
||||||
|
business_scope: set[str] = set()
|
||||||
|
for svc in (getattr(profile, "detected_services", []) or []):
|
||||||
|
business_scope.add(str(svc).lower())
|
||||||
|
if (getattr(profile, "business_type", "") or "").lower() == "b2c":
|
||||||
|
business_scope.add("b2c")
|
||||||
|
if getattr(profile, "has_online_shop", False):
|
||||||
|
business_scope.add("ecommerce")
|
||||||
|
if getattr(profile, "is_regulated_profession", False):
|
||||||
|
business_scope.add("regulated_profession")
|
||||||
|
|
||||||
|
# Document checks: 40-80%
|
||||||
|
n_entries = max(1, len(doc_entries))
|
||||||
|
for i, entry in enumerate(doc_entries):
|
||||||
|
text = entry["text"]
|
||||||
|
doc_type = entry["doc_type"]
|
||||||
|
label = _doc_type_label(doc_type)
|
||||||
|
url = entry["url"]
|
||||||
|
|
||||||
|
if doc_type in skip_types:
|
||||||
|
results.append(DocCheckResult(
|
||||||
|
label=label, url=url, doc_type=doc_type,
|
||||||
|
error=skip_types[doc_type],
|
||||||
|
))
|
||||||
|
continue
|
||||||
|
|
||||||
|
pct = int(40 + (i / n_entries) * 40)
|
||||||
|
_update(check_id, f"Pruefen {i+1}/{n_entries}: {label}...", pct)
|
||||||
|
|
||||||
|
if not text or len(text) < 50:
|
||||||
|
# P15: duplicate doc that was deduped against a primary doc
|
||||||
|
if entry.get("dup_of"):
|
||||||
|
results.append(DocCheckResult(
|
||||||
|
label=label, url="", doc_type=doc_type,
|
||||||
|
error=f"Nicht separat vorhanden — wird im Dokument "
|
||||||
|
f"'{_doc_type_label(entry['dup_of'])}' "
|
||||||
|
f"mit-geprueft.",
|
||||||
|
))
|
||||||
|
continue
|
||||||
|
# P24: DSB-Kontakt ist Pflichtangabe in der DSE (Art. 13(1)(b)
|
||||||
|
# DSGVO) — wenn kein separates DSB-Dokument vorliegt, ist das
|
||||||
|
# KEIN Fehler. DSB-Pruefung passiert ohnehin in der DSE.
|
||||||
|
if doc_type == "dsb" and not (entry.get("url") or "").strip():
|
||||||
|
results.append(DocCheckResult(
|
||||||
|
label=label, url="", doc_type=doc_type,
|
||||||
|
error="Nicht separat vorhanden — DSB-Kontaktdaten "
|
||||||
|
"werden in der Datenschutzerklaerung als "
|
||||||
|
"Pflichtangabe nach Art. 13(1)(b) DSGVO geprueft.",
|
||||||
|
))
|
||||||
|
continue
|
||||||
|
# Empty entry — either from auto-discovery padding (no URL
|
||||||
|
# to fetch) or from a fetch that returned nothing. If there
|
||||||
|
# was a URL we keep the error so the user knows the fetch
|
||||||
|
# failed; otherwise let the padding step label it
|
||||||
|
# 'Nicht eingereicht' / 'Auf der Website nicht gefunden'.
|
||||||
|
if (entry.get("url") or "").strip():
|
||||||
|
results.append(DocCheckResult(
|
||||||
|
label=label, url=url, doc_type=doc_type,
|
||||||
|
error="Kein Text vorhanden oder zu kurz",
|
||||||
|
))
|
||||||
|
continue
|
||||||
|
|
||||||
|
result = await _check_single(
|
||||||
|
text, doc_type, label, url,
|
||||||
|
entry["word_count"], use_agent_flag,
|
||||||
|
business_scope=business_scope,
|
||||||
|
business_profile={"no_direct_sales": getattr(profile, "no_direct_sales", False)},
|
||||||
|
)
|
||||||
|
|
||||||
|
# Apply profile context filter
|
||||||
|
result = _apply_profile_filter(result, profile, doc_type)
|
||||||
|
|
||||||
|
# Add placement findings — but only if the regex checks confirm
|
||||||
|
# the text doesn't match. If completeness >= 50%, the text IS the
|
||||||
|
# right doc_type despite missing cross-search keywords.
|
||||||
|
if result.completeness_pct < 50:
|
||||||
|
for pf in placement_findings:
|
||||||
|
if pf.get("doc_type") == doc_type:
|
||||||
|
result.checks.insert(0, CheckItem(**{
|
||||||
|
k: v for k, v in pf.items() if k != "doc_type"
|
||||||
|
}))
|
||||||
|
|
||||||
|
results.append(result)
|
||||||
|
total_findings += result.findings_count
|
||||||
|
|
||||||
|
state["profile"] = profile
|
||||||
|
state["profile_dict"] = profile_dict
|
||||||
|
state["business_scope"] = business_scope
|
||||||
|
state["results"] = results
|
||||||
|
state["total_findings"] = total_findings
|
||||||
@@ -0,0 +1,129 @@
|
|||||||
|
"""Phase C — Banner scan + Cookie/DSE cross-check + TCF check + profile extract.
|
||||||
|
|
||||||
|
Covers (in the original `_run_compliance_check`):
|
||||||
|
- Step 3b Cookie-banner scan via consent-tester /scan (homepage,
|
||||||
|
3-phase consent test)
|
||||||
|
- Step 3c Cross-check banner findings vs. cookie-policy text
|
||||||
|
- Step 3d TCF vendor vs. DSI cross-check + VVT entries
|
||||||
|
- Step 4 Extract profile hints from documents
|
||||||
|
- Step 4b Determine scenario per document (skip / regenerate / fix /
|
||||||
|
import)
|
||||||
|
- Step 4c Pad missing canonical doc_types so the report always shows
|
||||||
|
every checklist row
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
|
||||||
|
import httpx
|
||||||
|
|
||||||
|
from ._constants import CONSENT_TESTER_URL
|
||||||
|
from ._helpers import _pad_results_with_missing, _update
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
async def run_phase_c(state: dict) -> None:
|
||||||
|
"""Run banner scan + cross-checks + profile extraction. Mutates state."""
|
||||||
|
check_id = state["check_id"]
|
||||||
|
req = state["req"]
|
||||||
|
doc_texts = state["doc_texts"]
|
||||||
|
doc_entries = state["doc_entries"]
|
||||||
|
results = state["results"]
|
||||||
|
profile_dict = state["profile_dict"]
|
||||||
|
|
||||||
|
# Step 3b: Banner-Check (automatic, uses first URL or homepage)
|
||||||
|
banner_result = None
|
||||||
|
banner_url = req.documents[0].url if req.documents and req.documents[0].url else ""
|
||||||
|
# Use the homepage (strip path) for banner check
|
||||||
|
if banner_url:
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
parsed = urlparse(banner_url)
|
||||||
|
banner_url = f"{parsed.scheme}://{parsed.netloc}"
|
||||||
|
if banner_url:
|
||||||
|
_update(check_id, "Cookie-Banner wird geprueft...", 82)
|
||||||
|
try:
|
||||||
|
async with httpx.AsyncClient(timeout=900.0) as client: # P50: +10min for vendor-detail-phase
|
||||||
|
resp = await client.post(
|
||||||
|
f"{CONSENT_TESTER_URL}/scan",
|
||||||
|
json={"url": banner_url, "timeout_per_phase": 10},
|
||||||
|
)
|
||||||
|
if resp.status_code == 200:
|
||||||
|
banner_result = resp.json()
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(
|
||||||
|
"Banner check failed: %s (%s)", e or "<empty>", type(e).__name__,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Step 3c: Cross-check Banner vs Cookie-Richtlinie (88-90%)
|
||||||
|
if banner_result and "cookie" in doc_texts:
|
||||||
|
from compliance.services.banner_cookie_cross_check import (
|
||||||
|
cross_check_banner_vs_cookie,
|
||||||
|
)
|
||||||
|
from ..agent_doc_check_routes import CheckItem
|
||||||
|
_update(check_id, "Banner vs. Cookie-Richtlinie abgleichen...", 89)
|
||||||
|
cross_findings = cross_check_banner_vs_cookie(
|
||||||
|
banner_result, doc_texts["cookie"],
|
||||||
|
)
|
||||||
|
if cross_findings:
|
||||||
|
for r in results:
|
||||||
|
if r.doc_type == "cookie":
|
||||||
|
for cf in cross_findings:
|
||||||
|
r.checks.append(CheckItem(**cf))
|
||||||
|
l2 = [c for c in r.checks if c.level == 2 and not c.skipped]
|
||||||
|
l2p = sum(1 for c in l2 if c.passed)
|
||||||
|
r.correctness_pct = round(l2p / len(l2) * 100) if l2 else 0
|
||||||
|
|
||||||
|
# Step 3d: TCF Vendor cross-check against DSI
|
||||||
|
tcf_vendors = banner_result.get("tcf_vendors", []) if banner_result else []
|
||||||
|
vvt_entries: list[dict] = []
|
||||||
|
if tcf_vendors and "dse" in doc_texts:
|
||||||
|
_update(check_id, f"{len(tcf_vendors)} TCF-Verarbeiter vs. DSI abgleichen...", 91)
|
||||||
|
from compliance.services.banner_cookie_cross_check import (
|
||||||
|
cross_check_vendors_vs_dsi,
|
||||||
|
)
|
||||||
|
from compliance.services.vendor_vvt_mapper import map_vendors_to_vvt
|
||||||
|
from ..agent_doc_check_routes import CheckItem
|
||||||
|
vendor_findings = cross_check_vendors_vs_dsi(tcf_vendors, doc_texts["dse"])
|
||||||
|
if vendor_findings:
|
||||||
|
for r in results:
|
||||||
|
if r.doc_type == "dse":
|
||||||
|
for vf in vendor_findings:
|
||||||
|
r.checks.append(CheckItem(**vf))
|
||||||
|
vvt_entries = map_vendors_to_vvt(tcf_vendors)
|
||||||
|
|
||||||
|
# Step 4: Extract profile hints from documents (92-95%)
|
||||||
|
_update(check_id, "Profil wird aus Dokumenten extrahiert...", 93)
|
||||||
|
from compliance.services.profile_extractor import (
|
||||||
|
extract_profile_from_documents,
|
||||||
|
)
|
||||||
|
extracted_profile = extract_profile_from_documents(doc_texts, profile_dict)
|
||||||
|
|
||||||
|
# Step 4b: Determine scenario per document
|
||||||
|
for r in results:
|
||||||
|
if r.error:
|
||||||
|
r.scenario = "skip"
|
||||||
|
elif r.completeness_pct < 30:
|
||||||
|
r.scenario = "regenerate"
|
||||||
|
elif r.completeness_pct < 95:
|
||||||
|
r.scenario = "fix"
|
||||||
|
else:
|
||||||
|
r.scenario = "import"
|
||||||
|
|
||||||
|
# Step 4c: Always render all 8 canonical doc types. Missing types
|
||||||
|
# are differentiated:
|
||||||
|
# - Discovery was tried but found nothing -> 'Auf der Website
|
||||||
|
# nicht gefunden' (suggest user provides URL manually)
|
||||||
|
# - No submitted URLs at all -> 'Nicht eingereicht'
|
||||||
|
attempted = {
|
||||||
|
e["doc_type"] for e in doc_entries if e.get("discovery_attempted")
|
||||||
|
}
|
||||||
|
results = _pad_results_with_missing(results, discovery_attempted=attempted)
|
||||||
|
|
||||||
|
state["banner_result"] = banner_result
|
||||||
|
state["banner_url"] = banner_url
|
||||||
|
state["tcf_vendors"] = tcf_vendors
|
||||||
|
state["vvt_entries"] = vvt_entries
|
||||||
|
state["extracted_profile"] = extracted_profile
|
||||||
|
state["results"] = results
|
||||||
@@ -0,0 +1,315 @@
|
|||||||
|
"""Phase D-1 — Vendor-extraction raw stages.
|
||||||
|
|
||||||
|
Covers (in the original Step 5 of `_run_compliance_check`):
|
||||||
|
- Aggregate cmp_payloads from all doc_entries + banner_result (P30/P48)
|
||||||
|
- Fallback: use DSE text when cookie was deduped (P17-D)
|
||||||
|
- Extract structured vendor records from CMP payloads
|
||||||
|
- LLM-cascade fallback when structured extract yields < 5 vendors (P52)
|
||||||
|
- Phase-G vendor-details append (P57)
|
||||||
|
- HTML-table DOM parse (Stage D)
|
||||||
|
- Crawled cookie-table parse (Stage B)
|
||||||
|
- Tesseract OCR over evidence slices (Stage C) — also captures the
|
||||||
|
cookie_evidence_slices used by A1 e-mail attachment
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
|
||||||
|
from ._helpers import _company_name_from_url, _update
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
async def run_phase_d1(state: dict) -> None:
|
||||||
|
"""Vendor-extract raw stages. Mutates state in place."""
|
||||||
|
check_id = state["check_id"]
|
||||||
|
doc_entries = state["doc_entries"]
|
||||||
|
doc_texts = state["doc_texts"]
|
||||||
|
banner_result = state["banner_result"]
|
||||||
|
pasted_table_vendors = state["pasted_table_vendors"]
|
||||||
|
|
||||||
|
cmp_vendors: list[dict] = []
|
||||||
|
cookie_payloads: list[dict] = []
|
||||||
|
cookie_text = ""
|
||||||
|
cookie_evidence_slices: list[dict] | None = None
|
||||||
|
cookie_evidence_meta: dict | None = None
|
||||||
|
|
||||||
|
try:
|
||||||
|
from compliance.services.vendor_extractor import (
|
||||||
|
extract_vendors_from_payloads,
|
||||||
|
)
|
||||||
|
|
||||||
|
# P30: aggregate cmp_payloads from ALL doc_entries — sites
|
||||||
|
# like Mercedes load Usercentrics only on the homepage, so the
|
||||||
|
# JSON gets captured during DSE/Impressum discovery, not in the
|
||||||
|
# cookies.html fetch. Dedup by URL since the same payload is
|
||||||
|
# captured on every page load.
|
||||||
|
seen_cmp_urls: set[str] = set()
|
||||||
|
for e in doc_entries:
|
||||||
|
for p in (e.get("cmp_payloads") or []):
|
||||||
|
p_url = p.get("url") or ""
|
||||||
|
if p_url and p_url in seen_cmp_urls:
|
||||||
|
continue
|
||||||
|
seen_cmp_urls.add(p_url)
|
||||||
|
cookie_payloads.append(p)
|
||||||
|
if e.get("doc_type") == "cookie" and e.get("text"):
|
||||||
|
cookie_text = e["text"]
|
||||||
|
# P48: also pull cmp_payloads from the Banner-Scan (homepage 3-phase
|
||||||
|
# consent test). Mercedes' Usercentrics-JSON is captured there even
|
||||||
|
# when not in DSI-Discovery of static legal pages.
|
||||||
|
if banner_result:
|
||||||
|
for p in (banner_result.get("cmp_payloads") or []):
|
||||||
|
p_url = p.get("url") or ""
|
||||||
|
if p_url and p_url in seen_cmp_urls:
|
||||||
|
continue
|
||||||
|
seen_cmp_urls.add(p_url)
|
||||||
|
cookie_payloads.append(p)
|
||||||
|
if cookie_payloads:
|
||||||
|
logger.info("P48: %d CMP-payloads available for vendor-extract "
|
||||||
|
"(after Banner-Scan merge)", len(cookie_payloads))
|
||||||
|
# P17-D: Fallback wenn cookie via P15 deduped wurde — nutze DSE-Text
|
||||||
|
# sofern Cookie-Begriffe drin sind, damit LLM-Vendor-Extract trotzdem
|
||||||
|
# greifen kann.
|
||||||
|
if not cookie_text and not cookie_payloads:
|
||||||
|
dse_t = doc_texts.get("dse", "")
|
||||||
|
if dse_t and any(w in dse_t.lower() for w in
|
||||||
|
("cookie", "tracking", "google analytics", "consent")):
|
||||||
|
cookie_text = dse_t
|
||||||
|
logger.info("P17-D: vendor-extract Fallback auf DSE (Cookie deduped)")
|
||||||
|
owner_name = _company_name_from_url(doc_entries) or ""
|
||||||
|
if cookie_payloads:
|
||||||
|
cmp_vendors = extract_vendors_from_payloads(
|
||||||
|
cookie_payloads, owner_name=owner_name,
|
||||||
|
)
|
||||||
|
|
||||||
|
# P52: LLM-Fallback nicht nur wenn 0 Vendors, sondern auch wenn die
|
||||||
|
# strukturierten Quellen < 5 Vendors lieferten und der Cookie-Text
|
||||||
|
# substantiell ist.
|
||||||
|
if (len(cmp_vendors) < 5
|
||||||
|
and cookie_text and len(cookie_text.split()) >= 500):
|
||||||
|
from compliance.services.vendor_llm_extractor import (
|
||||||
|
extract_vendors_via_llm,
|
||||||
|
)
|
||||||
|
from compliance.services.vendor_classifier import classify
|
||||||
|
_update(check_id, "Vendor-Liste per LLM extrahieren...", 94)
|
||||||
|
llm_vendors = await extract_vendors_via_llm(cookie_text)
|
||||||
|
existing_names = {(v.get("name") or "").strip().lower()
|
||||||
|
for v in cmp_vendors}
|
||||||
|
added_llm = 0
|
||||||
|
for v in llm_vendors:
|
||||||
|
nm = (v.get("name") or "").strip()
|
||||||
|
if not nm or nm.lower() in existing_names:
|
||||||
|
continue
|
||||||
|
v["recipient_type"] = classify(
|
||||||
|
vendor_name=nm,
|
||||||
|
category=v.get("category", ""),
|
||||||
|
owner_name=owner_name,
|
||||||
|
)
|
||||||
|
v.setdefault("source", "llm_cascade")
|
||||||
|
cmp_vendors.append(v)
|
||||||
|
existing_names.add(nm.lower())
|
||||||
|
added_llm += 1
|
||||||
|
if added_llm:
|
||||||
|
logger.info("P52 LLM-Cascade: +%d Vendors (total: %d)",
|
||||||
|
added_llm, len(cmp_vendors))
|
||||||
|
|
||||||
|
# P57: Phase G vendor_details als zusätzliche Vendor-Quelle.
|
||||||
|
if banner_result:
|
||||||
|
vd_list = banner_result.get("vendor_details") or []
|
||||||
|
vd_list = [v for v in vd_list if v.get("name") != "__TDM_OPTOUT__"]
|
||||||
|
existing_names = {(v.get("name") or "").strip().lower()
|
||||||
|
for v in cmp_vendors}
|
||||||
|
added = 0
|
||||||
|
for d in vd_list:
|
||||||
|
n = (d.get("name") or "").strip()
|
||||||
|
if not n or n.lower() in existing_names:
|
||||||
|
continue
|
||||||
|
if n.lower() in ("technisch erforderlich", "analyse und statistik",
|
||||||
|
"marketing", "alles auswählen",
|
||||||
|
"alles auswaehlen"):
|
||||||
|
continue
|
||||||
|
from compliance.services.vendor_classifier import classify
|
||||||
|
cmp_vendors.append({
|
||||||
|
"name": n,
|
||||||
|
"country": "",
|
||||||
|
"purpose": d.get("description", "")[:500],
|
||||||
|
"category": "",
|
||||||
|
"opt_out_url": d.get("opt_out_url", ""),
|
||||||
|
"privacy_policy_url": d.get("privacy_url", ""),
|
||||||
|
"persistence": d.get("retention", ""),
|
||||||
|
"cookies": d.get("cookies", []),
|
||||||
|
"processing_company": d.get("processing_company", ""),
|
||||||
|
"address": d.get("address", ""),
|
||||||
|
"purposes": d.get("purposes", []),
|
||||||
|
"technologies": d.get("technologies", []),
|
||||||
|
"recipient_type": classify(
|
||||||
|
vendor_name=n, category="", owner_name=owner_name,
|
||||||
|
),
|
||||||
|
})
|
||||||
|
existing_names.add(n.lower())
|
||||||
|
added += 1
|
||||||
|
if added:
|
||||||
|
logger.info("P57: added %d new vendors from Phase G (total: %d)",
|
||||||
|
added, len(cmp_vendors))
|
||||||
|
|
||||||
|
# D — HTML-Tabellen aus DOM
|
||||||
|
for pl in (cookie_payloads or []):
|
||||||
|
if pl.get("kind") != "html_table":
|
||||||
|
continue
|
||||||
|
rows = pl.get("rows") or []
|
||||||
|
if len(rows) < 3:
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
from compliance.services.cookies_table_parser import (
|
||||||
|
parse_cookie_table as _parse_ct_d,
|
||||||
|
)
|
||||||
|
table_text = "\n".join(rows)
|
||||||
|
d_vendors = _parse_ct_d(table_text)
|
||||||
|
if d_vendors:
|
||||||
|
existing_d = {(v.get("name") or "").strip().lower()
|
||||||
|
for v in cmp_vendors}
|
||||||
|
added_d = 0
|
||||||
|
for v in d_vendors:
|
||||||
|
nm = (v.get("name") or "").strip()
|
||||||
|
if not nm or nm.lower() in existing_d:
|
||||||
|
continue
|
||||||
|
v.setdefault("source", "html_table_dom")
|
||||||
|
cmp_vendors.append(v)
|
||||||
|
existing_d.add(nm.lower())
|
||||||
|
added_d += 1
|
||||||
|
if added_d:
|
||||||
|
logger.info("D HTML-Table-DOM-Parse: +%d Vendors aus "
|
||||||
|
"%d-Zeilen-Tabelle (total: %d)",
|
||||||
|
added_d, len(rows), len(cmp_vendors))
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("html_table parse failed: %s", e)
|
||||||
|
|
||||||
|
# B — cookies_table_parser auch auf gecrawltem Cookie-Text
|
||||||
|
if cookie_text and len(cookie_text) >= 500:
|
||||||
|
try:
|
||||||
|
from compliance.services.cookies_table_parser import (
|
||||||
|
parse_cookie_table as _parse_ct,
|
||||||
|
parse_flat_cookie_text as _parse_flat,
|
||||||
|
)
|
||||||
|
crawled_table_vendors = _parse_ct(cookie_text)
|
||||||
|
if not crawled_table_vendors:
|
||||||
|
crawled_table_vendors = _parse_flat(cookie_text)
|
||||||
|
if crawled_table_vendors:
|
||||||
|
existing = {(v.get("name") or "").strip().lower()
|
||||||
|
for v in cmp_vendors}
|
||||||
|
added_c = 0
|
||||||
|
for v in crawled_table_vendors:
|
||||||
|
nm = (v.get("name") or "").strip()
|
||||||
|
if not nm or nm.lower() in existing:
|
||||||
|
continue
|
||||||
|
v.setdefault("source", "table_crawled")
|
||||||
|
cmp_vendors.append(v)
|
||||||
|
existing.add(nm.lower())
|
||||||
|
added_c += 1
|
||||||
|
if added_c:
|
||||||
|
logger.info("B Crawled-Tabellen-Parse: +%d Vendors "
|
||||||
|
"(total: %d)", added_c, len(cmp_vendors))
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("crawled-table-parse failed: %s", e)
|
||||||
|
|
||||||
|
# C — Screenshot + Tesseract-OCR (auch Quelle für A1 ZIP-Anhang)
|
||||||
|
cookie_url_for_shot = ""
|
||||||
|
for _e in doc_entries:
|
||||||
|
if _e.get("doc_type") == "cookie" and _e.get("url"):
|
||||||
|
cookie_url_for_shot = _e["url"]; break
|
||||||
|
if cookie_url_for_shot:
|
||||||
|
try:
|
||||||
|
from compliance.services.cookie_screenshot_ocr import (
|
||||||
|
capture_cookie_evidence_slices,
|
||||||
|
cookies_to_vendor_records,
|
||||||
|
ocr_slices_extract_cookies,
|
||||||
|
)
|
||||||
|
from compliance.services.cookies_table_parser import (
|
||||||
|
_guess_vendor as _gv,
|
||||||
|
)
|
||||||
|
_update(check_id,
|
||||||
|
"Cookie-Richtlinie wird fotografiert "
|
||||||
|
"(lueckenlose Beweiskette)...", 92)
|
||||||
|
ev = await capture_cookie_evidence_slices(
|
||||||
|
cookie_url_for_shot, check_id=check_id,
|
||||||
|
viewport_h=1024, overlap_px=200, max_slices=40,
|
||||||
|
)
|
||||||
|
if ev.get("slices"):
|
||||||
|
cookie_evidence_slices = ev["slices"]
|
||||||
|
cookie_evidence_meta = {
|
||||||
|
"total_height_px": ev.get("total_height_px"),
|
||||||
|
"width_px": ev.get("width_px"),
|
||||||
|
"accepted_banner": ev.get("accepted_banner"),
|
||||||
|
"expanded": ev.get("expanded"),
|
||||||
|
"url": ev.get("url"),
|
||||||
|
"slice_count": len(ev["slices"]),
|
||||||
|
}
|
||||||
|
_update(check_id, "Tesseract OCR über alle Slices...", 93)
|
||||||
|
ocr_cookies, ocr_stats = ocr_slices_extract_cookies(
|
||||||
|
ev["slices"],
|
||||||
|
)
|
||||||
|
if ocr_cookies:
|
||||||
|
ocr_vendors = cookies_to_vendor_records(
|
||||||
|
ocr_cookies, guess_vendor_fn=_gv,
|
||||||
|
)
|
||||||
|
existing = {(v.get("name") or "").strip().lower()
|
||||||
|
for v in cmp_vendors}
|
||||||
|
added_v = 0
|
||||||
|
for v in ocr_vendors:
|
||||||
|
nm = (v.get("name") or "").strip()
|
||||||
|
if not nm:
|
||||||
|
continue
|
||||||
|
if nm.lower() in existing:
|
||||||
|
for ex in cmp_vendors:
|
||||||
|
if (ex.get("name") or "").strip().lower() == nm.lower():
|
||||||
|
ex_names = {
|
||||||
|
(c.get("name") or "").lower()
|
||||||
|
for c in (ex.get("cookies") or [])
|
||||||
|
}
|
||||||
|
for c in (v.get("cookies") or []):
|
||||||
|
if c["name"].lower() not in ex_names:
|
||||||
|
ex.setdefault("cookies", []).append(c)
|
||||||
|
ex_names.add(c["name"].lower())
|
||||||
|
cur_src = ex.get("source", "")
|
||||||
|
if "tesseract_ocr" not in cur_src:
|
||||||
|
ex["source"] = (cur_src + ";tesseract_ocr").strip(";")
|
||||||
|
break
|
||||||
|
continue
|
||||||
|
cmp_vendors.append(v)
|
||||||
|
existing.add(nm.lower())
|
||||||
|
added_v += 1
|
||||||
|
logger.info(
|
||||||
|
"C Tesseract-OCR: +%d Vendors / %d Cookies "
|
||||||
|
"(über %d Slices, total: %d)",
|
||||||
|
added_v, len(ocr_cookies),
|
||||||
|
ocr_stats.get("slices", 0), len(cmp_vendors),
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("Tesseract-OCR pipeline failed: %s (%s)",
|
||||||
|
str(e) or "(no msg)", type(e).__name__)
|
||||||
|
|
||||||
|
# User-pasted Cookie-Tabelle (deterministisch, kein LLM):
|
||||||
|
# die hat IMMER Vorrang weil 100% genau.
|
||||||
|
if pasted_table_vendors:
|
||||||
|
existing = {(v.get("name") or "").strip().lower()
|
||||||
|
for v in cmp_vendors}
|
||||||
|
added_p = 0
|
||||||
|
for v in pasted_table_vendors:
|
||||||
|
nm = (v.get("name") or "").strip()
|
||||||
|
if not nm or nm.lower() in existing:
|
||||||
|
continue
|
||||||
|
cmp_vendors.append(v)
|
||||||
|
existing.add(nm.lower())
|
||||||
|
added_p += 1
|
||||||
|
if added_p:
|
||||||
|
logger.info("Pasted-Tabellen-Merge: +%d Vendors (total: %d)",
|
||||||
|
added_p, len(cmp_vendors))
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("VVT vendor extraction skipped: %s", e)
|
||||||
|
|
||||||
|
state["cmp_vendors"] = cmp_vendors
|
||||||
|
state["cookie_payloads"] = cookie_payloads
|
||||||
|
state["cookie_text"] = cookie_text
|
||||||
|
state["cookie_evidence_slices"] = cookie_evidence_slices
|
||||||
|
state["cookie_evidence_meta"] = cookie_evidence_meta
|
||||||
@@ -0,0 +1,250 @@
|
|||||||
|
"""Phase D-2 — Vendor finalize: enrich + normalize + library fallback.
|
||||||
|
|
||||||
|
Covers (in the original Step 5 of `_run_compliance_check`):
|
||||||
|
- Cookie-Library-Fallback (P52 Lite) — when < 20 vendors but many
|
||||||
|
after-accept cookies, resolve via library
|
||||||
|
- Vendor-Normalizer (Google-Familie dedup, garbage filter)
|
||||||
|
- Detail-modal enrichment from Phase G (P50) + TDM-opt-out sentinel
|
||||||
|
- Cookie-Behavior-Validator (P59b) — 3-Tier severity findings
|
||||||
|
- Implicit cookies detection (P61) — GTM brings GA/GCL/DoubleClick
|
||||||
|
- validate_vendor_urls + score_vendors + cookie-function classify
|
||||||
|
- Vendor-Redundanz (O4) + EU-Alternativen + Cost/Savings
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import re
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
async def run_phase_d2(state: dict) -> None:
|
||||||
|
"""Vendor finalize stages + redundancy. Mutates state in place."""
|
||||||
|
cmp_vendors = state["cmp_vendors"]
|
||||||
|
cookie_text = state.get("cookie_text", "")
|
||||||
|
banner_result = state["banner_result"]
|
||||||
|
banner_url = state["banner_url"]
|
||||||
|
profile = state["profile"]
|
||||||
|
business_scope = state["business_scope"]
|
||||||
|
|
||||||
|
tdm_opt_out_notice = ""
|
||||||
|
cookie_behavior_findings: list[dict] = []
|
||||||
|
redundancy_report = None
|
||||||
|
|
||||||
|
try:
|
||||||
|
from compliance.services.cookie_link_validator import (
|
||||||
|
score_vendors, validate_vendor_urls,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Cookie-Library-Fallback (P52 Lite): wenn weiterhin wenige
|
||||||
|
# Vendors aber viele after_accept-Cookies, aus Library auflösen.
|
||||||
|
# VW-Lehre: 6 LLM-Grob-Vendors reichen NICHT — die Library
|
||||||
|
# holt 30+ weitere aus den Cookie-Namen + Cookie-Doc-Pattern.
|
||||||
|
# Schwelle: immer probieren wenn < 20 Vendors.
|
||||||
|
if banner_result and len(cmp_vendors) < 20:
|
||||||
|
try:
|
||||||
|
from compliance.services.cookie_to_vendor_fallback import (
|
||||||
|
fallback_vendors_for_run,
|
||||||
|
)
|
||||||
|
from database import SessionLocal as _SLfb
|
||||||
|
_fb_db = _SLfb()
|
||||||
|
try:
|
||||||
|
extra = fallback_vendors_for_run(
|
||||||
|
_fb_db, banner_result, len(cmp_vendors),
|
||||||
|
cookie_doc_text=cookie_text,
|
||||||
|
)
|
||||||
|
if extra:
|
||||||
|
existing_names = {(v.get("name") or "").strip().lower()
|
||||||
|
for v in cmp_vendors}
|
||||||
|
for v in extra:
|
||||||
|
if v["name"].lower() in existing_names:
|
||||||
|
continue
|
||||||
|
cmp_vendors.append(v)
|
||||||
|
logger.info(
|
||||||
|
"Cookie-Library-Fallback: cmp_vendors %d -> %d",
|
||||||
|
len(cmp_vendors) - len(extra), len(cmp_vendors),
|
||||||
|
)
|
||||||
|
finally:
|
||||||
|
_fb_db.close()
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("Cookie-Library-Fallback skipped: %s", e)
|
||||||
|
|
||||||
|
# Vendor-Normalizer: Dedup (Google-Familie etc) + Garbage-Filter
|
||||||
|
try:
|
||||||
|
from compliance.services.vendor_normalizer import (
|
||||||
|
normalize_vendors as _norm_v,
|
||||||
|
)
|
||||||
|
cmp_vendors = _norm_v(cmp_vendors)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("vendor_normalizer skipped: %s", e)
|
||||||
|
|
||||||
|
# P50: enrich vendors with per-vendor detail-modal-extracts
|
||||||
|
if cmp_vendors and banner_result:
|
||||||
|
vendor_details = banner_result.get("vendor_details") or []
|
||||||
|
# P50f: filter out TDM-opt-out sentinel
|
||||||
|
tdm_sentinel = next((v for v in vendor_details
|
||||||
|
if v.get("name") == "__TDM_OPTOUT__"), None)
|
||||||
|
if tdm_sentinel:
|
||||||
|
tdm_opt_out_notice = tdm_sentinel.get("description", "")
|
||||||
|
logger.info("P50f: TDM opt-out — skipped detail-enrichment for vendors")
|
||||||
|
vendor_details = [v for v in vendor_details
|
||||||
|
if v.get("name") != "__TDM_OPTOUT__"]
|
||||||
|
if vendor_details:
|
||||||
|
details_by_name = {}
|
||||||
|
for d in vendor_details:
|
||||||
|
n = (d.get("name") or "").strip().lower()
|
||||||
|
if n:
|
||||||
|
details_by_name[n] = d
|
||||||
|
enriched = 0
|
||||||
|
for v in cmp_vendors:
|
||||||
|
key = (v.get("name") or "").strip().lower()
|
||||||
|
d = details_by_name.get(key)
|
||||||
|
if not d:
|
||||||
|
for dn, dv in details_by_name.items():
|
||||||
|
if key in dn or dn in key:
|
||||||
|
d = dv
|
||||||
|
break
|
||||||
|
if not d:
|
||||||
|
continue
|
||||||
|
if not v.get("country") and (d.get("processing_company") or d.get("address")):
|
||||||
|
addr = d.get("address", "")
|
||||||
|
if re.search(r"\b(deutschland|germany|berlin|m(?:ue|ü)nchen|hamburg|stuttgart)\b", addr, re.I):
|
||||||
|
v["country"] = "DE"
|
||||||
|
elif re.search(r"\bireland|irland|dublin\b", addr, re.I):
|
||||||
|
v["country"] = "IE"
|
||||||
|
elif re.search(r"\busa|united states|california|new york|delaware\b", addr, re.I):
|
||||||
|
v["country"] = "US"
|
||||||
|
if not v.get("purpose"):
|
||||||
|
v["purpose"] = d.get("description", "")[:500]
|
||||||
|
if not v.get("opt_out_url"):
|
||||||
|
v["opt_out_url"] = d.get("opt_out_url", "")
|
||||||
|
if not v.get("privacy_policy_url"):
|
||||||
|
v["privacy_policy_url"] = d.get("privacy_url", "")
|
||||||
|
if not v.get("cookies"):
|
||||||
|
v["cookies"] = d.get("cookies", [])
|
||||||
|
v["purposes"] = d.get("purposes", [])
|
||||||
|
v["technologies"] = d.get("technologies", [])
|
||||||
|
if not v.get("persistence"):
|
||||||
|
v["persistence"] = d.get("retention", "")
|
||||||
|
v["processing_company"] = d.get("processing_company", "")
|
||||||
|
v["address"] = d.get("address", "")
|
||||||
|
enriched += 1
|
||||||
|
logger.info("P50: enriched %d/%d vendors with detail-modal data",
|
||||||
|
enriched, len(cmp_vendors))
|
||||||
|
|
||||||
|
# P59b: Cookie-Behavior-Validator
|
||||||
|
if banner_result:
|
||||||
|
cookies_detailed = banner_result.get("cookies_detailed") or []
|
||||||
|
if cookies_detailed:
|
||||||
|
cb_session = None
|
||||||
|
try:
|
||||||
|
from database import SessionLocal
|
||||||
|
from compliance.services.cookie_behavior_validator import (
|
||||||
|
validate_cookie_behavior,
|
||||||
|
)
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
fp_domain = ""
|
||||||
|
if banner_url:
|
||||||
|
fp_domain = urlparse(banner_url).netloc.replace("www.", "")
|
||||||
|
cb_session = SessionLocal()
|
||||||
|
cookie_behavior_findings = validate_cookie_behavior(
|
||||||
|
cb_session, cookies_detailed,
|
||||||
|
network_requests=[], # TODO Layer B in P59d
|
||||||
|
first_party_domain=fp_domain,
|
||||||
|
)
|
||||||
|
if cookie_behavior_findings:
|
||||||
|
sevs = {f["severity"] for f in cookie_behavior_findings}
|
||||||
|
logger.info(
|
||||||
|
"P59b: Cookie-Behavior-Check %d findings (severities: %s) "
|
||||||
|
"ueber %d Cookies",
|
||||||
|
len(cookie_behavior_findings),
|
||||||
|
sorted(sevs), len(cookies_detailed),
|
||||||
|
)
|
||||||
|
banner_result["cookie_behavior_findings"] = (
|
||||||
|
cookie_behavior_findings
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
logger.info(
|
||||||
|
"P59b: Cookie-Behavior-Check 0 findings ueber %d Cookies "
|
||||||
|
"(library miss / clean)", len(cookies_detailed),
|
||||||
|
)
|
||||||
|
except Exception as cb_err:
|
||||||
|
logger.warning("P59b Cookie-Behavior-Check failed: %s", cb_err)
|
||||||
|
finally:
|
||||||
|
if cb_session is not None:
|
||||||
|
try:
|
||||||
|
cb_session.close()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# P61: "Untergeschobene Cookies"
|
||||||
|
if banner_result and cmp_vendors:
|
||||||
|
try:
|
||||||
|
from compliance.services.vendor_package_cookies import (
|
||||||
|
detect_implicit_cookies,
|
||||||
|
)
|
||||||
|
declared = [v.get("name", "") for v in cmp_vendors if v.get("name")]
|
||||||
|
actual_cookies: list[str] = []
|
||||||
|
for phase_data in (banner_result.get("phases") or {}).values():
|
||||||
|
if isinstance(phase_data, dict):
|
||||||
|
for ck in (phase_data.get("cookies") or []):
|
||||||
|
if isinstance(ck, dict) and ck.get("name"):
|
||||||
|
actual_cookies.append(ck["name"])
|
||||||
|
implicit_findings = detect_implicit_cookies(
|
||||||
|
declared, actual_cookies_set=actual_cookies or None,
|
||||||
|
)
|
||||||
|
if implicit_findings:
|
||||||
|
banner_result["implicit_vendor_findings"] = implicit_findings
|
||||||
|
logger.info(
|
||||||
|
"P61: %d implicit vendor-package items detected "
|
||||||
|
"(%d cookies + %d vendors)",
|
||||||
|
len(implicit_findings),
|
||||||
|
sum(1 for f in implicit_findings if f["implicit"]["type"] == "cookie"),
|
||||||
|
sum(1 for f in implicit_findings if f["implicit"]["type"] == "vendor"),
|
||||||
|
)
|
||||||
|
except Exception as p61_err:
|
||||||
|
logger.warning("P61 implicit-vendor detection failed: %s", p61_err)
|
||||||
|
|
||||||
|
if cmp_vendors:
|
||||||
|
logger.info("VVT: %d vendors extracted, validating links",
|
||||||
|
len(cmp_vendors))
|
||||||
|
cmp_vendors = await validate_vendor_urls(cmp_vendors)
|
||||||
|
cmp_vendors = score_vendors(cmp_vendors)
|
||||||
|
try:
|
||||||
|
from compliance.services.cookie_function_classifier import (
|
||||||
|
annotate_vendor_cookies,
|
||||||
|
)
|
||||||
|
cmp_vendors = [annotate_vendor_cookies(v) for v in cmp_vendors]
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("Cookie function classification skipped: %s", e)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("VVT vendor finalize skipped: %s", e)
|
||||||
|
|
||||||
|
# Vendor-Redundanz + EU-Alternativen + Cost/Savings (O4)
|
||||||
|
try:
|
||||||
|
from compliance.services.vendor_cost_estimator import infer_company_tier
|
||||||
|
from compliance.services.vendor_redundancy import (
|
||||||
|
analyze as analyze_redundancy,
|
||||||
|
)
|
||||||
|
if cmp_vendors:
|
||||||
|
bp_dict = {
|
||||||
|
"type": getattr(profile, "business_type", ""),
|
||||||
|
"features": list(business_scope),
|
||||||
|
}
|
||||||
|
ctier = infer_company_tier(bp_dict)
|
||||||
|
redundancy_report = analyze_redundancy(cmp_vendors, company_tier=ctier)
|
||||||
|
logger.info(
|
||||||
|
"Redundanz: %d Kategorien mit Mehrfach-Anbietern, "
|
||||||
|
"Spar-Schaetzung %s pro Jahr (company_tier=%s)",
|
||||||
|
redundancy_report["summary"]["redundancy_count"],
|
||||||
|
redundancy_report["summary"]["estimated_saving_pct"],
|
||||||
|
ctier,
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("Vendor redundancy analysis skipped: %s", e)
|
||||||
|
|
||||||
|
state["cmp_vendors"] = cmp_vendors
|
||||||
|
state["tdm_opt_out_notice"] = tdm_opt_out_notice
|
||||||
|
state["cookie_behavior_findings"] = cookie_behavior_findings
|
||||||
|
state["redundancy_report"] = redundancy_report
|
||||||
@@ -0,0 +1,220 @@
|
|||||||
|
"""Phase D-3-Bot — Bottom HTML blocks + final composition.
|
||||||
|
|
||||||
|
Covers (in the original Step 5):
|
||||||
|
- P71 JC-vs-AVV Entscheidungsbaum (only when DSE ambig)
|
||||||
|
- P6/P53/P55 Branchen-Kontext + Site-History
|
||||||
|
- P106 Internal-Checks-Block
|
||||||
|
- P85 Banner-Screenshot
|
||||||
|
- A Audit-Quality-Checks (Banner-Detect-Failure, vendor-extract dünn)
|
||||||
|
- P82 GF-1-Pager
|
||||||
|
- Doc-Input-Warnings (User text in falsches Feld gepastet)
|
||||||
|
- P86 Branchen-Benchmark
|
||||||
|
- P84 Diff-Mode (since-last-run delta)
|
||||||
|
- Final HTML composition
|
||||||
|
|
||||||
|
NOTE: in the original code `audit_quality_findings` was used by
|
||||||
|
build_gf_one_pager_html BEFORE it was initialised — a silent
|
||||||
|
UnboundLocalError caught by the surrounding try/except, so the
|
||||||
|
gf_one_pager block effectively never rendered. Here we run
|
||||||
|
audit-quality FIRST so the data is actually available.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
async def run_phase_d3_bot(state: dict) -> None:
|
||||||
|
"""Bottom blocks + assemble full_html. Mutates state in place."""
|
||||||
|
check_id = state["check_id"]
|
||||||
|
req = state["req"]
|
||||||
|
doc_entries = state["doc_entries"]
|
||||||
|
doc_texts = state["doc_texts"]
|
||||||
|
banner_result = state["banner_result"]
|
||||||
|
cmp_vendors = state["cmp_vendors"]
|
||||||
|
mc_split = state["mc_split"]
|
||||||
|
scorecard = state["scorecard"]
|
||||||
|
prev_scorecard = state.get("prev_scorecard")
|
||||||
|
mismatches = state.get("mismatches") or []
|
||||||
|
site_name_for_exec = state.get("site_name_for_exec", "")
|
||||||
|
domain_for_exec = state.get("domain_for_exec")
|
||||||
|
html_blocks = state["html_blocks"]
|
||||||
|
|
||||||
|
# P71: JC-vs-AVV Entscheidungsbaum
|
||||||
|
jc_decision_html = ""
|
||||||
|
try:
|
||||||
|
from compliance.services.jc_avv_decision import (
|
||||||
|
build_jc_avv_decision_html,
|
||||||
|
)
|
||||||
|
jc_decision_html = build_jc_avv_decision_html(doc_texts.get("dse"))
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("P71 jc_avv_decision skipped: %s", e)
|
||||||
|
|
||||||
|
# P6/P53/P55 — Branchen-Kontext + Site-History
|
||||||
|
industry_ctx_html = ""
|
||||||
|
try:
|
||||||
|
from compliance.services.industry_library import (
|
||||||
|
build_industry_context_block_html, load_site_profile,
|
||||||
|
)
|
||||||
|
from database import SessionLocal as _SLib
|
||||||
|
_ind_db = _SLib()
|
||||||
|
try:
|
||||||
|
ind = (req.scan_context or {}).get("industry") if req.scan_context else None
|
||||||
|
site_prof = load_site_profile(_ind_db, domain_for_exec or "")
|
||||||
|
industry_ctx_html = build_industry_context_block_html(ind, site_prof)
|
||||||
|
finally:
|
||||||
|
_ind_db.close()
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("industry context skipped: %s", e)
|
||||||
|
|
||||||
|
# P106 — Internal-Checks-Block
|
||||||
|
internal_checks_html = ""
|
||||||
|
try:
|
||||||
|
from compliance.services.mc_audit_type import (
|
||||||
|
build_internal_checks_block_html,
|
||||||
|
)
|
||||||
|
ic = (mc_split or {}).get("internal_checks") or []
|
||||||
|
if ic:
|
||||||
|
internal_checks_html = build_internal_checks_block_html(ic)
|
||||||
|
logger.info("P106: %d interne Checks (statt FAIL) im Block",
|
||||||
|
len(ic))
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("P106 internal_checks_html skipped: %s", e)
|
||||||
|
|
||||||
|
# P85 — Banner-Screenshot
|
||||||
|
banner_shot_html = ""
|
||||||
|
try:
|
||||||
|
from compliance.services.banner_screenshot_block import (
|
||||||
|
build_banner_screenshot_html,
|
||||||
|
)
|
||||||
|
banner_shot_html = build_banner_screenshot_html(banner_result)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("P85 banner-screenshot skipped: %s", e)
|
||||||
|
|
||||||
|
# A — Audit-Quality-Checks (run BEFORE gf_one_pager so the data is
|
||||||
|
# available — original code had this inverted, causing
|
||||||
|
# UnboundLocalError silently caught).
|
||||||
|
audit_quality_html = ""
|
||||||
|
audit_quality_findings: list[dict] = []
|
||||||
|
try:
|
||||||
|
from compliance.services.audit_quality_checks import (
|
||||||
|
build_audit_quality_block_html, run_all as run_audit_quality,
|
||||||
|
)
|
||||||
|
cookie_text_for_aq = doc_texts.get("cookie") or ""
|
||||||
|
audit_quality_findings = run_audit_quality(
|
||||||
|
banner_result, cookie_text_for_aq, cmp_vendors, doc_entries,
|
||||||
|
)
|
||||||
|
if audit_quality_findings:
|
||||||
|
audit_quality_html = build_audit_quality_block_html(audit_quality_findings)
|
||||||
|
logger.info("audit-quality: %d Vorbehalte erkannt",
|
||||||
|
len(audit_quality_findings))
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("audit-quality-checks failed: %s", e)
|
||||||
|
|
||||||
|
# P82: GF-1-Pager (now has the audit_quality_findings filled)
|
||||||
|
gf_one_pager_html = ""
|
||||||
|
try:
|
||||||
|
from compliance.services.gf_one_pager import build_gf_one_pager_html
|
||||||
|
gf_one_pager_html = build_gf_one_pager_html(
|
||||||
|
site_name=site_name_for_exec,
|
||||||
|
scorecard=scorecard,
|
||||||
|
previous_scorecard=prev_scorecard,
|
||||||
|
banner_result=banner_result,
|
||||||
|
library_mismatch_findings=mismatches,
|
||||||
|
scan_context=req.scan_context,
|
||||||
|
audit_quality_findings=audit_quality_findings,
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("P82 GF-1-pager skipped: %s", e)
|
||||||
|
|
||||||
|
# Doc-Input-Warnings — wenn User Text ins falsche Feld gepastet hat
|
||||||
|
input_warn_html = ""
|
||||||
|
try:
|
||||||
|
from compliance.services.doc_input_warnings import (
|
||||||
|
build_warnings_block_html, collect_warnings,
|
||||||
|
)
|
||||||
|
warns = collect_warnings(doc_entries)
|
||||||
|
if warns:
|
||||||
|
input_warn_html = build_warnings_block_html(warns)
|
||||||
|
logger.info("doc-input-warnings: %d Mismatches gefunden", len(warns))
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("doc-input-warnings skipped: %s", e)
|
||||||
|
|
||||||
|
# P86: Branchen-Benchmark
|
||||||
|
bench_html = ""
|
||||||
|
try:
|
||||||
|
from compliance.services.industry_benchmark import (
|
||||||
|
_extract_score, build_benchmark_html, compute_benchmark,
|
||||||
|
)
|
||||||
|
from database import SessionLocal as _SLb
|
||||||
|
industry = (req.scan_context or {}).get("industry") if req.scan_context else None
|
||||||
|
curr_score = _extract_score(banner_result)
|
||||||
|
if industry and curr_score is not None:
|
||||||
|
_b_db = _SLb()
|
||||||
|
try:
|
||||||
|
bench = compute_benchmark(
|
||||||
|
_b_db, industry, curr_score, check_id,
|
||||||
|
)
|
||||||
|
if bench:
|
||||||
|
bench_html = build_benchmark_html(bench)
|
||||||
|
finally:
|
||||||
|
_b_db.close()
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("P86 industry-benchmark skipped: %s", e)
|
||||||
|
|
||||||
|
# P84: Diff-Mode
|
||||||
|
diff_html = ""
|
||||||
|
try:
|
||||||
|
from compliance.services.run_diff import (
|
||||||
|
build_diff_block_html, compute_diff,
|
||||||
|
)
|
||||||
|
from database import SessionLocal as _SL
|
||||||
|
_diff_db = _SL()
|
||||||
|
try:
|
||||||
|
diff = compute_diff(
|
||||||
|
_diff_db, check_id, domain_for_exec or "",
|
||||||
|
banner_result, scorecard,
|
||||||
|
)
|
||||||
|
if diff:
|
||||||
|
diff_html = build_diff_block_html(diff)
|
||||||
|
finally:
|
||||||
|
_diff_db.close()
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("P84 diff-mode skipped: %s", e)
|
||||||
|
|
||||||
|
# B1 / B3 cross-cutting findings (own renderers, may be empty).
|
||||||
|
reachability_html = state.get("reachability_html", "")
|
||||||
|
retention_html = state.get("retention_html", "")
|
||||||
|
|
||||||
|
# Reihenfolge — Sales-optimiert.
|
||||||
|
# B1 (Reachability) sits next to critical because it's an Art.7-Abs.3
|
||||||
|
# finding. B3 (Retention) sits next to cookie_audit because both
|
||||||
|
# are 3-source comparisons of cookie metadata.
|
||||||
|
full_html = (
|
||||||
|
gf_one_pager_html + audit_quality_html + input_warn_html
|
||||||
|
+ bench_html + diff_html
|
||||||
|
+ html_blocks["critical_html"] + reachability_html
|
||||||
|
+ html_blocks["scope_disclaimer_html"]
|
||||||
|
+ html_blocks["exec_summary_html"]
|
||||||
|
+ html_blocks["cookie_arch_html"] + html_blocks["summary_html"]
|
||||||
|
+ html_blocks["scanned_html"] + html_blocks["profile_html"]
|
||||||
|
+ html_blocks["scorecard_html"] + internal_checks_html
|
||||||
|
+ html_blocks["redundancy_html"]
|
||||||
|
+ industry_ctx_html
|
||||||
|
+ banner_shot_html
|
||||||
|
+ html_blocks["providers_html"] + html_blocks["banner_deep_html"]
|
||||||
|
+ html_blocks["cookie_audit_html"] + retention_html
|
||||||
|
+ html_blocks["tcf_authority_html"]
|
||||||
|
+ html_blocks["entropy_html"]
|
||||||
|
+ html_blocks["network_trace_html"]
|
||||||
|
+ html_blocks["library_mismatch_html"]
|
||||||
|
+ html_blocks["consistency_html"] + html_blocks["signals_html"]
|
||||||
|
+ html_blocks["solutions_html"]
|
||||||
|
+ jc_decision_html
|
||||||
|
+ html_blocks["vvt_html"] + html_blocks["report_html"]
|
||||||
|
)
|
||||||
|
|
||||||
|
state["audit_quality_findings"] = audit_quality_findings
|
||||||
|
state["full_html"] = full_html
|
||||||
@@ -0,0 +1,221 @@
|
|||||||
|
"""Phase D-3-Mid — Mid HTML blocks (P62/P103/P104/P105/audit/mismatch/signals).
|
||||||
|
|
||||||
|
Covers (in the original Step 5):
|
||||||
|
- P62 Scope-Disclaimer
|
||||||
|
- P103 Cookie-Value-Entropy + P104 Network-Tracing
|
||||||
|
- P105 IAB TCF Authority cross-reference
|
||||||
|
- Cookie-Compliance-Audit (3-Quellen-Vergleich, central USP)
|
||||||
|
- P102 Cookie-Klassifikations-Pruefung (library mismatch)
|
||||||
|
- P35/P77/P78 Doc-Text signals
|
||||||
|
- P92/P94 Banner-Konsistenz
|
||||||
|
- P73 MC-Solution-Generator (LLM suggestions per HIGH-Fail)
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
async def run_phase_d3_mid(state: dict) -> None:
|
||||||
|
"""Mid HTML blocks. Mutates state in place."""
|
||||||
|
doc_entries = state["doc_entries"]
|
||||||
|
doc_texts = state["doc_texts"]
|
||||||
|
banner_result = state["banner_result"]
|
||||||
|
cmp_vendors = state["cmp_vendors"]
|
||||||
|
fails_by_doc = state["fails_by_doc"]
|
||||||
|
html_blocks = state["html_blocks"]
|
||||||
|
|
||||||
|
# P62: Marketing-Manager-Disclaimer
|
||||||
|
scope_disclaimer_html = ""
|
||||||
|
try:
|
||||||
|
from ..scope_disclaimer import build_scope_disclaimer_html
|
||||||
|
scope_disclaimer_html = build_scope_disclaimer_html()
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("Scope-disclaimer block skipped: %s", e)
|
||||||
|
|
||||||
|
# P103 + P104 — Cookie-Value-Entropy + Network-Tracing
|
||||||
|
entropy_html = ""
|
||||||
|
network_trace_html = ""
|
||||||
|
try:
|
||||||
|
from compliance.services.cookie_network_tracer import (
|
||||||
|
build_network_trace_block_html,
|
||||||
|
trace_cookie_network,
|
||||||
|
)
|
||||||
|
from compliance.services.cookie_value_entropy import (
|
||||||
|
build_entropy_block_html,
|
||||||
|
check_cookies_for_entropy_mismatch,
|
||||||
|
)
|
||||||
|
cookies_detailed = (banner_result or {}).get("cookies_detailed") or []
|
||||||
|
entropy_findings = check_cookies_for_entropy_mismatch(cookies_detailed)
|
||||||
|
if entropy_findings:
|
||||||
|
entropy_html = build_entropy_block_html(entropy_findings)
|
||||||
|
logger.info("P103 Entropy: %d Findings", len(entropy_findings))
|
||||||
|
primary_url = ""
|
||||||
|
for e_ in doc_entries:
|
||||||
|
if e_.get("url"):
|
||||||
|
primary_url = e_["url"]; break
|
||||||
|
net_findings = trace_cookie_network(cookies_detailed, primary_url)
|
||||||
|
if net_findings:
|
||||||
|
network_trace_html = build_network_trace_block_html(net_findings)
|
||||||
|
logger.info("P104 Network-Trace: %d Findings", len(net_findings))
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("P103/P104 entropy/network-trace skipped: %s", e)
|
||||||
|
|
||||||
|
# P105 — IAB TCF Authority-Cross-Reference
|
||||||
|
tcf_authority_html = ""
|
||||||
|
try:
|
||||||
|
from compliance.services.tcf_vendor_authority import (
|
||||||
|
build_tcf_authority_block_html, cross_reference_with_tcf,
|
||||||
|
)
|
||||||
|
from database import SessionLocal as _SLtcf
|
||||||
|
_tcf_db = _SLtcf()
|
||||||
|
try:
|
||||||
|
tcf_findings = cross_reference_with_tcf(_tcf_db, cmp_vendors)
|
||||||
|
if tcf_findings:
|
||||||
|
tcf_authority_html = build_tcf_authority_block_html(tcf_findings)
|
||||||
|
logger.info(
|
||||||
|
"TCF-Authority: %d Vendor-Discrepancies gefunden",
|
||||||
|
len(tcf_findings),
|
||||||
|
)
|
||||||
|
finally:
|
||||||
|
_tcf_db.close()
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("TCF-Authority-Check skipped: %s", e)
|
||||||
|
|
||||||
|
# COOKIE-COMPLIANCE-AUDIT (3-Quellen-Vergleich — central USP)
|
||||||
|
cookie_audit: dict = {}
|
||||||
|
cookie_audit_html = ""
|
||||||
|
try:
|
||||||
|
from compliance.services.cookie_compliance_audit import (
|
||||||
|
audit_cookie_compliance, build_cookie_audit_block_html,
|
||||||
|
)
|
||||||
|
from database import SessionLocal as _SLca
|
||||||
|
_ca_db = _SLca()
|
||||||
|
try:
|
||||||
|
cookie_audit = audit_cookie_compliance(
|
||||||
|
_ca_db, doc_texts.get("cookie") or doc_texts.get("dse"),
|
||||||
|
banner_result,
|
||||||
|
)
|
||||||
|
if cookie_audit and (cookie_audit.get("declared_count") or
|
||||||
|
cookie_audit.get("browser_count")):
|
||||||
|
cookie_audit_html = build_cookie_audit_block_html(cookie_audit)
|
||||||
|
logger.info(
|
||||||
|
"Cookie-Audit: %d deklariert, %d im Browser, "
|
||||||
|
"%d undokumentiert, %d compliant",
|
||||||
|
cookie_audit.get("declared_count"),
|
||||||
|
cookie_audit.get("browser_count"),
|
||||||
|
len(cookie_audit.get("undeclared_in_browser") or []),
|
||||||
|
len(cookie_audit.get("compliant") or []),
|
||||||
|
)
|
||||||
|
finally:
|
||||||
|
_ca_db.close()
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("cookie-compliance-audit skipped: %s", e)
|
||||||
|
|
||||||
|
# P102: Cookie-Klassifikations-Pruefung
|
||||||
|
library_mismatch_html = ""
|
||||||
|
mismatches: list[dict] = []
|
||||||
|
try:
|
||||||
|
from compliance.services.cookie_library_mismatch import (
|
||||||
|
build_mismatch_block_html, detect_mismatches,
|
||||||
|
)
|
||||||
|
from database import SessionLocal
|
||||||
|
cookie_doc_for_check = doc_texts.get("cookie") or doc_texts.get("dse") or ""
|
||||||
|
all_cookies_seen: list[str] = []
|
||||||
|
if banner_result:
|
||||||
|
for ph in (banner_result.get("phases") or {}).values():
|
||||||
|
if isinstance(ph, dict):
|
||||||
|
for ck in (ph.get("cookies") or []):
|
||||||
|
if isinstance(ck, str):
|
||||||
|
all_cookies_seen.append(ck)
|
||||||
|
elif isinstance(ck, dict) and ck.get("name"):
|
||||||
|
all_cookies_seen.append(ck["name"])
|
||||||
|
if all_cookies_seen and cookie_doc_for_check:
|
||||||
|
_mm_db = SessionLocal()
|
||||||
|
try:
|
||||||
|
mismatches = detect_mismatches(
|
||||||
|
_mm_db, all_cookies_seen, cookie_doc_for_check,
|
||||||
|
)
|
||||||
|
if mismatches:
|
||||||
|
library_mismatch_html = build_mismatch_block_html(mismatches)
|
||||||
|
logger.info(
|
||||||
|
"P102: %d Cookie-Mismatches gefunden", len(mismatches),
|
||||||
|
)
|
||||||
|
finally:
|
||||||
|
_mm_db.close()
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("P102 mismatch detection failed: %s", e)
|
||||||
|
|
||||||
|
# P35 + P77 + P78: Textsignal-Checks
|
||||||
|
signals_html = ""
|
||||||
|
try:
|
||||||
|
from compliance.services.doc_text_signals import (
|
||||||
|
build_signals_block_html, run_all as run_signal_checks,
|
||||||
|
)
|
||||||
|
cookie_doc_missing = not bool(doc_texts.get("cookie"))
|
||||||
|
sig_findings = run_signal_checks(
|
||||||
|
banner_result, doc_texts, cookie_doc_missing,
|
||||||
|
)
|
||||||
|
if sig_findings:
|
||||||
|
signals_html = build_signals_block_html(sig_findings)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("P35/P77/P78 signals-check failed: %s", e)
|
||||||
|
|
||||||
|
# P92 + P94: Banner-Konsistenz
|
||||||
|
consistency_html = ""
|
||||||
|
try:
|
||||||
|
from compliance.services.banner_consistency_checks import (
|
||||||
|
build_consistency_block_html, run_all as run_consistency_checks,
|
||||||
|
)
|
||||||
|
cookie_doc_for_check = (doc_texts.get("cookie")
|
||||||
|
or doc_texts.get("dse") or "")
|
||||||
|
cons_findings = run_consistency_checks(
|
||||||
|
banner_result or {}, cookie_doc_for_check, cmp_vendors,
|
||||||
|
doc_texts=doc_texts,
|
||||||
|
)
|
||||||
|
if cons_findings:
|
||||||
|
consistency_html = build_consistency_block_html(cons_findings)
|
||||||
|
logger.info("P92/P94: %d Konsistenz-Findings", len(cons_findings))
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("P92/P94 consistency-check failed: %s", e)
|
||||||
|
|
||||||
|
# P73: MC-Solution-Generator — LLM-Vorschlaege pro HIGH-Fail
|
||||||
|
solutions_html = ""
|
||||||
|
try:
|
||||||
|
from compliance.services.mc_solution_generator import (
|
||||||
|
build_solutions_block_html, generate_solutions_for_fails,
|
||||||
|
)
|
||||||
|
all_solutions: list[dict] = []
|
||||||
|
for dt, fails in fails_by_doc.items():
|
||||||
|
if not fails:
|
||||||
|
continue
|
||||||
|
doc_txt = doc_texts.get(dt) or doc_texts.get("dse") or ""
|
||||||
|
if not doc_txt or len(doc_txt) < 500:
|
||||||
|
continue
|
||||||
|
sols = await generate_solutions_for_fails(
|
||||||
|
fails, doc_txt, dt, limit=3,
|
||||||
|
)
|
||||||
|
all_solutions.extend(sols)
|
||||||
|
if len(all_solutions) >= 8:
|
||||||
|
break
|
||||||
|
if all_solutions:
|
||||||
|
solutions_html = build_solutions_block_html(all_solutions[:8])
|
||||||
|
logger.info("P73: %d MC-Solutions generiert", len(all_solutions))
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("P73 MC-Solution-Generator skipped: %s", e)
|
||||||
|
|
||||||
|
html_blocks.update({
|
||||||
|
"scope_disclaimer_html": scope_disclaimer_html,
|
||||||
|
"entropy_html": entropy_html,
|
||||||
|
"network_trace_html": network_trace_html,
|
||||||
|
"tcf_authority_html": tcf_authority_html,
|
||||||
|
"cookie_audit_html": cookie_audit_html,
|
||||||
|
"library_mismatch_html": library_mismatch_html,
|
||||||
|
"signals_html": signals_html,
|
||||||
|
"consistency_html": consistency_html,
|
||||||
|
"solutions_html": solutions_html,
|
||||||
|
})
|
||||||
|
state["cookie_audit"] = cookie_audit
|
||||||
|
state["mismatches"] = mismatches
|
||||||
@@ -0,0 +1,198 @@
|
|||||||
|
"""Phase D-3-Top — Top-of-mail HTML blocks.
|
||||||
|
|
||||||
|
Covers (in the original Step 5 of `_run_compliance_check`):
|
||||||
|
- Summary / Scanned-URLs / Provider-list / Banner-deep / VVT HTML
|
||||||
|
- MC-scorecard aggregation (all_mc_checks + scorecard) + trend lookup
|
||||||
|
- P106 mc_audit_type split (internal_checks vs. verifiable_fails)
|
||||||
|
- Profile HTML / Redundancy HTML
|
||||||
|
- P1 Executive Summary
|
||||||
|
- P18 Critical Findings block
|
||||||
|
- P10 Cookie-Policy-Architecture detection
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
|
||||||
|
from ._helpers import _build_profile_html, _company_name_from_url, _extract_domain
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
async def run_phase_d3_top(state: dict) -> None:
|
||||||
|
"""Top-of-mail HTML blocks. Mutates state in place."""
|
||||||
|
req = state["req"]
|
||||||
|
results = state["results"]
|
||||||
|
doc_entries = state["doc_entries"]
|
||||||
|
doc_texts = state["doc_texts"]
|
||||||
|
banner_result = state["banner_result"]
|
||||||
|
vvt_entries = state["vvt_entries"]
|
||||||
|
cmp_vendors = state["cmp_vendors"]
|
||||||
|
profile = state["profile"]
|
||||||
|
redundancy_report = state.get("redundancy_report")
|
||||||
|
|
||||||
|
from ..agent_doc_check_banner import build_banner_deep_html
|
||||||
|
from ..agent_doc_check_critical import build_critical_findings_html
|
||||||
|
from ..agent_doc_check_exec_summary import build_exec_summary_html
|
||||||
|
from ..agent_doc_check_extras import build_vvt_table_html
|
||||||
|
from ..agent_doc_check_redundancy import build_redundancy_html
|
||||||
|
from ..agent_doc_check_report import (
|
||||||
|
build_html_report,
|
||||||
|
build_management_summary,
|
||||||
|
build_provider_list_html,
|
||||||
|
build_scanned_urls_html,
|
||||||
|
)
|
||||||
|
from ..agent_doc_check_scorecard import build_scorecard_html
|
||||||
|
from compliance.services.mc_scorecard import build_scorecard
|
||||||
|
|
||||||
|
summary_html = build_management_summary(results)
|
||||||
|
scanned_html = build_scanned_urls_html(doc_entries)
|
||||||
|
providers_html = build_provider_list_html(banner_result, vvt_entries)
|
||||||
|
# P18: Deep-Block mit Phases + Quality-Score + Per-Category-Tracker
|
||||||
|
banner_deep_html = build_banner_deep_html(banner_result)
|
||||||
|
vvt_html = build_vvt_table_html(cmp_vendors)
|
||||||
|
|
||||||
|
# MC scorecard aggregated across ALL docs (DSGVO/TDDDG/BGB/...)
|
||||||
|
all_mc_checks: list[dict] = []
|
||||||
|
fails_by_doc: dict[str, list[dict]] = {}
|
||||||
|
for r in results:
|
||||||
|
for c in r.checks:
|
||||||
|
if c.id.startswith("mc-"):
|
||||||
|
rec = {
|
||||||
|
"id": c.id, "label": c.label, "passed": c.passed,
|
||||||
|
"severity": c.severity, "skipped": c.skipped,
|
||||||
|
"regulation": c.regulation,
|
||||||
|
"hint": getattr(c, "hint", "") or "",
|
||||||
|
}
|
||||||
|
all_mc_checks.append(rec)
|
||||||
|
if (not c.passed and not c.skipped
|
||||||
|
and (c.severity or "").upper() in ("CRITICAL", "HIGH")):
|
||||||
|
fails_by_doc.setdefault(r.doc_type, []).append(rec)
|
||||||
|
# P106 — Audit-Type-Klassifizierung pro MC
|
||||||
|
mc_split: dict = {"internal_checks": [], "verifiable_fails": all_mc_checks}
|
||||||
|
try:
|
||||||
|
from compliance.services.mc_audit_type import (
|
||||||
|
annotate_mc_results, split_by_audit_type,
|
||||||
|
)
|
||||||
|
annotate_mc_results(all_mc_checks)
|
||||||
|
mc_split = split_by_audit_type(all_mc_checks)
|
||||||
|
fails_by_doc = {}
|
||||||
|
for r in mc_split.get("verifiable_fails") or []:
|
||||||
|
fails_by_doc.setdefault("dse", []).append(r)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("P106 mc_audit_type skipped: %s", e)
|
||||||
|
scorecard = build_scorecard(all_mc_checks) if all_mc_checks else {}
|
||||||
|
|
||||||
|
# Trend: load previous scorecard for the same tenant + domain
|
||||||
|
prev_scorecard: dict | None = None
|
||||||
|
if scorecard:
|
||||||
|
try:
|
||||||
|
from compliance.services.compliance_audit_log import (
|
||||||
|
list_runs_for_tenant,
|
||||||
|
)
|
||||||
|
tenant_id_for_trend = req.recipient or ""
|
||||||
|
base_domain_for_trend = _extract_domain(doc_entries) or ""
|
||||||
|
prev_runs = list_runs_for_tenant(
|
||||||
|
tenant_id_for_trend,
|
||||||
|
base_domain=base_domain_for_trend,
|
||||||
|
limit=1,
|
||||||
|
)
|
||||||
|
if prev_runs:
|
||||||
|
prev_scorecard = prev_runs[0].get("scorecard")
|
||||||
|
except Exception as e:
|
||||||
|
logger.debug("trend lookup skipped: %s", e)
|
||||||
|
scorecard_html = (
|
||||||
|
build_scorecard_html(scorecard, previous_scorecard=prev_scorecard)
|
||||||
|
if scorecard else ""
|
||||||
|
)
|
||||||
|
|
||||||
|
report_html = build_html_report(results, None, doc_texts)
|
||||||
|
profile_html = _build_profile_html(profile)
|
||||||
|
|
||||||
|
# O4: Vendor-Redundanz / EU-Alternativen + Cost-Savings-Block
|
||||||
|
redundancy_html = build_redundancy_html(redundancy_report)
|
||||||
|
|
||||||
|
# P1: Executive-Summary
|
||||||
|
url_company_for_exec = _company_name_from_url(doc_entries)
|
||||||
|
domain_for_exec = _extract_domain(doc_entries)
|
||||||
|
site_name_for_exec = url_company_for_exec or domain_for_exec or ""
|
||||||
|
exec_summary_html = build_exec_summary_html(
|
||||||
|
scorecard=scorecard,
|
||||||
|
previous_scorecard=prev_scorecard,
|
||||||
|
cmp_vendors=cmp_vendors,
|
||||||
|
redundancy_report=redundancy_report,
|
||||||
|
site_name=site_name_for_exec,
|
||||||
|
)
|
||||||
|
|
||||||
|
# P18: Critical-Findings-Block
|
||||||
|
critical_html = ""
|
||||||
|
try:
|
||||||
|
critical_html = build_critical_findings_html(
|
||||||
|
banner_result=banner_result,
|
||||||
|
scorecard=scorecard,
|
||||||
|
results=results,
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("Critical-findings block skipped: %s", e)
|
||||||
|
|
||||||
|
# P10: Cookie-Policy-Architecture-Detection (BMW-Pattern erkennen)
|
||||||
|
cookie_arch_html = ""
|
||||||
|
try:
|
||||||
|
from compliance.services.cookie_policy_architecture import (
|
||||||
|
build_architecture_html,
|
||||||
|
detect_architecture,
|
||||||
|
)
|
||||||
|
cookie_doc_url = ""
|
||||||
|
cookie_doc_text = doc_texts.get("cookie", "")
|
||||||
|
cookie_cmp_payloads: list[dict] = []
|
||||||
|
for e in doc_entries:
|
||||||
|
if (e.get("doc_type") or "").lower() in ("cookie", "cookie_policy"):
|
||||||
|
cookie_doc_url = e.get("url", "")
|
||||||
|
cookie_cmp_payloads = e.get("cmp_payloads") or []
|
||||||
|
break
|
||||||
|
# P17-A: Fallback wenn Cookie-Doc via P15 deduped wurde
|
||||||
|
if not cookie_doc_text:
|
||||||
|
dse_text = doc_texts.get("dse", "")
|
||||||
|
if dse_text and any(w in dse_text.lower() for w in
|
||||||
|
("cookie", "tracking", "google analytics",
|
||||||
|
"consent")):
|
||||||
|
cookie_doc_text = dse_text
|
||||||
|
dse_entry = next((e for e in doc_entries
|
||||||
|
if e.get("doc_type") == "dse"), {})
|
||||||
|
cookie_doc_url = dse_entry.get("url", "")
|
||||||
|
cookie_cmp_payloads = dse_entry.get("cmp_payloads") or []
|
||||||
|
logger.info("P17-A: cookie-arch fallback auf DSE")
|
||||||
|
if cookie_doc_text:
|
||||||
|
arch = detect_architecture(
|
||||||
|
doc_url=cookie_doc_url,
|
||||||
|
doc_text=cookie_doc_text,
|
||||||
|
cmp_payloads=cookie_cmp_payloads,
|
||||||
|
homepage_cmp_payloads=state.get("cookie_payloads") or [],
|
||||||
|
)
|
||||||
|
cookie_arch_html = build_architecture_html(arch)
|
||||||
|
logger.info("cookie-arch: layer=%s versioned=%s risk=%s",
|
||||||
|
arch["layer_separation"], arch["versioned"],
|
||||||
|
arch["risk_label"])
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("cookie-architecture detection failed: %s", e)
|
||||||
|
|
||||||
|
state["scorecard"] = scorecard
|
||||||
|
state["prev_scorecard"] = prev_scorecard
|
||||||
|
state["mc_split"] = mc_split
|
||||||
|
state["fails_by_doc"] = fails_by_doc
|
||||||
|
state["site_name_for_exec"] = site_name_for_exec
|
||||||
|
state["domain_for_exec"] = domain_for_exec
|
||||||
|
state["html_blocks"] = {
|
||||||
|
"summary_html": summary_html,
|
||||||
|
"scanned_html": scanned_html,
|
||||||
|
"providers_html": providers_html,
|
||||||
|
"banner_deep_html": banner_deep_html,
|
||||||
|
"vvt_html": vvt_html,
|
||||||
|
"scorecard_html": scorecard_html,
|
||||||
|
"report_html": report_html,
|
||||||
|
"profile_html": profile_html,
|
||||||
|
"redundancy_html": redundancy_html,
|
||||||
|
"exec_summary_html": exec_summary_html,
|
||||||
|
"critical_html": critical_html,
|
||||||
|
"cookie_arch_html": cookie_arch_html,
|
||||||
|
}
|
||||||
@@ -0,0 +1,75 @@
|
|||||||
|
"""Phase E — Send compliance-check email, with A1 ZIP-Anhang.
|
||||||
|
|
||||||
|
Original Step 6 of `_run_compliance_check`, extended with the A1
|
||||||
|
attachment: when the Tesseract pipeline captured evidence slices,
|
||||||
|
bundle them into evidence-{check_id}.zip (manifest.json +
|
||||||
|
audit_metadata.json + slice_NNN.png) and attach to the e-mail. The
|
||||||
|
attachment makes the evidence chain portable so a DSB / lawyer can
|
||||||
|
hand it to an external auditor or supervisory authority.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
|
||||||
|
from compliance.services.smtp_sender import send_email
|
||||||
|
|
||||||
|
from ._helpers import _company_name_from_url, _extract_domain, _update
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def run_phase_e(state: dict) -> None:
|
||||||
|
"""Build site label, optional ZIP attachment, send mail. Mutate state."""
|
||||||
|
check_id = state["check_id"]
|
||||||
|
req = state["req"]
|
||||||
|
results = state["results"]
|
||||||
|
doc_entries = state["doc_entries"]
|
||||||
|
full_html = state["full_html"]
|
||||||
|
cookie_evidence_slices = state.get("cookie_evidence_slices")
|
||||||
|
cookie_evidence_meta = state.get("cookie_evidence_meta")
|
||||||
|
|
||||||
|
# Derive site name primarily from entered URL.
|
||||||
|
# The extracted_profile.companyName is often noisy (e.g. picks up
|
||||||
|
# juris.de from legal references). Domain-derived name is more
|
||||||
|
# predictable for the GF email subject.
|
||||||
|
doc_count = len([r for r in results if not r.error])
|
||||||
|
url_company = _company_name_from_url(doc_entries)
|
||||||
|
domain = _extract_domain(doc_entries)
|
||||||
|
site_name = url_company or domain or "Unbekannt"
|
||||||
|
_update(check_id, "E-Mail wird versendet...", 98)
|
||||||
|
|
||||||
|
# A1: bundle cookie-evidence slices into a ZIP attachment so the
|
||||||
|
# audit chain reaches the recipient. Each slice has its own
|
||||||
|
# SHA-256 + capture timestamp; manifest.json + audit_metadata.json
|
||||||
|
# make the chain verifiable for an external auditor.
|
||||||
|
evidence_attachments: list[dict] = []
|
||||||
|
if cookie_evidence_slices:
|
||||||
|
try:
|
||||||
|
from compliance.services.evidence_zip_builder import (
|
||||||
|
build_evidence_zip,
|
||||||
|
)
|
||||||
|
zip_bytes = build_evidence_zip(
|
||||||
|
slices=cookie_evidence_slices,
|
||||||
|
meta=cookie_evidence_meta,
|
||||||
|
check_id=check_id,
|
||||||
|
)
|
||||||
|
evidence_attachments.append({
|
||||||
|
"filename": f"evidence-{check_id[:8]}.zip",
|
||||||
|
"data": zip_bytes,
|
||||||
|
"mime": "application/zip",
|
||||||
|
})
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("A1 evidence-zip build failed: %s", e)
|
||||||
|
|
||||||
|
email_result = send_email(
|
||||||
|
recipient=req.recipient,
|
||||||
|
subject=f"[COMPLIANCE-CHECK] {site_name} — {doc_count} Dokumente geprueft",
|
||||||
|
body_html=full_html,
|
||||||
|
attachments=evidence_attachments or None,
|
||||||
|
)
|
||||||
|
|
||||||
|
state["email_result"] = email_result
|
||||||
|
state["site_name"] = site_name
|
||||||
|
state["domain"] = domain
|
||||||
|
state["doc_count"] = doc_count
|
||||||
@@ -0,0 +1,166 @@
|
|||||||
|
"""Phase F — Build response + persist snapshot/audit-log/unified-findings.
|
||||||
|
|
||||||
|
Covers (in the original `_run_compliance_check`):
|
||||||
|
- Step 7 Build response dict, mark job as completed
|
||||||
|
- P80 Persist raw scan data so we can replay the audit pipeline
|
||||||
|
without re-crawling (7min → 5sec test cycle)
|
||||||
|
- SQLite audit log (compliance.api/audit endpoints + trend view A6)
|
||||||
|
- P5 Unified findings (MC + Pflichtangaben + Vendor + Redundanz
|
||||||
|
in one searchable table behind /agent/findings/<id>)
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
|
||||||
|
from ._constants import _compliance_check_jobs
|
||||||
|
from ._helpers import _result_to_dict
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def run_phase_f(state: dict) -> None:
|
||||||
|
"""Build response + persist. Mutates state in place."""
|
||||||
|
check_id = state["check_id"]
|
||||||
|
req = state["req"]
|
||||||
|
results = state["results"]
|
||||||
|
profile = state["profile"]
|
||||||
|
profile_dict = state["profile_dict"]
|
||||||
|
extracted_profile = state["extracted_profile"]
|
||||||
|
banner_result = state["banner_result"]
|
||||||
|
tcf_vendors = state["tcf_vendors"]
|
||||||
|
vvt_entries = state["vvt_entries"]
|
||||||
|
cmp_vendors = state["cmp_vendors"]
|
||||||
|
cookie_audit = state["cookie_audit"]
|
||||||
|
total_findings = state["total_findings"]
|
||||||
|
email_result = state["email_result"]
|
||||||
|
doc_entries = state["doc_entries"]
|
||||||
|
doc_texts = state["doc_texts"]
|
||||||
|
redundancy_report = state.get("redundancy_report")
|
||||||
|
scorecard = state["scorecard"]
|
||||||
|
site_name = state.get("site_name", "")
|
||||||
|
domain = state.get("domain", "")
|
||||||
|
doc_count = state.get("doc_count", 0)
|
||||||
|
|
||||||
|
response = {
|
||||||
|
"check_id": check_id,
|
||||||
|
"results": [_result_to_dict(r) for r in results],
|
||||||
|
"business_profile": profile_dict,
|
||||||
|
"extracted_profile": extracted_profile,
|
||||||
|
# P18: vollen consent-tester-Output durchreichen statt nur 4 Felder.
|
||||||
|
# phases (before/after-accept/reject) + banner_checks.violations +
|
||||||
|
# category_tests werden vom Renderer + Critical-Findings-Block genutzt.
|
||||||
|
"banner_result": ({
|
||||||
|
"detected": banner_result.get("banner_detected", False),
|
||||||
|
"provider": banner_result.get("banner_provider", ""),
|
||||||
|
"violations": len((banner_result.get("banner_checks") or {})
|
||||||
|
.get("violations", [])),
|
||||||
|
"tcf_vendor_count": len(tcf_vendors),
|
||||||
|
"completeness_pct": banner_result.get("completeness_pct"),
|
||||||
|
"correctness_pct": banner_result.get("correctness_pct"),
|
||||||
|
"phases": banner_result.get("phases", {}),
|
||||||
|
"banner_checks": banner_result.get("banner_checks", {}),
|
||||||
|
"category_tests": banner_result.get("category_tests", []),
|
||||||
|
"structured_checks": banner_result.get("structured_checks", []),
|
||||||
|
"summary": banner_result.get("summary", {}),
|
||||||
|
} if banner_result else None),
|
||||||
|
"tcf_vendors": vvt_entries if tcf_vendors else [],
|
||||||
|
"cmp_vendors": cmp_vendors,
|
||||||
|
"cookie_audit": cookie_audit if cookie_audit else None,
|
||||||
|
"total_documents": len(results),
|
||||||
|
"total_findings": total_findings,
|
||||||
|
"email_status": email_result.get("status", "failed"),
|
||||||
|
"checked_at": datetime.now(timezone.utc).isoformat(),
|
||||||
|
}
|
||||||
|
|
||||||
|
_compliance_check_jobs[check_id]["status"] = "completed"
|
||||||
|
_compliance_check_jobs[check_id]["result"] = response
|
||||||
|
_compliance_check_jobs[check_id]["progress"] = "Fertig"
|
||||||
|
_compliance_check_jobs[check_id]["progress_pct"] = 100
|
||||||
|
|
||||||
|
# P80: persist raw scan data so we can replay audit pipeline
|
||||||
|
# without re-crawling (7min -> 5sec test cycle).
|
||||||
|
try:
|
||||||
|
from database import SessionLocal
|
||||||
|
from compliance.services.check_snapshot import save_snapshot
|
||||||
|
snap_db = SessionLocal()
|
||||||
|
try:
|
||||||
|
save_snapshot(
|
||||||
|
snap_db,
|
||||||
|
check_id=check_id,
|
||||||
|
doc_entries=doc_entries,
|
||||||
|
banner_result=banner_result,
|
||||||
|
profile=profile,
|
||||||
|
cmp_vendors=cmp_vendors,
|
||||||
|
scan_context=req.scan_context, # P79
|
||||||
|
site_label=site_name,
|
||||||
|
notes=f"recipient={req.recipient}",
|
||||||
|
)
|
||||||
|
finally:
|
||||||
|
snap_db.close()
|
||||||
|
except Exception as snap_err:
|
||||||
|
logger.warning("P80 snapshot save skipped: %s", snap_err)
|
||||||
|
|
||||||
|
# Persist to sidecar SQLite audit log — enables /audit endpoints
|
||||||
|
# (A5 admin tab) and trend view (A6). Best-effort; failures here
|
||||||
|
# do not affect the user-facing response.
|
||||||
|
try:
|
||||||
|
from compliance.services.compliance_audit_log import record_check_run
|
||||||
|
from compliance.services.mc_scorecard import full_audit_records
|
||||||
|
audit_rows: list[dict] = []
|
||||||
|
for r in results:
|
||||||
|
doc_mc = [c for c in r.checks if c.id.startswith("mc-")]
|
||||||
|
audit_rows.extend(full_audit_records(
|
||||||
|
[{"id": c.id, "label": c.label, "passed": c.passed,
|
||||||
|
"severity": c.severity, "skipped": c.skipped,
|
||||||
|
"regulation": c.regulation, "matched_text": c.matched_text,
|
||||||
|
"hint": c.hint, "level": c.level}
|
||||||
|
for c in doc_mc],
|
||||||
|
check_id=check_id,
|
||||||
|
doc_type=r.doc_type,
|
||||||
|
))
|
||||||
|
record_check_run(
|
||||||
|
check_id=check_id,
|
||||||
|
tenant_id=req.recipient or "",
|
||||||
|
site_name=site_name,
|
||||||
|
base_domain=domain or "",
|
||||||
|
doc_count=doc_count,
|
||||||
|
scorecard=scorecard,
|
||||||
|
vvt_summary={
|
||||||
|
"total": len(cmp_vendors),
|
||||||
|
"internal": sum(1 for v in cmp_vendors
|
||||||
|
if (v.get("recipient_type") or "").upper()
|
||||||
|
in ("INTERNAL", "GROUP_COMPANY")),
|
||||||
|
"external": sum(1 for v in cmp_vendors
|
||||||
|
if (v.get("recipient_type") or "").upper()
|
||||||
|
in ("PROCESSOR", "CONTROLLER")),
|
||||||
|
},
|
||||||
|
mc_records=audit_rows,
|
||||||
|
)
|
||||||
|
from compliance.services.compliance_audit_log import record_check_payload
|
||||||
|
record_check_payload(
|
||||||
|
check_id=check_id,
|
||||||
|
vendors=cmp_vendors,
|
||||||
|
profile=extracted_profile,
|
||||||
|
banner=banner_result,
|
||||||
|
)
|
||||||
|
# Unified findings (P5): bundle MC + Pflichtangaben + Vendor +
|
||||||
|
# Redundanz in one searchable table behind /agent/findings/<id>.
|
||||||
|
try:
|
||||||
|
from compliance.services.unified_findings_collector import collect
|
||||||
|
from compliance.services.unified_findings_store import record_findings
|
||||||
|
unified = collect(
|
||||||
|
check_id=check_id,
|
||||||
|
results=results,
|
||||||
|
cmp_vendors=cmp_vendors,
|
||||||
|
redundancy_report=redundancy_report,
|
||||||
|
doc_texts=doc_texts,
|
||||||
|
)
|
||||||
|
record_findings(check_id, unified)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("Unified findings collect failed: %s", e)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("Audit persistence skipped: %s", e)
|
||||||
|
|
||||||
|
state["response"] = response
|
||||||
@@ -0,0 +1,44 @@
|
|||||||
|
"""Pydantic request/response schemas for the compliance-check route."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from pydantic import BaseModel
|
||||||
|
|
||||||
|
|
||||||
|
class ExtractTextRequest(BaseModel):
|
||||||
|
url: str
|
||||||
|
|
||||||
|
|
||||||
|
class DocumentInput(BaseModel):
|
||||||
|
doc_type: str # dse, agb, impressum, cookie, widerruf, avv, loeschkonzept, etc.
|
||||||
|
url: str = ""
|
||||||
|
text: str = "" # text has priority over URL
|
||||||
|
|
||||||
|
|
||||||
|
class ComplianceCheckRequest(BaseModel):
|
||||||
|
documents: list[DocumentInput]
|
||||||
|
use_agent: bool = False
|
||||||
|
recipient: str = "dsb@breakpilot.local"
|
||||||
|
# P12: Override fuer TDM-Vorbehalt bei dokumentierter Kunden-Erlaubnis.
|
||||||
|
# Pflichtfeld tdm_override_reason wenn tdm_override=True
|
||||||
|
# (z.B. "Auftragsbeziehung Safetykon GmbH, Email Hr. X 18.05.2026").
|
||||||
|
tdm_override: bool = False
|
||||||
|
tdm_override_reason: str = ""
|
||||||
|
# P79: 8-Feld Pre-Scan-Wizard (Branche, B2B/B2C, Direkt-Vertrieb,
|
||||||
|
# Rechtsform, Konzern, MA, Besondere Daten, Drittland). Wird im
|
||||||
|
# Snapshot persistiert und filtert die MC-Auswertung (P72).
|
||||||
|
scan_context: dict | None = None
|
||||||
|
|
||||||
|
|
||||||
|
class ComplianceCheckStartResponse(BaseModel):
|
||||||
|
check_id: str
|
||||||
|
status: str = "running"
|
||||||
|
|
||||||
|
|
||||||
|
class ComplianceCheckStatusResponse(BaseModel):
|
||||||
|
check_id: str
|
||||||
|
status: str
|
||||||
|
progress: str = ""
|
||||||
|
progress_pct: int = 0
|
||||||
|
result: dict | None = None
|
||||||
|
error: str = ""
|
||||||
@@ -0,0 +1,118 @@
|
|||||||
|
"""Per-document regex + MC + LLM checks for the compliance-check route.
|
||||||
|
|
||||||
|
Each document goes through:
|
||||||
|
1. regex completeness/correctness checklist
|
||||||
|
2. Master Control evaluation (all MCs for this doc_type)
|
||||||
|
3. LLM verification of failed regex checks (overturns where evidence
|
||||||
|
was missed by the regex)
|
||||||
|
4. Cookie-only: opt-out + privacy-policy URL health-check
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
async def _check_single(
|
||||||
|
text: str, doc_type: str, label: str, url: str,
|
||||||
|
word_count: int, use_agent: bool,
|
||||||
|
business_scope: set[str] | None = None,
|
||||||
|
business_profile: dict | None = None,
|
||||||
|
):
|
||||||
|
"""Run regex + MC checks on a single document."""
|
||||||
|
from compliance.services.doc_checks.runner import check_document_completeness
|
||||||
|
from compliance.services.rag_document_checker import check_document_with_controls
|
||||||
|
from ..agent_doc_check_routes import CheckItem, DocCheckResult
|
||||||
|
|
||||||
|
# Regex checklist
|
||||||
|
findings = check_document_completeness(text, doc_type, label, url,
|
||||||
|
business_profile=business_profile)
|
||||||
|
|
||||||
|
all_checks: list[CheckItem] = []
|
||||||
|
completeness = 0
|
||||||
|
correctness = 0
|
||||||
|
|
||||||
|
for f in findings:
|
||||||
|
if "SCORE" in f.get("code", ""):
|
||||||
|
for c in f.get("all_checks", []):
|
||||||
|
all_checks.append(CheckItem(
|
||||||
|
id=c["id"], label=c["label"], passed=c["passed"],
|
||||||
|
severity=c["severity"], matched_text=c.get("matched_text", ""),
|
||||||
|
level=c.get("level", 1), parent=c.get("parent"),
|
||||||
|
skipped=c.get("skipped", False), hint=c.get("hint", ""),
|
||||||
|
))
|
||||||
|
completeness = f.get("completeness_pct", 0)
|
||||||
|
correctness = f.get("correctness_pct", 0)
|
||||||
|
|
||||||
|
# Master Control checks (top 20 by severity to avoid noise)
|
||||||
|
try:
|
||||||
|
# max_controls=0 -> evaluate ALL MCs for this doc_type (DB has
|
||||||
|
# 1874 across 8 types; regex matching is cheap and dominates
|
||||||
|
# well under 1s per doc). Caps remain on the LLM-enrich step
|
||||||
|
# (top-10 FAILs) so cost stays bounded.
|
||||||
|
mc_results = await check_document_with_controls(
|
||||||
|
text, doc_type, label, max_controls=0, use_agent=use_agent,
|
||||||
|
business_scope=business_scope,
|
||||||
|
)
|
||||||
|
if mc_results:
|
||||||
|
for mc in mc_results:
|
||||||
|
all_checks.append(CheckItem(**mc))
|
||||||
|
l2 = [c for c in all_checks if c.level == 2 and not c.skipped]
|
||||||
|
l2_passed = sum(1 for c in l2 if c.passed)
|
||||||
|
correctness = round(l2_passed / len(l2) * 100) if l2 else 0
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("MC check skipped for %s: %s", label, e)
|
||||||
|
|
||||||
|
# LLM verification of regex fails
|
||||||
|
failed = [c for c in all_checks if not c.passed and not c.skipped and c.hint]
|
||||||
|
if failed:
|
||||||
|
try:
|
||||||
|
from compliance.services.doc_checks.llm_verify import verify_failed_checks
|
||||||
|
overturns = await verify_failed_checks(
|
||||||
|
text,
|
||||||
|
[{"id": c.id, "label": c.label, "hint": c.hint} for c in failed],
|
||||||
|
label,
|
||||||
|
)
|
||||||
|
for c in all_checks:
|
||||||
|
if c.id in overturns and overturns[c.id]["overturned"]:
|
||||||
|
c.passed = True
|
||||||
|
c.matched_text = f"[LLM] {overturns[c.id]['evidence']}"
|
||||||
|
l2_active = [c for c in all_checks if c.level == 2 and not c.skipped]
|
||||||
|
l2_passed = sum(1 for c in l2_active if c.passed)
|
||||||
|
if l2_active:
|
||||||
|
correctness = round(l2_passed / len(l2_active) * 100)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("LLM verification skipped: %s", e)
|
||||||
|
|
||||||
|
# Cookie-policy only: actively HTTP-probe the Opt-Out + Privacy-Policy
|
||||||
|
# URLs the document advertises. Broken links make individual provider
|
||||||
|
# entries non-compliant under Art. 7(3) DSGVO.
|
||||||
|
if doc_type == "cookie":
|
||||||
|
try:
|
||||||
|
from compliance.services.cookie_link_validator import (
|
||||||
|
extract_links, validate_links, build_check_items,
|
||||||
|
)
|
||||||
|
links = extract_links(text)
|
||||||
|
if links:
|
||||||
|
logger.info("Cookie-link validator: %d urls extracted from %s",
|
||||||
|
len(links), label)
|
||||||
|
validated = await validate_links(links)
|
||||||
|
for item in build_check_items(validated):
|
||||||
|
all_checks.append(CheckItem(**item))
|
||||||
|
# Re-compute correctness with the new L2 items
|
||||||
|
l2_active = [c for c in all_checks if c.level == 2 and not c.skipped]
|
||||||
|
l2_passed = sum(1 for c in l2_active if c.passed)
|
||||||
|
if l2_active:
|
||||||
|
correctness = round(l2_passed / len(l2_active) * 100)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("Cookie-link validation skipped for %s: %s", label, e)
|
||||||
|
|
||||||
|
non_score = [f for f in findings if "SCORE" not in f.get("code", "")]
|
||||||
|
return DocCheckResult(
|
||||||
|
label=label, url=url, doc_type=doc_type,
|
||||||
|
word_count=word_count or len(text.split()),
|
||||||
|
completeness_pct=completeness, correctness_pct=correctness,
|
||||||
|
checks=all_checks, findings_count=len(non_score),
|
||||||
|
)
|
||||||
@@ -0,0 +1,58 @@
|
|||||||
|
"""Shared state for the compliance-check pipeline.
|
||||||
|
|
||||||
|
The 7-step pipeline accumulates ~60 named values that flow across
|
||||||
|
phases (doc_entries, profile, results, banner_result, cmp_vendors,
|
||||||
|
scorecard, HTML blocks, …). Rather than threading 60 parameters
|
||||||
|
through each function, we pass one mutable `CheckState` dict.
|
||||||
|
|
||||||
|
Phases read what they need with `state[key]` and write their outputs
|
||||||
|
with `state[key] = value`. This is intentionally untyped: enforcing
|
||||||
|
strict typing would require freezing the schema before all phases
|
||||||
|
landed, and the report-building phase routinely adds new optional
|
||||||
|
keys (P1, P10, P50, P59b, P82, P103, P104, P106, …).
|
||||||
|
|
||||||
|
`CheckState.new(check_id, req)` initialises the dict with the few
|
||||||
|
keys that must exist from the start.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
|
||||||
|
def new_state(check_id: str, req) -> dict:
|
||||||
|
"""Create a fresh state dict for a check run.
|
||||||
|
|
||||||
|
Pre-populates a few keys that downstream phases assume exist
|
||||||
|
(e.g. `cmp_vendors` defaulting to `[]`).
|
||||||
|
"""
|
||||||
|
return {
|
||||||
|
"check_id": check_id,
|
||||||
|
"req": req,
|
||||||
|
# Phase-1 outputs
|
||||||
|
"doc_texts": {},
|
||||||
|
"doc_entries": [],
|
||||||
|
"url_text_cache": {},
|
||||||
|
"pasted_table_vendors": [],
|
||||||
|
"placement_findings": [],
|
||||||
|
# Phase-2/3/4 outputs
|
||||||
|
"profile": None,
|
||||||
|
"profile_dict": {},
|
||||||
|
"results": [],
|
||||||
|
"total_findings": 0,
|
||||||
|
"business_scope": set(),
|
||||||
|
"banner_result": None,
|
||||||
|
"banner_url": "",
|
||||||
|
"tcf_vendors": [],
|
||||||
|
"vvt_entries": [],
|
||||||
|
"extracted_profile": {},
|
||||||
|
# Phase-5 outputs
|
||||||
|
"cmp_vendors": [],
|
||||||
|
"cookie_audit": {},
|
||||||
|
"cookie_evidence_slices": None,
|
||||||
|
"cookie_evidence_meta": None,
|
||||||
|
"scorecard": {},
|
||||||
|
"full_html": "",
|
||||||
|
"audit_quality_findings": [],
|
||||||
|
# Phase-6/7 outputs
|
||||||
|
"email_result": {"status": "skipped"},
|
||||||
|
"site_name": "",
|
||||||
|
}
|
||||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,278 @@
|
|||||||
|
"""
|
||||||
|
B1 — Cookie-Consent-UX-001: Mobile Reachability of Consent Settings.
|
||||||
|
|
||||||
|
DSGVO Art. 7 Abs. 3 requires that withdrawing consent must be as
|
||||||
|
easy as giving it. EDPB Cookie Banner Taskforce Report (2023) and
|
||||||
|
DSK OH Digitale Dienste v1.2 (2024) both demand a permanent, directly
|
||||||
|
reachable way to change cookie preferences — typically a Footer link
|
||||||
|
labelled "Cookie-Einstellungen" that re-opens the CMP in place.
|
||||||
|
|
||||||
|
Common anti-patterns we want to flag:
|
||||||
|
- Footer points to a Cookie-Policy *page* in a new tab, no CMP
|
||||||
|
- Footer only offers "more info" but no "manage settings"
|
||||||
|
- Only mention is a verbal reference to browser settings inside the
|
||||||
|
privacy-policy text
|
||||||
|
- Mobile footer hides the link in a multi-level accordion
|
||||||
|
|
||||||
|
This module does the STATIC HTML analysis. The dynamic part (mobile
|
||||||
|
viewport rendering, tap-target measurement, click-behaviour
|
||||||
|
verification) is performed by consent-tester via Playwright and feeds
|
||||||
|
back into `evaluate_combined` in a later phase.
|
||||||
|
|
||||||
|
Pure module — no DB, no network. Tests live in
|
||||||
|
tests/test_consent_reachability_check.py.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import re
|
||||||
|
from html.parser import HTMLParser
|
||||||
|
from urllib.parse import urljoin, urlparse
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# Phrases that suggest "open the consent manager" rather than "show
|
||||||
|
# more info / open a policy page".
|
||||||
|
_REOPEN_PHRASES = (
|
||||||
|
"cookie-einstellungen", "cookie einstellungen",
|
||||||
|
"cookie-präferenzen", "cookie praeferenzen", "cookie-praferenzen",
|
||||||
|
"cookie-einwilligung", "einwilligung verwalten",
|
||||||
|
"consent manager", "consent settings", "consent-einstellungen",
|
||||||
|
"datenschutz-einstellungen", "datenschutzeinstellungen",
|
||||||
|
"cookies verwalten", "manage cookies", "manage preferences",
|
||||||
|
"privacy settings", "privacy preferences",
|
||||||
|
"tracking-einstellungen",
|
||||||
|
)
|
||||||
|
|
||||||
|
# Weaker — these usually point at a policy page, not the CMP itself.
|
||||||
|
_INFO_ONLY_PHRASES = (
|
||||||
|
"cookie-richtlinie", "cookie richtlinie", "cookie-policy",
|
||||||
|
"cookie policy", "cookies (information)",
|
||||||
|
"datenschutz", "datenschutzerklärung", "privacy policy",
|
||||||
|
"weitere informationen", "more information",
|
||||||
|
)
|
||||||
|
|
||||||
|
# Phrases that try to shift the burden to the user's browser —
|
||||||
|
# Bundesländer-Datenschutzbeauftragte explicitly call this insufficient.
|
||||||
|
_BROWSER_DEFLECTION_PHRASES = (
|
||||||
|
"browser-einstellungen", "browsereinstellungen",
|
||||||
|
"einstellungen ihres browsers", "browser settings",
|
||||||
|
"in ihrem browser", "über ihren browser",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class _AnchorCollector(HTMLParser):
|
||||||
|
"""Collects <a> and <button> elements with text + attrs.
|
||||||
|
|
||||||
|
Track footer scope via a depth counter so we only return anchors
|
||||||
|
that are descendants of <footer> (or have role="contentinfo" /
|
||||||
|
id|class containing 'footer').
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self) -> None:
|
||||||
|
super().__init__(convert_charrefs=True)
|
||||||
|
self._footer_depth = 0
|
||||||
|
self._current: dict | None = None
|
||||||
|
self._text_chunks: list[str] = []
|
||||||
|
self.anchors: list[dict] = []
|
||||||
|
|
||||||
|
def _is_footer_open(self, tag: str, attrs: dict) -> bool:
|
||||||
|
if tag == "footer":
|
||||||
|
return True
|
||||||
|
if attrs.get("role", "").lower() == "contentinfo":
|
||||||
|
return True
|
||||||
|
ident = (attrs.get("id", "") + " " + attrs.get("class", "")).lower()
|
||||||
|
return "footer" in ident or "site-footer" in ident
|
||||||
|
|
||||||
|
def handle_starttag(self, tag, attrs):
|
||||||
|
a = {k.lower(): (v or "") for k, v in attrs}
|
||||||
|
if self._is_footer_open(tag, a):
|
||||||
|
self._footer_depth += 1
|
||||||
|
return
|
||||||
|
if self._footer_depth > 0 and tag in ("a", "button"):
|
||||||
|
self._current = {
|
||||||
|
"tag": tag,
|
||||||
|
"href": a.get("href", ""),
|
||||||
|
"target": a.get("target", ""),
|
||||||
|
"aria_label": a.get("aria-label", ""),
|
||||||
|
"data_cmp": a.get("data-cmp", ""),
|
||||||
|
"onclick": a.get("onclick", ""),
|
||||||
|
"id": a.get("id", ""),
|
||||||
|
"class": a.get("class", ""),
|
||||||
|
}
|
||||||
|
self._text_chunks = []
|
||||||
|
|
||||||
|
def handle_endtag(self, tag):
|
||||||
|
if tag == "footer" and self._footer_depth > 0:
|
||||||
|
self._footer_depth -= 1
|
||||||
|
elif self._current and tag == self._current["tag"]:
|
||||||
|
txt = " ".join(self._text_chunks).strip()
|
||||||
|
self._current["text"] = re.sub(r"\s+", " ", txt)[:200]
|
||||||
|
self.anchors.append(self._current)
|
||||||
|
self._current = None
|
||||||
|
self._text_chunks = []
|
||||||
|
|
||||||
|
def handle_data(self, data):
|
||||||
|
if self._current is not None:
|
||||||
|
self._text_chunks.append(data)
|
||||||
|
|
||||||
|
|
||||||
|
def find_consent_anchors_in_footer(html: str) -> list[dict]:
|
||||||
|
"""Parse <a> / <button> elements in <footer> and tag those that
|
||||||
|
look related to cookie/consent management.
|
||||||
|
|
||||||
|
Each returned dict:
|
||||||
|
{ tag, href, target, text, aria_label, onclick, id, class,
|
||||||
|
intent }
|
||||||
|
where intent ∈ {"reopen_cmp", "info_only", "browser_deflect",
|
||||||
|
"unrelated"}.
|
||||||
|
"""
|
||||||
|
if not html:
|
||||||
|
return []
|
||||||
|
parser = _AnchorCollector()
|
||||||
|
try:
|
||||||
|
parser.feed(html)
|
||||||
|
except Exception as e: # malformed HTML — recover silently
|
||||||
|
logger.warning("footer parser failed: %s", e)
|
||||||
|
return []
|
||||||
|
out: list[dict] = []
|
||||||
|
for a in parser.anchors:
|
||||||
|
label = " ".join([
|
||||||
|
a.get("text", ""), a.get("aria_label", ""),
|
||||||
|
a.get("data_cmp", ""), a.get("onclick", ""),
|
||||||
|
]).lower()
|
||||||
|
intent = "unrelated"
|
||||||
|
if any(p in label for p in _REOPEN_PHRASES):
|
||||||
|
intent = "reopen_cmp"
|
||||||
|
elif any(p in label for p in _BROWSER_DEFLECTION_PHRASES):
|
||||||
|
intent = "browser_deflect"
|
||||||
|
elif any(p in label for p in _INFO_ONLY_PHRASES):
|
||||||
|
intent = "info_only"
|
||||||
|
if intent != "unrelated":
|
||||||
|
a["intent"] = intent
|
||||||
|
out.append(a)
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
def classify_anchor_target(
|
||||||
|
anchor: dict, base_url: str,
|
||||||
|
) -> str:
|
||||||
|
"""Decide whether the anchor would open the CMP in place or
|
||||||
|
navigate elsewhere.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
"same_page_cmp" — onclick / data-cmp / data-* / hash-only link
|
||||||
|
"same_origin" — relative link or same-origin page (still a
|
||||||
|
navigation away from the live banner)
|
||||||
|
"external" — link to a different origin
|
||||||
|
"new_tab" — target="_blank" or rel*=external
|
||||||
|
"javascript" — javascript: link, probably a CMP trigger
|
||||||
|
"""
|
||||||
|
href = (anchor.get("href") or "").strip()
|
||||||
|
target = (anchor.get("target") or "").strip().lower()
|
||||||
|
onclick = anchor.get("onclick", "") or ""
|
||||||
|
data_cmp = anchor.get("data_cmp", "") or ""
|
||||||
|
|
||||||
|
if data_cmp or onclick:
|
||||||
|
return "same_page_cmp"
|
||||||
|
if href.startswith("javascript:"):
|
||||||
|
return "javascript"
|
||||||
|
if target == "_blank":
|
||||||
|
return "new_tab"
|
||||||
|
if not href or href.startswith("#"):
|
||||||
|
return "same_page_cmp"
|
||||||
|
|
||||||
|
base_host = urlparse(base_url).netloc.lower() if base_url else ""
|
||||||
|
try:
|
||||||
|
target_host = urlparse(urljoin(base_url or "/", href)).netloc.lower()
|
||||||
|
except Exception:
|
||||||
|
target_host = ""
|
||||||
|
if not target_host or target_host == base_host:
|
||||||
|
return "same_origin"
|
||||||
|
return "external"
|
||||||
|
|
||||||
|
|
||||||
|
def evaluate_reachability(
|
||||||
|
footer_html: str,
|
||||||
|
base_url: str = "",
|
||||||
|
) -> dict:
|
||||||
|
"""Run static reachability analysis on a footer HTML fragment.
|
||||||
|
|
||||||
|
Returns a finding dict for the COOKIE-CONSENT-UX-001 check.
|
||||||
|
"""
|
||||||
|
anchors = find_consent_anchors_in_footer(footer_html)
|
||||||
|
has_reopen_anchor = False
|
||||||
|
reopen_anchor: dict | None = None
|
||||||
|
info_only_count = 0
|
||||||
|
browser_deflect_count = 0
|
||||||
|
for a in anchors:
|
||||||
|
intent = a.get("intent")
|
||||||
|
if intent == "reopen_cmp":
|
||||||
|
has_reopen_anchor = True
|
||||||
|
target_class = classify_anchor_target(a, base_url)
|
||||||
|
a["target_class"] = target_class
|
||||||
|
if reopen_anchor is None:
|
||||||
|
reopen_anchor = a
|
||||||
|
elif intent == "info_only":
|
||||||
|
info_only_count += 1
|
||||||
|
elif intent == "browser_deflect":
|
||||||
|
browser_deflect_count += 1
|
||||||
|
|
||||||
|
result: dict = {
|
||||||
|
"check_id": "COOKIE-CONSENT-UX-001",
|
||||||
|
"anchors_total": len(anchors),
|
||||||
|
"has_reopen_anchor": has_reopen_anchor,
|
||||||
|
"info_only_count": info_only_count,
|
||||||
|
"browser_deflect_count": browser_deflect_count,
|
||||||
|
"reopen_anchor": reopen_anchor,
|
||||||
|
"passed": True,
|
||||||
|
"severity": None,
|
||||||
|
"severity_reason": None,
|
||||||
|
"evidence_phrases": [],
|
||||||
|
"notes": [],
|
||||||
|
}
|
||||||
|
|
||||||
|
# Hard fail: no reopen anchor at all → withdrawal not as easy as
|
||||||
|
# opt-in (Art. 7 Abs. 3 DSGVO).
|
||||||
|
if not has_reopen_anchor:
|
||||||
|
result["passed"] = False
|
||||||
|
result["severity"] = "HIGH"
|
||||||
|
result["severity_reason"] = "missing"
|
||||||
|
result["notes"].append(
|
||||||
|
"no consent-manager link in footer; withdrawal path "
|
||||||
|
"missing or only indirect",
|
||||||
|
)
|
||||||
|
|
||||||
|
# Soft fail: anchor exists but opens in new tab — DSK OH calls this
|
||||||
|
# an avoidable hurdle. MEDIUM rather than HIGH because withdrawal
|
||||||
|
# is technically still possible.
|
||||||
|
if has_reopen_anchor and reopen_anchor is not None:
|
||||||
|
cls = reopen_anchor.get("target_class")
|
||||||
|
if cls == "new_tab":
|
||||||
|
result["passed"] = False
|
||||||
|
result["severity"] = "MEDIUM"
|
||||||
|
result["severity_reason"] = "misclassified"
|
||||||
|
result["notes"].append(
|
||||||
|
"consent-manager link opens in new tab — context-break",
|
||||||
|
)
|
||||||
|
elif cls == "external":
|
||||||
|
result["passed"] = False
|
||||||
|
result["severity"] = "MEDIUM"
|
||||||
|
result["severity_reason"] = "misclassified"
|
||||||
|
result["notes"].append(
|
||||||
|
"consent-manager link points to external host",
|
||||||
|
)
|
||||||
|
|
||||||
|
# Extra signal: only browser-deflection phrases and zero reopen
|
||||||
|
# anchor — this is the worst variant the LfDI BaWü explicitly
|
||||||
|
# flagged.
|
||||||
|
if (not has_reopen_anchor and browser_deflect_count > 0):
|
||||||
|
result["severity"] = "HIGH"
|
||||||
|
result["severity_reason"] = "factually_wrong"
|
||||||
|
result["notes"].append(
|
||||||
|
"withdrawal route only via browser-settings — not gleich "
|
||||||
|
"einfach wie Erteilung",
|
||||||
|
)
|
||||||
|
|
||||||
|
return result
|
||||||
@@ -0,0 +1,119 @@
|
|||||||
|
"""
|
||||||
|
Evidence ZIP Builder — bundles cookie-evidence slices into one ZIP
|
||||||
|
suitable as email attachment for the audit trail.
|
||||||
|
|
||||||
|
Why: capture_cookie_evidence_slices() produces N PNG slices per check
|
||||||
|
with timestamps + per-slice SHA256. Without an attachment to the
|
||||||
|
compliance report, the evidence chain stops at the backend. The ZIP
|
||||||
|
makes the slices portable so a DSB / lawyer can hand them to an
|
||||||
|
auditor or supervisory authority.
|
||||||
|
|
||||||
|
ZIP layout:
|
||||||
|
evidence.zip
|
||||||
|
├── manifest.json # per-slice metadata
|
||||||
|
├── audit_metadata.json # run-level (check_id, url, build_sha, ...)
|
||||||
|
└── slice_001.png ... # binary PNG per slice
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import base64
|
||||||
|
import io
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import zipfile
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def build_evidence_zip(
|
||||||
|
slices: list[dict],
|
||||||
|
meta: dict | None = None,
|
||||||
|
check_id: str = "",
|
||||||
|
) -> bytes:
|
||||||
|
"""Build a ZIP archive with all slices + a manifest.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
slices: list of dicts from capture_cookie_evidence_slices():
|
||||||
|
each {"idx", "ts", "top_y", "bot_y", "sha256", "png_b64",
|
||||||
|
"png_size"}
|
||||||
|
meta: run-level dict from the same call:
|
||||||
|
{"total_height_px", "width_px", "accepted_banner",
|
||||||
|
"expanded", "url", "captured_at", "slice_count"}
|
||||||
|
check_id: the compliance-check job id
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
raw ZIP bytes (suitable as email attachment payload)
|
||||||
|
"""
|
||||||
|
buf = io.BytesIO()
|
||||||
|
manifest_slices: list[dict] = []
|
||||||
|
|
||||||
|
with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as zf:
|
||||||
|
for s in slices or []:
|
||||||
|
idx = int(s.get("idx", 0))
|
||||||
|
fname = f"slice_{idx + 1:03d}.png"
|
||||||
|
try:
|
||||||
|
png = base64.b64decode(s.get("png_b64", ""))
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(
|
||||||
|
"evidence_zip: skip slice %s, b64 decode failed: %s",
|
||||||
|
idx, e,
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
zf.writestr(fname, png)
|
||||||
|
manifest_slices.append({
|
||||||
|
"filename": fname,
|
||||||
|
"slice_idx": idx,
|
||||||
|
"captured_at": s.get("ts", ""),
|
||||||
|
"top_y_px": s.get("top_y"),
|
||||||
|
"bot_y_px": s.get("bot_y"),
|
||||||
|
"sha256_short": s.get("sha256", ""),
|
||||||
|
"png_size_bytes": s.get("png_size", len(png)),
|
||||||
|
})
|
||||||
|
|
||||||
|
manifest = {
|
||||||
|
"schema_version": "1.0",
|
||||||
|
"check_id": check_id,
|
||||||
|
"slices": manifest_slices,
|
||||||
|
"slice_count": len(manifest_slices),
|
||||||
|
}
|
||||||
|
zf.writestr(
|
||||||
|
"manifest.json",
|
||||||
|
json.dumps(manifest, indent=2, ensure_ascii=False),
|
||||||
|
)
|
||||||
|
|
||||||
|
audit_meta = {
|
||||||
|
"schema_version": "1.0",
|
||||||
|
"check_id": check_id,
|
||||||
|
"build_sha": os.environ.get("BUILD_SHA", "unknown"),
|
||||||
|
"generated_at": datetime.now(timezone.utc).isoformat(),
|
||||||
|
"source_url": (meta or {}).get("url", ""),
|
||||||
|
"captured_at": (meta or {}).get("captured_at", ""),
|
||||||
|
"accepted_banner": (meta or {}).get("accepted_banner"),
|
||||||
|
"expanded": (meta or {}).get("expanded"),
|
||||||
|
"total_height_px": (meta or {}).get("total_height_px"),
|
||||||
|
"width_px": (meta or {}).get("width_px"),
|
||||||
|
"slice_count": (meta or {}).get(
|
||||||
|
"slice_count", len(manifest_slices),
|
||||||
|
),
|
||||||
|
"note": (
|
||||||
|
"Each slice_NNN.png is an overlapping screenshot fragment "
|
||||||
|
"of the cookie policy page captured at captured_at. "
|
||||||
|
"sha256_short is the first 16 hex chars of the SHA-256 of "
|
||||||
|
"the raw PNG bytes — use it to verify the slice was not "
|
||||||
|
"modified after capture."
|
||||||
|
),
|
||||||
|
}
|
||||||
|
zf.writestr(
|
||||||
|
"audit_metadata.json",
|
||||||
|
json.dumps(audit_meta, indent=2, ensure_ascii=False),
|
||||||
|
)
|
||||||
|
|
||||||
|
data = buf.getvalue()
|
||||||
|
logger.info(
|
||||||
|
"evidence_zip built: %d slices, %d bytes, check_id=%s",
|
||||||
|
len(manifest_slices), len(data), check_id,
|
||||||
|
)
|
||||||
|
return data
|
||||||
@@ -0,0 +1,362 @@
|
|||||||
|
"""
|
||||||
|
B3 — Cross-Doc Retention Consistency Comparator.
|
||||||
|
|
||||||
|
Compares three sources of truth for cookie storage duration:
|
||||||
|
|
||||||
|
1. DSI claim — sentence(s) in the privacy policy mentioning retention
|
||||||
|
("Die Speicherdauer beträgt 14 Monate", "_ga: 14 Monate", ...).
|
||||||
|
2. Cookie-table — the `duration` field parsed from the cookie policy
|
||||||
|
table (parse_flat_cookie_text / OCR / vendor-extract).
|
||||||
|
3. Actual cookie — `Max-Age` / `Expires` from the real Set-Cookie
|
||||||
|
header captured by the consent-tester.
|
||||||
|
|
||||||
|
Output is a per-cookie finding usable by the audit report:
|
||||||
|
- matches=True → all three sources agree (within tolerance)
|
||||||
|
- matches=False → mismatch with explicit type + severity_reason
|
||||||
|
|
||||||
|
Severity hierarchy (see project_audit_report_architecture.md):
|
||||||
|
HIGH/factually_wrong : DSI claim is shorter than reality
|
||||||
|
→ user is told "X" but tracked for longer
|
||||||
|
HIGH/factually_wrong : table duration is shorter than reality
|
||||||
|
→ cookie table understates what is set
|
||||||
|
MEDIUM/misclassified : DSI is shorter than table (internal docs disagree)
|
||||||
|
LOW/incomplete : only one source has data
|
||||||
|
|
||||||
|
The module is pure (no DB, no network) and meant to be called from the
|
||||||
|
report pipeline after cookies+DSI+HAR have already been collected.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import re
|
||||||
|
from dataclasses import dataclass
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# 5% tolerance — Safari ITP, leap years, server clocks etc.
|
||||||
|
_MATCH_TOLERANCE_PCT = 5
|
||||||
|
|
||||||
|
# Multipliers in DAYS for the German + English unit vocabulary used in
|
||||||
|
# our cookie tables and policies.
|
||||||
|
_UNIT_DAYS: dict[str, float] = {
|
||||||
|
"sekunden": 1 / 86400, "sekunde": 1 / 86400, "sec": 1 / 86400, "s": 1 / 86400,
|
||||||
|
"minuten": 1 / 1440, "minute": 1 / 1440, "min": 1 / 1440,
|
||||||
|
"stunden": 1 / 24, "stunde": 1 / 24, "h": 1 / 24,
|
||||||
|
"tage": 1, "tag": 1, "d": 1, "day": 1, "days": 1,
|
||||||
|
"wochen": 7, "woche": 7, "week": 7, "weeks": 7,
|
||||||
|
"monate": 30, "monat": 30, "month": 30, "months": 30,
|
||||||
|
"jahre": 365, "jahr": 365, "year": 365, "years": 365,
|
||||||
|
}
|
||||||
|
|
||||||
|
# Phrases that mean "session" — cookie deleted when browser closes.
|
||||||
|
_SESSION_TOKENS = {
|
||||||
|
"session", "sitzung", "sitzungsdauer", "browsersitzung",
|
||||||
|
"browser session", "browsing session", "tab",
|
||||||
|
}
|
||||||
|
|
||||||
|
# Phrases that mean "persistent without explicit cap".
|
||||||
|
_NO_EXPIRY_TOKENS = {
|
||||||
|
"unbegrenzt", "unbestimmt", "kein ablaufdatum",
|
||||||
|
"no expiry", "persistent", "permanent",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class RetentionClaim:
|
||||||
|
"""One retention statement found in the DSI text."""
|
||||||
|
sentence: str
|
||||||
|
days: float | None # None for session/unknown
|
||||||
|
is_session: bool
|
||||||
|
is_persistent: bool
|
||||||
|
context_terms: list[str] # cookie names / provider names mentioned nearby
|
||||||
|
|
||||||
|
|
||||||
|
def parse_duration_to_days(text: str) -> tuple[float | None, str]:
|
||||||
|
"""Convert a duration phrase to days.
|
||||||
|
|
||||||
|
Returns (days, kind) where kind ∈
|
||||||
|
{"days", "session", "persistent", "unknown"}.
|
||||||
|
For "session" / "persistent" days is None — comparisons must
|
||||||
|
handle these as special cases, not as 0 or infinity.
|
||||||
|
"""
|
||||||
|
if text is None:
|
||||||
|
return None, "unknown"
|
||||||
|
s = text.strip().lower()
|
||||||
|
if not s:
|
||||||
|
return None, "unknown"
|
||||||
|
|
||||||
|
for tok in _SESSION_TOKENS:
|
||||||
|
if tok in s:
|
||||||
|
return None, "session"
|
||||||
|
for tok in _NO_EXPIRY_TOKENS:
|
||||||
|
if tok in s:
|
||||||
|
return None, "persistent"
|
||||||
|
|
||||||
|
# "14 Monate", "1 Jahr", "24h", "30 Tage", "365 Tage", "30d"
|
||||||
|
m = re.search(
|
||||||
|
r"(?P<num>\d+(?:[.,]\d+)?)\s*(?P<unit>"
|
||||||
|
r"sekunden?|sec|s|minuten?|min|stunden?|h|"
|
||||||
|
r"tage?|d(?:ays?)?|wochen?|weeks?|"
|
||||||
|
r"monate?|months?|jahre?|years?)\b",
|
||||||
|
s,
|
||||||
|
)
|
||||||
|
if not m:
|
||||||
|
return None, "unknown"
|
||||||
|
num = float(m.group("num").replace(",", "."))
|
||||||
|
unit = m.group("unit")
|
||||||
|
mult = _UNIT_DAYS.get(unit)
|
||||||
|
if mult is None:
|
||||||
|
return None, "unknown"
|
||||||
|
return num * mult, "days"
|
||||||
|
|
||||||
|
|
||||||
|
def max_age_to_days(max_age_seconds: int | float | None) -> float | None:
|
||||||
|
"""Convert a Set-Cookie Max-Age (in seconds) to days."""
|
||||||
|
if max_age_seconds is None:
|
||||||
|
return None
|
||||||
|
try:
|
||||||
|
return float(max_age_seconds) / 86400.0
|
||||||
|
except (TypeError, ValueError):
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
# Sentence splitter that respects German legal text style (lots of
|
||||||
|
# semicolons + parentheses but few capitalised abbreviations).
|
||||||
|
_SENTENCE_SPLIT = re.compile(r"(?<=[.!?])\s+(?=[A-ZÄÖÜ])")
|
||||||
|
|
||||||
|
# Quick anchor terms for retention sentences.
|
||||||
|
_RETENTION_ANCHORS = (
|
||||||
|
"speicherdauer", "speicherfrist", "speicher",
|
||||||
|
"aufbewahrungsdauer", "aufbewahrungsfrist",
|
||||||
|
"löschfrist", "löschung",
|
||||||
|
"gespeichert für", "wird gespeichert", "wird für",
|
||||||
|
"retention", "expires", "expiration", "lifetime",
|
||||||
|
"gültigkeit", "laufzeit",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _looks_like_retention(sentence: str) -> bool:
|
||||||
|
s = sentence.lower()
|
||||||
|
if not any(a in s for a in _RETENTION_ANCHORS):
|
||||||
|
return False
|
||||||
|
# Need a unit token nearby — otherwise it's metadata not duration.
|
||||||
|
return bool(re.search(
|
||||||
|
r"\b\d[\d.,]*\s*("
|
||||||
|
r"sekunden?|minuten?|stunden?|tage?|wochen?|"
|
||||||
|
r"monate?|jahre?|sec|min|h|d|"
|
||||||
|
r"weeks?|months?|years?|days?)\b",
|
||||||
|
s,
|
||||||
|
))
|
||||||
|
|
||||||
|
|
||||||
|
def extract_retention_claims(
|
||||||
|
dsi_text: str,
|
||||||
|
cookie_names: list[str] | None = None,
|
||||||
|
vendor_names: list[str] | None = None,
|
||||||
|
) -> list[RetentionClaim]:
|
||||||
|
"""Find sentences in the DSI that state a retention period.
|
||||||
|
|
||||||
|
cookie_names / vendor_names attach themselves to a sentence when
|
||||||
|
they are mentioned in it; the comparator uses this to prefer the
|
||||||
|
most specific claim available for a given cookie.
|
||||||
|
"""
|
||||||
|
if not dsi_text:
|
||||||
|
return []
|
||||||
|
cookie_names = cookie_names or []
|
||||||
|
vendor_names = vendor_names or []
|
||||||
|
# Normalise — keep original case for the sentence so it can be
|
||||||
|
# cited verbatim in the audit report.
|
||||||
|
sentences = _SENTENCE_SPLIT.split(dsi_text)
|
||||||
|
claims: list[RetentionClaim] = []
|
||||||
|
for raw in sentences:
|
||||||
|
s = raw.strip()
|
||||||
|
if not s:
|
||||||
|
continue
|
||||||
|
if not _looks_like_retention(s):
|
||||||
|
continue
|
||||||
|
days, kind = parse_duration_to_days(s)
|
||||||
|
lower = s.lower()
|
||||||
|
contexts: list[str] = []
|
||||||
|
for n in cookie_names:
|
||||||
|
if n and n.lower() in lower:
|
||||||
|
contexts.append(n)
|
||||||
|
for v in vendor_names:
|
||||||
|
if v and v.lower() in lower:
|
||||||
|
contexts.append(v)
|
||||||
|
claims.append(RetentionClaim(
|
||||||
|
sentence=s[:400],
|
||||||
|
days=days,
|
||||||
|
is_session=(kind == "session"),
|
||||||
|
is_persistent=(kind == "persistent"),
|
||||||
|
context_terms=contexts,
|
||||||
|
))
|
||||||
|
return claims
|
||||||
|
|
||||||
|
|
||||||
|
def _best_dsi_claim(
|
||||||
|
claims: list[RetentionClaim],
|
||||||
|
cookie_name: str,
|
||||||
|
vendor_name: str | None,
|
||||||
|
) -> RetentionClaim | None:
|
||||||
|
"""Pick the most specific DSI claim for a given cookie.
|
||||||
|
|
||||||
|
Priority: claim that mentions the cookie name > claim that mentions
|
||||||
|
the vendor > generic (no context).
|
||||||
|
"""
|
||||||
|
if not claims:
|
||||||
|
return None
|
||||||
|
by_cookie = [c for c in claims if cookie_name and cookie_name in c.context_terms]
|
||||||
|
if by_cookie:
|
||||||
|
return by_cookie[0]
|
||||||
|
if vendor_name:
|
||||||
|
by_vendor = [c for c in claims if vendor_name in c.context_terms]
|
||||||
|
if by_vendor:
|
||||||
|
return by_vendor[0]
|
||||||
|
generic = [c for c in claims if not c.context_terms]
|
||||||
|
return generic[0] if generic else claims[0]
|
||||||
|
|
||||||
|
|
||||||
|
def _within_tolerance(a: float, b: float) -> bool:
|
||||||
|
if a == 0 and b == 0:
|
||||||
|
return True
|
||||||
|
base = max(abs(a), abs(b))
|
||||||
|
return abs(a - b) <= base * (_MATCH_TOLERANCE_PCT / 100.0)
|
||||||
|
|
||||||
|
|
||||||
|
def compare_retention(
|
||||||
|
cookie_name: str,
|
||||||
|
table_duration: str | None,
|
||||||
|
actual_max_age_seconds: int | float | None,
|
||||||
|
dsi_claims: list[RetentionClaim] | None = None,
|
||||||
|
vendor_name: str | None = None,
|
||||||
|
) -> dict:
|
||||||
|
"""Per-cookie three-way retention comparison.
|
||||||
|
|
||||||
|
Returns a finding dict suitable for the audit-report aggregator
|
||||||
|
(theme = TH-RETENTION). Output schema is stable — extending it must
|
||||||
|
be additive so existing tests stay green.
|
||||||
|
"""
|
||||||
|
table_days, table_kind = parse_duration_to_days(table_duration or "")
|
||||||
|
actual_days = max_age_to_days(actual_max_age_seconds)
|
||||||
|
dsi_claim = _best_dsi_claim(
|
||||||
|
dsi_claims or [], cookie_name, vendor_name,
|
||||||
|
)
|
||||||
|
dsi_days = dsi_claim.days if dsi_claim else None
|
||||||
|
|
||||||
|
out: dict = {
|
||||||
|
"cookie_name": cookie_name,
|
||||||
|
"vendor_name": vendor_name,
|
||||||
|
"table_duration_raw": table_duration,
|
||||||
|
"table_days": table_days,
|
||||||
|
"table_kind": table_kind,
|
||||||
|
"actual_max_age_seconds": actual_max_age_seconds,
|
||||||
|
"actual_days": actual_days,
|
||||||
|
"dsi_days": dsi_days,
|
||||||
|
"dsi_sentence": dsi_claim.sentence if dsi_claim else None,
|
||||||
|
"dsi_context_terms": dsi_claim.context_terms if dsi_claim else [],
|
||||||
|
"matches": True,
|
||||||
|
"mismatch_type": None,
|
||||||
|
"severity_reason": None,
|
||||||
|
"severity": None,
|
||||||
|
"diff_days": None,
|
||||||
|
"notes": [],
|
||||||
|
}
|
||||||
|
|
||||||
|
sources = [v for v in (table_days, actual_days, dsi_days) if v is not None]
|
||||||
|
if len(sources) <= 1:
|
||||||
|
out["severity_reason"] = "incomplete"
|
||||||
|
out["severity"] = "LOW"
|
||||||
|
out["notes"].append("only_one_source_has_data")
|
||||||
|
return out
|
||||||
|
|
||||||
|
# Highest-severity check first: DSI claim is shorter than the cookie
|
||||||
|
# actually lives — user was misled.
|
||||||
|
if dsi_days is not None and actual_days is not None:
|
||||||
|
if not _within_tolerance(dsi_days, actual_days):
|
||||||
|
if dsi_days < actual_days:
|
||||||
|
out["matches"] = False
|
||||||
|
out["mismatch_type"] = "dsi_under_actual"
|
||||||
|
out["severity_reason"] = "factually_wrong"
|
||||||
|
out["severity"] = "HIGH"
|
||||||
|
out["diff_days"] = actual_days - dsi_days
|
||||||
|
|
||||||
|
# Cookie table understates reality — second highest.
|
||||||
|
if (out["matches"] and table_days is not None
|
||||||
|
and actual_days is not None):
|
||||||
|
if not _within_tolerance(table_days, actual_days):
|
||||||
|
if table_days < actual_days:
|
||||||
|
out["matches"] = False
|
||||||
|
out["mismatch_type"] = "table_under_actual"
|
||||||
|
out["severity_reason"] = "factually_wrong"
|
||||||
|
out["severity"] = "HIGH"
|
||||||
|
out["diff_days"] = actual_days - table_days
|
||||||
|
|
||||||
|
# Internal disagreement DSI vs. table (less severe — both are
|
||||||
|
# documentation, neither contradicts the live cookie).
|
||||||
|
if (out["matches"] and dsi_days is not None and table_days is not None):
|
||||||
|
if not _within_tolerance(dsi_days, table_days):
|
||||||
|
out["matches"] = False
|
||||||
|
out["mismatch_type"] = "dsi_vs_table"
|
||||||
|
out["severity_reason"] = "misclassified"
|
||||||
|
out["severity"] = "MEDIUM"
|
||||||
|
out["diff_days"] = abs(dsi_days - table_days)
|
||||||
|
|
||||||
|
# Catch over-declaration too — table says "2 years" but cookie
|
||||||
|
# lives 7 days (Safari ITP). Less severe but worth flagging.
|
||||||
|
if (out["matches"] and table_days is not None
|
||||||
|
and actual_days is not None):
|
||||||
|
if (not _within_tolerance(table_days, actual_days)
|
||||||
|
and table_days > actual_days):
|
||||||
|
out["matches"] = False
|
||||||
|
out["mismatch_type"] = "actual_under_table"
|
||||||
|
out["severity_reason"] = "incomplete"
|
||||||
|
out["severity"] = "LOW"
|
||||||
|
out["notes"].append("possible_safari_itp_cap")
|
||||||
|
out["diff_days"] = table_days - actual_days
|
||||||
|
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
def build_retention_theme_summary(
|
||||||
|
findings: list[dict],
|
||||||
|
) -> dict:
|
||||||
|
"""Aggregate per-cookie findings into the per-theme block used by
|
||||||
|
the report (theme = TH-RETENTION).
|
||||||
|
"""
|
||||||
|
total = len(findings)
|
||||||
|
incomplete = sum(
|
||||||
|
1 for f in findings if f.get("severity_reason") == "incomplete"
|
||||||
|
)
|
||||||
|
# Incomplete findings keep matches=True (we did not observe a
|
||||||
|
# mismatch), but they don't count as a verified pass either.
|
||||||
|
passed = sum(
|
||||||
|
1 for f in findings
|
||||||
|
if f.get("matches") and f.get("severity_reason") != "incomplete"
|
||||||
|
)
|
||||||
|
failed = total - passed - incomplete
|
||||||
|
by_severity: dict[str, int] = {}
|
||||||
|
by_type: dict[str, int] = {}
|
||||||
|
for f in findings:
|
||||||
|
sev = f.get("severity")
|
||||||
|
if sev:
|
||||||
|
by_severity[sev] = by_severity.get(sev, 0) + 1
|
||||||
|
mt = f.get("mismatch_type")
|
||||||
|
if mt:
|
||||||
|
by_type[mt] = by_type.get(mt, 0) + 1
|
||||||
|
return {
|
||||||
|
"theme_id": "TH-RETENTION",
|
||||||
|
"total": total,
|
||||||
|
"passed": passed,
|
||||||
|
"failed": failed,
|
||||||
|
"incomplete": incomplete,
|
||||||
|
"pct": int(round(100 * passed / total)) if total else 0,
|
||||||
|
"by_severity": by_severity,
|
||||||
|
"by_mismatch_type": by_type,
|
||||||
|
"top_fails": sorted(
|
||||||
|
(f for f in findings
|
||||||
|
if not f.get("matches")
|
||||||
|
and f.get("severity_reason") == "factually_wrong"),
|
||||||
|
key=lambda f: -(f.get("diff_days") or 0),
|
||||||
|
)[:10],
|
||||||
|
}
|
||||||
@@ -8,9 +8,13 @@ Uses standard smtplib. Configuration via environment variables:
|
|||||||
SMTP_FROM_ADDR (default: compliance@breakpilot.local)
|
SMTP_FROM_ADDR (default: compliance@breakpilot.local)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
import smtplib
|
import smtplib
|
||||||
|
from email import encoders
|
||||||
|
from email.mime.base import MIMEBase
|
||||||
from email.mime.multipart import MIMEMultipart
|
from email.mime.multipart import MIMEMultipart
|
||||||
from email.mime.text import MIMEText
|
from email.mime.text import MIMEText
|
||||||
|
|
||||||
@@ -28,22 +32,54 @@ def send_email(
|
|||||||
body_html: str,
|
body_html: str,
|
||||||
from_addr: str | None = None,
|
from_addr: str | None = None,
|
||||||
from_name: str | None = None,
|
from_name: str | None = None,
|
||||||
|
attachments: list[dict] | None = None,
|
||||||
) -> dict:
|
) -> dict:
|
||||||
"""Send an email via SMTP. Returns dict with status and message_id."""
|
"""Send an email via SMTP. Returns dict with status and message_id.
|
||||||
|
|
||||||
|
attachments: optional list of dicts:
|
||||||
|
[{"filename": "evidence.zip", "data": <bytes>,
|
||||||
|
"mime": "application/zip"}, ...]
|
||||||
|
"""
|
||||||
sender_addr = from_addr or SMTP_FROM_ADDR
|
sender_addr = from_addr or SMTP_FROM_ADDR
|
||||||
sender_name = from_name or SMTP_FROM_NAME
|
sender_name = from_name or SMTP_FROM_NAME
|
||||||
|
|
||||||
msg = MIMEMultipart("alternative")
|
if attachments:
|
||||||
|
msg = MIMEMultipart("mixed")
|
||||||
|
body = MIMEMultipart("alternative")
|
||||||
|
body.attach(MIMEText(body_html, "html", "utf-8"))
|
||||||
|
msg.attach(body)
|
||||||
|
for att in attachments:
|
||||||
|
mime = att.get("mime", "application/octet-stream")
|
||||||
|
maintype, _, subtype = mime.partition("/")
|
||||||
|
part = MIMEBase(maintype or "application", subtype or "octet-stream")
|
||||||
|
part.set_payload(att.get("data", b""))
|
||||||
|
encoders.encode_base64(part)
|
||||||
|
fname = att.get("filename", "attachment.bin")
|
||||||
|
part.add_header(
|
||||||
|
"Content-Disposition",
|
||||||
|
f'attachment; filename="{fname}"',
|
||||||
|
)
|
||||||
|
msg.attach(part)
|
||||||
|
else:
|
||||||
|
msg = MIMEMultipart("alternative")
|
||||||
|
msg.attach(MIMEText(body_html, "html", "utf-8"))
|
||||||
|
|
||||||
msg["From"] = f"{sender_name} <{sender_addr}>"
|
msg["From"] = f"{sender_name} <{sender_addr}>"
|
||||||
msg["To"] = recipient
|
msg["To"] = recipient
|
||||||
msg["Subject"] = subject
|
msg["Subject"] = subject
|
||||||
msg.attach(MIMEText(body_html, "html", "utf-8"))
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
with smtplib.SMTP(SMTP_HOST, SMTP_PORT, timeout=10) as server:
|
with smtplib.SMTP(SMTP_HOST, SMTP_PORT, timeout=30) as server:
|
||||||
server.sendmail(sender_addr, [recipient], msg.as_string())
|
server.sendmail(sender_addr, [recipient], msg.as_string())
|
||||||
logger.info("Email sent to %s: %s", recipient, subject)
|
att_count = len(attachments or [])
|
||||||
return {"status": "sent", "recipient": recipient, "subject": subject}
|
logger.info(
|
||||||
|
"Email sent to %s: %s (attachments=%d)",
|
||||||
|
recipient, subject, att_count,
|
||||||
|
)
|
||||||
|
return {
|
||||||
|
"status": "sent", "recipient": recipient, "subject": subject,
|
||||||
|
"attachments": att_count,
|
||||||
|
}
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error("Failed to send email to %s: %s", recipient, e)
|
logger.error("Failed to send email to %s: %s", recipient, e)
|
||||||
return {"status": "failed", "recipient": recipient, "error": str(e)}
|
return {"status": "failed", "recipient": recipient, "error": str(e)}
|
||||||
|
|||||||
@@ -0,0 +1,153 @@
|
|||||||
|
"""Tests for B1 static consent-reachability analysis."""
|
||||||
|
|
||||||
|
from compliance.services.consent_reachability_check import (
|
||||||
|
classify_anchor_target,
|
||||||
|
evaluate_reachability,
|
||||||
|
find_consent_anchors_in_footer,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _wrap(footer_inner: str) -> str:
|
||||||
|
return (
|
||||||
|
"<html><body>"
|
||||||
|
"<main>some content</main>"
|
||||||
|
f"<footer>{footer_inner}</footer>"
|
||||||
|
"</body></html>"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class TestFindConsentAnchors:
|
||||||
|
def test_finds_reopen_link_german(self):
|
||||||
|
html = _wrap('<a href="#" onclick="UC_UI.showSecondLayer()">'
|
||||||
|
'Cookie-Einstellungen</a>')
|
||||||
|
anchors = find_consent_anchors_in_footer(html)
|
||||||
|
assert len(anchors) == 1
|
||||||
|
assert anchors[0]["intent"] == "reopen_cmp"
|
||||||
|
|
||||||
|
def test_finds_reopen_button(self):
|
||||||
|
html = _wrap('<button data-cmp="show">Cookies verwalten</button>')
|
||||||
|
anchors = find_consent_anchors_in_footer(html)
|
||||||
|
assert anchors[0]["intent"] == "reopen_cmp"
|
||||||
|
|
||||||
|
def test_info_only_link_to_policy(self):
|
||||||
|
html = _wrap('<a href="/cookie-richtlinie">Cookie-Richtlinie</a>')
|
||||||
|
anchors = find_consent_anchors_in_footer(html)
|
||||||
|
assert len(anchors) == 1
|
||||||
|
assert anchors[0]["intent"] == "info_only"
|
||||||
|
|
||||||
|
def test_browser_deflection_link(self):
|
||||||
|
html = _wrap('<a href="/cookies">Browser-Einstellungen</a>')
|
||||||
|
anchors = find_consent_anchors_in_footer(html)
|
||||||
|
assert anchors[0]["intent"] == "browser_deflect"
|
||||||
|
|
||||||
|
def test_ignores_anchors_outside_footer(self):
|
||||||
|
html = ('<html><body>'
|
||||||
|
'<a href="#">Cookie-Einstellungen</a>'
|
||||||
|
'<footer><a href="/impressum">Impressum</a></footer>'
|
||||||
|
'</body></html>')
|
||||||
|
assert find_consent_anchors_in_footer(html) == []
|
||||||
|
|
||||||
|
def test_role_contentinfo_treated_as_footer(self):
|
||||||
|
html = ('<html><body>'
|
||||||
|
'<div role="contentinfo">'
|
||||||
|
'<a href="#" data-cmp="open">Cookie-Einstellungen</a>'
|
||||||
|
'</div></body></html>')
|
||||||
|
anchors = find_consent_anchors_in_footer(html)
|
||||||
|
assert len(anchors) == 1
|
||||||
|
|
||||||
|
def test_class_with_footer_treated_as_footer(self):
|
||||||
|
html = ('<html><body>'
|
||||||
|
'<div class="site-footer">'
|
||||||
|
'<a href="#" data-cmp="open">Cookies verwalten</a>'
|
||||||
|
'</div></body></html>')
|
||||||
|
anchors = find_consent_anchors_in_footer(html)
|
||||||
|
assert len(anchors) == 1
|
||||||
|
|
||||||
|
def test_empty_html(self):
|
||||||
|
assert find_consent_anchors_in_footer("") == []
|
||||||
|
|
||||||
|
def test_malformed_html(self):
|
||||||
|
# broken markup shouldn't crash
|
||||||
|
anchors = find_consent_anchors_in_footer("<footer><a>foo")
|
||||||
|
# may or may not yield results; must not raise
|
||||||
|
assert isinstance(anchors, list)
|
||||||
|
|
||||||
|
|
||||||
|
class TestClassifyAnchorTarget:
|
||||||
|
def test_onclick_classifies_as_cmp(self):
|
||||||
|
a = {"href": "#", "onclick": "showCmp()"}
|
||||||
|
assert classify_anchor_target(a, "https://x.de/") == "same_page_cmp"
|
||||||
|
|
||||||
|
def test_data_cmp_classifies_as_cmp(self):
|
||||||
|
a = {"href": "#", "data_cmp": "show"}
|
||||||
|
assert classify_anchor_target(a, "https://x.de/") == "same_page_cmp"
|
||||||
|
|
||||||
|
def test_javascript_link(self):
|
||||||
|
a = {"href": "javascript:void(0)"}
|
||||||
|
assert classify_anchor_target(a, "https://x.de/") == "javascript"
|
||||||
|
|
||||||
|
def test_new_tab(self):
|
||||||
|
a = {"href": "/cookie", "target": "_blank"}
|
||||||
|
assert classify_anchor_target(a, "https://x.de/") == "new_tab"
|
||||||
|
|
||||||
|
def test_hash_only(self):
|
||||||
|
a = {"href": "#cookies"}
|
||||||
|
assert classify_anchor_target(a, "https://x.de/") == "same_page_cmp"
|
||||||
|
|
||||||
|
def test_same_origin_relative(self):
|
||||||
|
a = {"href": "/cookie-richtlinie"}
|
||||||
|
assert classify_anchor_target(a, "https://x.de/") == "same_origin"
|
||||||
|
|
||||||
|
def test_external_origin(self):
|
||||||
|
a = {"href": "https://other.de/policy"}
|
||||||
|
assert classify_anchor_target(a, "https://x.de/") == "external"
|
||||||
|
|
||||||
|
|
||||||
|
class TestEvaluateReachability:
|
||||||
|
def test_pass_when_reopen_in_same_page(self):
|
||||||
|
html = _wrap('<a href="#" data-cmp="open">Cookie-Einstellungen</a>')
|
||||||
|
r = evaluate_reachability(html, "https://x.de/")
|
||||||
|
assert r["check_id"] == "COOKIE-CONSENT-UX-001"
|
||||||
|
assert r["passed"] is True
|
||||||
|
assert r["severity"] is None
|
||||||
|
assert r["has_reopen_anchor"] is True
|
||||||
|
|
||||||
|
def test_fail_missing_when_no_reopen(self):
|
||||||
|
html = _wrap('<a href="/cookie-richtlinie">Cookie-Richtlinie</a>')
|
||||||
|
r = evaluate_reachability(html, "https://x.de/")
|
||||||
|
assert r["passed"] is False
|
||||||
|
assert r["severity"] == "HIGH"
|
||||||
|
assert r["severity_reason"] == "missing"
|
||||||
|
|
||||||
|
def test_medium_when_reopen_opens_new_tab(self):
|
||||||
|
# The Elli case: footer link points at cookie policy in a new
|
||||||
|
# tab, no in-place CMP open.
|
||||||
|
html = _wrap(
|
||||||
|
'<a href="/cookie-einstellungen" target="_blank">'
|
||||||
|
'Cookie-Einstellungen</a>'
|
||||||
|
)
|
||||||
|
r = evaluate_reachability(html, "https://x.de/")
|
||||||
|
assert r["passed"] is False
|
||||||
|
assert r["severity"] == "MEDIUM"
|
||||||
|
assert r["severity_reason"] == "misclassified"
|
||||||
|
|
||||||
|
def test_high_when_only_browser_deflection(self):
|
||||||
|
html = _wrap('<a href="/cookies">Browser-Einstellungen</a>')
|
||||||
|
r = evaluate_reachability(html, "https://x.de/")
|
||||||
|
assert r["passed"] is False
|
||||||
|
assert r["severity"] == "HIGH"
|
||||||
|
assert r["severity_reason"] == "factually_wrong"
|
||||||
|
|
||||||
|
def test_empty_footer_is_fail(self):
|
||||||
|
r = evaluate_reachability(_wrap(""), "https://x.de/")
|
||||||
|
assert r["passed"] is False
|
||||||
|
assert r["severity"] == "HIGH"
|
||||||
|
|
||||||
|
def test_reopen_external_origin_is_medium(self):
|
||||||
|
html = _wrap(
|
||||||
|
'<a href="https://privacy.other.com/manage">'
|
||||||
|
'Cookie-Einstellungen</a>'
|
||||||
|
)
|
||||||
|
r = evaluate_reachability(html, "https://x.de/")
|
||||||
|
assert r["passed"] is False
|
||||||
|
assert r["severity"] == "MEDIUM"
|
||||||
@@ -0,0 +1,259 @@
|
|||||||
|
"""Tests for B3 cross-doc retention comparator."""
|
||||||
|
|
||||||
|
from compliance.services.retention_comparator import (
|
||||||
|
RetentionClaim,
|
||||||
|
build_retention_theme_summary,
|
||||||
|
compare_retention,
|
||||||
|
extract_retention_claims,
|
||||||
|
max_age_to_days,
|
||||||
|
parse_duration_to_days,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class TestParseDurationToDays:
|
||||||
|
def test_months(self):
|
||||||
|
d, k = parse_duration_to_days("14 Monate")
|
||||||
|
assert k == "days"
|
||||||
|
assert d == 14 * 30
|
||||||
|
|
||||||
|
def test_jahre(self):
|
||||||
|
d, k = parse_duration_to_days("2 Jahre")
|
||||||
|
assert k == "days"
|
||||||
|
assert d == 2 * 365
|
||||||
|
|
||||||
|
def test_hours_short(self):
|
||||||
|
d, k = parse_duration_to_days("24h")
|
||||||
|
assert k == "days"
|
||||||
|
assert d == 1.0
|
||||||
|
|
||||||
|
def test_days(self):
|
||||||
|
d, k = parse_duration_to_days("30 Tage")
|
||||||
|
assert k == "days"
|
||||||
|
assert d == 30
|
||||||
|
|
||||||
|
def test_minutes(self):
|
||||||
|
d, k = parse_duration_to_days("1 Minute")
|
||||||
|
assert k == "days"
|
||||||
|
assert abs(d - 1 / 1440) < 1e-9
|
||||||
|
|
||||||
|
def test_session(self):
|
||||||
|
d, k = parse_duration_to_days("Sitzungsdauer")
|
||||||
|
assert k == "session"
|
||||||
|
assert d is None
|
||||||
|
|
||||||
|
def test_session_token(self):
|
||||||
|
d, k = parse_duration_to_days("Session")
|
||||||
|
assert k == "session"
|
||||||
|
|
||||||
|
def test_persistent(self):
|
||||||
|
d, k = parse_duration_to_days("unbegrenzt")
|
||||||
|
assert k == "persistent"
|
||||||
|
|
||||||
|
def test_empty(self):
|
||||||
|
d, k = parse_duration_to_days("")
|
||||||
|
assert k == "unknown"
|
||||||
|
assert d is None
|
||||||
|
|
||||||
|
def test_none(self):
|
||||||
|
d, k = parse_duration_to_days(None)
|
||||||
|
assert k == "unknown"
|
||||||
|
assert d is None
|
||||||
|
|
||||||
|
def test_decimal_comma(self):
|
||||||
|
d, k = parse_duration_to_days("1,5 Jahre")
|
||||||
|
assert k == "days"
|
||||||
|
assert d == 1.5 * 365
|
||||||
|
|
||||||
|
|
||||||
|
class TestMaxAgeToDays:
|
||||||
|
def test_one_year(self):
|
||||||
|
assert abs(max_age_to_days(365 * 86400) - 365) < 1e-9
|
||||||
|
|
||||||
|
def test_session_none(self):
|
||||||
|
assert max_age_to_days(None) is None
|
||||||
|
|
||||||
|
def test_bad_input(self):
|
||||||
|
assert max_age_to_days("bad") is None
|
||||||
|
|
||||||
|
|
||||||
|
class TestExtractRetentionClaims:
|
||||||
|
def test_finds_global_claim(self):
|
||||||
|
dsi = (
|
||||||
|
"Wir verarbeiten Ihre Daten gemäß Art. 6 DSGVO. "
|
||||||
|
"Die Speicherdauer der Daten beträgt grundsätzlich 6 Monate. "
|
||||||
|
"Danach werden die Daten gelöscht."
|
||||||
|
)
|
||||||
|
claims = extract_retention_claims(dsi)
|
||||||
|
assert len(claims) == 1
|
||||||
|
assert claims[0].days == 6 * 30
|
||||||
|
|
||||||
|
def test_finds_cookie_specific(self):
|
||||||
|
dsi = (
|
||||||
|
"Wir nutzen Google Analytics. "
|
||||||
|
"Das Cookie _ga wird für 14 Monate gespeichert. "
|
||||||
|
"Weitere Hinweise finden Sie unten."
|
||||||
|
)
|
||||||
|
claims = extract_retention_claims(
|
||||||
|
dsi, cookie_names=["_ga"], vendor_names=["Google Analytics"],
|
||||||
|
)
|
||||||
|
assert len(claims) >= 1
|
||||||
|
ga_claim = next(c for c in claims if "_ga" in c.context_terms)
|
||||||
|
assert ga_claim.days == 14 * 30
|
||||||
|
|
||||||
|
def test_ignores_non_retention_sentence(self):
|
||||||
|
dsi = "Wir sind 14 Monate am Markt. Das ist keine Speicherdauer."
|
||||||
|
# "14 Monate" present but no retention anchor — skip.
|
||||||
|
assert extract_retention_claims(dsi) == []
|
||||||
|
|
||||||
|
def test_empty_text(self):
|
||||||
|
assert extract_retention_claims("") == []
|
||||||
|
|
||||||
|
|
||||||
|
class TestCompareRetention:
|
||||||
|
def test_match_all_three(self):
|
||||||
|
dsi_claims = [RetentionClaim(
|
||||||
|
sentence="Speicherdauer 14 Monate.",
|
||||||
|
days=14 * 30, is_session=False, is_persistent=False,
|
||||||
|
context_terms=[],
|
||||||
|
)]
|
||||||
|
out = compare_retention(
|
||||||
|
cookie_name="_ga",
|
||||||
|
table_duration="14 Monate",
|
||||||
|
actual_max_age_seconds=14 * 30 * 86400,
|
||||||
|
dsi_claims=dsi_claims,
|
||||||
|
)
|
||||||
|
assert out["matches"] is True
|
||||||
|
assert out["severity"] is None
|
||||||
|
|
||||||
|
def test_dsi_under_actual_is_HIGH(self):
|
||||||
|
# DSI claims 6 months, real cookie lives 14 months.
|
||||||
|
dsi_claims = [RetentionClaim(
|
||||||
|
sentence="Speicherdauer 6 Monate.",
|
||||||
|
days=6 * 30, is_session=False, is_persistent=False,
|
||||||
|
context_terms=[],
|
||||||
|
)]
|
||||||
|
out = compare_retention(
|
||||||
|
cookie_name="_ga",
|
||||||
|
table_duration="14 Monate",
|
||||||
|
actual_max_age_seconds=14 * 30 * 86400,
|
||||||
|
dsi_claims=dsi_claims,
|
||||||
|
)
|
||||||
|
assert out["matches"] is False
|
||||||
|
assert out["mismatch_type"] == "dsi_under_actual"
|
||||||
|
assert out["severity_reason"] == "factually_wrong"
|
||||||
|
assert out["severity"] == "HIGH"
|
||||||
|
assert out["diff_days"] == 14 * 30 - 6 * 30
|
||||||
|
|
||||||
|
def test_table_under_actual_is_HIGH(self):
|
||||||
|
# Table says 7 days, real cookie lives 365 days.
|
||||||
|
out = compare_retention(
|
||||||
|
cookie_name="_fbp",
|
||||||
|
table_duration="7 Tage",
|
||||||
|
actual_max_age_seconds=365 * 86400,
|
||||||
|
)
|
||||||
|
assert out["matches"] is False
|
||||||
|
assert out["mismatch_type"] == "table_under_actual"
|
||||||
|
assert out["severity"] == "HIGH"
|
||||||
|
|
||||||
|
def test_dsi_vs_table_is_MEDIUM(self):
|
||||||
|
# DSI says 6 months, table says 14 months, no actual.
|
||||||
|
dsi_claims = [RetentionClaim(
|
||||||
|
sentence="Speicherdauer 6 Monate.",
|
||||||
|
days=6 * 30, is_session=False, is_persistent=False,
|
||||||
|
context_terms=[],
|
||||||
|
)]
|
||||||
|
out = compare_retention(
|
||||||
|
cookie_name="_ga",
|
||||||
|
table_duration="14 Monate",
|
||||||
|
actual_max_age_seconds=None,
|
||||||
|
dsi_claims=dsi_claims,
|
||||||
|
)
|
||||||
|
assert out["matches"] is False
|
||||||
|
assert out["mismatch_type"] == "dsi_vs_table"
|
||||||
|
assert out["severity"] == "MEDIUM"
|
||||||
|
|
||||||
|
def test_actual_under_table_is_LOW_safari_itp_hint(self):
|
||||||
|
# Table says 2 years, real cookie lives 7 days (Safari ITP).
|
||||||
|
out = compare_retention(
|
||||||
|
cookie_name="_ga",
|
||||||
|
table_duration="2 Jahre",
|
||||||
|
actual_max_age_seconds=7 * 86400,
|
||||||
|
)
|
||||||
|
assert out["matches"] is False
|
||||||
|
assert out["mismatch_type"] == "actual_under_table"
|
||||||
|
assert out["severity"] == "LOW"
|
||||||
|
assert "possible_safari_itp_cap" in out["notes"]
|
||||||
|
|
||||||
|
def test_only_one_source_is_incomplete(self):
|
||||||
|
out = compare_retention(
|
||||||
|
cookie_name="_ga",
|
||||||
|
table_duration="14 Monate",
|
||||||
|
actual_max_age_seconds=None,
|
||||||
|
dsi_claims=[],
|
||||||
|
)
|
||||||
|
assert out["severity_reason"] == "incomplete"
|
||||||
|
assert out["severity"] == "LOW"
|
||||||
|
|
||||||
|
def test_tolerance_5pct(self):
|
||||||
|
# 14 Monate (420d) vs 410d — within 5% tolerance, match.
|
||||||
|
out = compare_retention(
|
||||||
|
cookie_name="_ga",
|
||||||
|
table_duration="14 Monate",
|
||||||
|
actual_max_age_seconds=410 * 86400,
|
||||||
|
)
|
||||||
|
assert out["matches"] is True
|
||||||
|
|
||||||
|
def test_cookie_specific_dsi_beats_generic(self):
|
||||||
|
dsi_claims = [
|
||||||
|
RetentionClaim(
|
||||||
|
sentence="Speicherdauer grundsätzlich 6 Monate.",
|
||||||
|
days=6 * 30, is_session=False, is_persistent=False,
|
||||||
|
context_terms=[],
|
||||||
|
),
|
||||||
|
RetentionClaim(
|
||||||
|
sentence="_ga: Speicherdauer 14 Monate.",
|
||||||
|
days=14 * 30, is_session=False, is_persistent=False,
|
||||||
|
context_terms=["_ga"],
|
||||||
|
),
|
||||||
|
]
|
||||||
|
out = compare_retention(
|
||||||
|
cookie_name="_ga",
|
||||||
|
table_duration="14 Monate",
|
||||||
|
actual_max_age_seconds=14 * 30 * 86400,
|
||||||
|
dsi_claims=dsi_claims,
|
||||||
|
)
|
||||||
|
# The cookie-specific claim should win → all three match.
|
||||||
|
assert out["matches"] is True
|
||||||
|
assert out["dsi_days"] == 14 * 30
|
||||||
|
|
||||||
|
|
||||||
|
class TestBuildRetentionThemeSummary:
|
||||||
|
def _claim(self, sentence, days):
|
||||||
|
return RetentionClaim(
|
||||||
|
sentence=sentence, days=days,
|
||||||
|
is_session=False, is_persistent=False, context_terms=[],
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_aggregate(self):
|
||||||
|
findings = [
|
||||||
|
compare_retention(
|
||||||
|
"_a", "14 Monate", 14 * 30 * 86400,
|
||||||
|
[self._claim("14 Monate", 14 * 30)],
|
||||||
|
),
|
||||||
|
compare_retention(
|
||||||
|
"_b", "6 Monate", 14 * 30 * 86400,
|
||||||
|
[self._claim("6 Monate", 6 * 30)],
|
||||||
|
),
|
||||||
|
compare_retention(
|
||||||
|
"_c", "14 Monate", None, [],
|
||||||
|
),
|
||||||
|
]
|
||||||
|
s = build_retention_theme_summary(findings)
|
||||||
|
assert s["theme_id"] == "TH-RETENTION"
|
||||||
|
assert s["total"] == 3
|
||||||
|
assert s["passed"] == 1
|
||||||
|
assert s["incomplete"] == 1
|
||||||
|
assert s["failed"] == 1
|
||||||
|
assert s["by_severity"].get("HIGH") == 1
|
||||||
|
assert s["by_mismatch_type"].get("dsi_under_actual") == 1
|
||||||
|
assert len(s["top_fails"]) == 1
|
||||||
Reference in New Issue
Block a user