c2c8783fee
Phase-5 split of agent_compliance_check_routes.py — the 2700-line
monolith was decomposed into 19 modules in compliance/api/agent_check/:
- Phase A-F: resolve / profile+check / banner+TCF / vendors raw+finalize /
HTML blocks top+mid+bot / email / persist
- Helpers: _constants, _helpers, _fetch, _discovery, _single_check
- Schemas + State + thin _orchestrator
A1 ZIP-Anhang nativ in _phase_e_email: evidence_zip_builder.py bundles
slices + manifest.json + audit_metadata.json (SHA256 per slice +
build_sha + source_url). smtp_sender.py erweitert um attachments-Parameter.
B1 COOKIE-CONSENT-UX-001 (Mobile Reachability): consent_reachability_check.py
parses footer anchors, classifies intent (reopen_cmp / info_only /
browser_deflect) + target (same_page_cmp / new_tab / external).
_b1_wiring.py fetches homepage with iPhone-UA + renders Art-7-Abs-3
severity-coloured block.
B3 TH-RETENTION (Cross-Doc Speicherdauer): retention_comparator.py
compares DSI claim ↔ cookie-table duration ↔ actual Max-Age/expires
with 5% tolerance + severity hierarchy (dsi_under_actual HIGH,
table_under_actual HIGH, dsi_vs_table MEDIUM, actual_under_table LOW
Safari-ITP-Hint). _b3_wiring.py + Top-10 mismatches table in mail.
Side-effects:
- Fixed silent UnboundLocalError in original Step 5 (gf_one_pager used
audit_quality_findings before declaration, caught by surrounding
except → block never rendered). New _phase_d3_blocks_bot.py runs
audit-quality FIRST.
- agent_compliance_check_routes.py removed from loc-exceptions.txt
("Phase 5 split target" — done).
Tests: 55/55 grün (B1 22 + B3 27 + saving_scan 6).
E2E: smoke against Elli DSE+Cookie produced HIGH/missing B1 finding,
TH-RETENTION table (17 cookies / 3 ✓ / 3 ✗ / 11 ?), evidence-zip
with 2 slices + manifest + audit_metadata (12089B, SHA256-chained,
source verified), email sent (attachments=1).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
143 lines
6.9 KiB
Python
143 lines
6.9 KiB
Python
"""URL → text fetch helper for the compliance-check pipeline.
|
|
|
|
Tries the consent-tester service first (Playwright, full JS render +
|
|
CMP capture). On any failure or empty result, falls back to a direct
|
|
HTTP GET with an identifiable User-Agent and per-domain rate limiting.
|
|
|
|
For cookie/dse/social_media doc types we cap discovery to 1 sub-page
|
|
(the policy itself is authoritative). For Impressum/AGB/Widerruf and
|
|
similar enterprise-split pages we follow up to 3 sub-pages.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
import re as _re
|
|
|
|
import httpx
|
|
|
|
from ._constants import CONSENT_TESTER_URL
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
async def _fetch_text(url: str, doc_type: str = "") -> tuple[str, list[dict]]:
|
|
"""Fetch text from URL via consent-tester, with HTTP fallback.
|
|
|
|
Returns (text, cmp_payloads). cmp_payloads is the raw CMP JSON captured
|
|
during navigation (ePaaS, OneTrust, …) — empty when no CMP fired or
|
|
HTTP fallback was used. Backend turns payloads into structured vendor
|
|
records for the VVT table in the email.
|
|
"""
|
|
# 1. Consent-tester (Playwright-based, full JS rendering).
|
|
# max_documents depends on doc_type:
|
|
# - cookie/dse/social_media: self-extract (often + CMP capture) is
|
|
# authoritative, sub-pages dilute the policy text. max=1.
|
|
# - impressum/agb/widerruf/nutzungsbedingungen/dsb: BMW & similar
|
|
# enterprise sites split this across 3-4 short sub-pages
|
|
# (Versicherungsvermittler, Aufsicht, Berufsrecht). max=3 follows
|
|
# them. The 15s networkidle bail (dsi_helpers) keeps timing safe.
|
|
short_extract_types = {"cookie", "dse", "datenschutz", "privacy", "social_media"}
|
|
max_docs = 1 if (doc_type or "") in short_extract_types else 3
|
|
try:
|
|
# P90: 120s reicht nicht fuer BMW-Impressum (Auto-Discovery folgt
|
|
# 3 Sub-Docs). 240s gibt Spielraum. Mercedes faellt aktuell mit
|
|
# 120s auch oft an Akamai-Latenz.
|
|
async with httpx.AsyncClient(timeout=240.0) as client:
|
|
resp = await client.post(
|
|
f"{CONSENT_TESTER_URL}/dsi-discovery",
|
|
json={"url": url, "max_documents": max_docs},
|
|
timeout=240.0,
|
|
)
|
|
if resp.status_code == 200:
|
|
payload = resp.json()
|
|
docs = payload.get("documents", [])
|
|
cmp_payloads = payload.get("cmp_payloads") or []
|
|
cmp_cookie_text = payload.get("cmp_cookie_text") or ""
|
|
# D — wenn der consent-tester HTML-Tabellen aus dem DOM
|
|
# extrahiert hat, in die cmp_payloads als "generic_table"
|
|
# einschleusen damit das Backend sie via cookies_table_parser
|
|
# verarbeiten kann.
|
|
for doc in (docs or []):
|
|
for tbl in (doc.get("tables") or []):
|
|
if not tbl or len(tbl) < 3:
|
|
continue
|
|
cmp_payloads.append({
|
|
"kind": "html_table",
|
|
"url": doc.get("url", ""),
|
|
"rows": tbl,
|
|
})
|
|
if docs:
|
|
texts = []
|
|
for doc in docs:
|
|
t = doc.get("full_text", "") or doc.get("text_preview", "") or ""
|
|
if t and len(t) > 50:
|
|
texts.append(t)
|
|
merged = "\n\n".join(texts)
|
|
# For cookie/dse/social_media: when CMP reconstruction is
|
|
# substantially richer than DOM extraction, use it. This
|
|
# fixes the BMW case where DOM yields ~600 words of
|
|
# navigation but the ePaaS payload reconstructs to ~1800
|
|
# words of actual cookie policy.
|
|
if (doc_type in short_extract_types
|
|
and cmp_cookie_text
|
|
and len(cmp_cookie_text.split()) > len(merged.split())):
|
|
logger.info(
|
|
"Preferring CMP-reconstructed text for %s on %s "
|
|
"(%d words CMP vs %d words DOM)",
|
|
doc_type, url,
|
|
len(cmp_cookie_text.split()),
|
|
len(merged.split()),
|
|
)
|
|
merged = cmp_cookie_text
|
|
if merged and len(merged.split()) > 100:
|
|
if len(texts) > 1:
|
|
logger.info("Merged %d docs from %s (%d words)",
|
|
len(texts), url, len(merged.split()))
|
|
return merged, cmp_payloads
|
|
# P90-Bug-Fix: auch wenn DSE-Text zu kurz fuer 100-Wort-
|
|
# Schwelle ist, die captured CMP-Payloads NICHT verwerfen.
|
|
# BMW-Bug: DSE liefert 10 Wort SPA-Shell, aber ePaaS-JSON
|
|
# (393KB) wurde captured. Backend braucht die fuer
|
|
# extract_vendors_from_payloads (VVT-Tabelle).
|
|
if cmp_payloads:
|
|
logger.info(
|
|
"P90: keeping %d CMP payloads for %s despite "
|
|
"short text (%d words) — HTTP fallback runs in parallel",
|
|
len(cmp_payloads), url,
|
|
len((merged or cmp_cookie_text).split()),
|
|
)
|
|
fallback_text = merged or cmp_cookie_text or ""
|
|
return fallback_text, cmp_payloads
|
|
except Exception as e:
|
|
# P90: verbose exception fuer Diagnose (war vorher empty)
|
|
logger.warning("Consent-tester fetch failed for %s: %s (%s)",
|
|
url, str(e) or "(empty)", type(e).__name__)
|
|
|
|
# 2. Fallback: direct HTTP fetch (works for SSR pages like BMW).
|
|
# P7: kenntlicher UA + per-Domain Rate-Limit.
|
|
try:
|
|
from compliance.services.compliance_user_agent import (
|
|
default_request_headers, DomainRateLimiter,
|
|
)
|
|
async with httpx.AsyncClient(
|
|
timeout=30.0, follow_redirects=True,
|
|
headers=default_request_headers(),
|
|
) as client:
|
|
async with DomainRateLimiter(url):
|
|
resp = await client.get(url)
|
|
if resp.status_code == 200 and "text/html" in resp.headers.get("content-type", ""):
|
|
html = resp.text
|
|
# Strip HTML tags, decode entities
|
|
text = _re.sub(r"<script[^>]*>.*?</script>", " ", html, flags=_re.DOTALL | _re.IGNORECASE)
|
|
text = _re.sub(r"<style[^>]*>.*?</style>", " ", text, flags=_re.DOTALL | _re.IGNORECASE)
|
|
text = _re.sub(r"<[^>]+>", " ", text)
|
|
text = _re.sub(r"\s+", " ", text).strip()
|
|
if len(text.split()) > 100:
|
|
logger.info("HTTP fallback for %s: %d words", url, len(text.split()))
|
|
return text, []
|
|
except Exception as e:
|
|
logger.warning("HTTP fallback failed for %s: %s", url, e)
|
|
|
|
return "", []
|