refactor(agent-check): split routes file (2692→347 LOC) + wire B1/B3/A1 [guardrail-change]
Phase-5 split of agent_compliance_check_routes.py — the 2700-line
monolith was decomposed into 19 modules in compliance/api/agent_check/:
- Phase A-F: resolve / profile+check / banner+TCF / vendors raw+finalize /
HTML blocks top+mid+bot / email / persist
- Helpers: _constants, _helpers, _fetch, _discovery, _single_check
- Schemas + State + thin _orchestrator
A1 ZIP-Anhang nativ in _phase_e_email: evidence_zip_builder.py bundles
slices + manifest.json + audit_metadata.json (SHA256 per slice +
build_sha + source_url). smtp_sender.py erweitert um attachments-Parameter.
B1 COOKIE-CONSENT-UX-001 (Mobile Reachability): consent_reachability_check.py
parses footer anchors, classifies intent (reopen_cmp / info_only /
browser_deflect) + target (same_page_cmp / new_tab / external).
_b1_wiring.py fetches homepage with iPhone-UA + renders Art-7-Abs-3
severity-coloured block.
B3 TH-RETENTION (Cross-Doc Speicherdauer): retention_comparator.py
compares DSI claim ↔ cookie-table duration ↔ actual Max-Age/expires
with 5% tolerance + severity hierarchy (dsi_under_actual HIGH,
table_under_actual HIGH, dsi_vs_table MEDIUM, actual_under_table LOW
Safari-ITP-Hint). _b3_wiring.py + Top-10 mismatches table in mail.
Side-effects:
- Fixed silent UnboundLocalError in original Step 5 (gf_one_pager used
audit_quality_findings before declaration, caught by surrounding
except → block never rendered). New _phase_d3_blocks_bot.py runs
audit-quality FIRST.
- agent_compliance_check_routes.py removed from loc-exceptions.txt
("Phase 5 split target" — done).
Tests: 55/55 grün (B1 22 + B3 27 + saving_scan 6).
E2E: smoke against Elli DSE+Cookie produced HIGH/missing B1 finding,
TH-RETENTION table (17 cookies / 3 ✓ / 3 ✗ / 11 ?), evidence-zip
with 2 slices + manifest + audit_metadata (12089B, SHA256-chained,
source verified), email sent (attachments=1).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,142 @@
|
||||
"""URL → text fetch helper for the compliance-check pipeline.
|
||||
|
||||
Tries the consent-tester service first (Playwright, full JS render +
|
||||
CMP capture). On any failure or empty result, falls back to a direct
|
||||
HTTP GET with an identifiable User-Agent and per-domain rate limiting.
|
||||
|
||||
For cookie/dse/social_media doc types we cap discovery to 1 sub-page
|
||||
(the policy itself is authoritative). For Impressum/AGB/Widerruf and
|
||||
similar enterprise-split pages we follow up to 3 sub-pages.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import re as _re
|
||||
|
||||
import httpx
|
||||
|
||||
from ._constants import CONSENT_TESTER_URL
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
async def _fetch_text(url: str, doc_type: str = "") -> tuple[str, list[dict]]:
|
||||
"""Fetch text from URL via consent-tester, with HTTP fallback.
|
||||
|
||||
Returns (text, cmp_payloads). cmp_payloads is the raw CMP JSON captured
|
||||
during navigation (ePaaS, OneTrust, …) — empty when no CMP fired or
|
||||
HTTP fallback was used. Backend turns payloads into structured vendor
|
||||
records for the VVT table in the email.
|
||||
"""
|
||||
# 1. Consent-tester (Playwright-based, full JS rendering).
|
||||
# max_documents depends on doc_type:
|
||||
# - cookie/dse/social_media: self-extract (often + CMP capture) is
|
||||
# authoritative, sub-pages dilute the policy text. max=1.
|
||||
# - impressum/agb/widerruf/nutzungsbedingungen/dsb: BMW & similar
|
||||
# enterprise sites split this across 3-4 short sub-pages
|
||||
# (Versicherungsvermittler, Aufsicht, Berufsrecht). max=3 follows
|
||||
# them. The 15s networkidle bail (dsi_helpers) keeps timing safe.
|
||||
short_extract_types = {"cookie", "dse", "datenschutz", "privacy", "social_media"}
|
||||
max_docs = 1 if (doc_type or "") in short_extract_types else 3
|
||||
try:
|
||||
# P90: 120s reicht nicht fuer BMW-Impressum (Auto-Discovery folgt
|
||||
# 3 Sub-Docs). 240s gibt Spielraum. Mercedes faellt aktuell mit
|
||||
# 120s auch oft an Akamai-Latenz.
|
||||
async with httpx.AsyncClient(timeout=240.0) as client:
|
||||
resp = await client.post(
|
||||
f"{CONSENT_TESTER_URL}/dsi-discovery",
|
||||
json={"url": url, "max_documents": max_docs},
|
||||
timeout=240.0,
|
||||
)
|
||||
if resp.status_code == 200:
|
||||
payload = resp.json()
|
||||
docs = payload.get("documents", [])
|
||||
cmp_payloads = payload.get("cmp_payloads") or []
|
||||
cmp_cookie_text = payload.get("cmp_cookie_text") or ""
|
||||
# D — wenn der consent-tester HTML-Tabellen aus dem DOM
|
||||
# extrahiert hat, in die cmp_payloads als "generic_table"
|
||||
# einschleusen damit das Backend sie via cookies_table_parser
|
||||
# verarbeiten kann.
|
||||
for doc in (docs or []):
|
||||
for tbl in (doc.get("tables") or []):
|
||||
if not tbl or len(tbl) < 3:
|
||||
continue
|
||||
cmp_payloads.append({
|
||||
"kind": "html_table",
|
||||
"url": doc.get("url", ""),
|
||||
"rows": tbl,
|
||||
})
|
||||
if docs:
|
||||
texts = []
|
||||
for doc in docs:
|
||||
t = doc.get("full_text", "") or doc.get("text_preview", "") or ""
|
||||
if t and len(t) > 50:
|
||||
texts.append(t)
|
||||
merged = "\n\n".join(texts)
|
||||
# For cookie/dse/social_media: when CMP reconstruction is
|
||||
# substantially richer than DOM extraction, use it. This
|
||||
# fixes the BMW case where DOM yields ~600 words of
|
||||
# navigation but the ePaaS payload reconstructs to ~1800
|
||||
# words of actual cookie policy.
|
||||
if (doc_type in short_extract_types
|
||||
and cmp_cookie_text
|
||||
and len(cmp_cookie_text.split()) > len(merged.split())):
|
||||
logger.info(
|
||||
"Preferring CMP-reconstructed text for %s on %s "
|
||||
"(%d words CMP vs %d words DOM)",
|
||||
doc_type, url,
|
||||
len(cmp_cookie_text.split()),
|
||||
len(merged.split()),
|
||||
)
|
||||
merged = cmp_cookie_text
|
||||
if merged and len(merged.split()) > 100:
|
||||
if len(texts) > 1:
|
||||
logger.info("Merged %d docs from %s (%d words)",
|
||||
len(texts), url, len(merged.split()))
|
||||
return merged, cmp_payloads
|
||||
# P90-Bug-Fix: auch wenn DSE-Text zu kurz fuer 100-Wort-
|
||||
# Schwelle ist, die captured CMP-Payloads NICHT verwerfen.
|
||||
# BMW-Bug: DSE liefert 10 Wort SPA-Shell, aber ePaaS-JSON
|
||||
# (393KB) wurde captured. Backend braucht die fuer
|
||||
# extract_vendors_from_payloads (VVT-Tabelle).
|
||||
if cmp_payloads:
|
||||
logger.info(
|
||||
"P90: keeping %d CMP payloads for %s despite "
|
||||
"short text (%d words) — HTTP fallback runs in parallel",
|
||||
len(cmp_payloads), url,
|
||||
len((merged or cmp_cookie_text).split()),
|
||||
)
|
||||
fallback_text = merged or cmp_cookie_text or ""
|
||||
return fallback_text, cmp_payloads
|
||||
except Exception as e:
|
||||
# P90: verbose exception fuer Diagnose (war vorher empty)
|
||||
logger.warning("Consent-tester fetch failed for %s: %s (%s)",
|
||||
url, str(e) or "(empty)", type(e).__name__)
|
||||
|
||||
# 2. Fallback: direct HTTP fetch (works for SSR pages like BMW).
|
||||
# P7: kenntlicher UA + per-Domain Rate-Limit.
|
||||
try:
|
||||
from compliance.services.compliance_user_agent import (
|
||||
default_request_headers, DomainRateLimiter,
|
||||
)
|
||||
async with httpx.AsyncClient(
|
||||
timeout=30.0, follow_redirects=True,
|
||||
headers=default_request_headers(),
|
||||
) as client:
|
||||
async with DomainRateLimiter(url):
|
||||
resp = await client.get(url)
|
||||
if resp.status_code == 200 and "text/html" in resp.headers.get("content-type", ""):
|
||||
html = resp.text
|
||||
# Strip HTML tags, decode entities
|
||||
text = _re.sub(r"<script[^>]*>.*?</script>", " ", html, flags=_re.DOTALL | _re.IGNORECASE)
|
||||
text = _re.sub(r"<style[^>]*>.*?</style>", " ", text, flags=_re.DOTALL | _re.IGNORECASE)
|
||||
text = _re.sub(r"<[^>]+>", " ", text)
|
||||
text = _re.sub(r"\s+", " ", text).strip()
|
||||
if len(text.split()) > 100:
|
||||
logger.info("HTTP fallback for %s: %d words", url, len(text.split()))
|
||||
return text, []
|
||||
except Exception as e:
|
||||
logger.warning("HTTP fallback failed for %s: %s", url, e)
|
||||
|
||||
return "", []
|
||||
Reference in New Issue
Block a user