08c08fcba2
CI / test-python-backend (push) Successful in 30s
CI / detect-changes (push) Successful in 9s
CI / branch-name (push) Has been skipped
CI / guardrail-integrity (push) Has been skipped
CI / secret-scan (push) Has been skipped
CI / dep-audit (push) Has been skipped
CI / sbom-scan (push) Has been skipped
CI / build-sha-integrity (push) Failing after 4s
CI / validate-canonical-controls (push) Successful in 12s
CI / loc-budget (push) Successful in 15s
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / nodejs-build (push) Has been skipped
CI / test-go (push) Has been skipped
CI / iace-gt-coverage (push) Has been skipped
CI / test-python-document-crawler (push) Has been skipped
CI / test-python-dsms-gateway (push) Has been skipped
Damit die Specialist-Agents auf vollstaendigem Website-Content arbeiten:
A — _find_dsi_links pierct jetzt Shadow-DOM (Web-Components wie Usercentrics/
Mercedes) rekursiv; versteckte (display:none) Links werden erfasst + als
Coverage-Metadatum geflaggt.
B — _expand_to_fixpoint klappt Akkordeons/Tabs/Hover-Menues in einer Schleife
auf, bis das DOM stabil ist (statt 1 Pass); erweiterte Selektoren;
Coverage-Telemetrie (Runden, expandierte Elemente, DOM-Wachstum, Shadow-/
versteckte Links) → Response + Backend-Log.
C — legacy_url_cdx.cdx_enumerate listet via Wayback-CDX-API ALLE je
archivierten URLs der Domain → findet Orphan-/Legacy-Seiten, die nie im
Slug-Raster standen (z.B. nicht mehr verlinktes /datenschutz, per Direkt-
URL noch erreichbar). Fliesst durch das bestehende Legacy-URL-Inventar.
Tests: test_legacy_url_cdx.py (6) + consent-tester/tests/test_dsi_discovery.py
(Pure-Helper + Real-Browser-Integration). Alle gruen, LOC-Gate gruen.
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
154 lines
7.5 KiB
Python
154 lines
7.5 KiB
Python
"""URL → text fetch helper for the compliance-check pipeline.
|
|
|
|
Tries the consent-tester service first (Playwright, full JS render +
|
|
CMP capture). On any failure or empty result, falls back to a direct
|
|
HTTP GET with an identifiable User-Agent and per-domain rate limiting.
|
|
|
|
For cookie/dse/social_media doc types we cap discovery to 1 sub-page
|
|
(the policy itself is authoritative). For Impressum/AGB/Widerruf and
|
|
similar enterprise-split pages we follow up to 3 sub-pages.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
import re as _re
|
|
|
|
import httpx
|
|
|
|
from ._constants import CONSENT_TESTER_URL
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
async def _fetch_text(url: str, doc_type: str = "") -> tuple[str, list[dict]]:
|
|
"""Fetch text from URL via consent-tester, with HTTP fallback.
|
|
|
|
Returns (text, cmp_payloads). cmp_payloads is the raw CMP JSON captured
|
|
during navigation (ePaaS, OneTrust, …) — empty when no CMP fired or
|
|
HTTP fallback was used. Backend turns payloads into structured vendor
|
|
records for the VVT table in the email.
|
|
"""
|
|
# 1. Consent-tester (Playwright-based, full JS rendering).
|
|
# max_documents depends on doc_type:
|
|
# - cookie/dse/social_media: self-extract (often + CMP capture) is
|
|
# authoritative, sub-pages dilute the policy text. max=1.
|
|
# - impressum/agb/widerruf/nutzungsbedingungen/dsb: BMW & similar
|
|
# enterprise sites split this across 3-4 short sub-pages
|
|
# (Versicherungsvermittler, Aufsicht, Berufsrecht). max=3 follows
|
|
# them. The 15s networkidle bail (dsi_helpers) keeps timing safe.
|
|
short_extract_types = {"cookie", "dse", "datenschutz", "privacy", "social_media"}
|
|
max_docs = 1 if (doc_type or "") in short_extract_types else 3
|
|
try:
|
|
# P90: 120s reicht nicht fuer BMW-Impressum (Auto-Discovery folgt
|
|
# 3 Sub-Docs). 240s gibt Spielraum. Mercedes faellt aktuell mit
|
|
# 120s auch oft an Akamai-Latenz.
|
|
async with httpx.AsyncClient(timeout=240.0) as client:
|
|
resp = await client.post(
|
|
f"{CONSENT_TESTER_URL}/dsi-discovery",
|
|
json={"url": url, "max_documents": max_docs},
|
|
timeout=240.0,
|
|
)
|
|
if resp.status_code == 200:
|
|
payload = resp.json()
|
|
docs = payload.get("documents", [])
|
|
cmp_payloads = payload.get("cmp_payloads") or []
|
|
cmp_cookie_text = payload.get("cmp_cookie_text") or ""
|
|
coverage = payload.get("coverage") or {}
|
|
if coverage:
|
|
logger.info(
|
|
"Crawl-Coverage %s: %d Interaktions-Runden, "
|
|
"%d Elemente expandiert, %d Shadow-Links, "
|
|
"%d versteckte Links",
|
|
url, coverage.get("interaction_rounds", 0),
|
|
coverage.get("elements_expanded", 0),
|
|
coverage.get("shadow_links_found", 0),
|
|
coverage.get("hidden_links_found", 0),
|
|
)
|
|
# D — wenn der consent-tester HTML-Tabellen aus dem DOM
|
|
# extrahiert hat, in die cmp_payloads als "generic_table"
|
|
# einschleusen damit das Backend sie via cookies_table_parser
|
|
# verarbeiten kann.
|
|
for doc in (docs or []):
|
|
for tbl in (doc.get("tables") or []):
|
|
if not tbl or len(tbl) < 3:
|
|
continue
|
|
cmp_payloads.append({
|
|
"kind": "html_table",
|
|
"url": doc.get("url", ""),
|
|
"rows": tbl,
|
|
})
|
|
if docs:
|
|
texts = []
|
|
for doc in docs:
|
|
t = doc.get("full_text", "") or doc.get("text_preview", "") or ""
|
|
if t and len(t) > 50:
|
|
texts.append(t)
|
|
merged = "\n\n".join(texts)
|
|
# For cookie/dse/social_media: when CMP reconstruction is
|
|
# substantially richer than DOM extraction, use it. This
|
|
# fixes the BMW case where DOM yields ~600 words of
|
|
# navigation but the ePaaS payload reconstructs to ~1800
|
|
# words of actual cookie policy.
|
|
if (doc_type in short_extract_types
|
|
and cmp_cookie_text
|
|
and len(cmp_cookie_text.split()) > len(merged.split())):
|
|
logger.info(
|
|
"Preferring CMP-reconstructed text for %s on %s "
|
|
"(%d words CMP vs %d words DOM)",
|
|
doc_type, url,
|
|
len(cmp_cookie_text.split()),
|
|
len(merged.split()),
|
|
)
|
|
merged = cmp_cookie_text
|
|
if merged and len(merged.split()) > 100:
|
|
if len(texts) > 1:
|
|
logger.info("Merged %d docs from %s (%d words)",
|
|
len(texts), url, len(merged.split()))
|
|
return merged, cmp_payloads
|
|
# P90-Bug-Fix: auch wenn DSE-Text zu kurz fuer 100-Wort-
|
|
# Schwelle ist, die captured CMP-Payloads NICHT verwerfen.
|
|
# BMW-Bug: DSE liefert 10 Wort SPA-Shell, aber ePaaS-JSON
|
|
# (393KB) wurde captured. Backend braucht die fuer
|
|
# extract_vendors_from_payloads (VVT-Tabelle).
|
|
if cmp_payloads:
|
|
logger.info(
|
|
"P90: keeping %d CMP payloads for %s despite "
|
|
"short text (%d words) — HTTP fallback runs in parallel",
|
|
len(cmp_payloads), url,
|
|
len((merged or cmp_cookie_text).split()),
|
|
)
|
|
fallback_text = merged or cmp_cookie_text or ""
|
|
return fallback_text, cmp_payloads
|
|
except Exception as e:
|
|
# P90: verbose exception fuer Diagnose (war vorher empty)
|
|
logger.warning("Consent-tester fetch failed for %s: %s (%s)",
|
|
url, str(e) or "(empty)", type(e).__name__)
|
|
|
|
# 2. Fallback: direct HTTP fetch (works for SSR pages like BMW).
|
|
# P7: kenntlicher UA + per-Domain Rate-Limit.
|
|
try:
|
|
from compliance.services.compliance_user_agent import (
|
|
default_request_headers, DomainRateLimiter,
|
|
)
|
|
async with httpx.AsyncClient(
|
|
timeout=30.0, follow_redirects=True,
|
|
headers=default_request_headers(),
|
|
) as client:
|
|
async with DomainRateLimiter(url):
|
|
resp = await client.get(url)
|
|
if resp.status_code == 200 and "text/html" in resp.headers.get("content-type", ""):
|
|
html = resp.text
|
|
# Strip HTML tags, decode entities
|
|
text = _re.sub(r"<script[^>]*>.*?</script>", " ", html, flags=_re.DOTALL | _re.IGNORECASE)
|
|
text = _re.sub(r"<style[^>]*>.*?</style>", " ", text, flags=_re.DOTALL | _re.IGNORECASE)
|
|
text = _re.sub(r"<[^>]+>", " ", text)
|
|
text = _re.sub(r"\s+", " ", text).strip()
|
|
if len(text.split()) > 100:
|
|
logger.info("HTTP fallback for %s: %d words", url, len(text.split()))
|
|
return text, []
|
|
except Exception as e:
|
|
logger.warning("HTTP fallback failed for %s: %s", url, e)
|
|
|
|
return "", []
|