Files
breakpilot-compliance/backend-compliance/compliance/api/agent_check/_fetch.py
T
Benjamin Admin 08c08fcba2
CI / test-python-backend (push) Successful in 30s
CI / detect-changes (push) Successful in 9s
CI / branch-name (push) Has been skipped
CI / guardrail-integrity (push) Has been skipped
CI / secret-scan (push) Has been skipped
CI / dep-audit (push) Has been skipped
CI / sbom-scan (push) Has been skipped
CI / build-sha-integrity (push) Failing after 4s
CI / validate-canonical-controls (push) Successful in 12s
CI / loc-budget (push) Successful in 15s
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / nodejs-build (push) Has been skipped
CI / test-go (push) Has been skipped
CI / iace-gt-coverage (push) Has been skipped
CI / test-python-document-crawler (push) Has been skipped
CI / test-python-dsms-gateway (push) Has been skipped
feat(crawl): Vollstaendigkeit — Shadow-DOM/versteckte Links + Interaktions-Fixpunkt + Wayback-CDX-Orphans
Damit die Specialist-Agents auf vollstaendigem Website-Content arbeiten:

A — _find_dsi_links pierct jetzt Shadow-DOM (Web-Components wie Usercentrics/
    Mercedes) rekursiv; versteckte (display:none) Links werden erfasst + als
    Coverage-Metadatum geflaggt.
B — _expand_to_fixpoint klappt Akkordeons/Tabs/Hover-Menues in einer Schleife
    auf, bis das DOM stabil ist (statt 1 Pass); erweiterte Selektoren;
    Coverage-Telemetrie (Runden, expandierte Elemente, DOM-Wachstum, Shadow-/
    versteckte Links) → Response + Backend-Log.
C — legacy_url_cdx.cdx_enumerate listet via Wayback-CDX-API ALLE je
    archivierten URLs der Domain → findet Orphan-/Legacy-Seiten, die nie im
    Slug-Raster standen (z.B. nicht mehr verlinktes /datenschutz, per Direkt-
    URL noch erreichbar). Fliesst durch das bestehende Legacy-URL-Inventar.

Tests: test_legacy_url_cdx.py (6) + consent-tester/tests/test_dsi_discovery.py
(Pure-Helper + Real-Browser-Integration). Alle gruen, LOC-Gate gruen.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-09 12:33:34 +02:00

154 lines
7.5 KiB
Python

"""URL → text fetch helper for the compliance-check pipeline.
Tries the consent-tester service first (Playwright, full JS render +
CMP capture). On any failure or empty result, falls back to a direct
HTTP GET with an identifiable User-Agent and per-domain rate limiting.
For cookie/dse/social_media doc types we cap discovery to 1 sub-page
(the policy itself is authoritative). For Impressum/AGB/Widerruf and
similar enterprise-split pages we follow up to 3 sub-pages.
"""
from __future__ import annotations
import logging
import re as _re
import httpx
from ._constants import CONSENT_TESTER_URL
logger = logging.getLogger(__name__)
async def _fetch_text(url: str, doc_type: str = "") -> tuple[str, list[dict]]:
"""Fetch text from URL via consent-tester, with HTTP fallback.
Returns (text, cmp_payloads). cmp_payloads is the raw CMP JSON captured
during navigation (ePaaS, OneTrust, …) — empty when no CMP fired or
HTTP fallback was used. Backend turns payloads into structured vendor
records for the VVT table in the email.
"""
# 1. Consent-tester (Playwright-based, full JS rendering).
# max_documents depends on doc_type:
# - cookie/dse/social_media: self-extract (often + CMP capture) is
# authoritative, sub-pages dilute the policy text. max=1.
# - impressum/agb/widerruf/nutzungsbedingungen/dsb: BMW & similar
# enterprise sites split this across 3-4 short sub-pages
# (Versicherungsvermittler, Aufsicht, Berufsrecht). max=3 follows
# them. The 15s networkidle bail (dsi_helpers) keeps timing safe.
short_extract_types = {"cookie", "dse", "datenschutz", "privacy", "social_media"}
max_docs = 1 if (doc_type or "") in short_extract_types else 3
try:
# P90: 120s reicht nicht fuer BMW-Impressum (Auto-Discovery folgt
# 3 Sub-Docs). 240s gibt Spielraum. Mercedes faellt aktuell mit
# 120s auch oft an Akamai-Latenz.
async with httpx.AsyncClient(timeout=240.0) as client:
resp = await client.post(
f"{CONSENT_TESTER_URL}/dsi-discovery",
json={"url": url, "max_documents": max_docs},
timeout=240.0,
)
if resp.status_code == 200:
payload = resp.json()
docs = payload.get("documents", [])
cmp_payloads = payload.get("cmp_payloads") or []
cmp_cookie_text = payload.get("cmp_cookie_text") or ""
coverage = payload.get("coverage") or {}
if coverage:
logger.info(
"Crawl-Coverage %s: %d Interaktions-Runden, "
"%d Elemente expandiert, %d Shadow-Links, "
"%d versteckte Links",
url, coverage.get("interaction_rounds", 0),
coverage.get("elements_expanded", 0),
coverage.get("shadow_links_found", 0),
coverage.get("hidden_links_found", 0),
)
# D — wenn der consent-tester HTML-Tabellen aus dem DOM
# extrahiert hat, in die cmp_payloads als "generic_table"
# einschleusen damit das Backend sie via cookies_table_parser
# verarbeiten kann.
for doc in (docs or []):
for tbl in (doc.get("tables") or []):
if not tbl or len(tbl) < 3:
continue
cmp_payloads.append({
"kind": "html_table",
"url": doc.get("url", ""),
"rows": tbl,
})
if docs:
texts = []
for doc in docs:
t = doc.get("full_text", "") or doc.get("text_preview", "") or ""
if t and len(t) > 50:
texts.append(t)
merged = "\n\n".join(texts)
# For cookie/dse/social_media: when CMP reconstruction is
# substantially richer than DOM extraction, use it. This
# fixes the BMW case where DOM yields ~600 words of
# navigation but the ePaaS payload reconstructs to ~1800
# words of actual cookie policy.
if (doc_type in short_extract_types
and cmp_cookie_text
and len(cmp_cookie_text.split()) > len(merged.split())):
logger.info(
"Preferring CMP-reconstructed text for %s on %s "
"(%d words CMP vs %d words DOM)",
doc_type, url,
len(cmp_cookie_text.split()),
len(merged.split()),
)
merged = cmp_cookie_text
if merged and len(merged.split()) > 100:
if len(texts) > 1:
logger.info("Merged %d docs from %s (%d words)",
len(texts), url, len(merged.split()))
return merged, cmp_payloads
# P90-Bug-Fix: auch wenn DSE-Text zu kurz fuer 100-Wort-
# Schwelle ist, die captured CMP-Payloads NICHT verwerfen.
# BMW-Bug: DSE liefert 10 Wort SPA-Shell, aber ePaaS-JSON
# (393KB) wurde captured. Backend braucht die fuer
# extract_vendors_from_payloads (VVT-Tabelle).
if cmp_payloads:
logger.info(
"P90: keeping %d CMP payloads for %s despite "
"short text (%d words) — HTTP fallback runs in parallel",
len(cmp_payloads), url,
len((merged or cmp_cookie_text).split()),
)
fallback_text = merged or cmp_cookie_text or ""
return fallback_text, cmp_payloads
except Exception as e:
# P90: verbose exception fuer Diagnose (war vorher empty)
logger.warning("Consent-tester fetch failed for %s: %s (%s)",
url, str(e) or "(empty)", type(e).__name__)
# 2. Fallback: direct HTTP fetch (works for SSR pages like BMW).
# P7: kenntlicher UA + per-Domain Rate-Limit.
try:
from compliance.services.compliance_user_agent import (
default_request_headers, DomainRateLimiter,
)
async with httpx.AsyncClient(
timeout=30.0, follow_redirects=True,
headers=default_request_headers(),
) as client:
async with DomainRateLimiter(url):
resp = await client.get(url)
if resp.status_code == 200 and "text/html" in resp.headers.get("content-type", ""):
html = resp.text
# Strip HTML tags, decode entities
text = _re.sub(r"<script[^>]*>.*?</script>", " ", html, flags=_re.DOTALL | _re.IGNORECASE)
text = _re.sub(r"<style[^>]*>.*?</style>", " ", text, flags=_re.DOTALL | _re.IGNORECASE)
text = _re.sub(r"<[^>]+>", " ", text)
text = _re.sub(r"\s+", " ", text).strip()
if len(text.split()) > 100:
logger.info("HTTP fallback for %s: %d words", url, len(text.split()))
return text, []
except Exception as e:
logger.warning("HTTP fallback failed for %s: %s", url, e)
return "", []