cb5dad1a2f
CI / detect-changes (push) Successful in 10s
CI / branch-name (push) Has been skipped
CI / guardrail-integrity (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / nodejs-build (push) Has been skipped
CI / test-python-backend (push) Successful in 45s
CI / secret-scan (push) Has been skipped
CI / dep-audit (push) Has been skipped
CI / sbom-scan (push) Has been skipped
CI / validate-canonical-controls (push) Successful in 20s
CI / loc-budget (push) Failing after 17s
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / test-go (push) Has been skipped
CI / iace-gt-coverage (push) Has been skipped
CI / test-python-document-crawler (push) Has been skipped
CI / test-python-dsms-gateway (push) Has been skipped
Drei zusammenhaengende Fixes fuer den VW-Befund (6 Vendors statt 100+): A — audit_quality_checks.py: drei systemische Vorbehalte die IMMER prominent gezeigt werden: * banner_detected=False trotz Cookie-Doc → HIGH 'CMP-Tool ungeladen' * cookie_doc >= 30k chars aber cmp_vendors < 15 → HIGH/MEDIUM 'Vendor-Liste auffaellig kurz fuer Doc-Groesse' * submitted URL aber 0/Mini-Text → MEDIUM 'URL nicht ladbar' Rote Audit-Vorbehalt-Box ueber dem GF-1-Pager. GF-Summary sagt 'Audit unvollstaendig' statt faelschlich 'Keine kritischen Themen'. gf_one_pager nimmt audit_quality_findings in top_findings auf (BEVOR andere Findings). B — cookies_table_parser laeuft jetzt auch auf gecrawltem Cookie-Doc- Text (nicht nur bei User-Paste). Wenn der dsi-discovery-Response Tab/ Pipe-getrennte Tabellen-Reihen liefert, parsen wir sie deterministisch. D — consent-tester/dsi-discovery extrahiert jetzt zusaetzlich zum Text die <table>-Elemente aus dem DOM als list[str] (Tab-getrennt pro Zeile, mind. 2 Zellen, mind. 3 Zeilen, max 10 Tabellen pro Doc). Backend schleust diese als 'html_table'-cmp_payload ein und jagt sie zuerst durch cookies_table_parser → 100% deterministische Vendor-Extraktion ohne LLM. VW-Erwartung: aus der 65k-Cookie-Tabelle werden jetzt 30-50 Vendors deterministisch geparst statt 6 vom LLM-Cascade. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
964 lines
42 KiB
Python
964 lines
42 KiB
Python
"""
|
|
DSI Discovery — Generic privacy document finder and parser.
|
|
|
|
Finds all privacy/data protection documents on any website regardless of:
|
|
- Technology (static HTML, SPA, WordPress, Typo3, etc.)
|
|
- Structure (accordion, sidebar, footer, inline links, separate pages)
|
|
- Format (HTML sections, PDF downloads, cross-domain links)
|
|
- Language (all 26 EU/EEA official languages)
|
|
|
|
Flow:
|
|
1. Load page with Playwright (full JS rendering)
|
|
2. Find all links matching DSI keywords (26 languages)
|
|
3. Expand accordions, click tabs, open dropdowns
|
|
4. Follow cross-domain links (e.g. instagram.com → help.instagram.com)
|
|
5. Extract document text from each link target
|
|
6. Return structured list of discovered documents
|
|
"""
|
|
|
|
import logging
|
|
import re
|
|
from dataclasses import dataclass, field
|
|
from urllib.parse import urlparse, urljoin
|
|
|
|
from playwright.async_api import Page
|
|
|
|
from services.dsi_helpers import goto_resilient, try_dismiss_consent_banner, is_pdf_redirect
|
|
from services.cmp_extractor import CMPCapture
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Legal document keywords in all EU/EEA official languages.
|
|
# Covers: DSI (privacy), AGB (terms), Widerruf (cancellation),
|
|
# Cookie-Richtlinie, Impressum, NB (Nutzungsbedingungen).
|
|
DSI_KEYWORDS: dict[str, list[str]] = {
|
|
"de": [
|
|
# Datenschutz
|
|
"datenschutz", "datenschutzerklaerung", "datenschutzinformation",
|
|
"datenschutzhinweis", "datenschutzrichtlinie", "dsgvo", "privatsphäre",
|
|
"datenschutzbestimmung", "verarbeitung personenbezogener daten",
|
|
# AGB / Nutzungsbedingungen
|
|
"allgemeine geschäftsbedingungen", "agb", "nutzungsbedingungen",
|
|
"nutzungsordnung", "geschäftsbedingungen",
|
|
# Widerruf
|
|
"widerrufsbelehrung", "widerrufsrecht", "widerrufsformular",
|
|
"widerruf", "rücktrittsrecht",
|
|
# Cookie
|
|
"cookie-richtlinie", "cookie-policy", "cookie-hinweis",
|
|
# Impressum
|
|
"impressum", "anbieterkennzeichnung",
|
|
# Imprint (EN)
|
|
"imprint", "legal notice", "site notice",
|
|
],
|
|
"en": [
|
|
"privacy policy", "privacy notice", "data protection", "data policy",
|
|
"privacy statement", "gdpr", "personal data", "cookie policy",
|
|
"terms of service", "terms and conditions", "terms of use",
|
|
"cancellation policy", "right of withdrawal", "refund policy",
|
|
"cookie notice",
|
|
],
|
|
"fr": [
|
|
"politique de confidentialité", "protection des données",
|
|
"données personnelles", "vie privée", "rgpd",
|
|
"conditions générales", "conditions d'utilisation",
|
|
"droit de rétractation", "politique de cookies",
|
|
],
|
|
"es": [
|
|
"política de privacidad", "protección de datos",
|
|
"datos personales", "aviso de privacidad",
|
|
"términos y condiciones", "condiciones de uso",
|
|
"derecho de desistimiento", "política de cookies",
|
|
],
|
|
"it": [
|
|
"informativa sulla privacy", "protezione dei dati",
|
|
"dati personali", "privacy policy",
|
|
"termini e condizioni", "condizioni d'uso",
|
|
"diritto di recesso", "politica dei cookie",
|
|
],
|
|
"nl": [
|
|
"privacybeleid", "gegevensbescherming", "privacyverklaring",
|
|
"persoonsgegevens", "avg",
|
|
"algemene voorwaarden", "gebruiksvoorwaarden",
|
|
"herroepingsrecht", "cookiebeleid",
|
|
],
|
|
"pl": [
|
|
"polityka prywatności", "ochrona danych osobowych",
|
|
"dane osobowe", "rodo",
|
|
"regulamin", "warunki korzystania",
|
|
"prawo odstąpienia", "polityka cookies",
|
|
],
|
|
"pt": [
|
|
"política de privacidade", "proteção de dados",
|
|
"dados pessoais", "lgpd",
|
|
"termos e condições", "condições de utilização",
|
|
"direito de resolução", "política de cookies",
|
|
],
|
|
"sv": [
|
|
"integritetspolicy", "dataskydd", "personuppgifter",
|
|
"sekretesspolicy",
|
|
"allmänna villkor", "användarvillkor",
|
|
"ångerrätt", "cookiepolicy",
|
|
],
|
|
"da": [
|
|
"privatlivspolitik", "databeskyttelse", "personoplysninger",
|
|
"persondatapolitik",
|
|
"handelsbetingelser", "brugsbetingelser",
|
|
"fortrydelsesret", "cookiepolitik",
|
|
],
|
|
"fi": [
|
|
"tietosuojaseloste", "tietosuoja", "henkilötiedot",
|
|
"rekisteriseloste",
|
|
"yleiset ehdot", "käyttöehdot",
|
|
"peruutusoikeus", "evästekäytäntö",
|
|
],
|
|
"cs": ["zásady ochrany osobních údajů", "ochrana osobních údajů",
|
|
"zpracování osobních údajů", "obchodní podmínky", "zásady cookies"],
|
|
"el": ["πολιτική απορρήτου", "προστασία δεδομένων",
|
|
"προσωπικά δεδομένα", "όροι χρήσης", "πολιτική cookies"],
|
|
"hu": ["adatvédelmi szabályzat", "adatvédelem", "személyes adatok",
|
|
"általános szerződési feltételek", "cookie szabályzat"],
|
|
"ro": ["politica de confidențialitate", "protecția datelor",
|
|
"date cu caracter personal", "termeni și condiții", "politica cookies"],
|
|
"bg": ["политика за поверителност", "защита на данните",
|
|
"лични данни", "общи условия", "политика за бисквитки"],
|
|
"hr": ["politika privatnosti", "zaštita podataka", "osobni podaci",
|
|
"opći uvjeti", "politika kolačića"],
|
|
"sk": ["zásady ochrany osobných údajov", "ochrana osobných údajov",
|
|
"obchodné podmienky", "zásady cookies"],
|
|
"sl": ["politika zasebnosti", "varstvo podatkov", "osebni podatki",
|
|
"splošni pogoji", "politika piškotkov"],
|
|
"et": ["privaatsuspoliitika", "andmekaitse", "isikuandmed",
|
|
"kasutustingimused", "küpsiste poliitika"],
|
|
"lt": ["privatumo politika", "duomenų apsauga", "asmens duomenys",
|
|
"naudojimosi sąlygos", "slapukų politika"],
|
|
"lv": ["privātuma politika", "datu aizsardzība", "personas dati",
|
|
"lietošanas noteikumi", "sīkdatņu politika"],
|
|
"mt": ["politika tal-privatezza", "protezzjoni tad-data",
|
|
"termini u kundizzjonijiet"],
|
|
"ga": ["polasaí príobháideachais", "cosaint sonraí",
|
|
"téarmaí agus coinníollacha"],
|
|
"is": ["persónuverndarstefna", "persónuvernd",
|
|
"skilmálar og skilyrði"],
|
|
"no": ["personvernerklæring", "personvern", "personopplysninger",
|
|
"brukervilkår", "angrerett", "informasjonskapsler"],
|
|
}
|
|
|
|
# Flatten all keywords for quick matching
|
|
ALL_DSI_KEYWORDS: list[str] = []
|
|
for kw_list in DSI_KEYWORDS.values():
|
|
ALL_DSI_KEYWORDS.extend(kw_list)
|
|
|
|
@dataclass
|
|
class DiscoveredDSI:
|
|
"""A discovered privacy/data protection document."""
|
|
title: str
|
|
url: str
|
|
source_url: str # Page where the link was found
|
|
language: str = ""
|
|
doc_type: str = "" # "html_section", "html_page", "pdf", "accordion", "cross_domain"
|
|
text: str = "" # Extracted full text
|
|
sections: list[dict] = field(default_factory=list) # Parsed sections
|
|
word_count: int = 0
|
|
# D — Tab-getrennte HTML-Tabellen aus dem DOM. Pro Tabelle eine
|
|
# Liste von Zeilen, jede Zeile ein Tab-getrennter String. Erlaubt
|
|
# dem Backend deterministischen Cookie-Tabellen-Parse ohne LLM.
|
|
tables: list[list[str]] = field(default_factory=list)
|
|
|
|
@dataclass
|
|
class DSIDiscoveryResult:
|
|
"""Result of DSI discovery scan."""
|
|
base_url: str
|
|
documents: list[DiscoveredDSI] = field(default_factory=list)
|
|
total_found: int = 0
|
|
languages_detected: list[str] = field(default_factory=list)
|
|
errors: list[str] = field(default_factory=list)
|
|
# Raw CMP payloads captured during navigation (one per matched JSON).
|
|
# Schema: [{"kind": str, "url": str, "data": dict}, ...]
|
|
# Backend uses these to build vendor records + run per-vendor checks.
|
|
cmp_payloads: list[dict] = field(default_factory=list)
|
|
# Reconstructed cookie-policy text from all captured CMP payloads
|
|
# (CMP-library reconstruct + heuristic generic). Backend uses this as
|
|
# the authoritative cookie-text so MC checks run on the real policy,
|
|
# not the homepage navigation that DOM extraction returns.
|
|
cmp_cookie_text: str = ""
|
|
|
|
def _matches_dsi_keyword(text: str) -> tuple[bool, str]:
|
|
"""Check if text contains any DSI keyword. Returns (match, language)."""
|
|
text_lower = text.lower().strip()
|
|
for lang, keywords in DSI_KEYWORDS.items():
|
|
for kw in keywords:
|
|
if kw in text_lower:
|
|
return True, lang
|
|
return False, ""
|
|
|
|
def _is_allowed_domain(href: str, base_domain: str) -> bool:
|
|
"""Allow same domain + known related domains (e.g. help.instagram.com)."""
|
|
try:
|
|
link_domain = urlparse(href).netloc.replace("www.", "")
|
|
base_clean = base_domain.replace("www.", "")
|
|
# Same domain
|
|
if link_domain == base_clean:
|
|
return True
|
|
# Subdomain (help.instagram.com for instagram.com)
|
|
if link_domain.endswith(f".{base_clean}"):
|
|
return True
|
|
# Parent domain (instagram.com links from about.instagram.com)
|
|
if base_clean.endswith(f".{link_domain}"):
|
|
return True
|
|
# Known related patterns
|
|
parts_base = base_clean.split(".")
|
|
parts_link = link_domain.split(".")
|
|
if len(parts_base) >= 2 and len(parts_link) >= 2:
|
|
if parts_base[-2] == parts_link[-2] and parts_base[-1] == parts_link[-1]:
|
|
return True # Same registrable domain
|
|
except Exception:
|
|
pass
|
|
return False
|
|
|
|
async def discover_dsi_documents(
|
|
page: Page,
|
|
url: str,
|
|
max_documents: int = 100,
|
|
timeout_seconds: int = 300,
|
|
) -> DSIDiscoveryResult:
|
|
"""Discover all privacy/data protection documents on a website.
|
|
|
|
Works generically regardless of website technology, structure, or language.
|
|
Searches exhaustively until no new documents are found — no arbitrary page limit.
|
|
Stops when: all discovered links have been visited OR timeout reached.
|
|
"""
|
|
import time
|
|
deadline = time.time() + timeout_seconds
|
|
|
|
result = DSIDiscoveryResult(base_url=url)
|
|
base_domain = urlparse(url).netloc
|
|
seen_urls: set[str] = set()
|
|
seen_titles: set[str] = set()
|
|
|
|
# CMP capture must be wired BEFORE navigation so we catch the JSON requests
|
|
# that fire as soon as the consent widget initializes (e.g. BMW ePaaS).
|
|
cmp_capture = CMPCapture()
|
|
cmp_capture.attach(page)
|
|
|
|
# Also collect a generic JSON response log for the LLM fallback (Phase C+D)
|
|
# if everything else fails. Keep it small (header info only, not bodies).
|
|
network_log: list[dict] = []
|
|
|
|
async def _on_response_log(response):
|
|
try:
|
|
ct = (response.headers.get("content-type") or "").lower()
|
|
if "json" not in ct:
|
|
return
|
|
network_log.append({
|
|
"url": response.url,
|
|
"status": response.status,
|
|
"content_type": ct,
|
|
"size": int(response.headers.get("content-length") or 0),
|
|
})
|
|
except Exception:
|
|
pass
|
|
|
|
page.on("response", _on_response_log)
|
|
|
|
try:
|
|
# Step 1: Load the page (with networkidle → domcontentloaded fallback)
|
|
await goto_resilient(page, url, timeout=60000)
|
|
await page.wait_for_timeout(2000)
|
|
|
|
# Step 1a: Detect PDF redirects (e.g. dm.de redirects to GCS PDF)
|
|
final_url = page.url
|
|
if is_pdf_redirect(url, final_url):
|
|
is_dsi_url, dsi_lang = _matches_dsi_keyword(urlparse(url).path.lower())
|
|
if is_dsi_url:
|
|
result.documents.append(DiscoveredDSI(
|
|
title=urlparse(url).path.split("/")[-1] or "Datenschutzerklaerung",
|
|
url=final_url,
|
|
source_url=url,
|
|
language=dsi_lang or "de",
|
|
doc_type="pdf",
|
|
text="[PDF — Textextraktion erforderlich]",
|
|
))
|
|
seen_urls.add(url)
|
|
seen_urls.add(final_url)
|
|
logger.info("PDF redirect detected: %s -> %s", url, final_url)
|
|
# Return early — a PDF redirect means no HTML content to scan
|
|
result.total_found = len(result.documents)
|
|
result.cmp_payloads = [
|
|
{"kind": kind, "data": data}
|
|
for kind, data in cmp_capture.payloads
|
|
]
|
|
return result
|
|
|
|
# Step 1b: Try dismissing cookie consent banners before extraction.
|
|
# Many German sites (dm.de, Zalando, etc.) block page content behind
|
|
# a consent wall. Dismissing it reveals the actual DSI text.
|
|
banner_dismissed = await try_dismiss_consent_banner(page)
|
|
if banner_dismissed:
|
|
# After consent, page may reload or reveal hidden content
|
|
await page.wait_for_timeout(2000)
|
|
# Re-navigate if the page redirected after consent
|
|
try:
|
|
if page.url != url:
|
|
await goto_resilient(page, url, timeout=30000)
|
|
await page.wait_for_timeout(2000)
|
|
except Exception:
|
|
pass
|
|
|
|
# Step 1c: Self-extraction — if the URL itself is a DSI page,
|
|
# extract its full text as the first document. This handles the
|
|
# case where the user provides the DSE URL directly (e.g.
|
|
# example.com/datenschutz) instead of the homepage.
|
|
current_url_path = urlparse(url).path.lower()
|
|
is_self_dsi, self_lang = _matches_dsi_keyword(current_url_path)
|
|
if not is_self_dsi:
|
|
# Also check the page title
|
|
page_title = await page.title() or ""
|
|
is_self_dsi, self_lang = _matches_dsi_keyword(page_title)
|
|
if is_self_dsi:
|
|
try:
|
|
# Wait for substantive content to appear (SPAs need time to render).
|
|
# Polls body.innerText length up to 10s. Many sites (BMW, Daimler)
|
|
# render via React/Vue after domcontentloaded fires.
|
|
try:
|
|
await page.wait_for_function(
|
|
"() => (document.body && document.body.innerText || '').length > 500",
|
|
timeout=10000,
|
|
)
|
|
except Exception:
|
|
pass # Continue anyway, extractor below has fallbacks
|
|
|
|
# Scroll to bottom to trigger lazy-loading of full content
|
|
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
|
await page.wait_for_timeout(1500)
|
|
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
|
await page.wait_for_timeout(1000)
|
|
|
|
self_text = await _extract_text_robust(page)
|
|
self_wc = len(self_text.split()) if self_text else 0
|
|
|
|
# If still too short, try same-origin iframes (some sites
|
|
# embed cookie policies via OneTrust/Sourcepoint iframes).
|
|
if self_wc < 100:
|
|
iframe_text = await _extract_text_from_iframes(page)
|
|
if iframe_text and len(iframe_text.split()) > self_wc:
|
|
self_text = iframe_text
|
|
self_wc = len(self_text.split())
|
|
logger.info("Self-extraction via iframe for %s: %d words", url, self_wc)
|
|
|
|
# If a CMP JSON was captured (BMW ePaaS, OneTrust, etc.) it is
|
|
# the authoritative source for the cookie policy — far more
|
|
# reliable than the rendered DOM, which usually only contains
|
|
# site chrome (navigation/footer) when the policy widget hasn't
|
|
# finished rendering yet.
|
|
#
|
|
# Prefer the CMP-reconstructed text when ANY of:
|
|
# - DOM extraction was very short (< 300 words)
|
|
# - CMP text is at least 1.5x longer than DOM
|
|
# - CMP text exceeds 1000 words (always authoritative at scale)
|
|
if cmp_capture.payloads:
|
|
cmp_text = cmp_capture.reconstruct_cookie_policy()
|
|
cmp_wc = len(cmp_text.split()) if cmp_text else 0
|
|
# Replace DOM with CMP only when CMP is *strictly larger*
|
|
# AND meets at least one of: DOM was very thin, CMP is
|
|
# substantial, or CMP is significantly longer than DOM.
|
|
# The strict-larger guard prevents a tiny heuristic match
|
|
# (e.g. an unrelated /api/data JSON) from clobbering a
|
|
# bigger DOM extraction.
|
|
if cmp_wc > self_wc and (
|
|
self_wc < 300
|
|
or cmp_wc >= 1000
|
|
or cmp_wc > self_wc * 1.5
|
|
):
|
|
logger.info(
|
|
"Self-extraction via CMP capture for %s: %d words "
|
|
"(replacing %d-word DOM extraction, %d CMP payloads)",
|
|
url, cmp_wc, self_wc, len(cmp_capture.payloads),
|
|
)
|
|
self_text = cmp_text
|
|
self_wc = cmp_wc
|
|
|
|
# Phase C/D: LLM cascade fallback. Triggers only when both
|
|
# named CMPs (Phase B) and the generic heuristic (Phase A)
|
|
# produced nothing AND the DOM is too thin to be a real policy.
|
|
if self_wc < 300 and not cmp_capture.payloads:
|
|
llm_text, llm_wc = await _try_llm_cascade(
|
|
page, url, network_log,
|
|
)
|
|
if llm_wc > self_wc:
|
|
logger.info(
|
|
"Self-extraction via LLM cascade for %s: %d words "
|
|
"(replacing %d-word DOM)",
|
|
url, llm_wc, self_wc,
|
|
)
|
|
self_text = llm_text
|
|
self_wc = llm_wc
|
|
|
|
if self_wc >= 100:
|
|
page_title = await page.title() or url
|
|
result.documents.append(DiscoveredDSI(
|
|
title=page_title.strip(),
|
|
url=url,
|
|
source_url=url,
|
|
language=self_lang or "de",
|
|
doc_type="html_full_page",
|
|
text=self_text.strip(),
|
|
word_count=self_wc,
|
|
))
|
|
seen_urls.add(url)
|
|
logger.info("Self-extracted %d words from %s", self_wc, url)
|
|
else:
|
|
logger.info("Self-extraction too short (%d words) for %s", self_wc, url)
|
|
except Exception as e:
|
|
logger.warning("Self-extraction failed for %s: %s", url, e)
|
|
|
|
# Step 2: Find DSI links in current page
|
|
links = await _find_dsi_links(page, base_domain)
|
|
logger.info("Found %d DSI links on %s", len(links), url)
|
|
|
|
# Step 3: Expand accordions, tabs, dropdowns to find hidden content
|
|
await _expand_all_interactive(page)
|
|
await page.wait_for_timeout(1000)
|
|
|
|
# Step 3b: Re-scan after expanding (may reveal new links)
|
|
links_after = await _find_dsi_links(page, base_domain)
|
|
for link in links_after:
|
|
if link["href"] not in [l["href"] for l in links]:
|
|
links.append(link)
|
|
|
|
# Step 4: Check for inline DSI sections (accordion content already visible)
|
|
inline_sections = await _find_inline_dsi_sections(page)
|
|
for section in inline_sections:
|
|
title_norm = section["title"].strip().lower()
|
|
if title_norm not in seen_titles:
|
|
seen_titles.add(title_norm)
|
|
is_dsi, lang = _matches_dsi_keyword(section["title"])
|
|
doc = DiscoveredDSI(
|
|
title=section["title"],
|
|
url=f"{url}#{section.get('id', '')}",
|
|
source_url=url,
|
|
language=lang,
|
|
doc_type="html_section",
|
|
text=section["text"],
|
|
word_count=len(section["text"].split()),
|
|
)
|
|
result.documents.append(doc)
|
|
|
|
# Step 5: Follow each DSI link and extract content.
|
|
# Exhaustive: processes ALL found links. On each visited page,
|
|
# searches for MORE links (recursive discovery). Stops only when
|
|
# all links visited or timeout reached.
|
|
pending_links = list(links)
|
|
pages_to_revisit: list[str] = [] # Pages where we found docs — may have more links
|
|
|
|
while pending_links and time.time() < deadline and len(result.documents) < max_documents:
|
|
link_info = pending_links.pop(0)
|
|
href = link_info["href"]
|
|
if href in seen_urls:
|
|
continue
|
|
seen_urls.add(href)
|
|
|
|
title = link_info["text"]
|
|
title_norm = title.strip().lower()
|
|
if title_norm in seen_titles:
|
|
continue
|
|
seen_titles.add(title_norm)
|
|
|
|
is_dsi, lang = _matches_dsi_keyword(title)
|
|
is_pdf = href.lower().endswith(".pdf")
|
|
|
|
if is_pdf:
|
|
result.documents.append(DiscoveredDSI(
|
|
title=title, url=href, source_url=url,
|
|
language=lang, doc_type="pdf",
|
|
text="[PDF — Textextraktion erforderlich]",
|
|
))
|
|
continue
|
|
|
|
try:
|
|
# Skip anchor links on same page — they are sections of the parent doc
|
|
is_anchor = "#" in href and href.split("#")[0] in (url.split("#")[0], page.url.split("#")[0])
|
|
if is_anchor:
|
|
continue
|
|
|
|
# Navigate to page — with networkidle/domcontentloaded fallback
|
|
await goto_resilient(page, href, timeout=45000)
|
|
resp_url = page.url
|
|
|
|
# Check for PDF redirect on followed links
|
|
if is_pdf_redirect(href, resp_url):
|
|
result.documents.append(DiscoveredDSI(
|
|
title=title, url=resp_url, source_url=url,
|
|
language=lang, doc_type="pdf",
|
|
text="[PDF — Textextraktion erforderlich]",
|
|
))
|
|
await goto_resilient(page, url, timeout=45000)
|
|
continue
|
|
|
|
await try_dismiss_consent_banner(page)
|
|
await _expand_all_interactive(page)
|
|
await page.wait_for_timeout(500)
|
|
|
|
# Extract text — try specific content areas, fall back to full body
|
|
text = await page.evaluate("""
|
|
() => {
|
|
// Try progressively broader content selectors
|
|
const selectors = [
|
|
'.article-content', '.page-content', '.entry-content',
|
|
'[class*="content-area"]', '[class*="main-content"]',
|
|
'main article', 'main', 'article',
|
|
'[role="main"]', '.content', '#content',
|
|
];
|
|
for (const sel of selectors) {
|
|
const el = document.querySelector(sel);
|
|
if (el) {
|
|
// P98: innerText statt textContent — innerText
|
|
// respektiert Whitespace zwischen Block-Elementen.
|
|
// textContent verkettet HTML-Tabellen-Zellen ohne
|
|
// Spaces (VW-Cookie-Tabelle: ~100 Cookie-Namen
|
|
// wurden zu einem Klumpen "smartSignals2UiDsmartSignals2sUiD...").
|
|
const txt = (el.innerText || el.textContent || '').trim();
|
|
if (txt.length > 200) return txt;
|
|
}
|
|
}
|
|
// Fallback: full body minus nav/header/footer
|
|
const body = document.body.cloneNode(true);
|
|
body.querySelectorAll('nav, header, footer, script, style, [class*="nav"], [class*="sidebar"]').forEach(e => e.remove());
|
|
// P98: innerText respektiert Whitespace (s.o.)
|
|
return (body.innerText || body.textContent || '').trim();
|
|
}
|
|
""")
|
|
# D — HTML-Tabellen separat extrahieren. Pro Tabelle ein
|
|
# Array von Zeilen, jede Zeile ein Tab-getrennter String.
|
|
# Das erlaubt dem Backend deterministischen Spalten-Parse
|
|
# (cookies_table_parser) ohne LLM-Halluzinationen.
|
|
tables = await page.evaluate("""
|
|
() => {
|
|
const out = [];
|
|
document.querySelectorAll('table').forEach(t => {
|
|
const rows = [];
|
|
t.querySelectorAll('tr').forEach(tr => {
|
|
const cells = [];
|
|
tr.querySelectorAll('th, td').forEach(c => {
|
|
cells.push((c.innerText || c.textContent || '').trim().replace(/\\s+/g, ' '));
|
|
});
|
|
if (cells.length >= 2) rows.push(cells.join('\\t'));
|
|
});
|
|
if (rows.length >= 3) out.push(rows);
|
|
});
|
|
return out;
|
|
}
|
|
""")
|
|
if text and len(text) > 50:
|
|
result.documents.append(DiscoveredDSI(
|
|
title=title, url=href, source_url=url,
|
|
language=lang,
|
|
doc_type="cross_domain" if not _is_allowed_domain(href, base_domain) else "html_page",
|
|
text=text[:200000], word_count=len(text.split()),
|
|
tables=(tables or [])[:10],
|
|
))
|
|
|
|
# Recursive: search THIS page for more DSI links
|
|
new_links = await _find_dsi_links(page, base_domain)
|
|
for nl in new_links:
|
|
if nl["href"] not in seen_urls and nl["href"] not in [p["href"] for p in pending_links]:
|
|
pending_links.append(nl)
|
|
|
|
# Navigate back for next link
|
|
await goto_resilient(page, url, timeout=45000)
|
|
await page.wait_for_timeout(500)
|
|
await _expand_all_interactive(page)
|
|
|
|
except Exception as e:
|
|
result.errors.append(f"Failed to load {href}: {str(e)[:80]}")
|
|
try:
|
|
await goto_resilient(page, url, timeout=45000)
|
|
except Exception:
|
|
pass
|
|
|
|
except Exception as e:
|
|
result.errors.append(f"Discovery failed: {str(e)[:100]}")
|
|
logger.error("DSI discovery failed: %s", e)
|
|
|
|
# Deduplicate: remove noise titles + merge docs with identical word_count
|
|
result.documents = _deduplicate_documents(result.documents)
|
|
|
|
result.total_found = len(result.documents)
|
|
result.languages_detected = list(set(
|
|
d.language for d in result.documents if d.language
|
|
))
|
|
result.cmp_payloads = [
|
|
{"kind": kind, "data": data} for kind, data in cmp_capture.payloads
|
|
]
|
|
if cmp_capture.payloads:
|
|
try:
|
|
result.cmp_cookie_text = cmp_capture.reconstruct_cookie_policy()
|
|
except Exception as e:
|
|
logger.warning("CMP reconstruct on discovery failed: %s", e)
|
|
logger.info(
|
|
"DSI discovery complete: %d documents found in %s, %d CMP payloads, "
|
|
"cmp_cookie_text=%d words",
|
|
result.total_found, result.languages_detected, len(result.cmp_payloads),
|
|
len(result.cmp_cookie_text.split()) if result.cmp_cookie_text else 0,
|
|
)
|
|
return result
|
|
|
|
# Nav elements, not real documents
|
|
# NOTE: "datenschutz" was removed — it's a legitimate document title
|
|
NOISE_TITLES = {"drucken", "print", "nach oben", "back to top", "teilen", "share",
|
|
"kontakt", "contact", "suche", "search", "menü", "menu", "home"}
|
|
|
|
def _deduplicate_documents(docs: list[DiscoveredDSI]) -> list[DiscoveredDSI]:
|
|
"""Remove duplicate and noise documents."""
|
|
# Step 1: Filter noise titles (nav elements, not real docs)
|
|
filtered = []
|
|
for d in docs:
|
|
title_lower = d.title.strip().lower()
|
|
# Skip very short titles that are nav elements
|
|
if title_lower in NOISE_TITLES:
|
|
continue
|
|
# Skip titles that are just URLs
|
|
if title_lower.startswith("http") or title_lower.startswith("www."):
|
|
continue
|
|
# Skip very short documents (< 50 words) — likely nav snippets
|
|
if d.word_count < 50 and d.doc_type != "pdf":
|
|
continue
|
|
filtered.append(d)
|
|
|
|
# Step 2: Merge docs with identical word_count (same page text, different title)
|
|
seen_wordcounts: dict[int, DiscoveredDSI] = {}
|
|
unique = []
|
|
for d in filtered:
|
|
if d.word_count > 200: # Only dedup substantial docs
|
|
if d.word_count in seen_wordcounts:
|
|
existing = seen_wordcounts[d.word_count]
|
|
# Prefer "Datenschutzinformation*" titles over section headings
|
|
d_is_dsi = d.title.lower().startswith("datenschutzinformation")
|
|
ex_is_dsi = existing.title.lower().startswith("datenschutzinformation")
|
|
if d_is_dsi and not ex_is_dsi:
|
|
unique = [x for x in unique if x is not existing]
|
|
unique.append(d)
|
|
seen_wordcounts[d.word_count] = d
|
|
continue
|
|
seen_wordcounts[d.word_count] = d
|
|
unique.append(d)
|
|
|
|
return unique
|
|
|
|
async def _find_dsi_links(page: Page, base_domain: str) -> list[dict]:
|
|
"""Find all links whose text or href matches DSI keywords."""
|
|
try:
|
|
all_links = await page.evaluate("""
|
|
() => [...document.querySelectorAll('a[href]')].map(a => ({
|
|
href: a.href,
|
|
text: (a.textContent || '').trim().substring(0, 200),
|
|
ariaLabel: a.getAttribute('aria-label') || '',
|
|
title: a.getAttribute('title') || '',
|
|
visible: a.getBoundingClientRect().width > 0,
|
|
}))
|
|
""")
|
|
dsi_links = []
|
|
for link in (all_links or []):
|
|
search_text = f"{link['text']} {link['ariaLabel']} {link['title']}".lower()
|
|
href = link["href"]
|
|
href_lower = href.lower()
|
|
|
|
# Match by link text or href
|
|
is_match = any(kw in search_text or kw in href_lower for kw in ALL_DSI_KEYWORDS)
|
|
if not is_match:
|
|
continue
|
|
|
|
# Allow same domain + related domains + PDFs
|
|
if _is_allowed_domain(href, base_domain) or href.endswith(".pdf"):
|
|
dsi_links.append({
|
|
"href": href,
|
|
"text": link["text"],
|
|
"visible": link["visible"],
|
|
})
|
|
|
|
return dsi_links
|
|
except Exception as e:
|
|
logger.warning("DSI link scan failed: %s", e)
|
|
return []
|
|
|
|
async def _expand_all_interactive(page: Page) -> None:
|
|
"""Expand all accordions, tabs, details, dropdowns on the page.
|
|
|
|
IMPORTANT: Only expand CLOSED elements. Never click elements that
|
|
are already expanded (aria-expanded="true") — that would close them.
|
|
BMW, for example, has accordions open by default.
|
|
"""
|
|
try:
|
|
await page.evaluate("""() => {
|
|
// 1. Open all <details> that are closed
|
|
document.querySelectorAll('details:not([open])').forEach(d => d.open = true);
|
|
|
|
// 2. Click buttons that are explicitly CLOSED (aria-expanded="false")
|
|
document.querySelectorAll('button[aria-expanded="false"]').forEach(b => {
|
|
try { b.click(); } catch {}
|
|
});
|
|
|
|
// 3. Bootstrap/jQuery collapse triggers (only closed ones)
|
|
document.querySelectorAll('[data-toggle="collapse"].collapsed').forEach(e => {
|
|
try { e.click(); } catch {}
|
|
});
|
|
document.querySelectorAll('[data-bs-toggle="collapse"].collapsed').forEach(e => {
|
|
try { e.click(); } catch {}
|
|
});
|
|
|
|
// 4. "Show more" / "Mehr anzeigen" buttons
|
|
document.querySelectorAll('button,a').forEach(b => {
|
|
const t = (b.textContent || '').trim();
|
|
if (/^(mehr|more|weiterlesen|read more|show more|anzeigen|alle anzeigen)/i.test(t))
|
|
try { b.click(); } catch {}
|
|
});
|
|
|
|
// 5. Tabs — click each to make content visible, then go back
|
|
// (don't click, just make tab panels visible)
|
|
document.querySelectorAll('[role="tabpanel"][hidden]').forEach(p => {
|
|
p.removeAttribute('hidden');
|
|
p.style.display = '';
|
|
});
|
|
}""")
|
|
except Exception:
|
|
pass
|
|
|
|
async def _find_inline_dsi_sections(page: Page) -> list[dict]:
|
|
"""Find DSI content already visible on the page (e.g. expanded accordions).
|
|
|
|
Only counts top-level documents (H1/H2 with DSI keywords).
|
|
Sub-sections (H3/H4 like 'Cookies', 'Betroffenenrechte') are NOT counted
|
|
as separate documents — their text is part of the parent document.
|
|
"""
|
|
try:
|
|
sections = await page.evaluate("""
|
|
() => {
|
|
const results = [];
|
|
// Only H1 and H2 count as document-level headings
|
|
const headings = document.querySelectorAll('h1, h2');
|
|
const dsiKeywords = [
|
|
'datenschutz', 'privacy', 'données', 'privacidad', 'protezione',
|
|
'gegevensbescherming', 'ochrona danych', 'tietosuoja', 'integritet',
|
|
'databeskyttelse', 'ochrana', 'adatvédel', 'confidential',
|
|
];
|
|
for (const h of headings) {
|
|
const text = (h.textContent || '').trim();
|
|
const textLower = text.toLowerCase();
|
|
if (!dsiKeywords.some(kw => textLower.includes(kw))) continue;
|
|
|
|
// Get ALL content until the next H1/H2 (include sub-sections H3-H5)
|
|
let content = '';
|
|
let el = h.nextElementSibling;
|
|
let count = 0;
|
|
while (el && count < 200) {
|
|
// Stop at next H1 or H2 (next top-level document)
|
|
if (el.tagName === 'H1' || el.tagName === 'H2') break;
|
|
content += (el.textContent || '').trim() + '\\n';
|
|
el = el.nextElementSibling;
|
|
count++;
|
|
}
|
|
|
|
if (content.length > 100) {
|
|
results.push({
|
|
title: text.substring(0, 200),
|
|
text: content.substring(0, 50000),
|
|
id: h.id || '',
|
|
});
|
|
}
|
|
}
|
|
return results;
|
|
}
|
|
""")
|
|
return sections or []
|
|
except Exception:
|
|
return []
|
|
|
|
|
|
async def _extract_text_robust(page: Page) -> str:
|
|
"""Multi-strategy text extraction for SPA-heavy pages (BMW, Daimler, etc).
|
|
|
|
Tries progressively broader selectors, falls back to body-minus-chrome,
|
|
final fallback: join all paragraph/list/cell tags' textContent.
|
|
"""
|
|
try:
|
|
return await page.evaluate("""
|
|
() => {
|
|
// 1) Specific content containers
|
|
const selectors = [
|
|
'.article-content', '.page-content', '.entry-content',
|
|
'[class*="content-area"]', '[class*="main-content"]',
|
|
'[class*="legal-text"]', '[class*="policy-content"]',
|
|
'main article', 'main', 'article',
|
|
'[role="main"]', '.content', '#content', '.bodytext',
|
|
];
|
|
for (const sel of selectors) {
|
|
const el = document.querySelector(sel);
|
|
if (el && el.textContent.trim().length > 200) {
|
|
return el.textContent.trim().replace(/\\s+/g, ' ');
|
|
}
|
|
}
|
|
// 2) Body minus nav/header/footer/scripts
|
|
const body = document.body.cloneNode(true);
|
|
body.querySelectorAll(
|
|
'nav, header, footer, script, style, noscript,' +
|
|
' [class*="nav"], [class*="sidebar"], [class*="cookie"],' +
|
|
' [class*="banner"], [id*="cookie"], [id*="banner"]'
|
|
).forEach(e => e.remove());
|
|
const bodyText = (body.textContent || '').trim().replace(/\\s+/g, ' ');
|
|
if (bodyText.length > 200) return bodyText;
|
|
// 3) Final fallback: collect all text-bearing tags
|
|
const blocks = document.querySelectorAll('p, li, dd, td, h1, h2, h3, h4');
|
|
const parts = [];
|
|
for (const b of blocks) {
|
|
const t = (b.textContent || '').trim();
|
|
if (t.length > 20) parts.push(t);
|
|
}
|
|
return parts.join(' ').replace(/\\s+/g, ' ');
|
|
}
|
|
""") or ""
|
|
except Exception as e:
|
|
logger.warning("Robust text extraction failed: %s", e)
|
|
return ""
|
|
|
|
|
|
async def _extract_text_from_iframes(page: Page) -> str:
|
|
"""Collect text from same-origin iframes (OneTrust, Sourcepoint embeds).
|
|
|
|
Many sites render cookie policies inside iframes managed by CMP vendors.
|
|
"""
|
|
try:
|
|
from urllib.parse import urlparse
|
|
page_host = urlparse(page.url).netloc
|
|
chunks: list[str] = []
|
|
for frame in page.frames:
|
|
if frame == page.main_frame:
|
|
continue
|
|
try:
|
|
frame_host = urlparse(frame.url).netloc
|
|
# Accept same-origin or known CMP frames
|
|
if frame_host and frame_host != page_host:
|
|
cmp_hosts = ("onetrust", "cookiebot", "consensu", "sourcepoint",
|
|
"usercentrics", "didomi", "klaro")
|
|
if not any(h in frame_host for h in cmp_hosts):
|
|
continue
|
|
text = await frame.evaluate(
|
|
"() => (document.body && document.body.innerText || '').trim()"
|
|
)
|
|
if text and len(text.split()) > 50:
|
|
chunks.append(text)
|
|
except Exception:
|
|
continue
|
|
return "\n\n".join(chunks)
|
|
except Exception as e:
|
|
logger.debug("Iframe extraction failed: %s", e)
|
|
return ""
|
|
|
|
|
|
async def _try_llm_cascade(
|
|
page: Page, target_url: str, network_log: list[dict],
|
|
) -> tuple[str, int]:
|
|
"""Phase C/D fallback: ask Qwen (then OVH) where the cookie policy is.
|
|
|
|
Returns (text, word_count). On failure or no LLM configured: ("", 0).
|
|
|
|
Caches the LLM's suggestion in Valkey per netloc (7d TTL) so subsequent
|
|
runs against the same domain skip the LLM call.
|
|
"""
|
|
from urllib.parse import urlparse
|
|
from services.cmp_llm_fallback import (
|
|
LLMCascade, cache_get, cache_set,
|
|
)
|
|
|
|
netloc = urlparse(target_url).netloc.lower()
|
|
if not netloc:
|
|
return "", 0
|
|
|
|
# Cache hit: apply hint directly
|
|
cached = await cache_get(netloc)
|
|
if cached:
|
|
text = await _apply_llm_hint(page, cached)
|
|
wc = len(text.split()) if text else 0
|
|
if wc >= 300:
|
|
logger.info("LLM cache hit for %s: %d words", netloc, wc)
|
|
return text, wc
|
|
# Cached hint stale — fall through to fresh LLM call
|
|
|
|
# DOM snapshot for the LLM prompt
|
|
try:
|
|
dom_snapshot = await page.evaluate(
|
|
"() => (document.body && document.body.innerText || '').slice(0, 5000)"
|
|
) or ""
|
|
except Exception:
|
|
dom_snapshot = ""
|
|
|
|
cascade = LLMCascade.from_env()
|
|
hint = await cascade.analyze(target_url, dom_snapshot, network_log)
|
|
if not hint:
|
|
return "", 0
|
|
|
|
text = await _apply_llm_hint(page, hint)
|
|
wc = len(text.split()) if text else 0
|
|
if wc >= 300:
|
|
await cache_set(netloc, hint)
|
|
logger.info("LLM cached for %s (%s): %d words", netloc, hint.get("_tier"), wc)
|
|
# Phase E: log discovery + (if eligible) auto-promote to named CMP
|
|
try:
|
|
from services.cmp_discovery_log import record_discovery
|
|
record_discovery(
|
|
domain=netloc,
|
|
llm_used=hint.get("_tier", "unknown"),
|
|
strategy=hint.get("strategy", ""),
|
|
value=hint.get("value", ""),
|
|
extracted_text=text,
|
|
)
|
|
except Exception as e:
|
|
logger.debug("CMP discovery log failed: %s", e)
|
|
return text, wc
|
|
|
|
|
|
async def _apply_llm_hint(page: Page, hint: dict) -> str:
|
|
"""Execute the LLM's suggested strategy and return extracted text."""
|
|
strategy = hint.get("strategy")
|
|
value = hint.get("value", "")
|
|
|
|
if strategy == "text":
|
|
return value or ""
|
|
|
|
if strategy == "selector" and value:
|
|
try:
|
|
return await page.evaluate(
|
|
"(sel) => { const e = document.querySelector(sel); "
|
|
"return e ? (e.innerText || e.textContent || '').trim() : ''; }",
|
|
value,
|
|
) or ""
|
|
except Exception as e:
|
|
logger.debug("LLM selector failed (%s): %s", value, e)
|
|
return ""
|
|
|
|
if strategy == "url" and value:
|
|
try:
|
|
resp = await page.context.request.get(value, timeout=30000)
|
|
if resp.status != 200:
|
|
return ""
|
|
ct = (resp.headers.get("content-type") or "").lower()
|
|
if "json" in ct:
|
|
from services.cmp_heuristic import (
|
|
looks_like_cookie_policy, reconstruct_generic,
|
|
)
|
|
data = await resp.json()
|
|
if looks_like_cookie_policy(data):
|
|
return reconstruct_generic(data)
|
|
# Even if heuristic rejects, try generic walker
|
|
return reconstruct_generic(data)
|
|
text = await resp.text()
|
|
# Strip HTML if HTML response
|
|
if "html" in ct:
|
|
import re as _re
|
|
text = _re.sub(r"<[^>]+>", " ", text)
|
|
text = _re.sub(r"\s+", " ", text).strip()
|
|
return text
|
|
except Exception as e:
|
|
logger.debug("LLM url fetch failed (%s): %s", value[:80], e)
|
|
return ""
|
|
|
|
return ""
|