Files
breakpilot-compliance/consent-tester/services/dsi_discovery.py
T
Benjamin Admin 608fb7faf5
Build + Deploy / build-developer-portal (push) Successful in 1m26s
Build + Deploy / build-tts (push) Successful in 1m38s
Build + Deploy / build-document-crawler (push) Successful in 37s
Build + Deploy / build-dsms-gateway (push) Successful in 26s
Build + Deploy / build-dsms-node (push) Successful in 11s
CI / branch-name (push) Has been skipped
CI / guardrail-integrity (push) Has been skipped
CI / loc-budget (push) Failing after 18s
CI / secret-scan (push) Has been skipped
CI / nodejs-build (push) Successful in 3m7s
CI / dep-audit (push) Has been skipped
Build + Deploy / build-admin-compliance (push) Successful in 2m22s
Build + Deploy / build-backend-compliance (push) Successful in 3m20s
Build + Deploy / build-ai-sdk (push) Successful in 54s
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go (push) Failing after 46s
CI / test-python-backend (push) Successful in 45s
CI / test-python-document-crawler (push) Successful in 30s
CI / test-python-dsms-gateway (push) Successful in 27s
CI / validate-canonical-controls (push) Successful in 17s
Build + Deploy / trigger-orca (push) Successful in 3m37s
CI / sbom-scan (push) Has been skipped
fix: DSI self-extraction + banner L1/L2 check definitions
1. DSI Discovery fix for direct-URL use case (e.g. example.com/datenschutz):
   - Self-extraction: if the URL itself is a DSE page, extract its text
     directly from the page body (main/article/content element)
   - Remove "datenschutz" from NOISE_TITLES — it's a legitimate doc title
   - Fixes safetykon.de/datenschutz returning 0 documents

2. Banner check definitions (36 checks: 6 L1 + 30 L2):
   - consent-tester/checks/banner_checks.py with expert-level hints
   - EDPB 3/2022, CNIL rulings, EuGH C-673/17, §25 TDDDG references
   - check_key maps to existing consent_scanner check codes

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-07 20:53:13 +02:00

541 lines
24 KiB
Python

"""
DSI Discovery — Generic privacy document finder and parser.
Finds all privacy/data protection documents on any website regardless of:
- Technology (static HTML, SPA, WordPress, Typo3, etc.)
- Structure (accordion, sidebar, footer, inline links, separate pages)
- Format (HTML sections, PDF downloads, cross-domain links)
- Language (all 26 EU/EEA official languages)
Flow:
1. Load page with Playwright (full JS rendering)
2. Find all links matching DSI keywords (26 languages)
3. Expand accordions, click tabs, open dropdowns
4. Follow cross-domain links (e.g. instagram.com → help.instagram.com)
5. Extract document text from each link target
6. Return structured list of discovered documents
"""
import logging
import re
from dataclasses import dataclass, field
from urllib.parse import urlparse, urljoin
from playwright.async_api import Page
logger = logging.getLogger(__name__)
# Legal document keywords in all EU/EEA official languages.
# Covers: DSI (privacy), AGB (terms), Widerruf (cancellation),
# Cookie-Richtlinie, Impressum, NB (Nutzungsbedingungen).
DSI_KEYWORDS: dict[str, list[str]] = {
"de": [
# Datenschutz
"datenschutz", "datenschutzerklaerung", "datenschutzinformation",
"datenschutzhinweis", "datenschutzrichtlinie", "dsgvo", "privatsphäre",
"datenschutzbestimmung", "verarbeitung personenbezogener daten",
# AGB / Nutzungsbedingungen
"allgemeine geschäftsbedingungen", "agb", "nutzungsbedingungen",
"nutzungsordnung", "geschäftsbedingungen",
# Widerruf
"widerrufsbelehrung", "widerrufsrecht", "widerrufsformular",
"widerruf", "rücktrittsrecht",
# Cookie
"cookie-richtlinie", "cookie-policy", "cookie-hinweis",
],
"en": [
"privacy policy", "privacy notice", "data protection", "data policy",
"privacy statement", "gdpr", "personal data", "cookie policy",
"terms of service", "terms and conditions", "terms of use",
"cancellation policy", "right of withdrawal", "refund policy",
"cookie notice",
],
"fr": [
"politique de confidentialité", "protection des données",
"données personnelles", "vie privée", "rgpd",
"conditions générales", "conditions d'utilisation",
"droit de rétractation", "politique de cookies",
],
"es": [
"política de privacidad", "protección de datos",
"datos personales", "aviso de privacidad",
"términos y condiciones", "condiciones de uso",
"derecho de desistimiento", "política de cookies",
],
"it": [
"informativa sulla privacy", "protezione dei dati",
"dati personali", "privacy policy",
"termini e condizioni", "condizioni d'uso",
"diritto di recesso", "politica dei cookie",
],
"nl": [
"privacybeleid", "gegevensbescherming", "privacyverklaring",
"persoonsgegevens", "avg",
"algemene voorwaarden", "gebruiksvoorwaarden",
"herroepingsrecht", "cookiebeleid",
],
"pl": [
"polityka prywatności", "ochrona danych osobowych",
"dane osobowe", "rodo",
"regulamin", "warunki korzystania",
"prawo odstąpienia", "polityka cookies",
],
"pt": [
"política de privacidade", "proteção de dados",
"dados pessoais", "lgpd",
"termos e condições", "condições de utilização",
"direito de resolução", "política de cookies",
],
"sv": [
"integritetspolicy", "dataskydd", "personuppgifter",
"sekretesspolicy",
"allmänna villkor", "användarvillkor",
"ångerrätt", "cookiepolicy",
],
"da": [
"privatlivspolitik", "databeskyttelse", "personoplysninger",
"persondatapolitik",
"handelsbetingelser", "brugsbetingelser",
"fortrydelsesret", "cookiepolitik",
],
"fi": [
"tietosuojaseloste", "tietosuoja", "henkilötiedot",
"rekisteriseloste",
"yleiset ehdot", "käyttöehdot",
"peruutusoikeus", "evästekäytäntö",
],
"cs": ["zásady ochrany osobních údajů", "ochrana osobních údajů",
"zpracování osobních údajů", "obchodní podmínky", "zásady cookies"],
"el": ["πολιτική απορρήτου", "προστασία δεδομένων",
"προσωπικά δεδομένα", "όροι χρήσης", "πολιτική cookies"],
"hu": ["adatvédelmi szabályzat", "adatvédelem", "személyes adatok",
"általános szerződési feltételek", "cookie szabályzat"],
"ro": ["politica de confidențialitate", "protecția datelor",
"date cu caracter personal", "termeni și condiții", "politica cookies"],
"bg": ["политика за поверителност", "защита на данните",
"лични данни", "общи условия", "политика за бисквитки"],
"hr": ["politika privatnosti", "zaštita podataka", "osobni podaci",
"opći uvjeti", "politika kolačića"],
"sk": ["zásady ochrany osobných údajov", "ochrana osobných údajov",
"obchodné podmienky", "zásady cookies"],
"sl": ["politika zasebnosti", "varstvo podatkov", "osebni podatki",
"splošni pogoji", "politika piškotkov"],
"et": ["privaatsuspoliitika", "andmekaitse", "isikuandmed",
"kasutustingimused", "küpsiste poliitika"],
"lt": ["privatumo politika", "duomenų apsauga", "asmens duomenys",
"naudojimosi sąlygos", "slapukų politika"],
"lv": ["privātuma politika", "datu aizsardzība", "personas dati",
"lietošanas noteikumi", "sīkdatņu politika"],
"mt": ["politika tal-privatezza", "protezzjoni tad-data",
"termini u kundizzjonijiet"],
"ga": ["polasaí príobháideachais", "cosaint sonraí",
"téarmaí agus coinníollacha"],
"is": ["persónuverndarstefna", "persónuvernd",
"skilmálar og skilyrði"],
"no": ["personvernerklæring", "personvern", "personopplysninger",
"brukervilkår", "angrerett", "informasjonskapsler"],
}
# Flatten all keywords for quick matching
ALL_DSI_KEYWORDS: list[str] = []
for kw_list in DSI_KEYWORDS.values():
ALL_DSI_KEYWORDS.extend(kw_list)
@dataclass
class DiscoveredDSI:
"""A discovered privacy/data protection document."""
title: str
url: str
source_url: str # Page where the link was found
language: str = ""
doc_type: str = "" # "html_section", "html_page", "pdf", "accordion", "cross_domain"
text: str = "" # Extracted full text
sections: list[dict] = field(default_factory=list) # Parsed sections
word_count: int = 0
@dataclass
class DSIDiscoveryResult:
"""Result of DSI discovery scan."""
base_url: str
documents: list[DiscoveredDSI] = field(default_factory=list)
total_found: int = 0
languages_detected: list[str] = field(default_factory=list)
errors: list[str] = field(default_factory=list)
def _matches_dsi_keyword(text: str) -> tuple[bool, str]:
"""Check if text contains any DSI keyword. Returns (match, language)."""
text_lower = text.lower().strip()
for lang, keywords in DSI_KEYWORDS.items():
for kw in keywords:
if kw in text_lower:
return True, lang
return False, ""
def _is_allowed_domain(href: str, base_domain: str) -> bool:
"""Allow same domain + known related domains (e.g. help.instagram.com)."""
try:
link_domain = urlparse(href).netloc.replace("www.", "")
base_clean = base_domain.replace("www.", "")
# Same domain
if link_domain == base_clean:
return True
# Subdomain (help.instagram.com for instagram.com)
if link_domain.endswith(f".{base_clean}"):
return True
# Parent domain (instagram.com links from about.instagram.com)
if base_clean.endswith(f".{link_domain}"):
return True
# Known related patterns
parts_base = base_clean.split(".")
parts_link = link_domain.split(".")
if len(parts_base) >= 2 and len(parts_link) >= 2:
if parts_base[-2] == parts_link[-2] and parts_base[-1] == parts_link[-1]:
return True # Same registrable domain
except Exception:
pass
return False
async def discover_dsi_documents(
page: Page,
url: str,
max_documents: int = 100,
timeout_seconds: int = 300,
) -> DSIDiscoveryResult:
"""Discover all privacy/data protection documents on a website.
Works generically regardless of website technology, structure, or language.
Searches exhaustively until no new documents are found — no arbitrary page limit.
Stops when: all discovered links have been visited OR timeout reached.
"""
import time
deadline = time.time() + timeout_seconds
result = DSIDiscoveryResult(base_url=url)
base_domain = urlparse(url).netloc
seen_urls: set[str] = set()
seen_titles: set[str] = set()
try:
# Step 1: Load the page
await page.goto(url, wait_until="networkidle", timeout=60000)
await page.wait_for_timeout(2000)
# Step 1b: Self-extraction — if the URL itself is a DSI page,
# extract its full text as the first document. This handles the
# case where the user provides the DSE URL directly (e.g.
# example.com/datenschutz) instead of the homepage.
current_url_path = urlparse(url).path.lower()
is_self_dsi, self_lang = _matches_dsi_keyword(current_url_path)
if not is_self_dsi:
# Also check the page title
page_title = await page.title() or ""
is_self_dsi, self_lang = _matches_dsi_keyword(page_title)
if is_self_dsi:
try:
self_text = await page.evaluate("""() => {
const main = document.querySelector('main, article, [role="main"], .content, #content, .bodytext')
|| document.body;
return main ? main.innerText : document.body.innerText;
}""")
self_wc = len(self_text.split()) if self_text else 0
if self_wc >= 100:
page_title = await page.title() or url
result.documents.append(DiscoveredDSI(
title=page_title.strip(),
url=url,
source_url=url,
language=self_lang or "de",
doc_type="html_full_page",
text=self_text.strip(),
word_count=self_wc,
))
seen_urls.add(url)
logger.info("Self-extracted %d words from %s", self_wc, url)
except Exception as e:
logger.warning("Self-extraction failed for %s: %s", url, e)
# Step 2: Find DSI links in current page
links = await _find_dsi_links(page, base_domain)
logger.info("Found %d DSI links on %s", len(links), url)
# Step 3: Expand accordions, tabs, dropdowns to find hidden content
await _expand_all_interactive(page)
await page.wait_for_timeout(1000)
# Step 3b: Re-scan after expanding (may reveal new links)
links_after = await _find_dsi_links(page, base_domain)
for link in links_after:
if link["href"] not in [l["href"] for l in links]:
links.append(link)
# Step 4: Check for inline DSI sections (accordion content already visible)
inline_sections = await _find_inline_dsi_sections(page)
for section in inline_sections:
title_norm = section["title"].strip().lower()
if title_norm not in seen_titles:
seen_titles.add(title_norm)
is_dsi, lang = _matches_dsi_keyword(section["title"])
doc = DiscoveredDSI(
title=section["title"],
url=f"{url}#{section.get('id', '')}",
source_url=url,
language=lang,
doc_type="html_section",
text=section["text"],
word_count=len(section["text"].split()),
)
result.documents.append(doc)
# Step 5: Follow each DSI link and extract content.
# Exhaustive: processes ALL found links. On each visited page,
# searches for MORE links (recursive discovery). Stops only when
# all links visited or timeout reached.
pending_links = list(links)
pages_to_revisit: list[str] = [] # Pages where we found docs — may have more links
while pending_links and time.time() < deadline and len(result.documents) < max_documents:
link_info = pending_links.pop(0)
href = link_info["href"]
if href in seen_urls:
continue
seen_urls.add(href)
title = link_info["text"]
title_norm = title.strip().lower()
if title_norm in seen_titles:
continue
seen_titles.add(title_norm)
is_dsi, lang = _matches_dsi_keyword(title)
is_pdf = href.lower().endswith(".pdf")
if is_pdf:
result.documents.append(DiscoveredDSI(
title=title, url=href, source_url=url,
language=lang, doc_type="pdf",
text="[PDF — Textextraktion erforderlich]",
))
continue
try:
# Skip anchor links on same page — they are sections of the parent doc
is_anchor = "#" in href and href.split("#")[0] in (url.split("#")[0], page.url.split("#")[0])
if is_anchor:
continue
# Navigate to page — wait for JS to load content
resp = await page.goto(href, wait_until="networkidle", timeout=45000)
if resp and resp.status < 400:
await page.wait_for_timeout(2000)
await _expand_all_interactive(page)
await page.wait_for_timeout(500)
# Extract text — try specific content areas, fall back to full body
text = await page.evaluate("""
() => {
// Try progressively broader content selectors
const selectors = [
'.article-content', '.page-content', '.entry-content',
'[class*="content-area"]', '[class*="main-content"]',
'main article', 'main', 'article',
'[role="main"]', '.content', '#content',
];
for (const sel of selectors) {
const el = document.querySelector(sel);
if (el && el.textContent.trim().length > 200) {
return el.textContent.trim();
}
}
// Fallback: full body minus nav/header/footer
const body = document.body.cloneNode(true);
body.querySelectorAll('nav, header, footer, script, style, [class*="nav"], [class*="sidebar"]').forEach(e => e.remove());
return body.textContent?.trim() || '';
}
""")
if text and len(text) > 50:
result.documents.append(DiscoveredDSI(
title=title, url=href, source_url=url,
language=lang,
doc_type="cross_domain" if not _is_allowed_domain(href, base_domain) else "html_page",
text=text[:50000], word_count=len(text.split()),
))
# Recursive: search THIS page for more DSI links
new_links = await _find_dsi_links(page, base_domain)
for nl in new_links:
if nl["href"] not in seen_urls and nl["href"] not in [p["href"] for p in pending_links]:
pending_links.append(nl)
# Navigate back for next link
await page.goto(url, wait_until="networkidle", timeout=45000)
await page.wait_for_timeout(500)
await _expand_all_interactive(page)
except Exception as e:
result.errors.append(f"Failed to load {href}: {str(e)[:80]}")
try:
await page.goto(url, wait_until="networkidle", timeout=45000)
except Exception:
pass
except Exception as e:
result.errors.append(f"Discovery failed: {str(e)[:100]}")
logger.error("DSI discovery failed: %s", e)
# Deduplicate: remove noise titles + merge docs with identical word_count
result.documents = _deduplicate_documents(result.documents)
result.total_found = len(result.documents)
result.languages_detected = list(set(
d.language for d in result.documents if d.language
))
logger.info("DSI discovery complete: %d documents found in %s",
result.total_found, result.languages_detected)
return result
# Nav elements, not real documents
# NOTE: "datenschutz" was removed — it's a legitimate document title
NOISE_TITLES = {"drucken", "print", "nach oben", "back to top", "teilen", "share",
"kontakt", "contact", "suche", "search", "menü", "menu", "home"}
def _deduplicate_documents(docs: list[DiscoveredDSI]) -> list[DiscoveredDSI]:
"""Remove duplicate and noise documents."""
# Step 1: Filter noise titles (nav elements, not real docs)
filtered = []
for d in docs:
title_lower = d.title.strip().lower()
# Skip very short titles that are nav elements
if title_lower in NOISE_TITLES:
continue
# Skip titles that are just URLs
if title_lower.startswith("http") or title_lower.startswith("www."):
continue
# Skip very short documents (< 50 words) — likely nav snippets
if d.word_count < 50 and d.doc_type != "pdf":
continue
filtered.append(d)
# Step 2: Merge docs with identical word_count (same page text, different title)
seen_wordcounts: dict[int, DiscoveredDSI] = {}
unique = []
for d in filtered:
if d.word_count > 200: # Only dedup substantial docs
if d.word_count in seen_wordcounts:
existing = seen_wordcounts[d.word_count]
# Prefer "Datenschutzinformation*" titles over section headings
d_is_dsi = d.title.lower().startswith("datenschutzinformation")
ex_is_dsi = existing.title.lower().startswith("datenschutzinformation")
if d_is_dsi and not ex_is_dsi:
unique = [x for x in unique if x is not existing]
unique.append(d)
seen_wordcounts[d.word_count] = d
continue
seen_wordcounts[d.word_count] = d
unique.append(d)
return unique
async def _find_dsi_links(page: Page, base_domain: str) -> list[dict]:
"""Find all links whose text or href matches DSI keywords."""
try:
all_links = await page.evaluate("""
() => [...document.querySelectorAll('a[href]')].map(a => ({
href: a.href,
text: (a.textContent || '').trim().substring(0, 200),
ariaLabel: a.getAttribute('aria-label') || '',
title: a.getAttribute('title') || '',
visible: a.getBoundingClientRect().width > 0,
}))
""")
dsi_links = []
for link in (all_links or []):
search_text = f"{link['text']} {link['ariaLabel']} {link['title']}".lower()
href = link["href"]
href_lower = href.lower()
# Match by link text or href
is_match = any(kw in search_text or kw in href_lower for kw in ALL_DSI_KEYWORDS)
if not is_match:
continue
# Allow same domain + related domains + PDFs
if _is_allowed_domain(href, base_domain) or href.endswith(".pdf"):
dsi_links.append({
"href": href,
"text": link["text"],
"visible": link["visible"],
})
return dsi_links
except Exception as e:
logger.warning("DSI link scan failed: %s", e)
return []
async def _expand_all_interactive(page: Page) -> None:
"""Expand all accordions, tabs, details, dropdowns on the page."""
try:
await page.evaluate("""() => {
document.querySelectorAll('details:not([open])').forEach(d => d.open = true);
const sels = ['button[aria-expanded="false"]','[data-toggle="collapse"]',
'[data-bs-toggle="collapse"]','[class*="accordion"] > button',
'[class*="collapse"] > button','.panel-heading a'];
sels.forEach(s => document.querySelectorAll(s).forEach(e => { try{e.click()}catch{} }));
document.querySelectorAll('button,a').forEach(b => {
if (/^(mehr|more|weiterlesen|read more|show more|anzeigen|alle anzeigen)/i.test((b.textContent||'').trim()))
try{b.click()}catch{}
});
document.querySelectorAll('[role="tab"]').forEach(t => { try{t.click()}catch{} });
}""")
except Exception:
pass
async def _find_inline_dsi_sections(page: Page) -> list[dict]:
"""Find DSI content already visible on the page (e.g. expanded accordions).
Only counts top-level documents (H1/H2 with DSI keywords).
Sub-sections (H3/H4 like 'Cookies', 'Betroffenenrechte') are NOT counted
as separate documents — their text is part of the parent document.
"""
try:
sections = await page.evaluate("""
() => {
const results = [];
// Only H1 and H2 count as document-level headings
const headings = document.querySelectorAll('h1, h2');
const dsiKeywords = [
'datenschutz', 'privacy', 'données', 'privacidad', 'protezione',
'gegevensbescherming', 'ochrona danych', 'tietosuoja', 'integritet',
'databeskyttelse', 'ochrana', 'adatvédel', 'confidential',
];
for (const h of headings) {
const text = (h.textContent || '').trim();
const textLower = text.toLowerCase();
if (!dsiKeywords.some(kw => textLower.includes(kw))) continue;
// Get ALL content until the next H1/H2 (include sub-sections H3-H5)
let content = '';
let el = h.nextElementSibling;
let count = 0;
while (el && count < 200) {
// Stop at next H1 or H2 (next top-level document)
if (el.tagName === 'H1' || el.tagName === 'H2') break;
content += (el.textContent || '').trim() + '\\n';
el = el.nextElementSibling;
count++;
}
if (content.length > 100) {
results.push({
title: text.substring(0, 200),
text: content.substring(0, 50000),
id: h.id || '',
});
}
}
return results;
}
""")
return sections or []
except Exception:
return []