686834cea0
Build + Deploy / build-ai-sdk (push) Failing after 36s
Build + Deploy / build-developer-portal (push) Successful in 8s
Build + Deploy / build-tts (push) Successful in 7s
Build + Deploy / build-document-crawler (push) Successful in 7s
Build + Deploy / build-admin-compliance (push) Successful in 8s
Build + Deploy / build-backend-compliance (push) Successful in 8s
CI / nodejs-build (push) Successful in 3m14s
CI / dep-audit (push) Has been skipped
CI / sbom-scan (push) Has been skipped
CI / test-go (push) Failing after 46s
CI / test-python-backend (push) Successful in 43s
CI / test-python-document-crawler (push) Successful in 29s
CI / test-python-dsms-gateway (push) Successful in 30s
CI / validate-canonical-controls (push) Successful in 16s
Build + Deploy / build-dsms-gateway (push) Successful in 8s
Build + Deploy / build-dsms-node (push) Successful in 8s
CI / branch-name (push) Has been skipped
Build + Deploy / trigger-orca (push) Has been skipped
CI / guardrail-integrity (push) Has been skipped
CI / loc-budget (push) Failing after 17s
CI / secret-scan (push) Has been skipped
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
1. EU Institution Checks (Verordnung 2018/1725): - New doc_type "eu_institution" with 9 L1 + 15 L2 checks - Both German + English patterns (EU institutions are multilingual) - Auto-detection via "2018/1725", "EDSB", "EDPS" keywords - Correct article references (Art. 15 instead of 13, Art. 5 instead of 6) 2. Banner Check Integration: - banner_runner.py maps scan results to 36 L1/L2 structured checks - BannerCheckTab shows hierarchical ChecklistView with hints - 3-phase summary (cookies/scripts before/after consent) - /scan endpoint now includes structured_checks in response 3. JS-heavy Website Fixes (dm, Zalando, HWK): - dsi_helpers.py: goto_resilient (networkidle→domcontentloaded fallback) - try_dismiss_consent_banner before text extraction - PDF redirect detection (dm.de redirects to GCS PDF) 4. Caritas False Positive Fixes: - Phone regex allows parentheses: +49 (0)761 → now matches - "Recht auf Widerspruch" (3 words) + §23 KDG → matches Art. 21 - Church authorities: "Katholisches Datenschutzzentrum" recognized Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
581 lines
25 KiB
Python
581 lines
25 KiB
Python
"""
|
|
DSI Discovery — Generic privacy document finder and parser.
|
|
|
|
Finds all privacy/data protection documents on any website regardless of:
|
|
- Technology (static HTML, SPA, WordPress, Typo3, etc.)
|
|
- Structure (accordion, sidebar, footer, inline links, separate pages)
|
|
- Format (HTML sections, PDF downloads, cross-domain links)
|
|
- Language (all 26 EU/EEA official languages)
|
|
|
|
Flow:
|
|
1. Load page with Playwright (full JS rendering)
|
|
2. Find all links matching DSI keywords (26 languages)
|
|
3. Expand accordions, click tabs, open dropdowns
|
|
4. Follow cross-domain links (e.g. instagram.com → help.instagram.com)
|
|
5. Extract document text from each link target
|
|
6. Return structured list of discovered documents
|
|
"""
|
|
|
|
import logging
|
|
import re
|
|
from dataclasses import dataclass, field
|
|
from urllib.parse import urlparse, urljoin
|
|
|
|
from playwright.async_api import Page
|
|
|
|
from services.dsi_helpers import goto_resilient, try_dismiss_consent_banner, is_pdf_redirect
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Legal document keywords in all EU/EEA official languages.
|
|
# Covers: DSI (privacy), AGB (terms), Widerruf (cancellation),
|
|
# Cookie-Richtlinie, Impressum, NB (Nutzungsbedingungen).
|
|
DSI_KEYWORDS: dict[str, list[str]] = {
|
|
"de": [
|
|
# Datenschutz
|
|
"datenschutz", "datenschutzerklaerung", "datenschutzinformation",
|
|
"datenschutzhinweis", "datenschutzrichtlinie", "dsgvo", "privatsphäre",
|
|
"datenschutzbestimmung", "verarbeitung personenbezogener daten",
|
|
# AGB / Nutzungsbedingungen
|
|
"allgemeine geschäftsbedingungen", "agb", "nutzungsbedingungen",
|
|
"nutzungsordnung", "geschäftsbedingungen",
|
|
# Widerruf
|
|
"widerrufsbelehrung", "widerrufsrecht", "widerrufsformular",
|
|
"widerruf", "rücktrittsrecht",
|
|
# Cookie
|
|
"cookie-richtlinie", "cookie-policy", "cookie-hinweis",
|
|
],
|
|
"en": [
|
|
"privacy policy", "privacy notice", "data protection", "data policy",
|
|
"privacy statement", "gdpr", "personal data", "cookie policy",
|
|
"terms of service", "terms and conditions", "terms of use",
|
|
"cancellation policy", "right of withdrawal", "refund policy",
|
|
"cookie notice",
|
|
],
|
|
"fr": [
|
|
"politique de confidentialité", "protection des données",
|
|
"données personnelles", "vie privée", "rgpd",
|
|
"conditions générales", "conditions d'utilisation",
|
|
"droit de rétractation", "politique de cookies",
|
|
],
|
|
"es": [
|
|
"política de privacidad", "protección de datos",
|
|
"datos personales", "aviso de privacidad",
|
|
"términos y condiciones", "condiciones de uso",
|
|
"derecho de desistimiento", "política de cookies",
|
|
],
|
|
"it": [
|
|
"informativa sulla privacy", "protezione dei dati",
|
|
"dati personali", "privacy policy",
|
|
"termini e condizioni", "condizioni d'uso",
|
|
"diritto di recesso", "politica dei cookie",
|
|
],
|
|
"nl": [
|
|
"privacybeleid", "gegevensbescherming", "privacyverklaring",
|
|
"persoonsgegevens", "avg",
|
|
"algemene voorwaarden", "gebruiksvoorwaarden",
|
|
"herroepingsrecht", "cookiebeleid",
|
|
],
|
|
"pl": [
|
|
"polityka prywatności", "ochrona danych osobowych",
|
|
"dane osobowe", "rodo",
|
|
"regulamin", "warunki korzystania",
|
|
"prawo odstąpienia", "polityka cookies",
|
|
],
|
|
"pt": [
|
|
"política de privacidade", "proteção de dados",
|
|
"dados pessoais", "lgpd",
|
|
"termos e condições", "condições de utilização",
|
|
"direito de resolução", "política de cookies",
|
|
],
|
|
"sv": [
|
|
"integritetspolicy", "dataskydd", "personuppgifter",
|
|
"sekretesspolicy",
|
|
"allmänna villkor", "användarvillkor",
|
|
"ångerrätt", "cookiepolicy",
|
|
],
|
|
"da": [
|
|
"privatlivspolitik", "databeskyttelse", "personoplysninger",
|
|
"persondatapolitik",
|
|
"handelsbetingelser", "brugsbetingelser",
|
|
"fortrydelsesret", "cookiepolitik",
|
|
],
|
|
"fi": [
|
|
"tietosuojaseloste", "tietosuoja", "henkilötiedot",
|
|
"rekisteriseloste",
|
|
"yleiset ehdot", "käyttöehdot",
|
|
"peruutusoikeus", "evästekäytäntö",
|
|
],
|
|
"cs": ["zásady ochrany osobních údajů", "ochrana osobních údajů",
|
|
"zpracování osobních údajů", "obchodní podmínky", "zásady cookies"],
|
|
"el": ["πολιτική απορρήτου", "προστασία δεδομένων",
|
|
"προσωπικά δεδομένα", "όροι χρήσης", "πολιτική cookies"],
|
|
"hu": ["adatvédelmi szabályzat", "adatvédelem", "személyes adatok",
|
|
"általános szerződési feltételek", "cookie szabályzat"],
|
|
"ro": ["politica de confidențialitate", "protecția datelor",
|
|
"date cu caracter personal", "termeni și condiții", "politica cookies"],
|
|
"bg": ["политика за поверителност", "защита на данните",
|
|
"лични данни", "общи условия", "политика за бисквитки"],
|
|
"hr": ["politika privatnosti", "zaštita podataka", "osobni podaci",
|
|
"opći uvjeti", "politika kolačića"],
|
|
"sk": ["zásady ochrany osobných údajov", "ochrana osobných údajov",
|
|
"obchodné podmienky", "zásady cookies"],
|
|
"sl": ["politika zasebnosti", "varstvo podatkov", "osebni podatki",
|
|
"splošni pogoji", "politika piškotkov"],
|
|
"et": ["privaatsuspoliitika", "andmekaitse", "isikuandmed",
|
|
"kasutustingimused", "küpsiste poliitika"],
|
|
"lt": ["privatumo politika", "duomenų apsauga", "asmens duomenys",
|
|
"naudojimosi sąlygos", "slapukų politika"],
|
|
"lv": ["privātuma politika", "datu aizsardzība", "personas dati",
|
|
"lietošanas noteikumi", "sīkdatņu politika"],
|
|
"mt": ["politika tal-privatezza", "protezzjoni tad-data",
|
|
"termini u kundizzjonijiet"],
|
|
"ga": ["polasaí príobháideachais", "cosaint sonraí",
|
|
"téarmaí agus coinníollacha"],
|
|
"is": ["persónuverndarstefna", "persónuvernd",
|
|
"skilmálar og skilyrði"],
|
|
"no": ["personvernerklæring", "personvern", "personopplysninger",
|
|
"brukervilkår", "angrerett", "informasjonskapsler"],
|
|
}
|
|
|
|
# Flatten all keywords for quick matching
|
|
ALL_DSI_KEYWORDS: list[str] = []
|
|
for kw_list in DSI_KEYWORDS.values():
|
|
ALL_DSI_KEYWORDS.extend(kw_list)
|
|
|
|
@dataclass
|
|
class DiscoveredDSI:
|
|
"""A discovered privacy/data protection document."""
|
|
title: str
|
|
url: str
|
|
source_url: str # Page where the link was found
|
|
language: str = ""
|
|
doc_type: str = "" # "html_section", "html_page", "pdf", "accordion", "cross_domain"
|
|
text: str = "" # Extracted full text
|
|
sections: list[dict] = field(default_factory=list) # Parsed sections
|
|
word_count: int = 0
|
|
|
|
@dataclass
|
|
class DSIDiscoveryResult:
|
|
"""Result of DSI discovery scan."""
|
|
base_url: str
|
|
documents: list[DiscoveredDSI] = field(default_factory=list)
|
|
total_found: int = 0
|
|
languages_detected: list[str] = field(default_factory=list)
|
|
errors: list[str] = field(default_factory=list)
|
|
|
|
def _matches_dsi_keyword(text: str) -> tuple[bool, str]:
|
|
"""Check if text contains any DSI keyword. Returns (match, language)."""
|
|
text_lower = text.lower().strip()
|
|
for lang, keywords in DSI_KEYWORDS.items():
|
|
for kw in keywords:
|
|
if kw in text_lower:
|
|
return True, lang
|
|
return False, ""
|
|
|
|
def _is_allowed_domain(href: str, base_domain: str) -> bool:
|
|
"""Allow same domain + known related domains (e.g. help.instagram.com)."""
|
|
try:
|
|
link_domain = urlparse(href).netloc.replace("www.", "")
|
|
base_clean = base_domain.replace("www.", "")
|
|
# Same domain
|
|
if link_domain == base_clean:
|
|
return True
|
|
# Subdomain (help.instagram.com for instagram.com)
|
|
if link_domain.endswith(f".{base_clean}"):
|
|
return True
|
|
# Parent domain (instagram.com links from about.instagram.com)
|
|
if base_clean.endswith(f".{link_domain}"):
|
|
return True
|
|
# Known related patterns
|
|
parts_base = base_clean.split(".")
|
|
parts_link = link_domain.split(".")
|
|
if len(parts_base) >= 2 and len(parts_link) >= 2:
|
|
if parts_base[-2] == parts_link[-2] and parts_base[-1] == parts_link[-1]:
|
|
return True # Same registrable domain
|
|
except Exception:
|
|
pass
|
|
return False
|
|
|
|
async def discover_dsi_documents(
|
|
page: Page,
|
|
url: str,
|
|
max_documents: int = 100,
|
|
timeout_seconds: int = 300,
|
|
) -> DSIDiscoveryResult:
|
|
"""Discover all privacy/data protection documents on a website.
|
|
|
|
Works generically regardless of website technology, structure, or language.
|
|
Searches exhaustively until no new documents are found — no arbitrary page limit.
|
|
Stops when: all discovered links have been visited OR timeout reached.
|
|
"""
|
|
import time
|
|
deadline = time.time() + timeout_seconds
|
|
|
|
result = DSIDiscoveryResult(base_url=url)
|
|
base_domain = urlparse(url).netloc
|
|
seen_urls: set[str] = set()
|
|
seen_titles: set[str] = set()
|
|
|
|
try:
|
|
# Step 1: Load the page (with networkidle → domcontentloaded fallback)
|
|
await goto_resilient(page, url, timeout=60000)
|
|
await page.wait_for_timeout(2000)
|
|
|
|
# Step 1a: Detect PDF redirects (e.g. dm.de redirects to GCS PDF)
|
|
final_url = page.url
|
|
if is_pdf_redirect(url, final_url):
|
|
is_dsi_url, dsi_lang = _matches_dsi_keyword(urlparse(url).path.lower())
|
|
if is_dsi_url:
|
|
result.documents.append(DiscoveredDSI(
|
|
title=urlparse(url).path.split("/")[-1] or "Datenschutzerklaerung",
|
|
url=final_url,
|
|
source_url=url,
|
|
language=dsi_lang or "de",
|
|
doc_type="pdf",
|
|
text="[PDF — Textextraktion erforderlich]",
|
|
))
|
|
seen_urls.add(url)
|
|
seen_urls.add(final_url)
|
|
logger.info("PDF redirect detected: %s -> %s", url, final_url)
|
|
# Return early — a PDF redirect means no HTML content to scan
|
|
result.total_found = len(result.documents)
|
|
return result
|
|
|
|
# Step 1b: Try dismissing cookie consent banners before extraction.
|
|
# Many German sites (dm.de, Zalando, etc.) block page content behind
|
|
# a consent wall. Dismissing it reveals the actual DSI text.
|
|
await try_dismiss_consent_banner(page)
|
|
|
|
# Step 1c: Self-extraction — if the URL itself is a DSI page,
|
|
# extract its full text as the first document. This handles the
|
|
# case where the user provides the DSE URL directly (e.g.
|
|
# example.com/datenschutz) instead of the homepage.
|
|
current_url_path = urlparse(url).path.lower()
|
|
is_self_dsi, self_lang = _matches_dsi_keyword(current_url_path)
|
|
if not is_self_dsi:
|
|
# Also check the page title
|
|
page_title = await page.title() or ""
|
|
is_self_dsi, self_lang = _matches_dsi_keyword(page_title)
|
|
if is_self_dsi:
|
|
try:
|
|
self_text = await page.evaluate("""() => {
|
|
const main = document.querySelector('main, article, [role="main"], .content, #content, .bodytext')
|
|
|| document.body;
|
|
return main ? main.innerText : document.body.innerText;
|
|
}""")
|
|
self_wc = len(self_text.split()) if self_text else 0
|
|
if self_wc >= 100:
|
|
page_title = await page.title() or url
|
|
result.documents.append(DiscoveredDSI(
|
|
title=page_title.strip(),
|
|
url=url,
|
|
source_url=url,
|
|
language=self_lang or "de",
|
|
doc_type="html_full_page",
|
|
text=self_text.strip(),
|
|
word_count=self_wc,
|
|
))
|
|
seen_urls.add(url)
|
|
logger.info("Self-extracted %d words from %s", self_wc, url)
|
|
else:
|
|
logger.info("Self-extraction too short (%d words) for %s", self_wc, url)
|
|
except Exception as e:
|
|
logger.warning("Self-extraction failed for %s: %s", url, e)
|
|
|
|
# Step 2: Find DSI links in current page
|
|
links = await _find_dsi_links(page, base_domain)
|
|
logger.info("Found %d DSI links on %s", len(links), url)
|
|
|
|
# Step 3: Expand accordions, tabs, dropdowns to find hidden content
|
|
await _expand_all_interactive(page)
|
|
await page.wait_for_timeout(1000)
|
|
|
|
# Step 3b: Re-scan after expanding (may reveal new links)
|
|
links_after = await _find_dsi_links(page, base_domain)
|
|
for link in links_after:
|
|
if link["href"] not in [l["href"] for l in links]:
|
|
links.append(link)
|
|
|
|
# Step 4: Check for inline DSI sections (accordion content already visible)
|
|
inline_sections = await _find_inline_dsi_sections(page)
|
|
for section in inline_sections:
|
|
title_norm = section["title"].strip().lower()
|
|
if title_norm not in seen_titles:
|
|
seen_titles.add(title_norm)
|
|
is_dsi, lang = _matches_dsi_keyword(section["title"])
|
|
doc = DiscoveredDSI(
|
|
title=section["title"],
|
|
url=f"{url}#{section.get('id', '')}",
|
|
source_url=url,
|
|
language=lang,
|
|
doc_type="html_section",
|
|
text=section["text"],
|
|
word_count=len(section["text"].split()),
|
|
)
|
|
result.documents.append(doc)
|
|
|
|
# Step 5: Follow each DSI link and extract content.
|
|
# Exhaustive: processes ALL found links. On each visited page,
|
|
# searches for MORE links (recursive discovery). Stops only when
|
|
# all links visited or timeout reached.
|
|
pending_links = list(links)
|
|
pages_to_revisit: list[str] = [] # Pages where we found docs — may have more links
|
|
|
|
while pending_links and time.time() < deadline and len(result.documents) < max_documents:
|
|
link_info = pending_links.pop(0)
|
|
href = link_info["href"]
|
|
if href in seen_urls:
|
|
continue
|
|
seen_urls.add(href)
|
|
|
|
title = link_info["text"]
|
|
title_norm = title.strip().lower()
|
|
if title_norm in seen_titles:
|
|
continue
|
|
seen_titles.add(title_norm)
|
|
|
|
is_dsi, lang = _matches_dsi_keyword(title)
|
|
is_pdf = href.lower().endswith(".pdf")
|
|
|
|
if is_pdf:
|
|
result.documents.append(DiscoveredDSI(
|
|
title=title, url=href, source_url=url,
|
|
language=lang, doc_type="pdf",
|
|
text="[PDF — Textextraktion erforderlich]",
|
|
))
|
|
continue
|
|
|
|
try:
|
|
# Skip anchor links on same page — they are sections of the parent doc
|
|
is_anchor = "#" in href and href.split("#")[0] in (url.split("#")[0], page.url.split("#")[0])
|
|
if is_anchor:
|
|
continue
|
|
|
|
# Navigate to page — with networkidle/domcontentloaded fallback
|
|
await goto_resilient(page, href, timeout=45000)
|
|
resp_url = page.url
|
|
|
|
# Check for PDF redirect on followed links
|
|
if is_pdf_redirect(href, resp_url):
|
|
result.documents.append(DiscoveredDSI(
|
|
title=title, url=resp_url, source_url=url,
|
|
language=lang, doc_type="pdf",
|
|
text="[PDF — Textextraktion erforderlich]",
|
|
))
|
|
await goto_resilient(page, url, timeout=45000)
|
|
continue
|
|
|
|
await try_dismiss_consent_banner(page)
|
|
await _expand_all_interactive(page)
|
|
await page.wait_for_timeout(500)
|
|
|
|
# Extract text — try specific content areas, fall back to full body
|
|
text = await page.evaluate("""
|
|
() => {
|
|
// Try progressively broader content selectors
|
|
const selectors = [
|
|
'.article-content', '.page-content', '.entry-content',
|
|
'[class*="content-area"]', '[class*="main-content"]',
|
|
'main article', 'main', 'article',
|
|
'[role="main"]', '.content', '#content',
|
|
];
|
|
for (const sel of selectors) {
|
|
const el = document.querySelector(sel);
|
|
if (el && el.textContent.trim().length > 200) {
|
|
return el.textContent.trim();
|
|
}
|
|
}
|
|
// Fallback: full body minus nav/header/footer
|
|
const body = document.body.cloneNode(true);
|
|
body.querySelectorAll('nav, header, footer, script, style, [class*="nav"], [class*="sidebar"]').forEach(e => e.remove());
|
|
return body.textContent?.trim() || '';
|
|
}
|
|
""")
|
|
if text and len(text) > 50:
|
|
result.documents.append(DiscoveredDSI(
|
|
title=title, url=href, source_url=url,
|
|
language=lang,
|
|
doc_type="cross_domain" if not _is_allowed_domain(href, base_domain) else "html_page",
|
|
text=text[:50000], word_count=len(text.split()),
|
|
))
|
|
|
|
# Recursive: search THIS page for more DSI links
|
|
new_links = await _find_dsi_links(page, base_domain)
|
|
for nl in new_links:
|
|
if nl["href"] not in seen_urls and nl["href"] not in [p["href"] for p in pending_links]:
|
|
pending_links.append(nl)
|
|
|
|
# Navigate back for next link
|
|
await goto_resilient(page, url, timeout=45000)
|
|
await page.wait_for_timeout(500)
|
|
await _expand_all_interactive(page)
|
|
|
|
except Exception as e:
|
|
result.errors.append(f"Failed to load {href}: {str(e)[:80]}")
|
|
try:
|
|
await goto_resilient(page, url, timeout=45000)
|
|
except Exception:
|
|
pass
|
|
|
|
except Exception as e:
|
|
result.errors.append(f"Discovery failed: {str(e)[:100]}")
|
|
logger.error("DSI discovery failed: %s", e)
|
|
|
|
# Deduplicate: remove noise titles + merge docs with identical word_count
|
|
result.documents = _deduplicate_documents(result.documents)
|
|
|
|
result.total_found = len(result.documents)
|
|
result.languages_detected = list(set(
|
|
d.language for d in result.documents if d.language
|
|
))
|
|
logger.info("DSI discovery complete: %d documents found in %s",
|
|
result.total_found, result.languages_detected)
|
|
return result
|
|
|
|
# Nav elements, not real documents
|
|
# NOTE: "datenschutz" was removed — it's a legitimate document title
|
|
NOISE_TITLES = {"drucken", "print", "nach oben", "back to top", "teilen", "share",
|
|
"kontakt", "contact", "suche", "search", "menü", "menu", "home"}
|
|
|
|
def _deduplicate_documents(docs: list[DiscoveredDSI]) -> list[DiscoveredDSI]:
|
|
"""Remove duplicate and noise documents."""
|
|
# Step 1: Filter noise titles (nav elements, not real docs)
|
|
filtered = []
|
|
for d in docs:
|
|
title_lower = d.title.strip().lower()
|
|
# Skip very short titles that are nav elements
|
|
if title_lower in NOISE_TITLES:
|
|
continue
|
|
# Skip titles that are just URLs
|
|
if title_lower.startswith("http") or title_lower.startswith("www."):
|
|
continue
|
|
# Skip very short documents (< 50 words) — likely nav snippets
|
|
if d.word_count < 50 and d.doc_type != "pdf":
|
|
continue
|
|
filtered.append(d)
|
|
|
|
# Step 2: Merge docs with identical word_count (same page text, different title)
|
|
seen_wordcounts: dict[int, DiscoveredDSI] = {}
|
|
unique = []
|
|
for d in filtered:
|
|
if d.word_count > 200: # Only dedup substantial docs
|
|
if d.word_count in seen_wordcounts:
|
|
existing = seen_wordcounts[d.word_count]
|
|
# Prefer "Datenschutzinformation*" titles over section headings
|
|
d_is_dsi = d.title.lower().startswith("datenschutzinformation")
|
|
ex_is_dsi = existing.title.lower().startswith("datenschutzinformation")
|
|
if d_is_dsi and not ex_is_dsi:
|
|
unique = [x for x in unique if x is not existing]
|
|
unique.append(d)
|
|
seen_wordcounts[d.word_count] = d
|
|
continue
|
|
seen_wordcounts[d.word_count] = d
|
|
unique.append(d)
|
|
|
|
return unique
|
|
|
|
async def _find_dsi_links(page: Page, base_domain: str) -> list[dict]:
|
|
"""Find all links whose text or href matches DSI keywords."""
|
|
try:
|
|
all_links = await page.evaluate("""
|
|
() => [...document.querySelectorAll('a[href]')].map(a => ({
|
|
href: a.href,
|
|
text: (a.textContent || '').trim().substring(0, 200),
|
|
ariaLabel: a.getAttribute('aria-label') || '',
|
|
title: a.getAttribute('title') || '',
|
|
visible: a.getBoundingClientRect().width > 0,
|
|
}))
|
|
""")
|
|
dsi_links = []
|
|
for link in (all_links or []):
|
|
search_text = f"{link['text']} {link['ariaLabel']} {link['title']}".lower()
|
|
href = link["href"]
|
|
href_lower = href.lower()
|
|
|
|
# Match by link text or href
|
|
is_match = any(kw in search_text or kw in href_lower for kw in ALL_DSI_KEYWORDS)
|
|
if not is_match:
|
|
continue
|
|
|
|
# Allow same domain + related domains + PDFs
|
|
if _is_allowed_domain(href, base_domain) or href.endswith(".pdf"):
|
|
dsi_links.append({
|
|
"href": href,
|
|
"text": link["text"],
|
|
"visible": link["visible"],
|
|
})
|
|
|
|
return dsi_links
|
|
except Exception as e:
|
|
logger.warning("DSI link scan failed: %s", e)
|
|
return []
|
|
|
|
async def _expand_all_interactive(page: Page) -> None:
|
|
"""Expand all accordions, tabs, details, dropdowns on the page."""
|
|
try:
|
|
await page.evaluate("""() => {
|
|
document.querySelectorAll('details:not([open])').forEach(d => d.open = true);
|
|
const sels = ['button[aria-expanded="false"]','[data-toggle="collapse"]',
|
|
'[data-bs-toggle="collapse"]','[class*="accordion"] > button',
|
|
'[class*="collapse"] > button','.panel-heading a'];
|
|
sels.forEach(s => document.querySelectorAll(s).forEach(e => { try{e.click()}catch{} }));
|
|
document.querySelectorAll('button,a').forEach(b => {
|
|
if (/^(mehr|more|weiterlesen|read more|show more|anzeigen|alle anzeigen)/i.test((b.textContent||'').trim()))
|
|
try{b.click()}catch{}
|
|
});
|
|
document.querySelectorAll('[role="tab"]').forEach(t => { try{t.click()}catch{} });
|
|
}""")
|
|
except Exception:
|
|
pass
|
|
|
|
async def _find_inline_dsi_sections(page: Page) -> list[dict]:
|
|
"""Find DSI content already visible on the page (e.g. expanded accordions).
|
|
|
|
Only counts top-level documents (H1/H2 with DSI keywords).
|
|
Sub-sections (H3/H4 like 'Cookies', 'Betroffenenrechte') are NOT counted
|
|
as separate documents — their text is part of the parent document.
|
|
"""
|
|
try:
|
|
sections = await page.evaluate("""
|
|
() => {
|
|
const results = [];
|
|
// Only H1 and H2 count as document-level headings
|
|
const headings = document.querySelectorAll('h1, h2');
|
|
const dsiKeywords = [
|
|
'datenschutz', 'privacy', 'données', 'privacidad', 'protezione',
|
|
'gegevensbescherming', 'ochrona danych', 'tietosuoja', 'integritet',
|
|
'databeskyttelse', 'ochrana', 'adatvédel', 'confidential',
|
|
];
|
|
for (const h of headings) {
|
|
const text = (h.textContent || '').trim();
|
|
const textLower = text.toLowerCase();
|
|
if (!dsiKeywords.some(kw => textLower.includes(kw))) continue;
|
|
|
|
// Get ALL content until the next H1/H2 (include sub-sections H3-H5)
|
|
let content = '';
|
|
let el = h.nextElementSibling;
|
|
let count = 0;
|
|
while (el && count < 200) {
|
|
// Stop at next H1 or H2 (next top-level document)
|
|
if (el.tagName === 'H1' || el.tagName === 'H2') break;
|
|
content += (el.textContent || '').trim() + '\\n';
|
|
el = el.nextElementSibling;
|
|
count++;
|
|
}
|
|
|
|
if (content.length > 100) {
|
|
results.push({
|
|
title: text.substring(0, 200),
|
|
text: content.substring(0, 50000),
|
|
id: h.id || '',
|
|
});
|
|
}
|
|
}
|
|
return results;
|
|
}
|
|
""")
|
|
return sections or []
|
|
except Exception:
|
|
return []
|