feat: Generic legal document discovery (DSI, AGB, Widerruf, Cookie-Richtlinie)
CI / branch-name (push) Has been skipped
CI / guardrail-integrity (push) Has been skipped
CI / loc-budget (push) Failing after 22s
CI / secret-scan (push) Has been skipped
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / nodejs-build (push) Successful in 2m35s
CI / dep-audit (push) Has been skipped
CI / sbom-scan (push) Has been skipped
CI / test-go (push) Successful in 52s
CI / test-python-backend (push) Successful in 42s
CI / test-python-document-crawler (push) Successful in 29s
CI / test-python-dsms-gateway (push) Successful in 21s
CI / validate-canonical-controls (push) Successful in 14s
CI / branch-name (push) Has been skipped
CI / guardrail-integrity (push) Has been skipped
CI / loc-budget (push) Failing after 22s
CI / secret-scan (push) Has been skipped
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / nodejs-build (push) Successful in 2m35s
CI / dep-audit (push) Has been skipped
CI / sbom-scan (push) Has been skipped
CI / test-go (push) Successful in 52s
CI / test-python-backend (push) Successful in 42s
CI / test-python-document-crawler (push) Successful in 29s
CI / test-python-dsms-gateway (push) Successful in 21s
CI / validate-canonical-controls (push) Successful in 14s
New service: dsi_discovery.py — finds ALL legal documents on any website:
- Technology-agnostic: HTML, SPA, WordPress, Typo3, custom CMS
- Structure-agnostic: accordions, sidebars, footers, inline links, tabs
- Format-agnostic: HTML pages, anchor sections, PDFs, cross-domain links
- Language-agnostic: 26 EU/EEA languages with document-type keywords
Document types discovered:
- Datenschutzinformationen / Privacy Policies (Art. 13/14 DSGVO)
- AGB / Terms of Service / Nutzungsbedingungen
- Widerrufsbelehrung / Right of Withdrawal (§355 BGB)
- Cookie-Richtlinie / Cookie Policy
- All cross-domain variants (e.g. help.instagram.com from instagram.com)
API: POST /dsi-discovery { url, max_documents }
Returns: list of documents with title, url, language, type, word_count, text_preview
Features:
- Expands all accordions, details, tabs, dropdowns before scanning
- Follows cross-domain links (same registrable domain)
- Re-expands after navigation back to source page
- Handles anchor links (#sections) separately from full pages
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,321 @@
|
|||||||
|
"""
|
||||||
|
Consent Tester Service — Playwright-based 3-phase cookie consent test.
|
||||||
|
|
||||||
|
Tests what scripts/cookies load BEFORE consent, AFTER rejection, and AFTER acceptance.
|
||||||
|
Runs as independent microservice on port 8094.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
|
||||||
|
from fastapi import FastAPI
|
||||||
|
from fastapi.middleware.cors import CORSMiddleware
|
||||||
|
from pydantic import BaseModel
|
||||||
|
|
||||||
|
from services.consent_scanner import run_consent_test, ConsentTestResult
|
||||||
|
from services.authenticated_scanner import run_authenticated_test, AuthTestResult
|
||||||
|
from services.playwright_scanner import scan_website_playwright
|
||||||
|
from services.dsi_discovery import discover_dsi_documents, DSIDiscoveryResult
|
||||||
|
|
||||||
|
logging.basicConfig(level=logging.INFO, format="%(levelname)s:%(name)s: %(message)s")
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
app = FastAPI(title="BreakPilot Consent Tester", version="1.0.0")
|
||||||
|
|
||||||
|
app.add_middleware(
|
||||||
|
CORSMiddleware,
|
||||||
|
allow_origins=["*"],
|
||||||
|
allow_methods=["*"],
|
||||||
|
allow_headers=["*"],
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class ScanRequest(BaseModel):
|
||||||
|
url: str
|
||||||
|
timeout_per_phase: int = 10 # seconds to wait after page load
|
||||||
|
|
||||||
|
|
||||||
|
class ScanResponse(BaseModel):
|
||||||
|
url: str
|
||||||
|
banner_detected: bool
|
||||||
|
banner_provider: str
|
||||||
|
phases: dict
|
||||||
|
summary: dict
|
||||||
|
scanned_at: str
|
||||||
|
category_tests: list = []
|
||||||
|
banner_checks: dict = {}
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/health")
|
||||||
|
async def health():
|
||||||
|
return {"status": "healthy", "service": "consent-tester"}
|
||||||
|
|
||||||
|
|
||||||
|
@app.post("/scan", response_model=ScanResponse)
|
||||||
|
async def scan_consent(req: ScanRequest):
|
||||||
|
"""Run 3-phase consent test on a URL."""
|
||||||
|
logger.info("Starting consent test for %s", req.url)
|
||||||
|
result = await run_consent_test(req.url, req.timeout_per_phase)
|
||||||
|
|
||||||
|
return ScanResponse(
|
||||||
|
url=req.url,
|
||||||
|
banner_detected=result.banner_detected,
|
||||||
|
banner_provider=result.banner_provider,
|
||||||
|
phases={
|
||||||
|
"before_consent": {
|
||||||
|
"scripts": result.before_scripts,
|
||||||
|
"cookies": result.before_cookies,
|
||||||
|
"tracking_services": result.before_tracking,
|
||||||
|
"violations": [v.__dict__ for v in result.before_violations],
|
||||||
|
},
|
||||||
|
"after_reject": {
|
||||||
|
"scripts": result.reject_scripts,
|
||||||
|
"cookies": result.reject_cookies,
|
||||||
|
"new_tracking": result.reject_new_tracking,
|
||||||
|
"violations": [v.__dict__ for v in result.reject_violations],
|
||||||
|
},
|
||||||
|
"after_accept": {
|
||||||
|
"scripts": result.accept_scripts,
|
||||||
|
"cookies": result.accept_cookies,
|
||||||
|
"new_tracking": result.accept_new_tracking,
|
||||||
|
"undocumented": result.accept_undocumented,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
summary={
|
||||||
|
"critical": sum(1 for v in result.reject_violations if v.severity == "CRITICAL"),
|
||||||
|
"high": len(result.before_violations) + sum(1 for v in result.banner_text_violations if v.severity == "HIGH"),
|
||||||
|
"undocumented": len(result.accept_undocumented),
|
||||||
|
"total_violations": len(result.before_violations) + len(result.reject_violations) + len(result.banner_text_violations),
|
||||||
|
"category_violations": sum(len(ct.violations) for ct in result.category_tests),
|
||||||
|
"categories_tested": len(result.category_tests),
|
||||||
|
"banner_text_issues": len(result.banner_text_violations),
|
||||||
|
},
|
||||||
|
banner_checks={
|
||||||
|
"has_impressum_link": result.banner_has_impressum_link,
|
||||||
|
"has_dse_link": result.banner_has_dse_link,
|
||||||
|
"violations": [v.__dict__ for v in result.banner_text_violations],
|
||||||
|
},
|
||||||
|
scanned_at=datetime.now(timezone.utc).isoformat(),
|
||||||
|
category_tests=[{
|
||||||
|
"category": ct.category,
|
||||||
|
"category_label": ct.category_label,
|
||||||
|
"tracking_services": ct.tracking_services,
|
||||||
|
"violations": ct.violations,
|
||||||
|
} for ct in result.category_tests] if result.category_tests else [],
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class AuthScanRequest(BaseModel):
|
||||||
|
url: str
|
||||||
|
username: str
|
||||||
|
password: str
|
||||||
|
username_selector: str = ""
|
||||||
|
password_selector: str = ""
|
||||||
|
submit_selector: str = ""
|
||||||
|
|
||||||
|
|
||||||
|
class AuthCheckInfo(BaseModel):
|
||||||
|
found: bool = False
|
||||||
|
text: str = ""
|
||||||
|
legal_ref: str = ""
|
||||||
|
|
||||||
|
|
||||||
|
class AuthScanResponse(BaseModel):
|
||||||
|
url: str
|
||||||
|
authenticated: bool
|
||||||
|
login_error: str = ""
|
||||||
|
checks: dict[str, AuthCheckInfo]
|
||||||
|
findings_count: int
|
||||||
|
scanned_at: str
|
||||||
|
|
||||||
|
|
||||||
|
LEGAL_REFS = {
|
||||||
|
"cancel_subscription": "§312k BGB (Kuendigungsbutton)",
|
||||||
|
"delete_account": "Art. 17 DSGVO (Recht auf Loeschung)",
|
||||||
|
"export_data": "Art. 20 DSGVO (Datenportabilitaet)",
|
||||||
|
"consent_settings": "Art. 7 Abs. 3 DSGVO (Widerruf der Einwilligung)",
|
||||||
|
"profile_visible": "Art. 15 DSGVO (Auskunftsrecht)",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@app.post("/authenticated-scan", response_model=AuthScanResponse)
|
||||||
|
async def authenticated_scan(req: AuthScanRequest):
|
||||||
|
"""Test post-login functionality. Credentials are destroyed after test."""
|
||||||
|
logger.info("Starting authenticated test for %s", req.url)
|
||||||
|
|
||||||
|
result = await run_authenticated_test(
|
||||||
|
url=req.url,
|
||||||
|
username=req.username,
|
||||||
|
password=req.password,
|
||||||
|
username_selector=req.username_selector,
|
||||||
|
password_selector=req.password_selector,
|
||||||
|
submit_selector=req.submit_selector,
|
||||||
|
)
|
||||||
|
|
||||||
|
checks = {
|
||||||
|
"cancel_subscription": AuthCheckInfo(
|
||||||
|
found=result.cancel_subscription.found,
|
||||||
|
text=result.cancel_subscription.text,
|
||||||
|
legal_ref=LEGAL_REFS["cancel_subscription"],
|
||||||
|
),
|
||||||
|
"delete_account": AuthCheckInfo(
|
||||||
|
found=result.delete_account.found,
|
||||||
|
text=result.delete_account.text,
|
||||||
|
legal_ref=LEGAL_REFS["delete_account"],
|
||||||
|
),
|
||||||
|
"export_data": AuthCheckInfo(
|
||||||
|
found=result.export_data.found,
|
||||||
|
text=result.export_data.text,
|
||||||
|
legal_ref=LEGAL_REFS["export_data"],
|
||||||
|
),
|
||||||
|
"consent_settings": AuthCheckInfo(
|
||||||
|
found=result.consent_settings.found,
|
||||||
|
text=result.consent_settings.text,
|
||||||
|
legal_ref=LEGAL_REFS["consent_settings"],
|
||||||
|
),
|
||||||
|
"profile_visible": AuthCheckInfo(
|
||||||
|
found=result.profile_visible.found,
|
||||||
|
text=result.profile_visible.text,
|
||||||
|
legal_ref=LEGAL_REFS["profile_visible"],
|
||||||
|
),
|
||||||
|
}
|
||||||
|
|
||||||
|
missing = sum(1 for c in checks.values() if not c.found)
|
||||||
|
|
||||||
|
return AuthScanResponse(
|
||||||
|
url=req.url,
|
||||||
|
authenticated=result.authenticated,
|
||||||
|
login_error=result.login_error,
|
||||||
|
checks=checks,
|
||||||
|
findings_count=missing,
|
||||||
|
scanned_at=datetime.now(timezone.utc).isoformat(),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ═══════════════════════════════════════════════════════════════
|
||||||
|
# PLAYWRIGHT WEBSITE SCAN (Phase 10 — replaces httpx scanner)
|
||||||
|
# ═══════════════════════════════════════════════════════════════
|
||||||
|
|
||||||
|
class WebsiteScanRequest(BaseModel):
|
||||||
|
url: str
|
||||||
|
max_pages: int = 15
|
||||||
|
click_nav: bool = True
|
||||||
|
|
||||||
|
|
||||||
|
class PageInfo(BaseModel):
|
||||||
|
url: str
|
||||||
|
status: int
|
||||||
|
title: str = ""
|
||||||
|
error: str = ""
|
||||||
|
|
||||||
|
|
||||||
|
class WebsiteScanResponse(BaseModel):
|
||||||
|
url: str
|
||||||
|
pages: list[PageInfo]
|
||||||
|
pages_count: int
|
||||||
|
external_scripts: list[str]
|
||||||
|
cookies: list[str]
|
||||||
|
page_htmls: dict[str, str] # url -> rendered HTML (for backend analysis)
|
||||||
|
scanned_at: str
|
||||||
|
|
||||||
|
|
||||||
|
@app.post("/website-scan", response_model=WebsiteScanResponse)
|
||||||
|
async def website_scan(req: WebsiteScanRequest):
|
||||||
|
"""Scan website using Playwright — discovers pages via JS navigation + menu clicks."""
|
||||||
|
logger.info("Starting Playwright website scan for %s (max %d pages)", req.url, req.max_pages)
|
||||||
|
|
||||||
|
result = await scan_website_playwright(req.url, req.max_pages, req.click_nav)
|
||||||
|
|
||||||
|
# Build page HTML map (only successful pages, truncated)
|
||||||
|
page_htmls = {}
|
||||||
|
for p in result.pages:
|
||||||
|
if p.html and p.status < 400:
|
||||||
|
page_htmls[p.url] = p.html[:50000] # Cap at 50KB per page
|
||||||
|
|
||||||
|
return WebsiteScanResponse(
|
||||||
|
url=req.url,
|
||||||
|
pages=[PageInfo(url=p.url, status=p.status, title=p.title, error=p.error) for p in result.pages],
|
||||||
|
pages_count=len(result.pages),
|
||||||
|
external_scripts=result.external_scripts[:50],
|
||||||
|
cookies=result.all_cookies,
|
||||||
|
page_htmls=page_htmls,
|
||||||
|
scanned_at=datetime.now(timezone.utc).isoformat(),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ═══════════════════════════════════════════════════════════════
|
||||||
|
# DSI DISCOVERY (finds all privacy + legal documents on a website)
|
||||||
|
# ═══════════════════════════════════════════════════════════════
|
||||||
|
|
||||||
|
class DSIDiscoveryRequest(BaseModel):
|
||||||
|
url: str
|
||||||
|
max_documents: int = 30
|
||||||
|
|
||||||
|
|
||||||
|
class DSIDocumentInfo(BaseModel):
|
||||||
|
title: str
|
||||||
|
url: str
|
||||||
|
source_url: str
|
||||||
|
language: str = ""
|
||||||
|
doc_type: str = ""
|
||||||
|
word_count: int = 0
|
||||||
|
text_preview: str = ""
|
||||||
|
|
||||||
|
|
||||||
|
class DSIDiscoveryResponse(BaseModel):
|
||||||
|
url: str
|
||||||
|
documents: list[DSIDocumentInfo]
|
||||||
|
total_found: int
|
||||||
|
languages_detected: list[str]
|
||||||
|
errors: list[str]
|
||||||
|
scanned_at: str
|
||||||
|
|
||||||
|
|
||||||
|
@app.post("/dsi-discovery", response_model=DSIDiscoveryResponse)
|
||||||
|
async def dsi_discovery(req: DSIDiscoveryRequest):
|
||||||
|
"""Discover all privacy/data protection documents on a website.
|
||||||
|
|
||||||
|
Generically finds DSI, AGB, Nutzungsbedingungen, Widerrufsbelehrung,
|
||||||
|
Cookie-Richtlinien etc. regardless of website technology or language.
|
||||||
|
Supports HTML pages, accordions, sidebars, PDFs, cross-domain links.
|
||||||
|
"""
|
||||||
|
logger.info("Starting DSI discovery for %s (max %d docs)", req.url, req.max_documents)
|
||||||
|
|
||||||
|
from playwright.async_api import async_playwright
|
||||||
|
|
||||||
|
async with async_playwright() as p:
|
||||||
|
browser = await p.chromium.launch(
|
||||||
|
headless=True,
|
||||||
|
args=["--no-sandbox", "--disable-dev-shm-usage"],
|
||||||
|
)
|
||||||
|
context = await browser.new_context(
|
||||||
|
user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
|
||||||
|
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
||||||
|
)
|
||||||
|
page = await context.new_page()
|
||||||
|
|
||||||
|
try:
|
||||||
|
result = await discover_dsi_documents(page, req.url, req.max_documents)
|
||||||
|
finally:
|
||||||
|
await context.close()
|
||||||
|
await browser.close()
|
||||||
|
|
||||||
|
return DSIDiscoveryResponse(
|
||||||
|
url=req.url,
|
||||||
|
documents=[
|
||||||
|
DSIDocumentInfo(
|
||||||
|
title=d.title,
|
||||||
|
url=d.url,
|
||||||
|
source_url=d.source_url,
|
||||||
|
language=d.language,
|
||||||
|
doc_type=d.doc_type,
|
||||||
|
word_count=d.word_count,
|
||||||
|
text_preview=d.text[:500] if d.text else "",
|
||||||
|
)
|
||||||
|
for d in result.documents
|
||||||
|
],
|
||||||
|
total_found=result.total_found,
|
||||||
|
languages_detected=result.languages_detected,
|
||||||
|
errors=result.errors,
|
||||||
|
scanned_at=datetime.now(timezone.utc).isoformat(),
|
||||||
|
)
|
||||||
@@ -0,0 +1,469 @@
|
|||||||
|
"""
|
||||||
|
DSI Discovery — Generic privacy document finder and parser.
|
||||||
|
|
||||||
|
Finds all privacy/data protection documents on any website regardless of:
|
||||||
|
- Technology (static HTML, SPA, WordPress, Typo3, etc.)
|
||||||
|
- Structure (accordion, sidebar, footer, inline links, separate pages)
|
||||||
|
- Format (HTML sections, PDF downloads, cross-domain links)
|
||||||
|
- Language (all 26 EU/EEA official languages)
|
||||||
|
|
||||||
|
Flow:
|
||||||
|
1. Load page with Playwright (full JS rendering)
|
||||||
|
2. Find all links matching DSI keywords (26 languages)
|
||||||
|
3. Expand accordions, click tabs, open dropdowns
|
||||||
|
4. Follow cross-domain links (e.g. instagram.com → help.instagram.com)
|
||||||
|
5. Extract document text from each link target
|
||||||
|
6. Return structured list of discovered documents
|
||||||
|
"""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import re
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from urllib.parse import urlparse, urljoin
|
||||||
|
|
||||||
|
from playwright.async_api import Page
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# Legal document keywords in all EU/EEA official languages.
|
||||||
|
# Covers: DSI (privacy), AGB (terms), Widerruf (cancellation),
|
||||||
|
# Cookie-Richtlinie, Impressum, NB (Nutzungsbedingungen).
|
||||||
|
DSI_KEYWORDS: dict[str, list[str]] = {
|
||||||
|
"de": [
|
||||||
|
# Datenschutz
|
||||||
|
"datenschutz", "datenschutzerklaerung", "datenschutzinformation",
|
||||||
|
"datenschutzhinweis", "datenschutzrichtlinie", "dsgvo", "privatsphäre",
|
||||||
|
"datenschutzbestimmung", "verarbeitung personenbezogener daten",
|
||||||
|
# AGB / Nutzungsbedingungen
|
||||||
|
"allgemeine geschäftsbedingungen", "agb", "nutzungsbedingungen",
|
||||||
|
"nutzungsordnung", "geschäftsbedingungen",
|
||||||
|
# Widerruf
|
||||||
|
"widerrufsbelehrung", "widerrufsrecht", "widerrufsformular",
|
||||||
|
"widerruf", "rücktrittsrecht",
|
||||||
|
# Cookie
|
||||||
|
"cookie-richtlinie", "cookie-policy", "cookie-hinweis",
|
||||||
|
],
|
||||||
|
"en": [
|
||||||
|
"privacy policy", "privacy notice", "data protection", "data policy",
|
||||||
|
"privacy statement", "gdpr", "personal data", "cookie policy",
|
||||||
|
"terms of service", "terms and conditions", "terms of use",
|
||||||
|
"cancellation policy", "right of withdrawal", "refund policy",
|
||||||
|
"cookie notice",
|
||||||
|
],
|
||||||
|
"fr": [
|
||||||
|
"politique de confidentialité", "protection des données",
|
||||||
|
"données personnelles", "vie privée", "rgpd",
|
||||||
|
"conditions générales", "conditions d'utilisation",
|
||||||
|
"droit de rétractation", "politique de cookies",
|
||||||
|
],
|
||||||
|
"es": [
|
||||||
|
"política de privacidad", "protección de datos",
|
||||||
|
"datos personales", "aviso de privacidad",
|
||||||
|
"términos y condiciones", "condiciones de uso",
|
||||||
|
"derecho de desistimiento", "política de cookies",
|
||||||
|
],
|
||||||
|
"it": [
|
||||||
|
"informativa sulla privacy", "protezione dei dati",
|
||||||
|
"dati personali", "privacy policy",
|
||||||
|
"termini e condizioni", "condizioni d'uso",
|
||||||
|
"diritto di recesso", "politica dei cookie",
|
||||||
|
],
|
||||||
|
"nl": [
|
||||||
|
"privacybeleid", "gegevensbescherming", "privacyverklaring",
|
||||||
|
"persoonsgegevens", "avg",
|
||||||
|
"algemene voorwaarden", "gebruiksvoorwaarden",
|
||||||
|
"herroepingsrecht", "cookiebeleid",
|
||||||
|
],
|
||||||
|
"pl": [
|
||||||
|
"polityka prywatności", "ochrona danych osobowych",
|
||||||
|
"dane osobowe", "rodo",
|
||||||
|
"regulamin", "warunki korzystania",
|
||||||
|
"prawo odstąpienia", "polityka cookies",
|
||||||
|
],
|
||||||
|
"pt": [
|
||||||
|
"política de privacidade", "proteção de dados",
|
||||||
|
"dados pessoais", "lgpd",
|
||||||
|
"termos e condições", "condições de utilização",
|
||||||
|
"direito de resolução", "política de cookies",
|
||||||
|
],
|
||||||
|
"sv": [
|
||||||
|
"integritetspolicy", "dataskydd", "personuppgifter",
|
||||||
|
"sekretesspolicy",
|
||||||
|
"allmänna villkor", "användarvillkor",
|
||||||
|
"ångerrätt", "cookiepolicy",
|
||||||
|
],
|
||||||
|
"da": [
|
||||||
|
"privatlivspolitik", "databeskyttelse", "personoplysninger",
|
||||||
|
"persondatapolitik",
|
||||||
|
"handelsbetingelser", "brugsbetingelser",
|
||||||
|
"fortrydelsesret", "cookiepolitik",
|
||||||
|
],
|
||||||
|
"fi": [
|
||||||
|
"tietosuojaseloste", "tietosuoja", "henkilötiedot",
|
||||||
|
"rekisteriseloste",
|
||||||
|
"yleiset ehdot", "käyttöehdot",
|
||||||
|
"peruutusoikeus", "evästekäytäntö",
|
||||||
|
],
|
||||||
|
"cs": ["zásady ochrany osobních údajů", "ochrana osobních údajů",
|
||||||
|
"zpracování osobních údajů", "obchodní podmínky", "zásady cookies"],
|
||||||
|
"el": ["πολιτική απορρήτου", "προστασία δεδομένων",
|
||||||
|
"προσωπικά δεδομένα", "όροι χρήσης", "πολιτική cookies"],
|
||||||
|
"hu": ["adatvédelmi szabályzat", "adatvédelem", "személyes adatok",
|
||||||
|
"általános szerződési feltételek", "cookie szabályzat"],
|
||||||
|
"ro": ["politica de confidențialitate", "protecția datelor",
|
||||||
|
"date cu caracter personal", "termeni și condiții", "politica cookies"],
|
||||||
|
"bg": ["политика за поверителност", "защита на данните",
|
||||||
|
"лични данни", "общи условия", "политика за бисквитки"],
|
||||||
|
"hr": ["politika privatnosti", "zaštita podataka", "osobni podaci",
|
||||||
|
"opći uvjeti", "politika kolačića"],
|
||||||
|
"sk": ["zásady ochrany osobných údajov", "ochrana osobných údajov",
|
||||||
|
"obchodné podmienky", "zásady cookies"],
|
||||||
|
"sl": ["politika zasebnosti", "varstvo podatkov", "osebni podatki",
|
||||||
|
"splošni pogoji", "politika piškotkov"],
|
||||||
|
"et": ["privaatsuspoliitika", "andmekaitse", "isikuandmed",
|
||||||
|
"kasutustingimused", "küpsiste poliitika"],
|
||||||
|
"lt": ["privatumo politika", "duomenų apsauga", "asmens duomenys",
|
||||||
|
"naudojimosi sąlygos", "slapukų politika"],
|
||||||
|
"lv": ["privātuma politika", "datu aizsardzība", "personas dati",
|
||||||
|
"lietošanas noteikumi", "sīkdatņu politika"],
|
||||||
|
"mt": ["politika tal-privatezza", "protezzjoni tad-data",
|
||||||
|
"termini u kundizzjonijiet"],
|
||||||
|
"ga": ["polasaí príobháideachais", "cosaint sonraí",
|
||||||
|
"téarmaí agus coinníollacha"],
|
||||||
|
"is": ["persónuverndarstefna", "persónuvernd",
|
||||||
|
"skilmálar og skilyrði"],
|
||||||
|
"no": ["personvernerklæring", "personvern", "personopplysninger",
|
||||||
|
"brukervilkår", "angrerett", "informasjonskapsler"],
|
||||||
|
}
|
||||||
|
|
||||||
|
# Flatten all keywords for quick matching
|
||||||
|
ALL_DSI_KEYWORDS: list[str] = []
|
||||||
|
for kw_list in DSI_KEYWORDS.values():
|
||||||
|
ALL_DSI_KEYWORDS.extend(kw_list)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class DiscoveredDSI:
|
||||||
|
"""A discovered privacy/data protection document."""
|
||||||
|
title: str
|
||||||
|
url: str
|
||||||
|
source_url: str # Page where the link was found
|
||||||
|
language: str = ""
|
||||||
|
doc_type: str = "" # "html_section", "html_page", "pdf", "accordion", "cross_domain"
|
||||||
|
text: str = "" # Extracted full text
|
||||||
|
sections: list[dict] = field(default_factory=list) # Parsed sections
|
||||||
|
word_count: int = 0
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class DSIDiscoveryResult:
|
||||||
|
"""Result of DSI discovery scan."""
|
||||||
|
base_url: str
|
||||||
|
documents: list[DiscoveredDSI] = field(default_factory=list)
|
||||||
|
total_found: int = 0
|
||||||
|
languages_detected: list[str] = field(default_factory=list)
|
||||||
|
errors: list[str] = field(default_factory=list)
|
||||||
|
|
||||||
|
|
||||||
|
def _matches_dsi_keyword(text: str) -> tuple[bool, str]:
|
||||||
|
"""Check if text contains any DSI keyword. Returns (match, language)."""
|
||||||
|
text_lower = text.lower().strip()
|
||||||
|
for lang, keywords in DSI_KEYWORDS.items():
|
||||||
|
for kw in keywords:
|
||||||
|
if kw in text_lower:
|
||||||
|
return True, lang
|
||||||
|
return False, ""
|
||||||
|
|
||||||
|
|
||||||
|
def _is_allowed_domain(href: str, base_domain: str) -> bool:
|
||||||
|
"""Allow same domain + known related domains (e.g. help.instagram.com)."""
|
||||||
|
try:
|
||||||
|
link_domain = urlparse(href).netloc.replace("www.", "")
|
||||||
|
base_clean = base_domain.replace("www.", "")
|
||||||
|
# Same domain
|
||||||
|
if link_domain == base_clean:
|
||||||
|
return True
|
||||||
|
# Subdomain (help.instagram.com for instagram.com)
|
||||||
|
if link_domain.endswith(f".{base_clean}"):
|
||||||
|
return True
|
||||||
|
# Parent domain (instagram.com links from about.instagram.com)
|
||||||
|
if base_clean.endswith(f".{link_domain}"):
|
||||||
|
return True
|
||||||
|
# Known related patterns
|
||||||
|
parts_base = base_clean.split(".")
|
||||||
|
parts_link = link_domain.split(".")
|
||||||
|
if len(parts_base) >= 2 and len(parts_link) >= 2:
|
||||||
|
if parts_base[-2] == parts_link[-2] and parts_base[-1] == parts_link[-1]:
|
||||||
|
return True # Same registrable domain
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
async def discover_dsi_documents(
|
||||||
|
page: Page,
|
||||||
|
url: str,
|
||||||
|
max_documents: int = 30,
|
||||||
|
) -> DSIDiscoveryResult:
|
||||||
|
"""Discover all privacy/data protection documents on a website.
|
||||||
|
|
||||||
|
Works generically regardless of website technology, structure, or language.
|
||||||
|
"""
|
||||||
|
result = DSIDiscoveryResult(base_url=url)
|
||||||
|
base_domain = urlparse(url).netloc
|
||||||
|
seen_urls: set[str] = set()
|
||||||
|
seen_titles: set[str] = set()
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Step 1: Load the page
|
||||||
|
await page.goto(url, wait_until="networkidle", timeout=30000)
|
||||||
|
await page.wait_for_timeout(2000)
|
||||||
|
|
||||||
|
# Step 2: Find DSI links in current page
|
||||||
|
links = await _find_dsi_links(page, base_domain)
|
||||||
|
logger.info("Found %d DSI links on %s", len(links), url)
|
||||||
|
|
||||||
|
# Step 3: Expand accordions, tabs, dropdowns to find hidden content
|
||||||
|
await _expand_all_interactive(page)
|
||||||
|
await page.wait_for_timeout(1000)
|
||||||
|
|
||||||
|
# Step 3b: Re-scan after expanding (may reveal new links)
|
||||||
|
links_after = await _find_dsi_links(page, base_domain)
|
||||||
|
for link in links_after:
|
||||||
|
if link["href"] not in [l["href"] for l in links]:
|
||||||
|
links.append(link)
|
||||||
|
|
||||||
|
# Step 4: Check for inline DSI sections (accordion content already visible)
|
||||||
|
inline_sections = await _find_inline_dsi_sections(page)
|
||||||
|
for section in inline_sections:
|
||||||
|
title_norm = section["title"].strip().lower()
|
||||||
|
if title_norm not in seen_titles:
|
||||||
|
seen_titles.add(title_norm)
|
||||||
|
is_dsi, lang = _matches_dsi_keyword(section["title"])
|
||||||
|
doc = DiscoveredDSI(
|
||||||
|
title=section["title"],
|
||||||
|
url=f"{url}#{section.get('id', '')}",
|
||||||
|
source_url=url,
|
||||||
|
language=lang,
|
||||||
|
doc_type="html_section",
|
||||||
|
text=section["text"],
|
||||||
|
word_count=len(section["text"].split()),
|
||||||
|
)
|
||||||
|
result.documents.append(doc)
|
||||||
|
|
||||||
|
# Step 5: Follow each DSI link and extract content
|
||||||
|
for link_info in links[:max_documents]:
|
||||||
|
href = link_info["href"]
|
||||||
|
if href in seen_urls:
|
||||||
|
continue
|
||||||
|
seen_urls.add(href)
|
||||||
|
|
||||||
|
title = link_info["text"]
|
||||||
|
title_norm = title.strip().lower()
|
||||||
|
if title_norm in seen_titles:
|
||||||
|
continue
|
||||||
|
seen_titles.add(title_norm)
|
||||||
|
|
||||||
|
is_dsi, lang = _matches_dsi_keyword(title)
|
||||||
|
is_pdf = href.lower().endswith(".pdf")
|
||||||
|
|
||||||
|
if is_pdf:
|
||||||
|
result.documents.append(DiscoveredDSI(
|
||||||
|
title=title, url=href, source_url=url,
|
||||||
|
language=lang, doc_type="pdf",
|
||||||
|
text="[PDF — Textextraktion erforderlich]",
|
||||||
|
))
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Navigate to the link and extract text
|
||||||
|
try:
|
||||||
|
is_anchor = "#" in href and href.split("#")[0] == url.split("#")[0]
|
||||||
|
if is_anchor:
|
||||||
|
anchor = href.split("#")[1]
|
||||||
|
text = await page.evaluate(f"""
|
||||||
|
() => {{
|
||||||
|
const el = document.getElementById('{anchor}');
|
||||||
|
if (!el) return '';
|
||||||
|
return el.closest('section,article,div')?.textContent?.trim() || el.textContent?.trim() || '';
|
||||||
|
}}
|
||||||
|
""")
|
||||||
|
if text and len(text) > 50:
|
||||||
|
result.documents.append(DiscoveredDSI(
|
||||||
|
title=title, url=href, source_url=url,
|
||||||
|
language=lang, doc_type="anchor_section",
|
||||||
|
text=text[:50000], word_count=len(text.split()),
|
||||||
|
))
|
||||||
|
continue
|
||||||
|
|
||||||
|
# External or same-domain page
|
||||||
|
resp = await page.goto(href, wait_until="networkidle", timeout=20000)
|
||||||
|
if resp and resp.status < 400:
|
||||||
|
await page.wait_for_timeout(2000)
|
||||||
|
await _expand_all_interactive(page) # Expand accordions on target page too
|
||||||
|
await page.wait_for_timeout(500)
|
||||||
|
|
||||||
|
text = await page.evaluate("""
|
||||||
|
() => {
|
||||||
|
const main = document.querySelector('main, article, [role="main"], .content, #content');
|
||||||
|
return (main || document.body).textContent?.trim() || '';
|
||||||
|
}
|
||||||
|
""")
|
||||||
|
if text and len(text) > 50:
|
||||||
|
result.documents.append(DiscoveredDSI(
|
||||||
|
title=title, url=href, source_url=url,
|
||||||
|
language=lang,
|
||||||
|
doc_type="cross_domain" if not _is_allowed_domain(href, base_domain) else "html_page",
|
||||||
|
text=text[:50000], word_count=len(text.split()),
|
||||||
|
))
|
||||||
|
|
||||||
|
# Navigate back to source page for next link
|
||||||
|
await page.goto(url, wait_until="networkidle", timeout=20000)
|
||||||
|
await page.wait_for_timeout(1000)
|
||||||
|
await _expand_all_interactive(page)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
result.errors.append(f"Failed to load {href}: {str(e)[:80]}")
|
||||||
|
try:
|
||||||
|
await page.goto(url, wait_until="networkidle", timeout=20000)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
result.errors.append(f"Discovery failed: {str(e)[:100]}")
|
||||||
|
logger.error("DSI discovery failed: %s", e)
|
||||||
|
|
||||||
|
result.total_found = len(result.documents)
|
||||||
|
result.languages_detected = list(set(
|
||||||
|
d.language for d in result.documents if d.language
|
||||||
|
))
|
||||||
|
logger.info("DSI discovery complete: %d documents found in %s",
|
||||||
|
result.total_found, result.languages_detected)
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
async def _find_dsi_links(page: Page, base_domain: str) -> list[dict]:
|
||||||
|
"""Find all links whose text or href matches DSI keywords."""
|
||||||
|
try:
|
||||||
|
all_links = await page.evaluate("""
|
||||||
|
() => [...document.querySelectorAll('a[href]')].map(a => ({
|
||||||
|
href: a.href,
|
||||||
|
text: (a.textContent || '').trim().substring(0, 200),
|
||||||
|
ariaLabel: a.getAttribute('aria-label') || '',
|
||||||
|
title: a.getAttribute('title') || '',
|
||||||
|
visible: a.getBoundingClientRect().width > 0,
|
||||||
|
}))
|
||||||
|
""")
|
||||||
|
dsi_links = []
|
||||||
|
for link in (all_links or []):
|
||||||
|
search_text = f"{link['text']} {link['ariaLabel']} {link['title']}".lower()
|
||||||
|
href = link["href"]
|
||||||
|
href_lower = href.lower()
|
||||||
|
|
||||||
|
# Match by link text or href
|
||||||
|
is_match = any(kw in search_text or kw in href_lower for kw in ALL_DSI_KEYWORDS)
|
||||||
|
if not is_match:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Allow same domain + related domains + PDFs
|
||||||
|
if _is_allowed_domain(href, base_domain) or href.endswith(".pdf"):
|
||||||
|
dsi_links.append({
|
||||||
|
"href": href,
|
||||||
|
"text": link["text"],
|
||||||
|
"visible": link["visible"],
|
||||||
|
})
|
||||||
|
|
||||||
|
return dsi_links
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("DSI link scan failed: %s", e)
|
||||||
|
return []
|
||||||
|
|
||||||
|
|
||||||
|
async def _expand_all_interactive(page: Page) -> None:
|
||||||
|
"""Expand all accordions, tabs, details, dropdowns on the page."""
|
||||||
|
try:
|
||||||
|
await page.evaluate("""
|
||||||
|
() => {
|
||||||
|
// 1. Open all <details> elements
|
||||||
|
document.querySelectorAll('details:not([open])').forEach(d => d.open = true);
|
||||||
|
|
||||||
|
// 2. Click all accordion buttons
|
||||||
|
const accSelectors = [
|
||||||
|
'button[aria-expanded="false"]',
|
||||||
|
'[class*="accordion"]:not([class*="open"]) > button',
|
||||||
|
'[class*="accordion"]:not([class*="open"]) > a',
|
||||||
|
'[class*="collapse"] > button',
|
||||||
|
'[class*="toggle"]:not(.active)',
|
||||||
|
'[data-toggle="collapse"]',
|
||||||
|
'[data-bs-toggle="collapse"]',
|
||||||
|
'.panel-heading:not(.active) a',
|
||||||
|
];
|
||||||
|
for (const sel of accSelectors) {
|
||||||
|
document.querySelectorAll(sel).forEach(el => {
|
||||||
|
try { el.click(); } catch {}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
// 3. Click all "show more" / "read more" buttons
|
||||||
|
const moreButtons = document.querySelectorAll(
|
||||||
|
'button, a'
|
||||||
|
);
|
||||||
|
for (const btn of moreButtons) {
|
||||||
|
const text = (btn.textContent || '').toLowerCase().trim();
|
||||||
|
if (/^(mehr|more|weiterlesen|read more|show more|anzeigen|details|alle anzeigen)/.test(text)) {
|
||||||
|
try { btn.click(); } catch {}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// 4. Expand all tab panels (click each tab)
|
||||||
|
document.querySelectorAll('[role="tab"]').forEach(tab => {
|
||||||
|
try { tab.click(); } catch {}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
""")
|
||||||
|
except Exception as e:
|
||||||
|
logger.debug("Expand interactive elements: %s", e)
|
||||||
|
|
||||||
|
|
||||||
|
async def _find_inline_dsi_sections(page: Page) -> list[dict]:
|
||||||
|
"""Find DSI content already visible on the page (e.g. expanded accordions)."""
|
||||||
|
try:
|
||||||
|
sections = await page.evaluate("""
|
||||||
|
() => {
|
||||||
|
const results = [];
|
||||||
|
// Find headings that match DSI keywords
|
||||||
|
const headings = document.querySelectorAll('h1, h2, h3, h4, h5');
|
||||||
|
const dsiKeywords = [
|
||||||
|
'datenschutz', 'privacy', 'données', 'privacidad', 'protezione',
|
||||||
|
'gegevensbescherming', 'ochrona danych', 'tietosuoja', 'integritet',
|
||||||
|
'databeskyttelse', 'ochrana', 'adatvédel', 'confidential',
|
||||||
|
];
|
||||||
|
for (const h of headings) {
|
||||||
|
const text = (h.textContent || '').trim();
|
||||||
|
const textLower = text.toLowerCase();
|
||||||
|
if (!dsiKeywords.some(kw => textLower.includes(kw))) continue;
|
||||||
|
|
||||||
|
// Get the section content following this heading
|
||||||
|
let content = '';
|
||||||
|
let el = h.nextElementSibling;
|
||||||
|
let count = 0;
|
||||||
|
while (el && count < 50) {
|
||||||
|
if (el.tagName.match(/^H[1-5]$/)) break;
|
||||||
|
content += (el.textContent || '').trim() + '\\n';
|
||||||
|
el = el.nextElementSibling;
|
||||||
|
count++;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (content.length > 100) {
|
||||||
|
results.push({
|
||||||
|
title: text.substring(0, 200),
|
||||||
|
text: content.substring(0, 50000),
|
||||||
|
id: h.id || '',
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return results;
|
||||||
|
}
|
||||||
|
""")
|
||||||
|
return sections or []
|
||||||
|
except Exception:
|
||||||
|
return []
|
||||||
Reference in New Issue
Block a user