feat: 4 remaining tasks — EU institutions, banner integration, JS-sites, Caritas fixes
Build + Deploy / build-ai-sdk (push) Failing after 36s
Build + Deploy / build-developer-portal (push) Successful in 8s
Build + Deploy / build-tts (push) Successful in 7s
Build + Deploy / build-document-crawler (push) Successful in 7s
Build + Deploy / build-admin-compliance (push) Successful in 8s
Build + Deploy / build-backend-compliance (push) Successful in 8s
CI / nodejs-build (push) Successful in 3m14s
CI / dep-audit (push) Has been skipped
CI / sbom-scan (push) Has been skipped
CI / test-go (push) Failing after 46s
CI / test-python-backend (push) Successful in 43s
CI / test-python-document-crawler (push) Successful in 29s
CI / test-python-dsms-gateway (push) Successful in 30s
CI / validate-canonical-controls (push) Successful in 16s
Build + Deploy / build-dsms-gateway (push) Successful in 8s
Build + Deploy / build-dsms-node (push) Successful in 8s
CI / branch-name (push) Has been skipped
Build + Deploy / trigger-orca (push) Has been skipped
CI / guardrail-integrity (push) Has been skipped
CI / loc-budget (push) Failing after 17s
CI / secret-scan (push) Has been skipped
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
Build + Deploy / build-ai-sdk (push) Failing after 36s
Build + Deploy / build-developer-portal (push) Successful in 8s
Build + Deploy / build-tts (push) Successful in 7s
Build + Deploy / build-document-crawler (push) Successful in 7s
Build + Deploy / build-admin-compliance (push) Successful in 8s
Build + Deploy / build-backend-compliance (push) Successful in 8s
CI / nodejs-build (push) Successful in 3m14s
CI / dep-audit (push) Has been skipped
CI / sbom-scan (push) Has been skipped
CI / test-go (push) Failing after 46s
CI / test-python-backend (push) Successful in 43s
CI / test-python-document-crawler (push) Successful in 29s
CI / test-python-dsms-gateway (push) Successful in 30s
CI / validate-canonical-controls (push) Successful in 16s
Build + Deploy / build-dsms-gateway (push) Successful in 8s
Build + Deploy / build-dsms-node (push) Successful in 8s
CI / branch-name (push) Has been skipped
Build + Deploy / trigger-orca (push) Has been skipped
CI / guardrail-integrity (push) Has been skipped
CI / loc-budget (push) Failing after 17s
CI / secret-scan (push) Has been skipped
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
1. EU Institution Checks (Verordnung 2018/1725): - New doc_type "eu_institution" with 9 L1 + 15 L2 checks - Both German + English patterns (EU institutions are multilingual) - Auto-detection via "2018/1725", "EDSB", "EDPS" keywords - Correct article references (Art. 15 instead of 13, Art. 5 instead of 6) 2. Banner Check Integration: - banner_runner.py maps scan results to 36 L1/L2 structured checks - BannerCheckTab shows hierarchical ChecklistView with hints - 3-phase summary (cookies/scripts before/after consent) - /scan endpoint now includes structured_checks in response 3. JS-heavy Website Fixes (dm, Zalando, HWK): - dsi_helpers.py: goto_resilient (networkidle→domcontentloaded fallback) - try_dismiss_consent_banner before text extraction - PDF redirect detection (dm.de redirects to GCS PDF) 4. Caritas False Positive Fixes: - Phone regex allows parentheses: +49 (0)761 → now matches - "Recht auf Widerspruch" (3 words) + §23 KDG → matches Art. 21 - Church authorities: "Katholisches Datenschutzzentrum" recognized Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -1,19 +1,35 @@
|
||||
'use client'
|
||||
|
||||
import React, { useState } from 'react'
|
||||
import { ChecklistView } from './ChecklistView'
|
||||
|
||||
interface CheckItem {
|
||||
id: string
|
||||
label: string
|
||||
passed: boolean
|
||||
severity: string
|
||||
matched_text: string
|
||||
level?: number
|
||||
parent?: string | null
|
||||
skipped?: boolean
|
||||
hint?: string
|
||||
}
|
||||
|
||||
interface BannerResult {
|
||||
banner_detected: boolean
|
||||
banner_provider: string
|
||||
banner_text: string
|
||||
banner_checks?: {
|
||||
violations: { code: string; text: string; severity: string }[]
|
||||
passes: { code: string; text: string }[]
|
||||
has_impressum_link?: boolean
|
||||
has_dse_link?: boolean
|
||||
}
|
||||
structured_checks?: CheckItem[]
|
||||
completeness_pct?: number
|
||||
correctness_pct?: number
|
||||
phases?: {
|
||||
before_consent: { cookies: number; scripts: number; violations: string[] }
|
||||
after_reject: { cookies: number; scripts: number; violations: string[] }
|
||||
after_accept: { cookies: number; scripts: number; violations: string[] }
|
||||
before_consent: { cookies: string[]; scripts: string[]; tracking_services: string[]; violations: any[] }
|
||||
after_reject: { cookies: string[]; scripts: string[]; new_tracking: string[]; violations: any[] }
|
||||
after_accept: { cookies: string[]; scripts: string[]; new_tracking: string[]; undocumented: string[] }
|
||||
}
|
||||
}
|
||||
|
||||
@@ -43,7 +59,6 @@ export function BannerCheckTab() {
|
||||
const data = await res.json()
|
||||
|
||||
if (data.scan_id) {
|
||||
// Async polling
|
||||
let attempts = 0
|
||||
while (attempts < 60) {
|
||||
await new Promise(r => setTimeout(r, 3000))
|
||||
@@ -69,9 +84,23 @@ export function BannerCheckTab() {
|
||||
}
|
||||
}
|
||||
|
||||
const violations = result?.banner_checks?.violations || []
|
||||
const passes = result?.banner_checks?.passes || []
|
||||
const total = violations.length + passes.length
|
||||
const structuredChecks = result?.structured_checks || []
|
||||
const hasStructured = structuredChecks.length > 0
|
||||
const compPct = result?.completeness_pct ?? 0
|
||||
const corrPct = result?.correctness_pct ?? 0
|
||||
|
||||
// Build ChecklistView-compatible result for structured checks
|
||||
const checklistResults = hasStructured ? [{
|
||||
label: `Cookie-Banner: ${result?.banner_provider || 'Unbekannt'}`,
|
||||
url: url,
|
||||
doc_type: 'banner',
|
||||
word_count: 0,
|
||||
completeness_pct: compPct,
|
||||
correctness_pct: corrPct,
|
||||
checks: structuredChecks,
|
||||
findings_count: structuredChecks.filter(c => !c.passed && !c.skipped).length,
|
||||
error: '',
|
||||
}] : []
|
||||
|
||||
return (
|
||||
<div className="space-y-4">
|
||||
@@ -79,7 +108,7 @@ export function BannerCheckTab() {
|
||||
<h3 className="text-sm font-semibold text-blue-900">Cookie-Banner Compliance Check</h3>
|
||||
<p className="text-xs text-blue-700 mt-1">
|
||||
Playwright-basierter 3-Phasen-Test: Vor Interaktion, nach Ablehnen, nach Akzeptieren.
|
||||
Prueft Dark Patterns, Pre-Consent-Cookies, Farbkontrast, Klick-Paritaet und 20+ weitere Kriterien.
|
||||
Prueft Dark Patterns, Pre-Consent-Cookies, Farbkontrast, Klick-Paritaet und 36 weitere Kriterien.
|
||||
</p>
|
||||
</div>
|
||||
|
||||
@@ -116,14 +145,14 @@ export function BannerCheckTab() {
|
||||
)}
|
||||
|
||||
{result && (
|
||||
<div className="bg-white border border-gray-200 rounded-xl shadow-sm overflow-hidden">
|
||||
{/* Header */}
|
||||
<div className="px-6 py-4 bg-gray-50 border-b border-gray-200">
|
||||
<div className="flex items-center justify-between">
|
||||
<div>
|
||||
<div className="space-y-4">
|
||||
{/* 3-Phase Summary Card */}
|
||||
{result.phases && (
|
||||
<div className="bg-white border border-gray-200 rounded-xl shadow-sm overflow-hidden">
|
||||
<div className="px-6 py-4 bg-gray-50 border-b border-gray-200">
|
||||
<div className="flex items-center gap-3">
|
||||
<span className={`text-2xl`}>
|
||||
{result.banner_detected ? '🛡️' : '⚠️'}
|
||||
<span className="text-2xl">
|
||||
{result.banner_detected ? '\u{1F6E1}\u{FE0F}' : '\u26A0\u{FE0F}'}
|
||||
</span>
|
||||
<div>
|
||||
<h3 className="text-sm font-semibold text-gray-900">
|
||||
@@ -131,98 +160,50 @@ export function BannerCheckTab() {
|
||||
? `Banner erkannt: ${result.banner_provider || 'Unbekannter Anbieter'}`
|
||||
: 'Kein Cookie-Banner erkannt'}
|
||||
</h3>
|
||||
{total > 0 && (
|
||||
<p className="text-xs text-gray-500 mt-0.5">
|
||||
{passes.length}/{total} Pruefungen bestanden
|
||||
</p>
|
||||
)}
|
||||
<p className="text-xs text-gray-500 mt-0.5">
|
||||
3-Phasen-Analyse: Cookies und Scripts vor/nach Interaktion
|
||||
</p>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
{total > 0 && (
|
||||
<div className="flex items-center gap-2">
|
||||
<div className="w-24 h-2 bg-gray-200 rounded-full overflow-hidden">
|
||||
<div
|
||||
className={`h-full rounded-full ${violations.length === 0 ? 'bg-green-500' : violations.length <= 3 ? 'bg-yellow-500' : 'bg-red-500'}`}
|
||||
style={{ width: `${Math.round(passes.length / total * 100)}%` }}
|
||||
/>
|
||||
</div>
|
||||
<span className={`text-xs font-medium ${violations.length === 0 ? 'text-green-700' : 'text-red-700'}`}>
|
||||
{Math.round(passes.length / total * 100)}%
|
||||
</span>
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
{/* 3-Phase Summary */}
|
||||
{result.phases && (
|
||||
<div className="px-6 py-3 border-b border-gray-100 grid grid-cols-3 gap-4">
|
||||
{[
|
||||
{ label: 'Vor Consent', data: result.phases.before_consent, icon: '🔒' },
|
||||
{ label: 'Nach Ablehnen', data: result.phases.after_reject, icon: '🚫' },
|
||||
{ label: 'Nach Akzeptieren', data: result.phases.after_accept, icon: '✅' },
|
||||
].map(phase => (
|
||||
<div key={phase.label} className="text-center">
|
||||
<div className="text-lg">{phase.icon}</div>
|
||||
<div className="text-xs font-medium text-gray-700">{phase.label}</div>
|
||||
<div className="text-xs text-gray-500 mt-1">
|
||||
{phase.data.cookies} Cookies, {phase.data.scripts} Scripts
|
||||
</div>
|
||||
{phase.data.violations.length > 0 && (
|
||||
<div className="text-xs text-red-600 font-medium">
|
||||
{phase.data.violations.length} Verstoesse
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
))}
|
||||
</div>
|
||||
)}
|
||||
|
||||
{/* Violations */}
|
||||
{violations.length > 0 && (
|
||||
<div className="px-6 py-4">
|
||||
<h4 className="text-xs font-semibold text-red-700 uppercase tracking-wide mb-2">
|
||||
Verstoesse ({violations.length})
|
||||
</h4>
|
||||
<div className="space-y-2">
|
||||
{violations.map((v, i) => (
|
||||
<div key={i} className="flex items-start gap-2">
|
||||
<svg className="w-4 h-4 text-red-500 mt-0.5 shrink-0" fill="none" stroke="currentColor" viewBox="0 0 24 24">
|
||||
<path strokeLinecap="round" strokeLinejoin="round" strokeWidth={2} d="M6 18L18 6M6 6l12 12" />
|
||||
</svg>
|
||||
<div>
|
||||
<div className="text-sm text-red-700">{v.text}</div>
|
||||
<div className="text-xs text-gray-400 mt-0.5">{v.code} | {v.severity}</div>
|
||||
</div>
|
||||
</div>
|
||||
))}
|
||||
<div className="px-6 py-3 grid grid-cols-3 gap-4">
|
||||
<PhaseBox
|
||||
label="Vor Consent"
|
||||
icon="\uD83D\uDD12"
|
||||
cookies={result.phases.before_consent.cookies?.length ?? 0}
|
||||
scripts={result.phases.before_consent.scripts?.length ?? 0}
|
||||
violations={result.phases.before_consent.violations?.length ?? 0}
|
||||
/>
|
||||
<PhaseBox
|
||||
label="Nach Ablehnen"
|
||||
icon="\uD83D\uDEAB"
|
||||
cookies={result.phases.after_reject.cookies?.length ?? 0}
|
||||
scripts={result.phases.after_reject.scripts?.length ?? 0}
|
||||
violations={result.phases.after_reject.violations?.length ?? 0}
|
||||
/>
|
||||
<PhaseBox
|
||||
label="Nach Akzeptieren"
|
||||
icon="\u2705"
|
||||
cookies={result.phases.after_accept.cookies?.length ?? 0}
|
||||
scripts={result.phases.after_accept.scripts?.length ?? 0}
|
||||
violations={0}
|
||||
/>
|
||||
</div>
|
||||
</div>
|
||||
)}
|
||||
|
||||
{/* Passes */}
|
||||
{passes.length > 0 && (
|
||||
<div className="px-6 py-4 border-t border-gray-100">
|
||||
<h4 className="text-xs font-semibold text-green-700 uppercase tracking-wide mb-2">
|
||||
Bestanden ({passes.length})
|
||||
</h4>
|
||||
<div className="space-y-1">
|
||||
{passes.map((p, i) => (
|
||||
<div key={i} className="flex items-start gap-2">
|
||||
<svg className="w-4 h-4 text-green-500 mt-0.5 shrink-0" fill="none" stroke="currentColor" viewBox="0 0 24 24">
|
||||
<path strokeLinecap="round" strokeLinejoin="round" strokeWidth={2} d="M5 13l4 4L19 7" />
|
||||
</svg>
|
||||
<div className="text-sm text-gray-600">{p.text}</div>
|
||||
</div>
|
||||
))}
|
||||
</div>
|
||||
{/* Structured L1/L2 Checklist */}
|
||||
{hasStructured && (
|
||||
<div className="bg-white border border-gray-200 rounded-xl p-6 shadow-sm">
|
||||
<ChecklistView results={checklistResults} />
|
||||
</div>
|
||||
)}
|
||||
|
||||
{!result.banner_detected && violations.length === 0 && passes.length === 0 && (
|
||||
<div className="px-6 py-4 text-sm text-gray-500">
|
||||
Kein Cookie-Banner auf dieser Seite gefunden. Falls Cookies gesetzt werden, ist ein Banner nach §25 TDDDG Pflicht.
|
||||
{!result.banner_detected && !hasStructured && (
|
||||
<div className="bg-white border border-gray-200 rounded-xl p-6 shadow-sm">
|
||||
<p className="text-sm text-gray-500">
|
||||
Kein Cookie-Banner auf dieser Seite gefunden. Falls Cookies gesetzt werden, ist ein Banner nach ss25 TDDDG Pflicht.
|
||||
</p>
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
@@ -230,3 +211,22 @@ export function BannerCheckTab() {
|
||||
</div>
|
||||
)
|
||||
}
|
||||
|
||||
function PhaseBox({ label, icon, cookies, scripts, violations }: {
|
||||
label: string; icon: string; cookies: number; scripts: number; violations: number
|
||||
}) {
|
||||
return (
|
||||
<div className="text-center">
|
||||
<div className="text-lg">{icon}</div>
|
||||
<div className="text-xs font-medium text-gray-700">{label}</div>
|
||||
<div className="text-xs text-gray-500 mt-1">
|
||||
{cookies} Cookies, {scripts} Scripts
|
||||
</div>
|
||||
{violations > 0 && (
|
||||
<div className="text-xs text-red-600 font-medium">
|
||||
{violations} Verstoesse
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
)
|
||||
}
|
||||
|
||||
@@ -30,6 +30,7 @@ const DOC_TYPE_LABELS: Record<string, string> = {
|
||||
dse: 'DSI', agb: 'AGB', impressum: 'Impressum',
|
||||
cookie: 'Cookie', widerruf: 'Widerruf', other: 'Sonstiges',
|
||||
social_media: 'Social Media', dsfa: 'DSFA', joint_controller: 'Art. 26',
|
||||
eu_institution: 'EU-Inst.', banner: 'Banner',
|
||||
}
|
||||
|
||||
interface GroupedCheck {
|
||||
|
||||
@@ -329,6 +329,7 @@ SECTION_TYPE_MAP = [
|
||||
(r"datenschutzfolge|dsfa|risikoanalyse", "dsfa"),
|
||||
(r"^social\s*media$|^soziale\s+(?:medien|netzwerke)$", "social_media"),
|
||||
(r"datenschutzerkl(?:ae|ä)rung.*social|datenschutz\s+f(?:ue|ü)r\s+social", "social_media"),
|
||||
(r"(?:verordnung|regulation)\s*\(?eu\)?\s*2018\s*/?\s*1725", "eu_institution"),
|
||||
]
|
||||
|
||||
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
doc_checks — Legal document compliance checkers.
|
||||
|
||||
Provides checklists and functions for verifying legal documents
|
||||
(DSI, AGB, Impressum, Cookie, Widerruf, Social Media, DSFA)
|
||||
(DSI, AGB, Impressum, Cookie, Widerruf, Social Media, DSFA, EU Institution)
|
||||
against their mandatory content requirements.
|
||||
|
||||
Two check levels:
|
||||
@@ -18,6 +18,7 @@ from .impressum_checks import IMPRESSUM_CHECKLIST
|
||||
from .cookie_checks import COOKIE_CHECKLIST
|
||||
from .social_media_checks import JOINT_CONTROLLER_CHECKLIST
|
||||
from .dsfa_checks import DSFA_CHECKLIST
|
||||
from .eu_institution_checks import EU_INSTITUTION_CHECKLIST
|
||||
|
||||
__all__ = [
|
||||
"check_document_completeness",
|
||||
@@ -29,4 +30,5 @@ __all__ = [
|
||||
"COOKIE_CHECKLIST",
|
||||
"JOINT_CONTROLLER_CHECKLIST",
|
||||
"DSFA_CHECKLIST",
|
||||
"EU_INSTITUTION_CHECKLIST",
|
||||
]
|
||||
|
||||
@@ -47,8 +47,9 @@ ART13_CHECKLIST = [
|
||||
"label": "Telefonnummer des Verantwortlichen",
|
||||
"level": 2, "parent": "controller",
|
||||
"patterns": [
|
||||
r"(?:tel(?:efon)?|phone|fon)\s*[.:]\s*[\+\d][\d\s/\-]{6,}",
|
||||
r"\+49\s*[\d\s/\-]{8,}",
|
||||
r"(?:tel(?:efon)?|phone|fon)\s*[.:]\s*[\+\d][\d\s/\-\(\)]{6,}",
|
||||
r"\+49\s*[\d\s/\-\(\)]{8,}",
|
||||
r"0\d{2,4}\s*[\(/\-\s]\s*\d{3,}",
|
||||
],
|
||||
"severity": "MEDIUM",
|
||||
"hint": "EuGH (C-298/17, 'Verein fuer Konsumenteninformation') verlangt effektive Kontaktmoeglichkeit. Telefon ist nicht zwingend, aber empfohlen — fehlt sie, muss ein gleichwertiger Kanal (z.B. Chat, Rueckruf) angeboten werden.",
|
||||
@@ -345,7 +346,7 @@ ART13_CHECKLIST = [
|
||||
"id": "rights_art21",
|
||||
"label": "Widerspruchsrecht (Art. 21)",
|
||||
"level": 2, "parent": "rights",
|
||||
"patterns": [r"art\.\s*21", r"widerspruchsrecht", r"right\s+to\s+object"],
|
||||
"patterns": [r"art\.\s*21", r"widerspruchsrecht", r"recht\s+auf\s+widerspruch", r"§\s*23\s+kdg", r"right\s+to\s+object"],
|
||||
"severity": "LOW",
|
||||
"hint": "Art. 21(4) DSGVO: Der Widerspruchshinweis muss spaetestens zum Zeitpunkt der ersten Kommunikation GESONDERT und in klarer Sprache erfolgen. Haeufiger Fehler: Widerspruchsrecht nur im Fliesstext versteckt — eigener Abschnitt/Hervorhebung noetig.",
|
||||
},
|
||||
@@ -386,6 +387,9 @@ ART13_CHECKLIST = [
|
||||
r"l(?:an)?fdi\s+\w+",
|
||||
r"bfdi",
|
||||
r"(?:bayerische|hessische|s(?:ae|ä)chsische|berliner)\s+(?:datenschutz|aufsicht)",
|
||||
r"(?:katholisch|evangelisch|kirchlich)\w*\s+datenschutz",
|
||||
r"datenschutzzentrum",
|
||||
r"kd(?:oe|ö)r",
|
||||
],
|
||||
"severity": "LOW",
|
||||
"hint": "Vollstaendigen Namen, Adresse und Website der Aufsichtsbehoerde angeben. Haeufiger Fehler: 'die zustaendige Aufsichtsbehoerde' ohne Konkretisierung. Korrekt z.B.: 'LfDI BW, Koenigstrasse 10a, 70173 Stuttgart, www.baden-wuerttemberg.datenschutz.de'.",
|
||||
|
||||
@@ -0,0 +1,500 @@
|
||||
"""
|
||||
EU Institution checks — Verordnung (EU) 2018/1725.
|
||||
Applies to EU institutions, bodies, offices and agencies instead of DSGVO.
|
||||
Key differences: Art. 15 (not 13), Art. 5 (not 6), EDSB (not national DPAs).
|
||||
L1: Pflichtangabe erwaehnt? L2: Pflichtangabe korrekt/vollstaendig?
|
||||
"""
|
||||
|
||||
EU_INSTITUTION_CHECKLIST = [
|
||||
# == L1: Verantwortlicher (Controller) =================================
|
||||
{
|
||||
"id": "eu_controller",
|
||||
"label": "Verantwortlicher (Art. 15(1)(a) VO 2018/1725)",
|
||||
"level": 1, "parent": None,
|
||||
"patterns": [
|
||||
r"verantwortlich\w*\s+(?:ist|im sinne|fuer|f(?:ue|ü)r)",
|
||||
r"kontaktdaten\s+des\s+verantwortlichen",
|
||||
r"name\s+(?:und|&)\s+kontaktdaten\s+des",
|
||||
r"controller", r"verantwortliche\s+stelle",
|
||||
r"responsible\s+(?:party|for)",
|
||||
r"data\s+controller",
|
||||
r"identity\s+(?:of\s+)?(?:the\s+)?controller",
|
||||
],
|
||||
"severity": "HIGH",
|
||||
"hint": (
|
||||
"Art. 15(1)(a) VO 2018/1725 verlangt die Identitaet des Verantwortlichen. "
|
||||
"Bei EU-Organen: Vollstaendiger Name der Institution (z.B. 'Europaeische Kommission, "
|
||||
"GD DIGIT'), Dienstadresse und funktionale E-Mail-Adresse. "
|
||||
"Haeufiger Fehler: Nur Abkuerzung ohne vollstaendigen Institutionsnamen."
|
||||
),
|
||||
},
|
||||
{
|
||||
"id": "eu_controller_address",
|
||||
"label": "Dienstadresse des Verantwortlichen",
|
||||
"level": 2, "parent": "eu_controller",
|
||||
"patterns": [
|
||||
r"(?:rue|avenue|boulevard|strasse|stra(?:ss|ß)e)\s+\w+",
|
||||
r"\d{4,5}\s+(?:bruxelles|brussels|br(?:ue|ü)ssel|luxembourg|luxemburg|strasbourg|stra(?:ss|ß)burg)",
|
||||
r"b[\-\s]?\d{4}\s+\w+",
|
||||
r"l[\-\s]?\d{4}\s+\w+",
|
||||
],
|
||||
"severity": "MEDIUM",
|
||||
"hint": (
|
||||
"Angabe der Dienstadresse der EU-Institution (typisch: Bruessel, Luxemburg "
|
||||
"oder Strassburg). Format z.B. 'Rue de la Loi 200, B-1049 Bruxelles'. "
|
||||
"Haeufiger Fehler: Nur Postfach ohne physische Adresse."
|
||||
),
|
||||
},
|
||||
{
|
||||
"id": "eu_controller_email",
|
||||
"label": "E-Mail-Adresse des Verantwortlichen",
|
||||
"level": 2, "parent": "eu_controller",
|
||||
"patterns": [
|
||||
r"[a-z0-9._%+\-]+@[a-z0-9.\-]+\.europa\.eu",
|
||||
r"[a-z0-9._%+\-]+@[a-z0-9.\-]+\.[a-z]{2,}",
|
||||
],
|
||||
"severity": "MEDIUM",
|
||||
"hint": (
|
||||
"Eine funktionale E-Mail-Adresse ist Pflicht (Art. 15(1)(a) VO 2018/1725). "
|
||||
"Bei EU-Organen typischerweise @ec.europa.eu, @europarl.europa.eu o.ae. "
|
||||
"Ein reines Kontaktformular genuegt nicht als unmittelbarer Kommunikationskanal."
|
||||
),
|
||||
},
|
||||
# == L1: Datenschutzbeauftragter (DPO) =================================
|
||||
{
|
||||
"id": "eu_dpo",
|
||||
"label": "Datenschutzbeauftragter (Art. 15(1)(b) / Art. 43 VO 2018/1725)",
|
||||
"level": 1, "parent": None,
|
||||
"patterns": [
|
||||
r"datenschutzbeauftragt",
|
||||
r"data\s+protection\s+officer",
|
||||
r"kontaktdaten\s+de[rs]\s+datenschutz",
|
||||
r"dpo",
|
||||
r"d(?:ae|ä)legu(?:e|é)\s+(?:a|à)\s+la\s+protection\s+des\s+donn(?:e|é)es",
|
||||
],
|
||||
"severity": "HIGH",
|
||||
"hint": (
|
||||
"Art. 43-44 VO 2018/1725: Jedes EU-Organ MUSS einen DSB (DPO) benennen. "
|
||||
"Dies ist — anders als unter der DSGVO — keine Frage der Mitarbeiterzahl, "
|
||||
"sondern absolute Pflicht fuer alle EU-Organe. Die Kontaktdaten muessen in "
|
||||
"jeder Datenschutzerklaerung angegeben werden (Art. 15(1)(b))."
|
||||
),
|
||||
},
|
||||
{
|
||||
"id": "eu_dpo_contact",
|
||||
"label": "DPO-Kontaktdaten (E-Mail oder Adresse)",
|
||||
"level": 2, "parent": "eu_dpo",
|
||||
"patterns": [
|
||||
r"(?:data\s+protection\s+officer|dpo|datenschutzbeauftragt)[\s\S]{0,300}[a-z0-9._%+\-]+@",
|
||||
r"dpo[\s\S]{0,100}@",
|
||||
r"data[\-\.]?protection@",
|
||||
r"dpo@\w+\.europa\.eu",
|
||||
],
|
||||
"severity": "MEDIUM",
|
||||
"hint": (
|
||||
"Art. 44(7) VO 2018/1725: Die Kontaktdaten des DPO muessen veroeffentlicht werden. "
|
||||
"Mindestens eine funktionale E-Mail-Adresse angeben (z.B. DATA-PROTECTION-OFFICER@ec.europa.eu). "
|
||||
"Den Namen des DPO muessen Sie nicht nennen."
|
||||
),
|
||||
},
|
||||
|
||||
{
|
||||
"id": "eu_dpo_function",
|
||||
"label": "DPO-Funktion / -Rolle beschrieben",
|
||||
"level": 2, "parent": "eu_dpo",
|
||||
"patterns": [
|
||||
r"(?:aufgaben|role|function|zustaendig).*(?:dpo|datenschutzbeauftragt|data\s+protection\s+officer)",
|
||||
r"(?:dpo|datenschutzbeauftragt|data\s+protection\s+officer).*(?:aufgaben|role|function|zustaendig)",
|
||||
r"art(?:icle)?\s*44\s+(?:vo|regulation|verordnung)",
|
||||
],
|
||||
"severity": "LOW",
|
||||
"hint": (
|
||||
"Art. 44 VO 2018/1725 beschreibt die Aufgaben des DPO bei EU-Organen: "
|
||||
"Beratung, Ueberwachung, Zusammenarbeit mit dem EDSB. "
|
||||
"Es empfiehlt sich, kurz die Rolle des DPO zu erlaeutern, damit "
|
||||
"Betroffene wissen, wofuer der DPO zustaendig ist."
|
||||
),
|
||||
},
|
||||
|
||||
# == L1: Zwecke und Rechtsgrundlage ====================================
|
||||
{
|
||||
"id": "eu_purposes",
|
||||
"label": "Zwecke der Verarbeitung (Art. 15(1)(c) VO 2018/1725)",
|
||||
"level": 1, "parent": None,
|
||||
"patterns": [
|
||||
r"zweck\w*\s+(?:der|und|die)\s+(?:verarbeitung|datenerhebung|datenverarbeitung)",
|
||||
r"purpose\w*\s+(?:of|for)\s+(?:the\s+)?(?:processing|data)",
|
||||
r"zu\s+welch\w+\s+zweck",
|
||||
r"(?:data|personal\s+data)\s+(?:is|are)\s+(?:collected|processed)\s+(?:for|to)",
|
||||
],
|
||||
"severity": "HIGH",
|
||||
"hint": (
|
||||
"Art. 15(1)(c) VO 2018/1725 verlangt konkrete Zweckangaben. "
|
||||
"EU-Organe muessen jeden Verarbeitungszweck einzeln auffuehren: z.B. "
|
||||
"'Verwaltung von Bewerbungen', 'Zugangsmanagement zum Gebaeude', "
|
||||
"'Webanalyse der Internetseite'. Pauschalformulierungen sind unzulaessig."
|
||||
),
|
||||
},
|
||||
{
|
||||
"id": "eu_purposes_specific",
|
||||
"label": "Konkrete Verarbeitungszwecke benannt",
|
||||
"level": 2, "parent": "eu_purposes",
|
||||
"patterns": [
|
||||
r"(?:recruitment|selection|verwaltung|management|administration|monitoring|evaluation)",
|
||||
r"(?:human\s+resources|hr|personal|bewerbung|grant|procurement|vergabe)",
|
||||
r"(?:access|zugang|building|gebaeude|website|webseite|intranet)",
|
||||
],
|
||||
"severity": "LOW",
|
||||
"hint": (
|
||||
"Mindestens 2 konkrete Zwecke benennen, jeweils mit zugehoeriger "
|
||||
"Rechtsgrundlage. Typische EU-Organ-Zwecke: Personalverwaltung, "
|
||||
"Gebaeudezugang, IT-Sicherheitsmonitoring, Vergabeverfahren, "
|
||||
"Evaluierung von Foerderprogrammen. Pauschalformulierungen genuegen "
|
||||
"nicht dem Bestimmtheitsgrundsatz."
|
||||
),
|
||||
},
|
||||
# == L1: Rechtsgrundlage (Art. 5 statt Art. 6 DSGVO) ==================
|
||||
{
|
||||
"id": "eu_legal_basis",
|
||||
"label": "Rechtsgrundlage (Art. 5 VO 2018/1725)",
|
||||
"level": 1, "parent": None,
|
||||
"patterns": [
|
||||
r"rechtsgrundlage",
|
||||
r"art\.\s*5\s*(?:abs|absatz)?\s*\.?\s*1",
|
||||
r"legal\s+basis",
|
||||
r"lawfulness\s+of\s+processing",
|
||||
r"art(?:icle)?\s*5\s*(?:\(1\))?\s*(?:\([a-d]\))?",
|
||||
r"auf\s+grundlage\s+(?:von|des|der)\s+art",
|
||||
r"regulation\s*\(eu\)\s*2018\s*/?\s*1725",
|
||||
r"verordnung\s*\(eu\)\s*2018\s*/?\s*1725",
|
||||
],
|
||||
"severity": "HIGH",
|
||||
"hint": (
|
||||
"Art. 5(1) VO 2018/1725 enthaelt die Rechtsgrundlagen fuer EU-Organe: "
|
||||
"(a) Einwilligung, (b) Vertrag, (c) rechtliche Verpflichtung, "
|
||||
"(d) im oeffentlichen Interesse/Ausuebung oeffentlicher Gewalt. "
|
||||
"WICHTIG: Art. 5(1)(d) ist der haeufigste Tatbestand bei EU-Organen — "
|
||||
"er entspricht etwa Art. 6(1)(e) DSGVO. Art. 6(1)(f) DSGVO "
|
||||
"(berechtigtes Interesse) existiert in der VO 2018/1725 NICHT."
|
||||
),
|
||||
},
|
||||
{
|
||||
"id": "eu_legal_basis_public_interest",
|
||||
"label": "Art. 5(1)(a) — Oeffentliches Interesse / oeffentliche Gewalt",
|
||||
"level": 2, "parent": "eu_legal_basis",
|
||||
"patterns": [
|
||||
r"art\.\s*5\s*(?:\(1\))?\s*\(?(?:1\s*)?(?:let(?:ter)?\.?\s*)?a\)?",
|
||||
r"(?:oeffentlich|öffentlich).*(?:interesse|gewalt|aufgabe)",
|
||||
r"public\s+interest",
|
||||
r"(?:exercise|performance)\s+of\s+(?:official|public)\s+(?:authority|task)",
|
||||
],
|
||||
"severity": "LOW",
|
||||
"hint": (
|
||||
"Art. 5(1)(a) VO 2018/1725 ist die Hauptrechtsgrundlage fuer EU-Organe. "
|
||||
"Verlangt einen konkreten Rechtsakt als Grundlage (z.B. Verordnung, "
|
||||
"Beschluss, Basisrechtsakt der Institution). Benennen Sie den spezifischen "
|
||||
"Rechtsakt, nicht nur pauschal 'oeffentliches Interesse'."
|
||||
),
|
||||
},
|
||||
{
|
||||
"id": "eu_legal_basis_consent",
|
||||
"label": "Art. 5(1)(d) — Einwilligung",
|
||||
"level": 2, "parent": "eu_legal_basis",
|
||||
"patterns": [
|
||||
r"art\.\s*5\s*(?:\(1\))?\s*\(?(?:1\s*)?(?:let(?:ter)?\.?\s*)?d\)?",
|
||||
r"einwilligung\s+(?:gem|nach|i\.?\s*s\.?\s*d\.?)",
|
||||
r"consent\s+(?:of|given\s+by)\s+the\s+data\s+subject",
|
||||
],
|
||||
"severity": "LOW",
|
||||
"hint": (
|
||||
"Bei Einwilligung (Art. 5(1)(d) VO 2018/1725) muss auf das jederzeitige "
|
||||
"Widerrufsrecht hingewiesen werden (Art. 7(3) VO 2018/1725). "
|
||||
"Achtung: EU-Organe sollten Einwilligung nur als Rechtsgrundlage waehlen, "
|
||||
"wenn keine andere Grundlage greift — wegen des Machtungleichgewichts "
|
||||
"zwischen Institution und Einzelperson (EDSB-Leitlinien)."
|
||||
),
|
||||
},
|
||||
|
||||
# == L1: Empfaenger ====================================================
|
||||
{
|
||||
"id": "eu_recipients",
|
||||
"label": "Empfaenger (Art. 15(1)(e) VO 2018/1725)",
|
||||
"level": 1, "parent": None,
|
||||
"patterns": [
|
||||
r"empf(?:ae|ä)nger",
|
||||
r"(?:ueber|über|weiter)mitt(?:el|l)ung",
|
||||
r"recipient",
|
||||
r"weitergabe\s+(?:an|von)\s+daten",
|
||||
r"data\s+(?:will\s+be|are|is)\s+(?:shared|disclosed|transferred|transmitted)\s+to",
|
||||
r"auftragsverarbeit",
|
||||
r"processor",
|
||||
],
|
||||
"severity": "MEDIUM",
|
||||
"hint": (
|
||||
"Art. 15(1)(e) VO 2018/1725: Empfaenger oder Empfaengerkategorien benennen. "
|
||||
"Typisch bei EU-Organen: andere EU-Institutionen (z.B. OLAF, Rechnungshof), "
|
||||
"Mitgliedstaaten-Behoerden, IT-Dienstleister. Auftragsverarbeiter muessen "
|
||||
"nach Art. 29 VO 2018/1725 vertraglich gebunden sein."
|
||||
),
|
||||
},
|
||||
{
|
||||
"id": "eu_recipients_processor",
|
||||
"label": "Auftragsverarbeiter / Processor (Art. 29 VO 2018/1725)",
|
||||
"level": 2, "parent": "eu_recipients",
|
||||
"patterns": [
|
||||
r"auftragsverarbeit(?:er|ung)",
|
||||
r"art\.\s*29\s+(?:vo|verordnung|regulation)",
|
||||
r"art(?:icle)?\s*29",
|
||||
r"processor",
|
||||
r"sub[\-\s]?processor",
|
||||
],
|
||||
"severity": "LOW",
|
||||
"hint": (
|
||||
"Art. 29 VO 2018/1725 (entspricht Art. 28 DSGVO): "
|
||||
"Auftragsverarbeiter muessen vertraglich gebunden werden. "
|
||||
"Erwaehnen Sie, dass ein Auftragsverarbeitungsvertrag besteht. "
|
||||
"Bei Cloud-Diensten (z.B. Microsoft 365, AWS): Vertrag muss "
|
||||
"die Vorgaben von Art. 29(3) VO 2018/1725 einhalten."
|
||||
),
|
||||
},
|
||||
|
||||
# == L1: Drittlandtransfer =============================================
|
||||
{
|
||||
"id": "eu_third_country",
|
||||
"label": "Drittlandtransfer (Art. 46-50 VO 2018/1725)",
|
||||
"level": 1, "parent": None,
|
||||
"patterns": [
|
||||
r"drittland",
|
||||
r"dritt\s*staat",
|
||||
r"third\s+countr",
|
||||
r"angemessenheitsbeschluss",
|
||||
r"adequacy\s+decision",
|
||||
r"standard\s*(?:vertragsklausel|contractual\s+clause)",
|
||||
r"(?:transfer|uebermittlung|übermittlung).*(?:ausserhalb|außerhalb|outside)",
|
||||
r"(?:europ(?:ae|ä)ischen\s+wirtschaftsraum|ewr|eea)",
|
||||
r"art(?:icle)?\s*4[6-9]",
|
||||
r"art\.\s*50",
|
||||
],
|
||||
"severity": "MEDIUM",
|
||||
"hint": (
|
||||
"Art. 46-50 VO 2018/1725 (entspricht Art. 44-49 DSGVO): "
|
||||
"Drittlandtransfers erfordern Angemessenheitsbeschluss (Art. 47), "
|
||||
"geeignete Garantien (Art. 48) oder Ausnahmen (Art. 50). "
|
||||
"EDSB-Empfehlung: EU-Organe muessen besonders streng pruefen, "
|
||||
"da sie eine Vorbildfunktion fuer die Mitgliedstaaten haben."
|
||||
),
|
||||
},
|
||||
{
|
||||
"id": "eu_third_country_mechanism",
|
||||
"label": "Transfermechanismus benannt (Art. 47-48 VO 2018/1725)",
|
||||
"level": 2, "parent": "eu_third_country",
|
||||
"patterns": [
|
||||
r"standard\s*vertragsklausel|scc|standard\s+contractual",
|
||||
r"angemessenheitsbeschluss|adequacy\s+decision",
|
||||
r"art(?:icle)?\s*4[7-8]",
|
||||
r"data\s+privacy\s+framework|dpf",
|
||||
r"appropriate\s+safeguards",
|
||||
r"geeignete\s+garantien",
|
||||
],
|
||||
"severity": "MEDIUM",
|
||||
"hint": (
|
||||
"Art. 48 VO 2018/1725: Bei fehlender Angemessenheit koennen "
|
||||
"geeignete Garantien (z.B. SCC, verbindliche Verwaltungsvereinbarungen) "
|
||||
"den Transfer absichern. Der EDSB hat 2020 eigene Leitlinien zu "
|
||||
"Drittlandtransfers fuer EU-Organe veroeffentlicht."
|
||||
),
|
||||
},
|
||||
|
||||
# == L1: Speicherdauer =================================================
|
||||
{
|
||||
"id": "eu_retention",
|
||||
"label": "Speicherdauer (Art. 15(1)(g) VO 2018/1725)",
|
||||
"level": 1, "parent": None,
|
||||
"patterns": [
|
||||
r"speicherdauer",
|
||||
r"aufbewahrungsfrist",
|
||||
r"retention\s+period",
|
||||
r"(?:how\s+long|storage\s+period|data\s+retention)",
|
||||
r"l(?:oe|ö)sch(?:ung|frist)",
|
||||
r"daten\s+werden\s+gel(?:oe|ö)scht",
|
||||
r"(?:\d+\s+(?:tage|monate|jahre|days|months|years))",
|
||||
r"dauer\s+der\s+speicherung",
|
||||
r"data\s+will\s+be\s+(?:kept|stored|retained)\s+(?:for|until|during)",
|
||||
],
|
||||
"severity": "HIGH",
|
||||
"hint": (
|
||||
"Art. 15(1)(g) VO 2018/1725 verlangt die Speicherdauer oder "
|
||||
"Kriterien zu deren Festlegung. EU-Organe haben oft interne "
|
||||
"Aufbewahrungsrichtlinien (retention schedules). Nennen Sie die "
|
||||
"konkreten Fristen oder verweisen Sie auf die interne Richtlinie "
|
||||
"mit Dokumentenreferenz."
|
||||
),
|
||||
},
|
||||
{
|
||||
"id": "eu_retention_periods",
|
||||
"label": "Konkrete Zeitangaben",
|
||||
"level": 2, "parent": "eu_retention",
|
||||
"patterns": [
|
||||
r"\d+\s+(?:tage?|monate?|jahre?|days?|months?|years?)",
|
||||
r"(?:after|nach)\s+(?:the\s+)?(?:end|closure|completion|ablauf|beendigung)",
|
||||
r"retention\s+(?:schedule|policy|period)\s+(?:of|for)\s+\d+",
|
||||
],
|
||||
"severity": "MEDIUM",
|
||||
"hint": (
|
||||
"Konkrete Fristen pro Datenkategorie nennen. EU-Organe folgen "
|
||||
"typischerweise der Common Retention List (CRL) der Kommission. "
|
||||
"Beispiel: Bewerbungsdaten 2 Jahre, Finanzunterlagen 7 Jahre, "
|
||||
"Gebaeudezugangslogs 6 Monate."
|
||||
),
|
||||
},
|
||||
|
||||
# == L1: Betroffenenrechte (Art. 17-24 statt Art. 15-22 DSGVO) =========
|
||||
{
|
||||
"id": "eu_rights",
|
||||
"label": "Betroffenenrechte (Art. 17-24 VO 2018/1725)",
|
||||
"level": 1, "parent": None,
|
||||
"patterns": [
|
||||
r"recht\s+auf\s+auskunft",
|
||||
r"recht\s+auf\s+l(?:oe|ö)schung",
|
||||
r"recht\s+auf\s+berichtigung",
|
||||
r"widerspruchsrecht",
|
||||
r"right\s+to\s+(?:access|erasure|rectification|object|restrict)",
|
||||
r"betroffenenrecht",
|
||||
r"rechte\s+(?:des|der)\s+betroffenen",
|
||||
r"(?:your|data\s+subject)\s+rights",
|
||||
r"art(?:icle)?\s*(?:17|18|19|20|21|22|23|24)\s+(?:vo|regulation|verordnung)",
|
||||
],
|
||||
"severity": "HIGH",
|
||||
"hint": (
|
||||
"Art. 15(1)(h) VO 2018/1725 verlangt Nennung der Betroffenenrechte: "
|
||||
"Auskunft (Art. 17), Berichtigung (Art. 18), Loeschung (Art. 19), "
|
||||
"Einschraenkung (Art. 20), Datenportabilitaet (Art. 22), "
|
||||
"Widerspruch (Art. 23). Achtung: Die Artikelnummern unterscheiden sich "
|
||||
"von der DSGVO (Art. 15-22)! Haeufiger Fehler: DSGVO-Artikel "
|
||||
"statt VO 2018/1725 Artikel zitieren."
|
||||
),
|
||||
},
|
||||
{
|
||||
"id": "eu_rights_access",
|
||||
"label": "Recht auf Auskunft (Art. 17 VO 2018/1725)",
|
||||
"level": 2, "parent": "eu_rights",
|
||||
"patterns": [
|
||||
r"art(?:icle)?\s*17\s+(?:vo|regulation|verordnung)",
|
||||
r"art\.\s*17",
|
||||
r"recht\s+auf\s+(?:\w+\s+)?auskunft",
|
||||
r"right\s+(?:of|to)\s+access",
|
||||
],
|
||||
"severity": "LOW",
|
||||
"hint": (
|
||||
"Art. 17 VO 2018/1725 (entspricht Art. 15 DSGVO): Betroffene koennen "
|
||||
"Auskunft und eine Kopie ihrer Daten verlangen. Antwortfrist: 1 Monat "
|
||||
"(Art. 14(3) VO 2018/1725). Anfragen gehen typischerweise an den DPO "
|
||||
"der Institution."
|
||||
),
|
||||
},
|
||||
{
|
||||
"id": "eu_rights_erasure",
|
||||
"label": "Recht auf Loeschung (Art. 19 VO 2018/1725)",
|
||||
"level": 2, "parent": "eu_rights",
|
||||
"patterns": [
|
||||
r"art(?:icle)?\s*19\s+(?:vo|regulation|verordnung)",
|
||||
r"art\.\s*19",
|
||||
r"recht\s+auf\s+(?:\w+\s+)?l(?:oe|ö)schung",
|
||||
r"right\s+to\s+erasure",
|
||||
r"right\s+to\s+be\s+forgotten",
|
||||
],
|
||||
"severity": "LOW",
|
||||
"hint": (
|
||||
"Art. 19 VO 2018/1725 (entspricht Art. 17 DSGVO): "
|
||||
"Recht auf Loeschung bei Zweckentfall, Widerruf der Einwilligung "
|
||||
"oder unrechtmaessiger Verarbeitung. Erwaehnen Sie auch die "
|
||||
"Ausnahmen fuer EU-Organe: Archivzwecke im oeffentlichen Interesse, "
|
||||
"gesetzliche Aufbewahrungspflichten."
|
||||
),
|
||||
},
|
||||
|
||||
{
|
||||
"id": "eu_rights_restriction",
|
||||
"label": "Recht auf Einschraenkung (Art. 20 VO 2018/1725)",
|
||||
"level": 2, "parent": "eu_rights",
|
||||
"patterns": [
|
||||
r"art(?:icle)?\s*20\s+(?:vo|regulation|verordnung)",
|
||||
r"art\.\s*20",
|
||||
r"einschr(?:ae|ä)nkung\s+der\s+verarbeitung",
|
||||
r"right\s+to\s+restrict(?:ion)?",
|
||||
],
|
||||
"severity": "LOW",
|
||||
"hint": (
|
||||
"Art. 20 VO 2018/1725 (entspricht Art. 18 DSGVO): "
|
||||
"Recht auf Einschraenkung der Verarbeitung bei bestrittener "
|
||||
"Richtigkeit, unrechtmaessiger Verarbeitung oder laufendem "
|
||||
"Widerspruch. Wird am haeufigsten vergessen."
|
||||
),
|
||||
},
|
||||
{
|
||||
"id": "eu_rights_automated",
|
||||
"label": "Automatisierte Entscheidungen (Art. 24 VO 2018/1725)",
|
||||
"level": 2, "parent": "eu_rights",
|
||||
"patterns": [
|
||||
r"art(?:icle)?\s*24\s+(?:vo|regulation|verordnung)",
|
||||
r"art\.\s*24",
|
||||
r"automatisierte\s+entscheidung",
|
||||
r"automated\s+(?:decision|individual)",
|
||||
r"profiling",
|
||||
],
|
||||
"severity": "LOW",
|
||||
"hint": (
|
||||
"Art. 24 VO 2018/1725 (entspricht Art. 22 DSGVO): "
|
||||
"Bei automatisierten Einzelentscheidungen muessen Logik, "
|
||||
"Tragweite und Auswirkungen erklaert werden. Falls kein "
|
||||
"Profiling stattfindet, explizit verneinen."
|
||||
),
|
||||
},
|
||||
|
||||
# == L1: Beschwerderecht beim EDSB =====================================
|
||||
{
|
||||
"id": "eu_complaint",
|
||||
"label": "Beschwerderecht beim EDSB (Art. 15(1)(i) VO 2018/1725)",
|
||||
"level": 1, "parent": None,
|
||||
"patterns": [
|
||||
r"beschwerderecht",
|
||||
r"right\s+to\s+lodge\s+a\s+complaint",
|
||||
r"beschwerde.*(?:edsb|edps)",
|
||||
r"edsb",
|
||||
r"edps",
|
||||
r"europ(?:ae|ä)isch\w*\s+datenschutzbeauftragt",
|
||||
r"european\s+data\s+protection\s+supervisor",
|
||||
r"contr(?:o|ô)leur\s+europ(?:e|é)en",
|
||||
r"art(?:icle)?\s*63",
|
||||
],
|
||||
"severity": "HIGH",
|
||||
"hint": (
|
||||
"Art. 15(1)(i) VO 2018/1725: Bei EU-Organen ist der EDSB "
|
||||
"(Europaeischer Datenschutzbeauftragter / European Data Protection "
|
||||
"Supervisor) die zustaendige Aufsichtsbehoerde — NICHT die nationalen "
|
||||
"Datenschutzbehoerden. Kontakt: edps@edps.europa.eu, "
|
||||
"Rue Wiertz 60, B-1047 Bruxelles. Haeufiger Fehler: Verweis auf "
|
||||
"nationale Aufsichtsbehoerde statt EDSB."
|
||||
),
|
||||
},
|
||||
{
|
||||
"id": "eu_complaint_edps_contact",
|
||||
"label": "EDSB-Kontaktdaten angegeben",
|
||||
"level": 2, "parent": "eu_complaint",
|
||||
"patterns": [
|
||||
r"edps@edps\.europa\.eu",
|
||||
r"edps\.europa\.eu",
|
||||
r"edsb.*(?:kontakt|anschrift|adresse|e[\-\s]?mail|wiertz)",
|
||||
r"edps.*(?:contact|address|e[\-\s]?mail|wiertz)",
|
||||
r"rue\s+wiertz",
|
||||
],
|
||||
"severity": "MEDIUM",
|
||||
"hint": (
|
||||
"Vollstaendige EDSB-Kontaktdaten angeben: "
|
||||
"Europaeischer Datenschutzbeauftragter (EDSB), "
|
||||
"Rue Wiertz 60, B-1047 Bruxelles/Bruessel, "
|
||||
"edps@edps.europa.eu, https://edps.europa.eu. "
|
||||
"Haeufiger Fehler: Nur 'EDSB' erwaehnt ohne Kontaktdaten."
|
||||
),
|
||||
},
|
||||
]
|
||||
@@ -15,6 +15,7 @@ from .impressum_checks import IMPRESSUM_CHECKLIST
|
||||
from .cookie_checks import COOKIE_CHECKLIST
|
||||
from .social_media_checks import JOINT_CONTROLLER_CHECKLIST
|
||||
from .dsfa_checks import DSFA_CHECKLIST
|
||||
from .eu_institution_checks import EU_INSTITUTION_CHECKLIST
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -35,6 +36,7 @@ _CHECKLIST_MAP = {
|
||||
"social_media": (JOINT_CONTROLLER_CHECKLIST, "Art. 26 DSGVO"),
|
||||
"joint_controller": (JOINT_CONTROLLER_CHECKLIST, "Art. 26 DSGVO"),
|
||||
"dsfa": (DSFA_CHECKLIST, "Art. 35 DSGVO"),
|
||||
"eu_institution": (EU_INSTITUTION_CHECKLIST, "VO (EU) 2018/1725"),
|
||||
}
|
||||
|
||||
|
||||
@@ -218,6 +220,11 @@ def classify_document_type(title: str, url: str) -> str:
|
||||
if any(kw in combined for kw in ["social media", "facebook", "instagram", "linkedin", "fanpage"]):
|
||||
if any(kw in combined for kw in ["datenschutzerkl", "datenschutz für", "datenschutzinformation"]):
|
||||
return "social_media"
|
||||
# EU institution check BEFORE generic privacy — 2018/1725 is more specific
|
||||
if any(kw in combined for kw in ["2018/1725", "2018 1725", "regulation (eu)",
|
||||
"verordnung (eu)", "edsb", "edps",
|
||||
"european data protection supervisor"]):
|
||||
return "eu_institution"
|
||||
if any(kw in combined for kw in ["datenschutz", "privacy", "dsgvo", "data protection", "données"]):
|
||||
return "dse"
|
||||
if any(kw in combined for kw in ["widerruf", "withdrawal", "rétractation", "desistimiento"]):
|
||||
|
||||
@@ -0,0 +1,175 @@
|
||||
"""
|
||||
Banner Runner — maps scan results to the L1/L2 check hierarchy.
|
||||
|
||||
Takes the raw ScanResponse dict and produces a structured_checks list
|
||||
compatible with ChecklistView (same format as document checks).
|
||||
"""
|
||||
|
||||
from checks.banner_checks import BANNER_CHECKLIST
|
||||
|
||||
|
||||
def map_scan_to_checks(scan_result: dict) -> dict:
|
||||
"""Map a /scan response to the L1/L2 banner check hierarchy.
|
||||
|
||||
Returns dict with:
|
||||
- structured_checks: list of CheckItem dicts
|
||||
- completeness_pct: L1 pass rate (0-100)
|
||||
- correctness_pct: L2 pass rate (0-100)
|
||||
"""
|
||||
# Collect all violation codes from every source
|
||||
violation_codes = _collect_violation_codes(scan_result)
|
||||
|
||||
# Collect pass codes — some checks produce boolean signals, not violations
|
||||
pass_codes = _collect_pass_codes(scan_result)
|
||||
|
||||
# Build structured checks
|
||||
checks: list[dict] = []
|
||||
l1_checks: list[dict] = []
|
||||
l2_checks: list[dict] = []
|
||||
|
||||
for defn in BANNER_CHECKLIST:
|
||||
key = defn["check_key"]
|
||||
level = defn["level"]
|
||||
parent = defn.get("parent")
|
||||
|
||||
# Determine pass/fail
|
||||
is_violation_key = key in violation_codes
|
||||
is_pass_key = key in pass_codes
|
||||
|
||||
# For checks whose check_key appears in violations → failed
|
||||
# For checks whose check_key appears only in passes → passed
|
||||
# For checks where neither → assume passed (not tested = no finding)
|
||||
if is_violation_key:
|
||||
passed = False
|
||||
matched_text = violation_codes[key]
|
||||
elif is_pass_key:
|
||||
passed = True
|
||||
matched_text = pass_codes.get(key, "")
|
||||
else:
|
||||
# Key not found in violations or explicit passes.
|
||||
# If the scan ran (banner detected) → assume passed.
|
||||
# If banner not detected → only banner_detected fails.
|
||||
passed = scan_result.get("banner_detected", False) or key == "banner_detected"
|
||||
if key == "banner_detected":
|
||||
passed = scan_result.get("banner_detected", False)
|
||||
matched_text = ""
|
||||
|
||||
# L2 checks are skipped if their parent L1 failed
|
||||
skipped = False
|
||||
if level == 2 and parent:
|
||||
parent_check = next(
|
||||
(c for c in checks if c["id"] == parent), None
|
||||
)
|
||||
if parent_check and not parent_check["passed"]:
|
||||
skipped = True
|
||||
|
||||
item = {
|
||||
"id": defn["id"],
|
||||
"label": defn["label"],
|
||||
"passed": passed and not skipped,
|
||||
"severity": defn["severity"],
|
||||
"level": level,
|
||||
"parent": parent,
|
||||
"skipped": skipped,
|
||||
"hint": defn.get("hint", ""),
|
||||
"matched_text": matched_text if passed else "",
|
||||
}
|
||||
checks.append(item)
|
||||
|
||||
if level == 1:
|
||||
l1_checks.append(item)
|
||||
elif level == 2:
|
||||
l2_checks.append(item)
|
||||
|
||||
# Compute percentages
|
||||
l1_total = len(l1_checks)
|
||||
l1_passed = sum(1 for c in l1_checks if c["passed"])
|
||||
completeness_pct = round(l1_passed / l1_total * 100) if l1_total else 0
|
||||
|
||||
l2_active = [c for c in l2_checks if not c["skipped"]]
|
||||
l2_passed = sum(1 for c in l2_active if c["passed"])
|
||||
correctness_pct = round(l2_passed / len(l2_active) * 100) if l2_active else 0
|
||||
|
||||
return {
|
||||
"structured_checks": checks,
|
||||
"completeness_pct": completeness_pct,
|
||||
"correctness_pct": correctness_pct,
|
||||
}
|
||||
|
||||
|
||||
def _collect_violation_codes(scan: dict) -> dict[str, str]:
|
||||
"""Collect check_key → violation text from all sources."""
|
||||
codes: dict[str, str] = {}
|
||||
|
||||
# Banner text violations
|
||||
banner_checks = scan.get("banner_checks", {})
|
||||
for v in banner_checks.get("violations", []):
|
||||
code = v.get("code", "")
|
||||
if code:
|
||||
codes[code] = v.get("text", "")[:120]
|
||||
|
||||
# Phase A violations (before consent)
|
||||
phase_a = scan.get("phases", {}).get("before_consent", {})
|
||||
for v in phase_a.get("violations", []):
|
||||
code = v.get("code", "")
|
||||
if code:
|
||||
codes[code] = v.get("text", "")[:120]
|
||||
|
||||
# Phase B violations (after reject)
|
||||
phase_b = scan.get("phases", {}).get("after_reject", {})
|
||||
for v in phase_b.get("violations", []):
|
||||
code = v.get("code", "")
|
||||
if code:
|
||||
codes[code] = v.get("text", "")[:120]
|
||||
|
||||
# Tracking services in phase A → tracking_before_consent
|
||||
tracking_a = phase_a.get("tracking_services", [])
|
||||
if tracking_a and "tracking_before_consent" not in codes:
|
||||
codes["tracking_before_consent"] = ", ".join(tracking_a[:5])
|
||||
|
||||
# Cookies before consent → cookies_before_consent
|
||||
cookies_a = phase_a.get("cookies", [])
|
||||
tracking_cookies = [c for c in cookies_a if _is_tracking_cookie(c)]
|
||||
if tracking_cookies and "cookies_before_consent" not in codes:
|
||||
codes["cookies_before_consent"] = ", ".join(tracking_cookies[:5])
|
||||
|
||||
# New tracking after reject → tracking_after_reject
|
||||
new_tracking_b = phase_b.get("new_tracking", [])
|
||||
if new_tracking_b and "tracking_after_reject" not in codes:
|
||||
codes["tracking_after_reject"] = ", ".join(new_tracking_b[:5])
|
||||
|
||||
return codes
|
||||
|
||||
|
||||
def _collect_pass_codes(scan: dict) -> dict[str, str]:
|
||||
"""Collect explicit pass signals from scan results."""
|
||||
passes: dict[str, str] = {}
|
||||
|
||||
# Banner detected
|
||||
if scan.get("banner_detected"):
|
||||
passes["banner_detected"] = scan.get("banner_provider", "detected")
|
||||
|
||||
# Provider named
|
||||
provider = scan.get("banner_provider", "")
|
||||
if provider:
|
||||
passes["banner_provider_named"] = provider
|
||||
|
||||
# Impressum link
|
||||
bc = scan.get("banner_checks", {})
|
||||
if bc.get("has_impressum_link"):
|
||||
passes["impressum_link"] = "Impressum-Link gefunden"
|
||||
if bc.get("has_dse_link"):
|
||||
passes["dse_link"] = "DSE-Link gefunden"
|
||||
|
||||
return passes
|
||||
|
||||
|
||||
_TRACKING_COOKIE_PREFIXES = (
|
||||
"_ga", "_gid", "_fbp", "_fbc", "IDE", "_gcl", "fr", "_pin",
|
||||
"_tt_", "li_sugr", "_hj", "mp_", "ajs_", "_clck", "_clsk",
|
||||
)
|
||||
|
||||
|
||||
def _is_tracking_cookie(name: str) -> bool:
|
||||
"""Check if a cookie name is a known tracking cookie."""
|
||||
return any(name.startswith(p) for p in _TRACKING_COOKIE_PREFIXES)
|
||||
+45
-25
@@ -16,6 +16,7 @@ from services.consent_scanner import run_consent_test, ConsentTestResult
|
||||
from services.authenticated_scanner import run_authenticated_test, AuthTestResult
|
||||
from services.playwright_scanner import scan_website_playwright
|
||||
from services.dsi_discovery import discover_dsi_documents, DSIDiscoveryResult
|
||||
from checks.banner_runner import map_scan_to_checks
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format="%(levelname)s:%(name)s: %(message)s")
|
||||
logger = logging.getLogger(__name__)
|
||||
@@ -44,6 +45,9 @@ class ScanResponse(BaseModel):
|
||||
scanned_at: str
|
||||
category_tests: list = []
|
||||
banner_checks: dict = {}
|
||||
structured_checks: list = []
|
||||
completeness_pct: int = 0
|
||||
correctness_pct: int = 0
|
||||
|
||||
|
||||
@app.get("/health")
|
||||
@@ -57,30 +61,47 @@ async def scan_consent(req: ScanRequest):
|
||||
logger.info("Starting consent test for %s", req.url)
|
||||
result = await run_consent_test(req.url, req.timeout_per_phase)
|
||||
|
||||
# Build raw response dict for structured check mapping
|
||||
phases = {
|
||||
"before_consent": {
|
||||
"scripts": result.before_scripts,
|
||||
"cookies": result.before_cookies,
|
||||
"tracking_services": result.before_tracking,
|
||||
"violations": [v.__dict__ for v in result.before_violations],
|
||||
},
|
||||
"after_reject": {
|
||||
"scripts": result.reject_scripts,
|
||||
"cookies": result.reject_cookies,
|
||||
"new_tracking": result.reject_new_tracking,
|
||||
"violations": [v.__dict__ for v in result.reject_violations],
|
||||
},
|
||||
"after_accept": {
|
||||
"scripts": result.accept_scripts,
|
||||
"cookies": result.accept_cookies,
|
||||
"new_tracking": result.accept_new_tracking,
|
||||
"undocumented": result.accept_undocumented,
|
||||
},
|
||||
}
|
||||
banner_checks_data = {
|
||||
"has_impressum_link": result.banner_has_impressum_link,
|
||||
"has_dse_link": result.banner_has_dse_link,
|
||||
"violations": [v.__dict__ for v in result.banner_text_violations],
|
||||
}
|
||||
|
||||
# Map to L1/L2 hierarchy
|
||||
raw_for_mapping = {
|
||||
"banner_detected": result.banner_detected,
|
||||
"banner_provider": result.banner_provider,
|
||||
"phases": phases,
|
||||
"banner_checks": banner_checks_data,
|
||||
}
|
||||
check_result = map_scan_to_checks(raw_for_mapping)
|
||||
|
||||
return ScanResponse(
|
||||
url=req.url,
|
||||
banner_detected=result.banner_detected,
|
||||
banner_provider=result.banner_provider,
|
||||
phases={
|
||||
"before_consent": {
|
||||
"scripts": result.before_scripts,
|
||||
"cookies": result.before_cookies,
|
||||
"tracking_services": result.before_tracking,
|
||||
"violations": [v.__dict__ for v in result.before_violations],
|
||||
},
|
||||
"after_reject": {
|
||||
"scripts": result.reject_scripts,
|
||||
"cookies": result.reject_cookies,
|
||||
"new_tracking": result.reject_new_tracking,
|
||||
"violations": [v.__dict__ for v in result.reject_violations],
|
||||
},
|
||||
"after_accept": {
|
||||
"scripts": result.accept_scripts,
|
||||
"cookies": result.accept_cookies,
|
||||
"new_tracking": result.accept_new_tracking,
|
||||
"undocumented": result.accept_undocumented,
|
||||
},
|
||||
},
|
||||
phases=phases,
|
||||
summary={
|
||||
"critical": sum(1 for v in result.reject_violations if v.severity == "CRITICAL"),
|
||||
"high": len(result.before_violations) + sum(1 for v in result.banner_text_violations if v.severity == "HIGH"),
|
||||
@@ -90,11 +111,10 @@ async def scan_consent(req: ScanRequest):
|
||||
"categories_tested": len(result.category_tests),
|
||||
"banner_text_issues": len(result.banner_text_violations),
|
||||
},
|
||||
banner_checks={
|
||||
"has_impressum_link": result.banner_has_impressum_link,
|
||||
"has_dse_link": result.banner_has_dse_link,
|
||||
"violations": [v.__dict__ for v in result.banner_text_violations],
|
||||
},
|
||||
banner_checks=banner_checks_data,
|
||||
structured_checks=check_result["structured_checks"],
|
||||
completeness_pct=check_result["completeness_pct"],
|
||||
correctness_pct=check_result["correctness_pct"],
|
||||
scanned_at=datetime.now(timezone.utc).isoformat(),
|
||||
category_tests=[{
|
||||
"category": ct.category,
|
||||
|
||||
@@ -23,6 +23,8 @@ from urllib.parse import urlparse, urljoin
|
||||
|
||||
from playwright.async_api import Page
|
||||
|
||||
from services.dsi_helpers import goto_resilient, try_dismiss_consent_banner, is_pdf_redirect
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Legal document keywords in all EU/EEA official languages.
|
||||
@@ -216,11 +218,36 @@ async def discover_dsi_documents(
|
||||
seen_titles: set[str] = set()
|
||||
|
||||
try:
|
||||
# Step 1: Load the page
|
||||
await page.goto(url, wait_until="networkidle", timeout=60000)
|
||||
# Step 1: Load the page (with networkidle → domcontentloaded fallback)
|
||||
await goto_resilient(page, url, timeout=60000)
|
||||
await page.wait_for_timeout(2000)
|
||||
|
||||
# Step 1b: Self-extraction — if the URL itself is a DSI page,
|
||||
# Step 1a: Detect PDF redirects (e.g. dm.de redirects to GCS PDF)
|
||||
final_url = page.url
|
||||
if is_pdf_redirect(url, final_url):
|
||||
is_dsi_url, dsi_lang = _matches_dsi_keyword(urlparse(url).path.lower())
|
||||
if is_dsi_url:
|
||||
result.documents.append(DiscoveredDSI(
|
||||
title=urlparse(url).path.split("/")[-1] or "Datenschutzerklaerung",
|
||||
url=final_url,
|
||||
source_url=url,
|
||||
language=dsi_lang or "de",
|
||||
doc_type="pdf",
|
||||
text="[PDF — Textextraktion erforderlich]",
|
||||
))
|
||||
seen_urls.add(url)
|
||||
seen_urls.add(final_url)
|
||||
logger.info("PDF redirect detected: %s -> %s", url, final_url)
|
||||
# Return early — a PDF redirect means no HTML content to scan
|
||||
result.total_found = len(result.documents)
|
||||
return result
|
||||
|
||||
# Step 1b: Try dismissing cookie consent banners before extraction.
|
||||
# Many German sites (dm.de, Zalando, etc.) block page content behind
|
||||
# a consent wall. Dismissing it reveals the actual DSI text.
|
||||
await try_dismiss_consent_banner(page)
|
||||
|
||||
# Step 1c: Self-extraction — if the URL itself is a DSI page,
|
||||
# extract its full text as the first document. This handles the
|
||||
# case where the user provides the DSE URL directly (e.g.
|
||||
# example.com/datenschutz) instead of the homepage.
|
||||
@@ -251,6 +278,8 @@ async def discover_dsi_documents(
|
||||
))
|
||||
seen_urls.add(url)
|
||||
logger.info("Self-extracted %d words from %s", self_wc, url)
|
||||
else:
|
||||
logger.info("Self-extraction too short (%d words) for %s", self_wc, url)
|
||||
except Exception as e:
|
||||
logger.warning("Self-extraction failed for %s: %s", url, e)
|
||||
|
||||
@@ -323,58 +352,69 @@ async def discover_dsi_documents(
|
||||
if is_anchor:
|
||||
continue
|
||||
|
||||
# Navigate to page — wait for JS to load content
|
||||
resp = await page.goto(href, wait_until="networkidle", timeout=45000)
|
||||
if resp and resp.status < 400:
|
||||
await page.wait_for_timeout(2000)
|
||||
await _expand_all_interactive(page)
|
||||
await page.wait_for_timeout(500)
|
||||
# Navigate to page — with networkidle/domcontentloaded fallback
|
||||
await goto_resilient(page, href, timeout=45000)
|
||||
resp_url = page.url
|
||||
|
||||
# Extract text — try specific content areas, fall back to full body
|
||||
text = await page.evaluate("""
|
||||
() => {
|
||||
// Try progressively broader content selectors
|
||||
const selectors = [
|
||||
'.article-content', '.page-content', '.entry-content',
|
||||
'[class*="content-area"]', '[class*="main-content"]',
|
||||
'main article', 'main', 'article',
|
||||
'[role="main"]', '.content', '#content',
|
||||
];
|
||||
for (const sel of selectors) {
|
||||
const el = document.querySelector(sel);
|
||||
if (el && el.textContent.trim().length > 200) {
|
||||
return el.textContent.trim();
|
||||
}
|
||||
# Check for PDF redirect on followed links
|
||||
if is_pdf_redirect(href, resp_url):
|
||||
result.documents.append(DiscoveredDSI(
|
||||
title=title, url=resp_url, source_url=url,
|
||||
language=lang, doc_type="pdf",
|
||||
text="[PDF — Textextraktion erforderlich]",
|
||||
))
|
||||
await goto_resilient(page, url, timeout=45000)
|
||||
continue
|
||||
|
||||
await try_dismiss_consent_banner(page)
|
||||
await _expand_all_interactive(page)
|
||||
await page.wait_for_timeout(500)
|
||||
|
||||
# Extract text — try specific content areas, fall back to full body
|
||||
text = await page.evaluate("""
|
||||
() => {
|
||||
// Try progressively broader content selectors
|
||||
const selectors = [
|
||||
'.article-content', '.page-content', '.entry-content',
|
||||
'[class*="content-area"]', '[class*="main-content"]',
|
||||
'main article', 'main', 'article',
|
||||
'[role="main"]', '.content', '#content',
|
||||
];
|
||||
for (const sel of selectors) {
|
||||
const el = document.querySelector(sel);
|
||||
if (el && el.textContent.trim().length > 200) {
|
||||
return el.textContent.trim();
|
||||
}
|
||||
// Fallback: full body minus nav/header/footer
|
||||
const body = document.body.cloneNode(true);
|
||||
body.querySelectorAll('nav, header, footer, script, style, [class*="nav"], [class*="sidebar"]').forEach(e => e.remove());
|
||||
return body.textContent?.trim() || '';
|
||||
}
|
||||
""")
|
||||
if text and len(text) > 50:
|
||||
result.documents.append(DiscoveredDSI(
|
||||
title=title, url=href, source_url=url,
|
||||
language=lang,
|
||||
doc_type="cross_domain" if not _is_allowed_domain(href, base_domain) else "html_page",
|
||||
text=text[:50000], word_count=len(text.split()),
|
||||
))
|
||||
// Fallback: full body minus nav/header/footer
|
||||
const body = document.body.cloneNode(true);
|
||||
body.querySelectorAll('nav, header, footer, script, style, [class*="nav"], [class*="sidebar"]').forEach(e => e.remove());
|
||||
return body.textContent?.trim() || '';
|
||||
}
|
||||
""")
|
||||
if text and len(text) > 50:
|
||||
result.documents.append(DiscoveredDSI(
|
||||
title=title, url=href, source_url=url,
|
||||
language=lang,
|
||||
doc_type="cross_domain" if not _is_allowed_domain(href, base_domain) else "html_page",
|
||||
text=text[:50000], word_count=len(text.split()),
|
||||
))
|
||||
|
||||
# Recursive: search THIS page for more DSI links
|
||||
new_links = await _find_dsi_links(page, base_domain)
|
||||
for nl in new_links:
|
||||
if nl["href"] not in seen_urls and nl["href"] not in [p["href"] for p in pending_links]:
|
||||
pending_links.append(nl)
|
||||
# Recursive: search THIS page for more DSI links
|
||||
new_links = await _find_dsi_links(page, base_domain)
|
||||
for nl in new_links:
|
||||
if nl["href"] not in seen_urls and nl["href"] not in [p["href"] for p in pending_links]:
|
||||
pending_links.append(nl)
|
||||
|
||||
# Navigate back for next link
|
||||
await page.goto(url, wait_until="networkidle", timeout=45000)
|
||||
await goto_resilient(page, url, timeout=45000)
|
||||
await page.wait_for_timeout(500)
|
||||
await _expand_all_interactive(page)
|
||||
|
||||
except Exception as e:
|
||||
result.errors.append(f"Failed to load {href}: {str(e)[:80]}")
|
||||
try:
|
||||
await page.goto(url, wait_until="networkidle", timeout=45000)
|
||||
await goto_resilient(page, url, timeout=45000)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
@@ -0,0 +1,118 @@
|
||||
"""
|
||||
DSI Discovery Helpers — resilient navigation, consent dismissal, PDF redirect detection.
|
||||
|
||||
Extracted from dsi_discovery.py to keep modules under 500 LOC.
|
||||
"""
|
||||
|
||||
import logging
|
||||
|
||||
from playwright.async_api import Page, TimeoutError as PlaywrightTimeout
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
async def goto_resilient(page: Page, url: str, timeout: int = 60000) -> None:
|
||||
"""Navigate to URL with fallback: try networkidle first, then domcontentloaded.
|
||||
|
||||
SPAs like Zalando never reach networkidle because of continuous background
|
||||
requests. Falling back to domcontentloaded + a short wait gives JS time to
|
||||
render the main content without waiting for every network request to finish.
|
||||
"""
|
||||
try:
|
||||
await page.goto(url, wait_until="networkidle", timeout=timeout)
|
||||
except PlaywrightTimeout:
|
||||
logger.info("networkidle timeout for %s, falling back to domcontentloaded", url)
|
||||
await page.goto(url, wait_until="domcontentloaded", timeout=timeout)
|
||||
await page.wait_for_timeout(5000) # extra wait for JS rendering
|
||||
|
||||
|
||||
async def try_dismiss_consent_banner(page: Page) -> bool:
|
||||
"""Try to dismiss cookie consent banners that block page content.
|
||||
|
||||
Handles shadow DOM (Usercentrics), iframes (Sourcepoint), and regular
|
||||
DOM banners (OneTrust, Cookiebot, Didomi, etc.).
|
||||
Returns True if a banner was dismissed.
|
||||
"""
|
||||
# 1) Usercentrics shadow DOM — most common for German sites
|
||||
try:
|
||||
uc_root = await page.query_selector("#usercentrics-root")
|
||||
if uc_root:
|
||||
clicked = await page.evaluate("""() => {
|
||||
const root = document.querySelector('#usercentrics-root');
|
||||
if (!root || !root.shadowRoot) return false;
|
||||
const buttons = root.shadowRoot.querySelectorAll('button');
|
||||
for (const btn of buttons) {
|
||||
const t = btn.textContent.trim().toLowerCase();
|
||||
if (t.includes('akzeptieren') || t.includes('accept')
|
||||
|| t.includes('zustimmen') || t.includes('agree')) {
|
||||
btn.click();
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}""")
|
||||
if clicked:
|
||||
logger.info("Dismissed Usercentrics consent banner (shadow DOM)")
|
||||
await page.wait_for_timeout(2000)
|
||||
return True
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# 2) Standard DOM banners — OneTrust, Cookiebot, Didomi, Borlabs, etc.
|
||||
accept_selectors = [
|
||||
"#onetrust-accept-btn-handler",
|
||||
"#CybotCookiebotDialogBodyLevelButtonLevelOptinAllowAll",
|
||||
"#didomi-notice-agree-button",
|
||||
"#BorlabsCookieBox .cookie-accept, [data-cookie-accept]",
|
||||
".cmpboxbtn.cmpboxbtnyes",
|
||||
".klaro .cm-btn-accept",
|
||||
".cky-btn-accept",
|
||||
"[class*='qc-cmp2-summary-buttons'] button:first-child",
|
||||
"#tarteaucitronPersonalize2",
|
||||
]
|
||||
for sel in accept_selectors:
|
||||
try:
|
||||
btn = page.locator(sel).first
|
||||
if await btn.count() > 0 and await btn.is_visible():
|
||||
await btn.click(timeout=3000)
|
||||
logger.info("Dismissed consent banner via %s", sel)
|
||||
await page.wait_for_timeout(2000)
|
||||
return True
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
# 3) Generic text-based button search
|
||||
accept_texts = [
|
||||
"Alle akzeptieren", "Alles akzeptieren", "Alle Cookies akzeptieren",
|
||||
"Accept all", "Accept All Cookies", "Akzeptieren", "Zustimmen",
|
||||
"Einverstanden", "Ich stimme zu",
|
||||
]
|
||||
try:
|
||||
clicked = await page.evaluate("""(texts) => {
|
||||
for (const btn of document.querySelectorAll('button, a[role="button"]')) {
|
||||
const t = (btn.textContent || '').trim();
|
||||
for (const target of texts) {
|
||||
if (t === target) { btn.click(); return true; }
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}""", accept_texts)
|
||||
if clicked:
|
||||
logger.info("Dismissed consent banner via generic text match")
|
||||
await page.wait_for_timeout(2000)
|
||||
return True
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def is_pdf_redirect(original_url: str, final_url: str) -> bool:
|
||||
"""Check if the page redirected to a PDF or external storage."""
|
||||
final_lower = final_url.lower()
|
||||
return (
|
||||
final_lower.endswith(".pdf")
|
||||
or "storage.googleapis.com" in final_lower
|
||||
or "blob.core.windows.net" in final_lower
|
||||
or "s3.amazonaws.com" in final_lower
|
||||
)
|
||||
Reference in New Issue
Block a user