diff --git a/admin-compliance/app/sdk/agent/_components/BannerCheckTab.tsx b/admin-compliance/app/sdk/agent/_components/BannerCheckTab.tsx
index fb8d623..5b73d43 100644
--- a/admin-compliance/app/sdk/agent/_components/BannerCheckTab.tsx
+++ b/admin-compliance/app/sdk/agent/_components/BannerCheckTab.tsx
@@ -1,19 +1,35 @@
'use client'
import React, { useState } from 'react'
+import { ChecklistView } from './ChecklistView'
+
+interface CheckItem {
+ id: string
+ label: string
+ passed: boolean
+ severity: string
+ matched_text: string
+ level?: number
+ parent?: string | null
+ skipped?: boolean
+ hint?: string
+}
interface BannerResult {
banner_detected: boolean
banner_provider: string
- banner_text: string
banner_checks?: {
violations: { code: string; text: string; severity: string }[]
- passes: { code: string; text: string }[]
+ has_impressum_link?: boolean
+ has_dse_link?: boolean
}
+ structured_checks?: CheckItem[]
+ completeness_pct?: number
+ correctness_pct?: number
phases?: {
- before_consent: { cookies: number; scripts: number; violations: string[] }
- after_reject: { cookies: number; scripts: number; violations: string[] }
- after_accept: { cookies: number; scripts: number; violations: string[] }
+ before_consent: { cookies: string[]; scripts: string[]; tracking_services: string[]; violations: any[] }
+ after_reject: { cookies: string[]; scripts: string[]; new_tracking: string[]; violations: any[] }
+ after_accept: { cookies: string[]; scripts: string[]; new_tracking: string[]; undocumented: string[] }
}
}
@@ -43,7 +59,6 @@ export function BannerCheckTab() {
const data = await res.json()
if (data.scan_id) {
- // Async polling
let attempts = 0
while (attempts < 60) {
await new Promise(r => setTimeout(r, 3000))
@@ -69,9 +84,23 @@ export function BannerCheckTab() {
}
}
- const violations = result?.banner_checks?.violations || []
- const passes = result?.banner_checks?.passes || []
- const total = violations.length + passes.length
+ const structuredChecks = result?.structured_checks || []
+ const hasStructured = structuredChecks.length > 0
+ const compPct = result?.completeness_pct ?? 0
+ const corrPct = result?.correctness_pct ?? 0
+
+ // Build ChecklistView-compatible result for structured checks
+ const checklistResults = hasStructured ? [{
+ label: `Cookie-Banner: ${result?.banner_provider || 'Unbekannt'}`,
+ url: url,
+ doc_type: 'banner',
+ word_count: 0,
+ completeness_pct: compPct,
+ correctness_pct: corrPct,
+ checks: structuredChecks,
+ findings_count: structuredChecks.filter(c => !c.passed && !c.skipped).length,
+ error: '',
+ }] : []
return (
@@ -79,7 +108,7 @@ export function BannerCheckTab() {
Cookie-Banner Compliance Check
Playwright-basierter 3-Phasen-Test: Vor Interaktion, nach Ablehnen, nach Akzeptieren.
- Prueft Dark Patterns, Pre-Consent-Cookies, Farbkontrast, Klick-Paritaet und 20+ weitere Kriterien.
+ Prueft Dark Patterns, Pre-Consent-Cookies, Farbkontrast, Klick-Paritaet und 36 weitere Kriterien.
@@ -116,14 +145,14 @@ export function BannerCheckTab() {
)}
{result && (
-
- {/* Header */}
-
-
-
+
+ {/* 3-Phase Summary Card */}
+ {result.phases && (
+
+
-
- {result.banner_detected ? '🛡️' : '⚠️'}
+
+ {result.banner_detected ? '\u{1F6E1}\u{FE0F}' : '\u26A0\u{FE0F}'}
@@ -131,98 +160,50 @@ export function BannerCheckTab() {
? `Banner erkannt: ${result.banner_provider || 'Unbekannter Anbieter'}`
: 'Kein Cookie-Banner erkannt'}
- {total > 0 && (
-
- {passes.length}/{total} Pruefungen bestanden
-
- )}
+
+ 3-Phasen-Analyse: Cookies und Scripts vor/nach Interaktion
+
- {total > 0 && (
-
-
-
- {Math.round(passes.length / total * 100)}%
-
-
- )}
-
-
-
- {/* 3-Phase Summary */}
- {result.phases && (
-
- {[
- { label: 'Vor Consent', data: result.phases.before_consent, icon: 'đź”’' },
- { label: 'Nach Ablehnen', data: result.phases.after_reject, icon: 'đźš«' },
- { label: 'Nach Akzeptieren', data: result.phases.after_accept, icon: 'âś…' },
- ].map(phase => (
-
-
{phase.icon}
-
{phase.label}
-
- {phase.data.cookies} Cookies, {phase.data.scripts} Scripts
-
- {phase.data.violations.length > 0 && (
-
- {phase.data.violations.length} Verstoesse
-
- )}
-
- ))}
-
- )}
-
- {/* Violations */}
- {violations.length > 0 && (
-
-
- Verstoesse ({violations.length})
-
-
- {violations.map((v, i) => (
-
-
-
-
-
-
{v.text}
-
{v.code} | {v.severity}
-
-
- ))}
+
)}
- {/* Passes */}
- {passes.length > 0 && (
-
-
- Bestanden ({passes.length})
-
-
- {passes.map((p, i) => (
-
- ))}
-
+ {/* Structured L1/L2 Checklist */}
+ {hasStructured && (
+
+
)}
- {!result.banner_detected && violations.length === 0 && passes.length === 0 && (
-
- Kein Cookie-Banner auf dieser Seite gefunden. Falls Cookies gesetzt werden, ist ein Banner nach §25 TDDDG Pflicht.
+ {!result.banner_detected && !hasStructured && (
+
+
+ Kein Cookie-Banner auf dieser Seite gefunden. Falls Cookies gesetzt werden, ist ein Banner nach ss25 TDDDG Pflicht.
+
)}
@@ -230,3 +211,22 @@ export function BannerCheckTab() {
)
}
+
+function PhaseBox({ label, icon, cookies, scripts, violations }: {
+ label: string; icon: string; cookies: number; scripts: number; violations: number
+}) {
+ return (
+
+
{icon}
+
{label}
+
+ {cookies} Cookies, {scripts} Scripts
+
+ {violations > 0 && (
+
+ {violations} Verstoesse
+
+ )}
+
+ )
+}
diff --git a/admin-compliance/app/sdk/agent/_components/ChecklistView.tsx b/admin-compliance/app/sdk/agent/_components/ChecklistView.tsx
index 9b74b54..4c99f87 100644
--- a/admin-compliance/app/sdk/agent/_components/ChecklistView.tsx
+++ b/admin-compliance/app/sdk/agent/_components/ChecklistView.tsx
@@ -30,6 +30,7 @@ const DOC_TYPE_LABELS: Record
= {
dse: 'DSI', agb: 'AGB', impressum: 'Impressum',
cookie: 'Cookie', widerruf: 'Widerruf', other: 'Sonstiges',
social_media: 'Social Media', dsfa: 'DSFA', joint_controller: 'Art. 26',
+ eu_institution: 'EU-Inst.', banner: 'Banner',
}
interface GroupedCheck {
diff --git a/backend-compliance/compliance/api/agent_doc_check_routes.py b/backend-compliance/compliance/api/agent_doc_check_routes.py
index c66a612..9d79188 100644
--- a/backend-compliance/compliance/api/agent_doc_check_routes.py
+++ b/backend-compliance/compliance/api/agent_doc_check_routes.py
@@ -329,6 +329,7 @@ SECTION_TYPE_MAP = [
(r"datenschutzfolge|dsfa|risikoanalyse", "dsfa"),
(r"^social\s*media$|^soziale\s+(?:medien|netzwerke)$", "social_media"),
(r"datenschutzerkl(?:ae|ä)rung.*social|datenschutz\s+f(?:ue|ü)r\s+social", "social_media"),
+ (r"(?:verordnung|regulation)\s*\(?eu\)?\s*2018\s*/?\s*1725", "eu_institution"),
]
diff --git a/backend-compliance/compliance/services/doc_checks/__init__.py b/backend-compliance/compliance/services/doc_checks/__init__.py
index 5d69a0b..8710c7a 100644
--- a/backend-compliance/compliance/services/doc_checks/__init__.py
+++ b/backend-compliance/compliance/services/doc_checks/__init__.py
@@ -2,7 +2,7 @@
doc_checks — Legal document compliance checkers.
Provides checklists and functions for verifying legal documents
-(DSI, AGB, Impressum, Cookie, Widerruf, Social Media, DSFA)
+(DSI, AGB, Impressum, Cookie, Widerruf, Social Media, DSFA, EU Institution)
against their mandatory content requirements.
Two check levels:
@@ -18,6 +18,7 @@ from .impressum_checks import IMPRESSUM_CHECKLIST
from .cookie_checks import COOKIE_CHECKLIST
from .social_media_checks import JOINT_CONTROLLER_CHECKLIST
from .dsfa_checks import DSFA_CHECKLIST
+from .eu_institution_checks import EU_INSTITUTION_CHECKLIST
__all__ = [
"check_document_completeness",
@@ -29,4 +30,5 @@ __all__ = [
"COOKIE_CHECKLIST",
"JOINT_CONTROLLER_CHECKLIST",
"DSFA_CHECKLIST",
+ "EU_INSTITUTION_CHECKLIST",
]
diff --git a/backend-compliance/compliance/services/doc_checks/dse_checks.py b/backend-compliance/compliance/services/doc_checks/dse_checks.py
index 99b3fc7..f5345c7 100644
--- a/backend-compliance/compliance/services/doc_checks/dse_checks.py
+++ b/backend-compliance/compliance/services/doc_checks/dse_checks.py
@@ -47,8 +47,9 @@ ART13_CHECKLIST = [
"label": "Telefonnummer des Verantwortlichen",
"level": 2, "parent": "controller",
"patterns": [
- r"(?:tel(?:efon)?|phone|fon)\s*[.:]\s*[\+\d][\d\s/\-]{6,}",
- r"\+49\s*[\d\s/\-]{8,}",
+ r"(?:tel(?:efon)?|phone|fon)\s*[.:]\s*[\+\d][\d\s/\-\(\)]{6,}",
+ r"\+49\s*[\d\s/\-\(\)]{8,}",
+ r"0\d{2,4}\s*[\(/\-\s]\s*\d{3,}",
],
"severity": "MEDIUM",
"hint": "EuGH (C-298/17, 'Verein fuer Konsumenteninformation') verlangt effektive Kontaktmoeglichkeit. Telefon ist nicht zwingend, aber empfohlen — fehlt sie, muss ein gleichwertiger Kanal (z.B. Chat, Rueckruf) angeboten werden.",
@@ -345,7 +346,7 @@ ART13_CHECKLIST = [
"id": "rights_art21",
"label": "Widerspruchsrecht (Art. 21)",
"level": 2, "parent": "rights",
- "patterns": [r"art\.\s*21", r"widerspruchsrecht", r"right\s+to\s+object"],
+ "patterns": [r"art\.\s*21", r"widerspruchsrecht", r"recht\s+auf\s+widerspruch", r"§\s*23\s+kdg", r"right\s+to\s+object"],
"severity": "LOW",
"hint": "Art. 21(4) DSGVO: Der Widerspruchshinweis muss spaetestens zum Zeitpunkt der ersten Kommunikation GESONDERT und in klarer Sprache erfolgen. Haeufiger Fehler: Widerspruchsrecht nur im Fliesstext versteckt — eigener Abschnitt/Hervorhebung noetig.",
},
@@ -386,6 +387,9 @@ ART13_CHECKLIST = [
r"l(?:an)?fdi\s+\w+",
r"bfdi",
r"(?:bayerische|hessische|s(?:ae|ä)chsische|berliner)\s+(?:datenschutz|aufsicht)",
+ r"(?:katholisch|evangelisch|kirchlich)\w*\s+datenschutz",
+ r"datenschutzzentrum",
+ r"kd(?:oe|ö)r",
],
"severity": "LOW",
"hint": "Vollstaendigen Namen, Adresse und Website der Aufsichtsbehoerde angeben. Haeufiger Fehler: 'die zustaendige Aufsichtsbehoerde' ohne Konkretisierung. Korrekt z.B.: 'LfDI BW, Koenigstrasse 10a, 70173 Stuttgart, www.baden-wuerttemberg.datenschutz.de'.",
diff --git a/backend-compliance/compliance/services/doc_checks/eu_institution_checks.py b/backend-compliance/compliance/services/doc_checks/eu_institution_checks.py
new file mode 100644
index 0000000..ed37d2f
--- /dev/null
+++ b/backend-compliance/compliance/services/doc_checks/eu_institution_checks.py
@@ -0,0 +1,500 @@
+"""
+EU Institution checks — Verordnung (EU) 2018/1725.
+Applies to EU institutions, bodies, offices and agencies instead of DSGVO.
+Key differences: Art. 15 (not 13), Art. 5 (not 6), EDSB (not national DPAs).
+L1: Pflichtangabe erwaehnt? L2: Pflichtangabe korrekt/vollstaendig?
+"""
+
+EU_INSTITUTION_CHECKLIST = [
+ # == L1: Verantwortlicher (Controller) =================================
+ {
+ "id": "eu_controller",
+ "label": "Verantwortlicher (Art. 15(1)(a) VO 2018/1725)",
+ "level": 1, "parent": None,
+ "patterns": [
+ r"verantwortlich\w*\s+(?:ist|im sinne|fuer|f(?:ue|ĂĽ)r)",
+ r"kontaktdaten\s+des\s+verantwortlichen",
+ r"name\s+(?:und|&)\s+kontaktdaten\s+des",
+ r"controller", r"verantwortliche\s+stelle",
+ r"responsible\s+(?:party|for)",
+ r"data\s+controller",
+ r"identity\s+(?:of\s+)?(?:the\s+)?controller",
+ ],
+ "severity": "HIGH",
+ "hint": (
+ "Art. 15(1)(a) VO 2018/1725 verlangt die Identitaet des Verantwortlichen. "
+ "Bei EU-Organen: Vollstaendiger Name der Institution (z.B. 'Europaeische Kommission, "
+ "GD DIGIT'), Dienstadresse und funktionale E-Mail-Adresse. "
+ "Haeufiger Fehler: Nur Abkuerzung ohne vollstaendigen Institutionsnamen."
+ ),
+ },
+ {
+ "id": "eu_controller_address",
+ "label": "Dienstadresse des Verantwortlichen",
+ "level": 2, "parent": "eu_controller",
+ "patterns": [
+ r"(?:rue|avenue|boulevard|strasse|stra(?:ss|Ăź)e)\s+\w+",
+ r"\d{4,5}\s+(?:bruxelles|brussels|br(?:ue|ĂĽ)ssel|luxembourg|luxemburg|strasbourg|stra(?:ss|Ăź)burg)",
+ r"b[\-\s]?\d{4}\s+\w+",
+ r"l[\-\s]?\d{4}\s+\w+",
+ ],
+ "severity": "MEDIUM",
+ "hint": (
+ "Angabe der Dienstadresse der EU-Institution (typisch: Bruessel, Luxemburg "
+ "oder Strassburg). Format z.B. 'Rue de la Loi 200, B-1049 Bruxelles'. "
+ "Haeufiger Fehler: Nur Postfach ohne physische Adresse."
+ ),
+ },
+ {
+ "id": "eu_controller_email",
+ "label": "E-Mail-Adresse des Verantwortlichen",
+ "level": 2, "parent": "eu_controller",
+ "patterns": [
+ r"[a-z0-9._%+\-]+@[a-z0-9.\-]+\.europa\.eu",
+ r"[a-z0-9._%+\-]+@[a-z0-9.\-]+\.[a-z]{2,}",
+ ],
+ "severity": "MEDIUM",
+ "hint": (
+ "Eine funktionale E-Mail-Adresse ist Pflicht (Art. 15(1)(a) VO 2018/1725). "
+ "Bei EU-Organen typischerweise @ec.europa.eu, @europarl.europa.eu o.ae. "
+ "Ein reines Kontaktformular genuegt nicht als unmittelbarer Kommunikationskanal."
+ ),
+ },
+ # == L1: Datenschutzbeauftragter (DPO) =================================
+ {
+ "id": "eu_dpo",
+ "label": "Datenschutzbeauftragter (Art. 15(1)(b) / Art. 43 VO 2018/1725)",
+ "level": 1, "parent": None,
+ "patterns": [
+ r"datenschutzbeauftragt",
+ r"data\s+protection\s+officer",
+ r"kontaktdaten\s+de[rs]\s+datenschutz",
+ r"dpo",
+ r"d(?:ae|ä)legu(?:e|é)\s+(?:a|à )\s+la\s+protection\s+des\s+donn(?:e|é)es",
+ ],
+ "severity": "HIGH",
+ "hint": (
+ "Art. 43-44 VO 2018/1725: Jedes EU-Organ MUSS einen DSB (DPO) benennen. "
+ "Dies ist — anders als unter der DSGVO — keine Frage der Mitarbeiterzahl, "
+ "sondern absolute Pflicht fuer alle EU-Organe. Die Kontaktdaten muessen in "
+ "jeder Datenschutzerklaerung angegeben werden (Art. 15(1)(b))."
+ ),
+ },
+ {
+ "id": "eu_dpo_contact",
+ "label": "DPO-Kontaktdaten (E-Mail oder Adresse)",
+ "level": 2, "parent": "eu_dpo",
+ "patterns": [
+ r"(?:data\s+protection\s+officer|dpo|datenschutzbeauftragt)[\s\S]{0,300}[a-z0-9._%+\-]+@",
+ r"dpo[\s\S]{0,100}@",
+ r"data[\-\.]?protection@",
+ r"dpo@\w+\.europa\.eu",
+ ],
+ "severity": "MEDIUM",
+ "hint": (
+ "Art. 44(7) VO 2018/1725: Die Kontaktdaten des DPO muessen veroeffentlicht werden. "
+ "Mindestens eine funktionale E-Mail-Adresse angeben (z.B. DATA-PROTECTION-OFFICER@ec.europa.eu). "
+ "Den Namen des DPO muessen Sie nicht nennen."
+ ),
+ },
+
+ {
+ "id": "eu_dpo_function",
+ "label": "DPO-Funktion / -Rolle beschrieben",
+ "level": 2, "parent": "eu_dpo",
+ "patterns": [
+ r"(?:aufgaben|role|function|zustaendig).*(?:dpo|datenschutzbeauftragt|data\s+protection\s+officer)",
+ r"(?:dpo|datenschutzbeauftragt|data\s+protection\s+officer).*(?:aufgaben|role|function|zustaendig)",
+ r"art(?:icle)?\s*44\s+(?:vo|regulation|verordnung)",
+ ],
+ "severity": "LOW",
+ "hint": (
+ "Art. 44 VO 2018/1725 beschreibt die Aufgaben des DPO bei EU-Organen: "
+ "Beratung, Ueberwachung, Zusammenarbeit mit dem EDSB. "
+ "Es empfiehlt sich, kurz die Rolle des DPO zu erlaeutern, damit "
+ "Betroffene wissen, wofuer der DPO zustaendig ist."
+ ),
+ },
+
+ # == L1: Zwecke und Rechtsgrundlage ====================================
+ {
+ "id": "eu_purposes",
+ "label": "Zwecke der Verarbeitung (Art. 15(1)(c) VO 2018/1725)",
+ "level": 1, "parent": None,
+ "patterns": [
+ r"zweck\w*\s+(?:der|und|die)\s+(?:verarbeitung|datenerhebung|datenverarbeitung)",
+ r"purpose\w*\s+(?:of|for)\s+(?:the\s+)?(?:processing|data)",
+ r"zu\s+welch\w+\s+zweck",
+ r"(?:data|personal\s+data)\s+(?:is|are)\s+(?:collected|processed)\s+(?:for|to)",
+ ],
+ "severity": "HIGH",
+ "hint": (
+ "Art. 15(1)(c) VO 2018/1725 verlangt konkrete Zweckangaben. "
+ "EU-Organe muessen jeden Verarbeitungszweck einzeln auffuehren: z.B. "
+ "'Verwaltung von Bewerbungen', 'Zugangsmanagement zum Gebaeude', "
+ "'Webanalyse der Internetseite'. Pauschalformulierungen sind unzulaessig."
+ ),
+ },
+ {
+ "id": "eu_purposes_specific",
+ "label": "Konkrete Verarbeitungszwecke benannt",
+ "level": 2, "parent": "eu_purposes",
+ "patterns": [
+ r"(?:recruitment|selection|verwaltung|management|administration|monitoring|evaluation)",
+ r"(?:human\s+resources|hr|personal|bewerbung|grant|procurement|vergabe)",
+ r"(?:access|zugang|building|gebaeude|website|webseite|intranet)",
+ ],
+ "severity": "LOW",
+ "hint": (
+ "Mindestens 2 konkrete Zwecke benennen, jeweils mit zugehoeriger "
+ "Rechtsgrundlage. Typische EU-Organ-Zwecke: Personalverwaltung, "
+ "Gebaeudezugang, IT-Sicherheitsmonitoring, Vergabeverfahren, "
+ "Evaluierung von Foerderprogrammen. Pauschalformulierungen genuegen "
+ "nicht dem Bestimmtheitsgrundsatz."
+ ),
+ },
+ # == L1: Rechtsgrundlage (Art. 5 statt Art. 6 DSGVO) ==================
+ {
+ "id": "eu_legal_basis",
+ "label": "Rechtsgrundlage (Art. 5 VO 2018/1725)",
+ "level": 1, "parent": None,
+ "patterns": [
+ r"rechtsgrundlage",
+ r"art\.\s*5\s*(?:abs|absatz)?\s*\.?\s*1",
+ r"legal\s+basis",
+ r"lawfulness\s+of\s+processing",
+ r"art(?:icle)?\s*5\s*(?:\(1\))?\s*(?:\([a-d]\))?",
+ r"auf\s+grundlage\s+(?:von|des|der)\s+art",
+ r"regulation\s*\(eu\)\s*2018\s*/?\s*1725",
+ r"verordnung\s*\(eu\)\s*2018\s*/?\s*1725",
+ ],
+ "severity": "HIGH",
+ "hint": (
+ "Art. 5(1) VO 2018/1725 enthaelt die Rechtsgrundlagen fuer EU-Organe: "
+ "(a) Einwilligung, (b) Vertrag, (c) rechtliche Verpflichtung, "
+ "(d) im oeffentlichen Interesse/Ausuebung oeffentlicher Gewalt. "
+ "WICHTIG: Art. 5(1)(d) ist der haeufigste Tatbestand bei EU-Organen — "
+ "er entspricht etwa Art. 6(1)(e) DSGVO. Art. 6(1)(f) DSGVO "
+ "(berechtigtes Interesse) existiert in der VO 2018/1725 NICHT."
+ ),
+ },
+ {
+ "id": "eu_legal_basis_public_interest",
+ "label": "Art. 5(1)(a) — Oeffentliches Interesse / oeffentliche Gewalt",
+ "level": 2, "parent": "eu_legal_basis",
+ "patterns": [
+ r"art\.\s*5\s*(?:\(1\))?\s*\(?(?:1\s*)?(?:let(?:ter)?\.?\s*)?a\)?",
+ r"(?:oeffentlich|öffentlich).*(?:interesse|gewalt|aufgabe)",
+ r"public\s+interest",
+ r"(?:exercise|performance)\s+of\s+(?:official|public)\s+(?:authority|task)",
+ ],
+ "severity": "LOW",
+ "hint": (
+ "Art. 5(1)(a) VO 2018/1725 ist die Hauptrechtsgrundlage fuer EU-Organe. "
+ "Verlangt einen konkreten Rechtsakt als Grundlage (z.B. Verordnung, "
+ "Beschluss, Basisrechtsakt der Institution). Benennen Sie den spezifischen "
+ "Rechtsakt, nicht nur pauschal 'oeffentliches Interesse'."
+ ),
+ },
+ {
+ "id": "eu_legal_basis_consent",
+ "label": "Art. 5(1)(d) — Einwilligung",
+ "level": 2, "parent": "eu_legal_basis",
+ "patterns": [
+ r"art\.\s*5\s*(?:\(1\))?\s*\(?(?:1\s*)?(?:let(?:ter)?\.?\s*)?d\)?",
+ r"einwilligung\s+(?:gem|nach|i\.?\s*s\.?\s*d\.?)",
+ r"consent\s+(?:of|given\s+by)\s+the\s+data\s+subject",
+ ],
+ "severity": "LOW",
+ "hint": (
+ "Bei Einwilligung (Art. 5(1)(d) VO 2018/1725) muss auf das jederzeitige "
+ "Widerrufsrecht hingewiesen werden (Art. 7(3) VO 2018/1725). "
+ "Achtung: EU-Organe sollten Einwilligung nur als Rechtsgrundlage waehlen, "
+ "wenn keine andere Grundlage greift — wegen des Machtungleichgewichts "
+ "zwischen Institution und Einzelperson (EDSB-Leitlinien)."
+ ),
+ },
+
+ # == L1: Empfaenger ====================================================
+ {
+ "id": "eu_recipients",
+ "label": "Empfaenger (Art. 15(1)(e) VO 2018/1725)",
+ "level": 1, "parent": None,
+ "patterns": [
+ r"empf(?:ae|ä)nger",
+ r"(?:ueber|ĂĽber|weiter)mitt(?:el|l)ung",
+ r"recipient",
+ r"weitergabe\s+(?:an|von)\s+daten",
+ r"data\s+(?:will\s+be|are|is)\s+(?:shared|disclosed|transferred|transmitted)\s+to",
+ r"auftragsverarbeit",
+ r"processor",
+ ],
+ "severity": "MEDIUM",
+ "hint": (
+ "Art. 15(1)(e) VO 2018/1725: Empfaenger oder Empfaengerkategorien benennen. "
+ "Typisch bei EU-Organen: andere EU-Institutionen (z.B. OLAF, Rechnungshof), "
+ "Mitgliedstaaten-Behoerden, IT-Dienstleister. Auftragsverarbeiter muessen "
+ "nach Art. 29 VO 2018/1725 vertraglich gebunden sein."
+ ),
+ },
+ {
+ "id": "eu_recipients_processor",
+ "label": "Auftragsverarbeiter / Processor (Art. 29 VO 2018/1725)",
+ "level": 2, "parent": "eu_recipients",
+ "patterns": [
+ r"auftragsverarbeit(?:er|ung)",
+ r"art\.\s*29\s+(?:vo|verordnung|regulation)",
+ r"art(?:icle)?\s*29",
+ r"processor",
+ r"sub[\-\s]?processor",
+ ],
+ "severity": "LOW",
+ "hint": (
+ "Art. 29 VO 2018/1725 (entspricht Art. 28 DSGVO): "
+ "Auftragsverarbeiter muessen vertraglich gebunden werden. "
+ "Erwaehnen Sie, dass ein Auftragsverarbeitungsvertrag besteht. "
+ "Bei Cloud-Diensten (z.B. Microsoft 365, AWS): Vertrag muss "
+ "die Vorgaben von Art. 29(3) VO 2018/1725 einhalten."
+ ),
+ },
+
+ # == L1: Drittlandtransfer =============================================
+ {
+ "id": "eu_third_country",
+ "label": "Drittlandtransfer (Art. 46-50 VO 2018/1725)",
+ "level": 1, "parent": None,
+ "patterns": [
+ r"drittland",
+ r"dritt\s*staat",
+ r"third\s+countr",
+ r"angemessenheitsbeschluss",
+ r"adequacy\s+decision",
+ r"standard\s*(?:vertragsklausel|contractual\s+clause)",
+ r"(?:transfer|uebermittlung|ĂĽbermittlung).*(?:ausserhalb|auĂźerhalb|outside)",
+ r"(?:europ(?:ae|ä)ischen\s+wirtschaftsraum|ewr|eea)",
+ r"art(?:icle)?\s*4[6-9]",
+ r"art\.\s*50",
+ ],
+ "severity": "MEDIUM",
+ "hint": (
+ "Art. 46-50 VO 2018/1725 (entspricht Art. 44-49 DSGVO): "
+ "Drittlandtransfers erfordern Angemessenheitsbeschluss (Art. 47), "
+ "geeignete Garantien (Art. 48) oder Ausnahmen (Art. 50). "
+ "EDSB-Empfehlung: EU-Organe muessen besonders streng pruefen, "
+ "da sie eine Vorbildfunktion fuer die Mitgliedstaaten haben."
+ ),
+ },
+ {
+ "id": "eu_third_country_mechanism",
+ "label": "Transfermechanismus benannt (Art. 47-48 VO 2018/1725)",
+ "level": 2, "parent": "eu_third_country",
+ "patterns": [
+ r"standard\s*vertragsklausel|scc|standard\s+contractual",
+ r"angemessenheitsbeschluss|adequacy\s+decision",
+ r"art(?:icle)?\s*4[7-8]",
+ r"data\s+privacy\s+framework|dpf",
+ r"appropriate\s+safeguards",
+ r"geeignete\s+garantien",
+ ],
+ "severity": "MEDIUM",
+ "hint": (
+ "Art. 48 VO 2018/1725: Bei fehlender Angemessenheit koennen "
+ "geeignete Garantien (z.B. SCC, verbindliche Verwaltungsvereinbarungen) "
+ "den Transfer absichern. Der EDSB hat 2020 eigene Leitlinien zu "
+ "Drittlandtransfers fuer EU-Organe veroeffentlicht."
+ ),
+ },
+
+ # == L1: Speicherdauer =================================================
+ {
+ "id": "eu_retention",
+ "label": "Speicherdauer (Art. 15(1)(g) VO 2018/1725)",
+ "level": 1, "parent": None,
+ "patterns": [
+ r"speicherdauer",
+ r"aufbewahrungsfrist",
+ r"retention\s+period",
+ r"(?:how\s+long|storage\s+period|data\s+retention)",
+ r"l(?:oe|ö)sch(?:ung|frist)",
+ r"daten\s+werden\s+gel(?:oe|ö)scht",
+ r"(?:\d+\s+(?:tage|monate|jahre|days|months|years))",
+ r"dauer\s+der\s+speicherung",
+ r"data\s+will\s+be\s+(?:kept|stored|retained)\s+(?:for|until|during)",
+ ],
+ "severity": "HIGH",
+ "hint": (
+ "Art. 15(1)(g) VO 2018/1725 verlangt die Speicherdauer oder "
+ "Kriterien zu deren Festlegung. EU-Organe haben oft interne "
+ "Aufbewahrungsrichtlinien (retention schedules). Nennen Sie die "
+ "konkreten Fristen oder verweisen Sie auf die interne Richtlinie "
+ "mit Dokumentenreferenz."
+ ),
+ },
+ {
+ "id": "eu_retention_periods",
+ "label": "Konkrete Zeitangaben",
+ "level": 2, "parent": "eu_retention",
+ "patterns": [
+ r"\d+\s+(?:tage?|monate?|jahre?|days?|months?|years?)",
+ r"(?:after|nach)\s+(?:the\s+)?(?:end|closure|completion|ablauf|beendigung)",
+ r"retention\s+(?:schedule|policy|period)\s+(?:of|for)\s+\d+",
+ ],
+ "severity": "MEDIUM",
+ "hint": (
+ "Konkrete Fristen pro Datenkategorie nennen. EU-Organe folgen "
+ "typischerweise der Common Retention List (CRL) der Kommission. "
+ "Beispiel: Bewerbungsdaten 2 Jahre, Finanzunterlagen 7 Jahre, "
+ "Gebaeudezugangslogs 6 Monate."
+ ),
+ },
+
+ # == L1: Betroffenenrechte (Art. 17-24 statt Art. 15-22 DSGVO) =========
+ {
+ "id": "eu_rights",
+ "label": "Betroffenenrechte (Art. 17-24 VO 2018/1725)",
+ "level": 1, "parent": None,
+ "patterns": [
+ r"recht\s+auf\s+auskunft",
+ r"recht\s+auf\s+l(?:oe|ö)schung",
+ r"recht\s+auf\s+berichtigung",
+ r"widerspruchsrecht",
+ r"right\s+to\s+(?:access|erasure|rectification|object|restrict)",
+ r"betroffenenrecht",
+ r"rechte\s+(?:des|der)\s+betroffenen",
+ r"(?:your|data\s+subject)\s+rights",
+ r"art(?:icle)?\s*(?:17|18|19|20|21|22|23|24)\s+(?:vo|regulation|verordnung)",
+ ],
+ "severity": "HIGH",
+ "hint": (
+ "Art. 15(1)(h) VO 2018/1725 verlangt Nennung der Betroffenenrechte: "
+ "Auskunft (Art. 17), Berichtigung (Art. 18), Loeschung (Art. 19), "
+ "Einschraenkung (Art. 20), Datenportabilitaet (Art. 22), "
+ "Widerspruch (Art. 23). Achtung: Die Artikelnummern unterscheiden sich "
+ "von der DSGVO (Art. 15-22)! Haeufiger Fehler: DSGVO-Artikel "
+ "statt VO 2018/1725 Artikel zitieren."
+ ),
+ },
+ {
+ "id": "eu_rights_access",
+ "label": "Recht auf Auskunft (Art. 17 VO 2018/1725)",
+ "level": 2, "parent": "eu_rights",
+ "patterns": [
+ r"art(?:icle)?\s*17\s+(?:vo|regulation|verordnung)",
+ r"art\.\s*17",
+ r"recht\s+auf\s+(?:\w+\s+)?auskunft",
+ r"right\s+(?:of|to)\s+access",
+ ],
+ "severity": "LOW",
+ "hint": (
+ "Art. 17 VO 2018/1725 (entspricht Art. 15 DSGVO): Betroffene koennen "
+ "Auskunft und eine Kopie ihrer Daten verlangen. Antwortfrist: 1 Monat "
+ "(Art. 14(3) VO 2018/1725). Anfragen gehen typischerweise an den DPO "
+ "der Institution."
+ ),
+ },
+ {
+ "id": "eu_rights_erasure",
+ "label": "Recht auf Loeschung (Art. 19 VO 2018/1725)",
+ "level": 2, "parent": "eu_rights",
+ "patterns": [
+ r"art(?:icle)?\s*19\s+(?:vo|regulation|verordnung)",
+ r"art\.\s*19",
+ r"recht\s+auf\s+(?:\w+\s+)?l(?:oe|ö)schung",
+ r"right\s+to\s+erasure",
+ r"right\s+to\s+be\s+forgotten",
+ ],
+ "severity": "LOW",
+ "hint": (
+ "Art. 19 VO 2018/1725 (entspricht Art. 17 DSGVO): "
+ "Recht auf Loeschung bei Zweckentfall, Widerruf der Einwilligung "
+ "oder unrechtmaessiger Verarbeitung. Erwaehnen Sie auch die "
+ "Ausnahmen fuer EU-Organe: Archivzwecke im oeffentlichen Interesse, "
+ "gesetzliche Aufbewahrungspflichten."
+ ),
+ },
+
+ {
+ "id": "eu_rights_restriction",
+ "label": "Recht auf Einschraenkung (Art. 20 VO 2018/1725)",
+ "level": 2, "parent": "eu_rights",
+ "patterns": [
+ r"art(?:icle)?\s*20\s+(?:vo|regulation|verordnung)",
+ r"art\.\s*20",
+ r"einschr(?:ae|ä)nkung\s+der\s+verarbeitung",
+ r"right\s+to\s+restrict(?:ion)?",
+ ],
+ "severity": "LOW",
+ "hint": (
+ "Art. 20 VO 2018/1725 (entspricht Art. 18 DSGVO): "
+ "Recht auf Einschraenkung der Verarbeitung bei bestrittener "
+ "Richtigkeit, unrechtmaessiger Verarbeitung oder laufendem "
+ "Widerspruch. Wird am haeufigsten vergessen."
+ ),
+ },
+ {
+ "id": "eu_rights_automated",
+ "label": "Automatisierte Entscheidungen (Art. 24 VO 2018/1725)",
+ "level": 2, "parent": "eu_rights",
+ "patterns": [
+ r"art(?:icle)?\s*24\s+(?:vo|regulation|verordnung)",
+ r"art\.\s*24",
+ r"automatisierte\s+entscheidung",
+ r"automated\s+(?:decision|individual)",
+ r"profiling",
+ ],
+ "severity": "LOW",
+ "hint": (
+ "Art. 24 VO 2018/1725 (entspricht Art. 22 DSGVO): "
+ "Bei automatisierten Einzelentscheidungen muessen Logik, "
+ "Tragweite und Auswirkungen erklaert werden. Falls kein "
+ "Profiling stattfindet, explizit verneinen."
+ ),
+ },
+
+ # == L1: Beschwerderecht beim EDSB =====================================
+ {
+ "id": "eu_complaint",
+ "label": "Beschwerderecht beim EDSB (Art. 15(1)(i) VO 2018/1725)",
+ "level": 1, "parent": None,
+ "patterns": [
+ r"beschwerderecht",
+ r"right\s+to\s+lodge\s+a\s+complaint",
+ r"beschwerde.*(?:edsb|edps)",
+ r"edsb",
+ r"edps",
+ r"europ(?:ae|ä)isch\w*\s+datenschutzbeauftragt",
+ r"european\s+data\s+protection\s+supervisor",
+ r"contr(?:o|Ă´)leur\s+europ(?:e|Ă©)en",
+ r"art(?:icle)?\s*63",
+ ],
+ "severity": "HIGH",
+ "hint": (
+ "Art. 15(1)(i) VO 2018/1725: Bei EU-Organen ist der EDSB "
+ "(Europaeischer Datenschutzbeauftragter / European Data Protection "
+ "Supervisor) die zustaendige Aufsichtsbehoerde — NICHT die nationalen "
+ "Datenschutzbehoerden. Kontakt: edps@edps.europa.eu, "
+ "Rue Wiertz 60, B-1047 Bruxelles. Haeufiger Fehler: Verweis auf "
+ "nationale Aufsichtsbehoerde statt EDSB."
+ ),
+ },
+ {
+ "id": "eu_complaint_edps_contact",
+ "label": "EDSB-Kontaktdaten angegeben",
+ "level": 2, "parent": "eu_complaint",
+ "patterns": [
+ r"edps@edps\.europa\.eu",
+ r"edps\.europa\.eu",
+ r"edsb.*(?:kontakt|anschrift|adresse|e[\-\s]?mail|wiertz)",
+ r"edps.*(?:contact|address|e[\-\s]?mail|wiertz)",
+ r"rue\s+wiertz",
+ ],
+ "severity": "MEDIUM",
+ "hint": (
+ "Vollstaendige EDSB-Kontaktdaten angeben: "
+ "Europaeischer Datenschutzbeauftragter (EDSB), "
+ "Rue Wiertz 60, B-1047 Bruxelles/Bruessel, "
+ "edps@edps.europa.eu, https://edps.europa.eu. "
+ "Haeufiger Fehler: Nur 'EDSB' erwaehnt ohne Kontaktdaten."
+ ),
+ },
+]
diff --git a/backend-compliance/compliance/services/doc_checks/runner.py b/backend-compliance/compliance/services/doc_checks/runner.py
index 70e1bd2..96ce71a 100644
--- a/backend-compliance/compliance/services/doc_checks/runner.py
+++ b/backend-compliance/compliance/services/doc_checks/runner.py
@@ -15,6 +15,7 @@ from .impressum_checks import IMPRESSUM_CHECKLIST
from .cookie_checks import COOKIE_CHECKLIST
from .social_media_checks import JOINT_CONTROLLER_CHECKLIST
from .dsfa_checks import DSFA_CHECKLIST
+from .eu_institution_checks import EU_INSTITUTION_CHECKLIST
logger = logging.getLogger(__name__)
@@ -35,6 +36,7 @@ _CHECKLIST_MAP = {
"social_media": (JOINT_CONTROLLER_CHECKLIST, "Art. 26 DSGVO"),
"joint_controller": (JOINT_CONTROLLER_CHECKLIST, "Art. 26 DSGVO"),
"dsfa": (DSFA_CHECKLIST, "Art. 35 DSGVO"),
+ "eu_institution": (EU_INSTITUTION_CHECKLIST, "VO (EU) 2018/1725"),
}
@@ -218,6 +220,11 @@ def classify_document_type(title: str, url: str) -> str:
if any(kw in combined for kw in ["social media", "facebook", "instagram", "linkedin", "fanpage"]):
if any(kw in combined for kw in ["datenschutzerkl", "datenschutz fĂĽr", "datenschutzinformation"]):
return "social_media"
+ # EU institution check BEFORE generic privacy — 2018/1725 is more specific
+ if any(kw in combined for kw in ["2018/1725", "2018 1725", "regulation (eu)",
+ "verordnung (eu)", "edsb", "edps",
+ "european data protection supervisor"]):
+ return "eu_institution"
if any(kw in combined for kw in ["datenschutz", "privacy", "dsgvo", "data protection", "données"]):
return "dse"
if any(kw in combined for kw in ["widerruf", "withdrawal", "rétractation", "desistimiento"]):
diff --git a/consent-tester/checks/banner_runner.py b/consent-tester/checks/banner_runner.py
new file mode 100644
index 0000000..fe0ef67
--- /dev/null
+++ b/consent-tester/checks/banner_runner.py
@@ -0,0 +1,175 @@
+"""
+Banner Runner — maps scan results to the L1/L2 check hierarchy.
+
+Takes the raw ScanResponse dict and produces a structured_checks list
+compatible with ChecklistView (same format as document checks).
+"""
+
+from checks.banner_checks import BANNER_CHECKLIST
+
+
+def map_scan_to_checks(scan_result: dict) -> dict:
+ """Map a /scan response to the L1/L2 banner check hierarchy.
+
+ Returns dict with:
+ - structured_checks: list of CheckItem dicts
+ - completeness_pct: L1 pass rate (0-100)
+ - correctness_pct: L2 pass rate (0-100)
+ """
+ # Collect all violation codes from every source
+ violation_codes = _collect_violation_codes(scan_result)
+
+ # Collect pass codes — some checks produce boolean signals, not violations
+ pass_codes = _collect_pass_codes(scan_result)
+
+ # Build structured checks
+ checks: list[dict] = []
+ l1_checks: list[dict] = []
+ l2_checks: list[dict] = []
+
+ for defn in BANNER_CHECKLIST:
+ key = defn["check_key"]
+ level = defn["level"]
+ parent = defn.get("parent")
+
+ # Determine pass/fail
+ is_violation_key = key in violation_codes
+ is_pass_key = key in pass_codes
+
+ # For checks whose check_key appears in violations → failed
+ # For checks whose check_key appears only in passes → passed
+ # For checks where neither → assume passed (not tested = no finding)
+ if is_violation_key:
+ passed = False
+ matched_text = violation_codes[key]
+ elif is_pass_key:
+ passed = True
+ matched_text = pass_codes.get(key, "")
+ else:
+ # Key not found in violations or explicit passes.
+ # If the scan ran (banner detected) → assume passed.
+ # If banner not detected → only banner_detected fails.
+ passed = scan_result.get("banner_detected", False) or key == "banner_detected"
+ if key == "banner_detected":
+ passed = scan_result.get("banner_detected", False)
+ matched_text = ""
+
+ # L2 checks are skipped if their parent L1 failed
+ skipped = False
+ if level == 2 and parent:
+ parent_check = next(
+ (c for c in checks if c["id"] == parent), None
+ )
+ if parent_check and not parent_check["passed"]:
+ skipped = True
+
+ item = {
+ "id": defn["id"],
+ "label": defn["label"],
+ "passed": passed and not skipped,
+ "severity": defn["severity"],
+ "level": level,
+ "parent": parent,
+ "skipped": skipped,
+ "hint": defn.get("hint", ""),
+ "matched_text": matched_text if passed else "",
+ }
+ checks.append(item)
+
+ if level == 1:
+ l1_checks.append(item)
+ elif level == 2:
+ l2_checks.append(item)
+
+ # Compute percentages
+ l1_total = len(l1_checks)
+ l1_passed = sum(1 for c in l1_checks if c["passed"])
+ completeness_pct = round(l1_passed / l1_total * 100) if l1_total else 0
+
+ l2_active = [c for c in l2_checks if not c["skipped"]]
+ l2_passed = sum(1 for c in l2_active if c["passed"])
+ correctness_pct = round(l2_passed / len(l2_active) * 100) if l2_active else 0
+
+ return {
+ "structured_checks": checks,
+ "completeness_pct": completeness_pct,
+ "correctness_pct": correctness_pct,
+ }
+
+
+def _collect_violation_codes(scan: dict) -> dict[str, str]:
+ """Collect check_key → violation text from all sources."""
+ codes: dict[str, str] = {}
+
+ # Banner text violations
+ banner_checks = scan.get("banner_checks", {})
+ for v in banner_checks.get("violations", []):
+ code = v.get("code", "")
+ if code:
+ codes[code] = v.get("text", "")[:120]
+
+ # Phase A violations (before consent)
+ phase_a = scan.get("phases", {}).get("before_consent", {})
+ for v in phase_a.get("violations", []):
+ code = v.get("code", "")
+ if code:
+ codes[code] = v.get("text", "")[:120]
+
+ # Phase B violations (after reject)
+ phase_b = scan.get("phases", {}).get("after_reject", {})
+ for v in phase_b.get("violations", []):
+ code = v.get("code", "")
+ if code:
+ codes[code] = v.get("text", "")[:120]
+
+ # Tracking services in phase A → tracking_before_consent
+ tracking_a = phase_a.get("tracking_services", [])
+ if tracking_a and "tracking_before_consent" not in codes:
+ codes["tracking_before_consent"] = ", ".join(tracking_a[:5])
+
+ # Cookies before consent → cookies_before_consent
+ cookies_a = phase_a.get("cookies", [])
+ tracking_cookies = [c for c in cookies_a if _is_tracking_cookie(c)]
+ if tracking_cookies and "cookies_before_consent" not in codes:
+ codes["cookies_before_consent"] = ", ".join(tracking_cookies[:5])
+
+ # New tracking after reject → tracking_after_reject
+ new_tracking_b = phase_b.get("new_tracking", [])
+ if new_tracking_b and "tracking_after_reject" not in codes:
+ codes["tracking_after_reject"] = ", ".join(new_tracking_b[:5])
+
+ return codes
+
+
+def _collect_pass_codes(scan: dict) -> dict[str, str]:
+ """Collect explicit pass signals from scan results."""
+ passes: dict[str, str] = {}
+
+ # Banner detected
+ if scan.get("banner_detected"):
+ passes["banner_detected"] = scan.get("banner_provider", "detected")
+
+ # Provider named
+ provider = scan.get("banner_provider", "")
+ if provider:
+ passes["banner_provider_named"] = provider
+
+ # Impressum link
+ bc = scan.get("banner_checks", {})
+ if bc.get("has_impressum_link"):
+ passes["impressum_link"] = "Impressum-Link gefunden"
+ if bc.get("has_dse_link"):
+ passes["dse_link"] = "DSE-Link gefunden"
+
+ return passes
+
+
+_TRACKING_COOKIE_PREFIXES = (
+ "_ga", "_gid", "_fbp", "_fbc", "IDE", "_gcl", "fr", "_pin",
+ "_tt_", "li_sugr", "_hj", "mp_", "ajs_", "_clck", "_clsk",
+)
+
+
+def _is_tracking_cookie(name: str) -> bool:
+ """Check if a cookie name is a known tracking cookie."""
+ return any(name.startswith(p) for p in _TRACKING_COOKIE_PREFIXES)
diff --git a/consent-tester/main.py b/consent-tester/main.py
index 64fc3ab..93cbc8e 100644
--- a/consent-tester/main.py
+++ b/consent-tester/main.py
@@ -16,6 +16,7 @@ from services.consent_scanner import run_consent_test, ConsentTestResult
from services.authenticated_scanner import run_authenticated_test, AuthTestResult
from services.playwright_scanner import scan_website_playwright
from services.dsi_discovery import discover_dsi_documents, DSIDiscoveryResult
+from checks.banner_runner import map_scan_to_checks
logging.basicConfig(level=logging.INFO, format="%(levelname)s:%(name)s: %(message)s")
logger = logging.getLogger(__name__)
@@ -44,6 +45,9 @@ class ScanResponse(BaseModel):
scanned_at: str
category_tests: list = []
banner_checks: dict = {}
+ structured_checks: list = []
+ completeness_pct: int = 0
+ correctness_pct: int = 0
@app.get("/health")
@@ -57,30 +61,47 @@ async def scan_consent(req: ScanRequest):
logger.info("Starting consent test for %s", req.url)
result = await run_consent_test(req.url, req.timeout_per_phase)
+ # Build raw response dict for structured check mapping
+ phases = {
+ "before_consent": {
+ "scripts": result.before_scripts,
+ "cookies": result.before_cookies,
+ "tracking_services": result.before_tracking,
+ "violations": [v.__dict__ for v in result.before_violations],
+ },
+ "after_reject": {
+ "scripts": result.reject_scripts,
+ "cookies": result.reject_cookies,
+ "new_tracking": result.reject_new_tracking,
+ "violations": [v.__dict__ for v in result.reject_violations],
+ },
+ "after_accept": {
+ "scripts": result.accept_scripts,
+ "cookies": result.accept_cookies,
+ "new_tracking": result.accept_new_tracking,
+ "undocumented": result.accept_undocumented,
+ },
+ }
+ banner_checks_data = {
+ "has_impressum_link": result.banner_has_impressum_link,
+ "has_dse_link": result.banner_has_dse_link,
+ "violations": [v.__dict__ for v in result.banner_text_violations],
+ }
+
+ # Map to L1/L2 hierarchy
+ raw_for_mapping = {
+ "banner_detected": result.banner_detected,
+ "banner_provider": result.banner_provider,
+ "phases": phases,
+ "banner_checks": banner_checks_data,
+ }
+ check_result = map_scan_to_checks(raw_for_mapping)
+
return ScanResponse(
url=req.url,
banner_detected=result.banner_detected,
banner_provider=result.banner_provider,
- phases={
- "before_consent": {
- "scripts": result.before_scripts,
- "cookies": result.before_cookies,
- "tracking_services": result.before_tracking,
- "violations": [v.__dict__ for v in result.before_violations],
- },
- "after_reject": {
- "scripts": result.reject_scripts,
- "cookies": result.reject_cookies,
- "new_tracking": result.reject_new_tracking,
- "violations": [v.__dict__ for v in result.reject_violations],
- },
- "after_accept": {
- "scripts": result.accept_scripts,
- "cookies": result.accept_cookies,
- "new_tracking": result.accept_new_tracking,
- "undocumented": result.accept_undocumented,
- },
- },
+ phases=phases,
summary={
"critical": sum(1 for v in result.reject_violations if v.severity == "CRITICAL"),
"high": len(result.before_violations) + sum(1 for v in result.banner_text_violations if v.severity == "HIGH"),
@@ -90,11 +111,10 @@ async def scan_consent(req: ScanRequest):
"categories_tested": len(result.category_tests),
"banner_text_issues": len(result.banner_text_violations),
},
- banner_checks={
- "has_impressum_link": result.banner_has_impressum_link,
- "has_dse_link": result.banner_has_dse_link,
- "violations": [v.__dict__ for v in result.banner_text_violations],
- },
+ banner_checks=banner_checks_data,
+ structured_checks=check_result["structured_checks"],
+ completeness_pct=check_result["completeness_pct"],
+ correctness_pct=check_result["correctness_pct"],
scanned_at=datetime.now(timezone.utc).isoformat(),
category_tests=[{
"category": ct.category,
diff --git a/consent-tester/services/dsi_discovery.py b/consent-tester/services/dsi_discovery.py
index 75be505..e5aa4d0 100644
--- a/consent-tester/services/dsi_discovery.py
+++ b/consent-tester/services/dsi_discovery.py
@@ -23,6 +23,8 @@ from urllib.parse import urlparse, urljoin
from playwright.async_api import Page
+from services.dsi_helpers import goto_resilient, try_dismiss_consent_banner, is_pdf_redirect
+
logger = logging.getLogger(__name__)
# Legal document keywords in all EU/EEA official languages.
@@ -216,11 +218,36 @@ async def discover_dsi_documents(
seen_titles: set[str] = set()
try:
- # Step 1: Load the page
- await page.goto(url, wait_until="networkidle", timeout=60000)
+ # Step 1: Load the page (with networkidle → domcontentloaded fallback)
+ await goto_resilient(page, url, timeout=60000)
await page.wait_for_timeout(2000)
- # Step 1b: Self-extraction — if the URL itself is a DSI page,
+ # Step 1a: Detect PDF redirects (e.g. dm.de redirects to GCS PDF)
+ final_url = page.url
+ if is_pdf_redirect(url, final_url):
+ is_dsi_url, dsi_lang = _matches_dsi_keyword(urlparse(url).path.lower())
+ if is_dsi_url:
+ result.documents.append(DiscoveredDSI(
+ title=urlparse(url).path.split("/")[-1] or "Datenschutzerklaerung",
+ url=final_url,
+ source_url=url,
+ language=dsi_lang or "de",
+ doc_type="pdf",
+ text="[PDF — Textextraktion erforderlich]",
+ ))
+ seen_urls.add(url)
+ seen_urls.add(final_url)
+ logger.info("PDF redirect detected: %s -> %s", url, final_url)
+ # Return early — a PDF redirect means no HTML content to scan
+ result.total_found = len(result.documents)
+ return result
+
+ # Step 1b: Try dismissing cookie consent banners before extraction.
+ # Many German sites (dm.de, Zalando, etc.) block page content behind
+ # a consent wall. Dismissing it reveals the actual DSI text.
+ await try_dismiss_consent_banner(page)
+
+ # Step 1c: Self-extraction — if the URL itself is a DSI page,
# extract its full text as the first document. This handles the
# case where the user provides the DSE URL directly (e.g.
# example.com/datenschutz) instead of the homepage.
@@ -251,6 +278,8 @@ async def discover_dsi_documents(
))
seen_urls.add(url)
logger.info("Self-extracted %d words from %s", self_wc, url)
+ else:
+ logger.info("Self-extraction too short (%d words) for %s", self_wc, url)
except Exception as e:
logger.warning("Self-extraction failed for %s: %s", url, e)
@@ -323,58 +352,69 @@ async def discover_dsi_documents(
if is_anchor:
continue
- # Navigate to page — wait for JS to load content
- resp = await page.goto(href, wait_until="networkidle", timeout=45000)
- if resp and resp.status < 400:
- await page.wait_for_timeout(2000)
- await _expand_all_interactive(page)
- await page.wait_for_timeout(500)
+ # Navigate to page — with networkidle/domcontentloaded fallback
+ await goto_resilient(page, href, timeout=45000)
+ resp_url = page.url
- # Extract text — try specific content areas, fall back to full body
- text = await page.evaluate("""
- () => {
- // Try progressively broader content selectors
- const selectors = [
- '.article-content', '.page-content', '.entry-content',
- '[class*="content-area"]', '[class*="main-content"]',
- 'main article', 'main', 'article',
- '[role="main"]', '.content', '#content',
- ];
- for (const sel of selectors) {
- const el = document.querySelector(sel);
- if (el && el.textContent.trim().length > 200) {
- return el.textContent.trim();
- }
+ # Check for PDF redirect on followed links
+ if is_pdf_redirect(href, resp_url):
+ result.documents.append(DiscoveredDSI(
+ title=title, url=resp_url, source_url=url,
+ language=lang, doc_type="pdf",
+ text="[PDF — Textextraktion erforderlich]",
+ ))
+ await goto_resilient(page, url, timeout=45000)
+ continue
+
+ await try_dismiss_consent_banner(page)
+ await _expand_all_interactive(page)
+ await page.wait_for_timeout(500)
+
+ # Extract text — try specific content areas, fall back to full body
+ text = await page.evaluate("""
+ () => {
+ // Try progressively broader content selectors
+ const selectors = [
+ '.article-content', '.page-content', '.entry-content',
+ '[class*="content-area"]', '[class*="main-content"]',
+ 'main article', 'main', 'article',
+ '[role="main"]', '.content', '#content',
+ ];
+ for (const sel of selectors) {
+ const el = document.querySelector(sel);
+ if (el && el.textContent.trim().length > 200) {
+ return el.textContent.trim();
}
- // Fallback: full body minus nav/header/footer
- const body = document.body.cloneNode(true);
- body.querySelectorAll('nav, header, footer, script, style, [class*="nav"], [class*="sidebar"]').forEach(e => e.remove());
- return body.textContent?.trim() || '';
}
- """)
- if text and len(text) > 50:
- result.documents.append(DiscoveredDSI(
- title=title, url=href, source_url=url,
- language=lang,
- doc_type="cross_domain" if not _is_allowed_domain(href, base_domain) else "html_page",
- text=text[:50000], word_count=len(text.split()),
- ))
+ // Fallback: full body minus nav/header/footer
+ const body = document.body.cloneNode(true);
+ body.querySelectorAll('nav, header, footer, script, style, [class*="nav"], [class*="sidebar"]').forEach(e => e.remove());
+ return body.textContent?.trim() || '';
+ }
+ """)
+ if text and len(text) > 50:
+ result.documents.append(DiscoveredDSI(
+ title=title, url=href, source_url=url,
+ language=lang,
+ doc_type="cross_domain" if not _is_allowed_domain(href, base_domain) else "html_page",
+ text=text[:50000], word_count=len(text.split()),
+ ))
- # Recursive: search THIS page for more DSI links
- new_links = await _find_dsi_links(page, base_domain)
- for nl in new_links:
- if nl["href"] not in seen_urls and nl["href"] not in [p["href"] for p in pending_links]:
- pending_links.append(nl)
+ # Recursive: search THIS page for more DSI links
+ new_links = await _find_dsi_links(page, base_domain)
+ for nl in new_links:
+ if nl["href"] not in seen_urls and nl["href"] not in [p["href"] for p in pending_links]:
+ pending_links.append(nl)
# Navigate back for next link
- await page.goto(url, wait_until="networkidle", timeout=45000)
+ await goto_resilient(page, url, timeout=45000)
await page.wait_for_timeout(500)
await _expand_all_interactive(page)
except Exception as e:
result.errors.append(f"Failed to load {href}: {str(e)[:80]}")
try:
- await page.goto(url, wait_until="networkidle", timeout=45000)
+ await goto_resilient(page, url, timeout=45000)
except Exception:
pass
diff --git a/consent-tester/services/dsi_helpers.py b/consent-tester/services/dsi_helpers.py
new file mode 100644
index 0000000..b5ad847
--- /dev/null
+++ b/consent-tester/services/dsi_helpers.py
@@ -0,0 +1,118 @@
+"""
+DSI Discovery Helpers — resilient navigation, consent dismissal, PDF redirect detection.
+
+Extracted from dsi_discovery.py to keep modules under 500 LOC.
+"""
+
+import logging
+
+from playwright.async_api import Page, TimeoutError as PlaywrightTimeout
+
+logger = logging.getLogger(__name__)
+
+
+async def goto_resilient(page: Page, url: str, timeout: int = 60000) -> None:
+ """Navigate to URL with fallback: try networkidle first, then domcontentloaded.
+
+ SPAs like Zalando never reach networkidle because of continuous background
+ requests. Falling back to domcontentloaded + a short wait gives JS time to
+ render the main content without waiting for every network request to finish.
+ """
+ try:
+ await page.goto(url, wait_until="networkidle", timeout=timeout)
+ except PlaywrightTimeout:
+ logger.info("networkidle timeout for %s, falling back to domcontentloaded", url)
+ await page.goto(url, wait_until="domcontentloaded", timeout=timeout)
+ await page.wait_for_timeout(5000) # extra wait for JS rendering
+
+
+async def try_dismiss_consent_banner(page: Page) -> bool:
+ """Try to dismiss cookie consent banners that block page content.
+
+ Handles shadow DOM (Usercentrics), iframes (Sourcepoint), and regular
+ DOM banners (OneTrust, Cookiebot, Didomi, etc.).
+ Returns True if a banner was dismissed.
+ """
+ # 1) Usercentrics shadow DOM — most common for German sites
+ try:
+ uc_root = await page.query_selector("#usercentrics-root")
+ if uc_root:
+ clicked = await page.evaluate("""() => {
+ const root = document.querySelector('#usercentrics-root');
+ if (!root || !root.shadowRoot) return false;
+ const buttons = root.shadowRoot.querySelectorAll('button');
+ for (const btn of buttons) {
+ const t = btn.textContent.trim().toLowerCase();
+ if (t.includes('akzeptieren') || t.includes('accept')
+ || t.includes('zustimmen') || t.includes('agree')) {
+ btn.click();
+ return true;
+ }
+ }
+ return false;
+ }""")
+ if clicked:
+ logger.info("Dismissed Usercentrics consent banner (shadow DOM)")
+ await page.wait_for_timeout(2000)
+ return True
+ except Exception:
+ pass
+
+ # 2) Standard DOM banners — OneTrust, Cookiebot, Didomi, Borlabs, etc.
+ accept_selectors = [
+ "#onetrust-accept-btn-handler",
+ "#CybotCookiebotDialogBodyLevelButtonLevelOptinAllowAll",
+ "#didomi-notice-agree-button",
+ "#BorlabsCookieBox .cookie-accept, [data-cookie-accept]",
+ ".cmpboxbtn.cmpboxbtnyes",
+ ".klaro .cm-btn-accept",
+ ".cky-btn-accept",
+ "[class*='qc-cmp2-summary-buttons'] button:first-child",
+ "#tarteaucitronPersonalize2",
+ ]
+ for sel in accept_selectors:
+ try:
+ btn = page.locator(sel).first
+ if await btn.count() > 0 and await btn.is_visible():
+ await btn.click(timeout=3000)
+ logger.info("Dismissed consent banner via %s", sel)
+ await page.wait_for_timeout(2000)
+ return True
+ except Exception:
+ continue
+
+ # 3) Generic text-based button search
+ accept_texts = [
+ "Alle akzeptieren", "Alles akzeptieren", "Alle Cookies akzeptieren",
+ "Accept all", "Accept All Cookies", "Akzeptieren", "Zustimmen",
+ "Einverstanden", "Ich stimme zu",
+ ]
+ try:
+ clicked = await page.evaluate("""(texts) => {
+ for (const btn of document.querySelectorAll('button, a[role="button"]')) {
+ const t = (btn.textContent || '').trim();
+ for (const target of texts) {
+ if (t === target) { btn.click(); return true; }
+ }
+ }
+ return false;
+ }""", accept_texts)
+ if clicked:
+ logger.info("Dismissed consent banner via generic text match")
+ await page.wait_for_timeout(2000)
+ return True
+ except Exception:
+ pass
+
+ return False
+
+
+def is_pdf_redirect(original_url: str, final_url: str) -> bool:
+ """Check if the page redirected to a PDF or external storage."""
+ final_lower = final_url.lower()
+ return (
+ final_lower.endswith(".pdf")
+ or "storage.googleapis.com" in final_lower
+ or "blob.core.windows.net" in final_lower
+ or "s3.amazonaws.com" in final_lower
+ )