diff --git a/admin-compliance/app/sdk/agent/_components/BannerCheckTab.tsx b/admin-compliance/app/sdk/agent/_components/BannerCheckTab.tsx index fb8d623..5b73d43 100644 --- a/admin-compliance/app/sdk/agent/_components/BannerCheckTab.tsx +++ b/admin-compliance/app/sdk/agent/_components/BannerCheckTab.tsx @@ -1,19 +1,35 @@ 'use client' import React, { useState } from 'react' +import { ChecklistView } from './ChecklistView' + +interface CheckItem { + id: string + label: string + passed: boolean + severity: string + matched_text: string + level?: number + parent?: string | null + skipped?: boolean + hint?: string +} interface BannerResult { banner_detected: boolean banner_provider: string - banner_text: string banner_checks?: { violations: { code: string; text: string; severity: string }[] - passes: { code: string; text: string }[] + has_impressum_link?: boolean + has_dse_link?: boolean } + structured_checks?: CheckItem[] + completeness_pct?: number + correctness_pct?: number phases?: { - before_consent: { cookies: number; scripts: number; violations: string[] } - after_reject: { cookies: number; scripts: number; violations: string[] } - after_accept: { cookies: number; scripts: number; violations: string[] } + before_consent: { cookies: string[]; scripts: string[]; tracking_services: string[]; violations: any[] } + after_reject: { cookies: string[]; scripts: string[]; new_tracking: string[]; violations: any[] } + after_accept: { cookies: string[]; scripts: string[]; new_tracking: string[]; undocumented: string[] } } } @@ -43,7 +59,6 @@ export function BannerCheckTab() { const data = await res.json() if (data.scan_id) { - // Async polling let attempts = 0 while (attempts < 60) { await new Promise(r => setTimeout(r, 3000)) @@ -69,9 +84,23 @@ export function BannerCheckTab() { } } - const violations = result?.banner_checks?.violations || [] - const passes = result?.banner_checks?.passes || [] - const total = violations.length + passes.length + const structuredChecks = result?.structured_checks || [] + const hasStructured = structuredChecks.length > 0 + const compPct = result?.completeness_pct ?? 0 + const corrPct = result?.correctness_pct ?? 0 + + // Build ChecklistView-compatible result for structured checks + const checklistResults = hasStructured ? [{ + label: `Cookie-Banner: ${result?.banner_provider || 'Unbekannt'}`, + url: url, + doc_type: 'banner', + word_count: 0, + completeness_pct: compPct, + correctness_pct: corrPct, + checks: structuredChecks, + findings_count: structuredChecks.filter(c => !c.passed && !c.skipped).length, + error: '', + }] : [] return (
@@ -79,7 +108,7 @@ export function BannerCheckTab() {

Cookie-Banner Compliance Check

Playwright-basierter 3-Phasen-Test: Vor Interaktion, nach Ablehnen, nach Akzeptieren. - Prueft Dark Patterns, Pre-Consent-Cookies, Farbkontrast, Klick-Paritaet und 20+ weitere Kriterien. + Prueft Dark Patterns, Pre-Consent-Cookies, Farbkontrast, Klick-Paritaet und 36 weitere Kriterien.

@@ -116,14 +145,14 @@ export function BannerCheckTab() { )} {result && ( -
- {/* Header */} -
-
-
+
+ {/* 3-Phase Summary Card */} + {result.phases && ( +
+
- - {result.banner_detected ? '🛡️' : '⚠️'} + + {result.banner_detected ? '\u{1F6E1}\u{FE0F}' : '\u26A0\u{FE0F}'}

@@ -131,98 +160,50 @@ export function BannerCheckTab() { ? `Banner erkannt: ${result.banner_provider || 'Unbekannter Anbieter'}` : 'Kein Cookie-Banner erkannt'}

- {total > 0 && ( -

- {passes.length}/{total} Pruefungen bestanden -

- )} +

+ 3-Phasen-Analyse: Cookies und Scripts vor/nach Interaktion +

- {total > 0 && ( -
-
-
-
- - {Math.round(passes.length / total * 100)}% - -
- )} -
-
- - {/* 3-Phase Summary */} - {result.phases && ( -
- {[ - { label: 'Vor Consent', data: result.phases.before_consent, icon: 'đź”’' }, - { label: 'Nach Ablehnen', data: result.phases.after_reject, icon: 'đźš«' }, - { label: 'Nach Akzeptieren', data: result.phases.after_accept, icon: 'âś…' }, - ].map(phase => ( -
-
{phase.icon}
-
{phase.label}
-
- {phase.data.cookies} Cookies, {phase.data.scripts} Scripts -
- {phase.data.violations.length > 0 && ( -
- {phase.data.violations.length} Verstoesse -
- )} -
- ))} -
- )} - - {/* Violations */} - {violations.length > 0 && ( -
-

- Verstoesse ({violations.length}) -

-
- {violations.map((v, i) => ( -
- - - -
-
{v.text}
-
{v.code} | {v.severity}
-
-
- ))} +
+ + +
)} - {/* Passes */} - {passes.length > 0 && ( -
-

- Bestanden ({passes.length}) -

-
- {passes.map((p, i) => ( -
- - - -
{p.text}
-
- ))} -
+ {/* Structured L1/L2 Checklist */} + {hasStructured && ( +
+
)} - {!result.banner_detected && violations.length === 0 && passes.length === 0 && ( -
- Kein Cookie-Banner auf dieser Seite gefunden. Falls Cookies gesetzt werden, ist ein Banner nach §25 TDDDG Pflicht. + {!result.banner_detected && !hasStructured && ( +
+

+ Kein Cookie-Banner auf dieser Seite gefunden. Falls Cookies gesetzt werden, ist ein Banner nach ss25 TDDDG Pflicht. +

)}
@@ -230,3 +211,22 @@ export function BannerCheckTab() {
) } + +function PhaseBox({ label, icon, cookies, scripts, violations }: { + label: string; icon: string; cookies: number; scripts: number; violations: number +}) { + return ( +
+
{icon}
+
{label}
+
+ {cookies} Cookies, {scripts} Scripts +
+ {violations > 0 && ( +
+ {violations} Verstoesse +
+ )} +
+ ) +} diff --git a/admin-compliance/app/sdk/agent/_components/ChecklistView.tsx b/admin-compliance/app/sdk/agent/_components/ChecklistView.tsx index 9b74b54..4c99f87 100644 --- a/admin-compliance/app/sdk/agent/_components/ChecklistView.tsx +++ b/admin-compliance/app/sdk/agent/_components/ChecklistView.tsx @@ -30,6 +30,7 @@ const DOC_TYPE_LABELS: Record = { dse: 'DSI', agb: 'AGB', impressum: 'Impressum', cookie: 'Cookie', widerruf: 'Widerruf', other: 'Sonstiges', social_media: 'Social Media', dsfa: 'DSFA', joint_controller: 'Art. 26', + eu_institution: 'EU-Inst.', banner: 'Banner', } interface GroupedCheck { diff --git a/backend-compliance/compliance/api/agent_doc_check_routes.py b/backend-compliance/compliance/api/agent_doc_check_routes.py index c66a612..9d79188 100644 --- a/backend-compliance/compliance/api/agent_doc_check_routes.py +++ b/backend-compliance/compliance/api/agent_doc_check_routes.py @@ -329,6 +329,7 @@ SECTION_TYPE_MAP = [ (r"datenschutzfolge|dsfa|risikoanalyse", "dsfa"), (r"^social\s*media$|^soziale\s+(?:medien|netzwerke)$", "social_media"), (r"datenschutzerkl(?:ae|ä)rung.*social|datenschutz\s+f(?:ue|ü)r\s+social", "social_media"), + (r"(?:verordnung|regulation)\s*\(?eu\)?\s*2018\s*/?\s*1725", "eu_institution"), ] diff --git a/backend-compliance/compliance/services/doc_checks/__init__.py b/backend-compliance/compliance/services/doc_checks/__init__.py index 5d69a0b..8710c7a 100644 --- a/backend-compliance/compliance/services/doc_checks/__init__.py +++ b/backend-compliance/compliance/services/doc_checks/__init__.py @@ -2,7 +2,7 @@ doc_checks — Legal document compliance checkers. Provides checklists and functions for verifying legal documents -(DSI, AGB, Impressum, Cookie, Widerruf, Social Media, DSFA) +(DSI, AGB, Impressum, Cookie, Widerruf, Social Media, DSFA, EU Institution) against their mandatory content requirements. Two check levels: @@ -18,6 +18,7 @@ from .impressum_checks import IMPRESSUM_CHECKLIST from .cookie_checks import COOKIE_CHECKLIST from .social_media_checks import JOINT_CONTROLLER_CHECKLIST from .dsfa_checks import DSFA_CHECKLIST +from .eu_institution_checks import EU_INSTITUTION_CHECKLIST __all__ = [ "check_document_completeness", @@ -29,4 +30,5 @@ __all__ = [ "COOKIE_CHECKLIST", "JOINT_CONTROLLER_CHECKLIST", "DSFA_CHECKLIST", + "EU_INSTITUTION_CHECKLIST", ] diff --git a/backend-compliance/compliance/services/doc_checks/dse_checks.py b/backend-compliance/compliance/services/doc_checks/dse_checks.py index 99b3fc7..f5345c7 100644 --- a/backend-compliance/compliance/services/doc_checks/dse_checks.py +++ b/backend-compliance/compliance/services/doc_checks/dse_checks.py @@ -47,8 +47,9 @@ ART13_CHECKLIST = [ "label": "Telefonnummer des Verantwortlichen", "level": 2, "parent": "controller", "patterns": [ - r"(?:tel(?:efon)?|phone|fon)\s*[.:]\s*[\+\d][\d\s/\-]{6,}", - r"\+49\s*[\d\s/\-]{8,}", + r"(?:tel(?:efon)?|phone|fon)\s*[.:]\s*[\+\d][\d\s/\-\(\)]{6,}", + r"\+49\s*[\d\s/\-\(\)]{8,}", + r"0\d{2,4}\s*[\(/\-\s]\s*\d{3,}", ], "severity": "MEDIUM", "hint": "EuGH (C-298/17, 'Verein fuer Konsumenteninformation') verlangt effektive Kontaktmoeglichkeit. Telefon ist nicht zwingend, aber empfohlen — fehlt sie, muss ein gleichwertiger Kanal (z.B. Chat, Rueckruf) angeboten werden.", @@ -345,7 +346,7 @@ ART13_CHECKLIST = [ "id": "rights_art21", "label": "Widerspruchsrecht (Art. 21)", "level": 2, "parent": "rights", - "patterns": [r"art\.\s*21", r"widerspruchsrecht", r"right\s+to\s+object"], + "patterns": [r"art\.\s*21", r"widerspruchsrecht", r"recht\s+auf\s+widerspruch", r"§\s*23\s+kdg", r"right\s+to\s+object"], "severity": "LOW", "hint": "Art. 21(4) DSGVO: Der Widerspruchshinweis muss spaetestens zum Zeitpunkt der ersten Kommunikation GESONDERT und in klarer Sprache erfolgen. Haeufiger Fehler: Widerspruchsrecht nur im Fliesstext versteckt — eigener Abschnitt/Hervorhebung noetig.", }, @@ -386,6 +387,9 @@ ART13_CHECKLIST = [ r"l(?:an)?fdi\s+\w+", r"bfdi", r"(?:bayerische|hessische|s(?:ae|ä)chsische|berliner)\s+(?:datenschutz|aufsicht)", + r"(?:katholisch|evangelisch|kirchlich)\w*\s+datenschutz", + r"datenschutzzentrum", + r"kd(?:oe|ö)r", ], "severity": "LOW", "hint": "Vollstaendigen Namen, Adresse und Website der Aufsichtsbehoerde angeben. Haeufiger Fehler: 'die zustaendige Aufsichtsbehoerde' ohne Konkretisierung. Korrekt z.B.: 'LfDI BW, Koenigstrasse 10a, 70173 Stuttgart, www.baden-wuerttemberg.datenschutz.de'.", diff --git a/backend-compliance/compliance/services/doc_checks/eu_institution_checks.py b/backend-compliance/compliance/services/doc_checks/eu_institution_checks.py new file mode 100644 index 0000000..ed37d2f --- /dev/null +++ b/backend-compliance/compliance/services/doc_checks/eu_institution_checks.py @@ -0,0 +1,500 @@ +""" +EU Institution checks — Verordnung (EU) 2018/1725. +Applies to EU institutions, bodies, offices and agencies instead of DSGVO. +Key differences: Art. 15 (not 13), Art. 5 (not 6), EDSB (not national DPAs). +L1: Pflichtangabe erwaehnt? L2: Pflichtangabe korrekt/vollstaendig? +""" + +EU_INSTITUTION_CHECKLIST = [ + # == L1: Verantwortlicher (Controller) ================================= + { + "id": "eu_controller", + "label": "Verantwortlicher (Art. 15(1)(a) VO 2018/1725)", + "level": 1, "parent": None, + "patterns": [ + r"verantwortlich\w*\s+(?:ist|im sinne|fuer|f(?:ue|ü)r)", + r"kontaktdaten\s+des\s+verantwortlichen", + r"name\s+(?:und|&)\s+kontaktdaten\s+des", + r"controller", r"verantwortliche\s+stelle", + r"responsible\s+(?:party|for)", + r"data\s+controller", + r"identity\s+(?:of\s+)?(?:the\s+)?controller", + ], + "severity": "HIGH", + "hint": ( + "Art. 15(1)(a) VO 2018/1725 verlangt die Identitaet des Verantwortlichen. " + "Bei EU-Organen: Vollstaendiger Name der Institution (z.B. 'Europaeische Kommission, " + "GD DIGIT'), Dienstadresse und funktionale E-Mail-Adresse. " + "Haeufiger Fehler: Nur Abkuerzung ohne vollstaendigen Institutionsnamen." + ), + }, + { + "id": "eu_controller_address", + "label": "Dienstadresse des Verantwortlichen", + "level": 2, "parent": "eu_controller", + "patterns": [ + r"(?:rue|avenue|boulevard|strasse|stra(?:ss|ß)e)\s+\w+", + r"\d{4,5}\s+(?:bruxelles|brussels|br(?:ue|ü)ssel|luxembourg|luxemburg|strasbourg|stra(?:ss|ß)burg)", + r"b[\-\s]?\d{4}\s+\w+", + r"l[\-\s]?\d{4}\s+\w+", + ], + "severity": "MEDIUM", + "hint": ( + "Angabe der Dienstadresse der EU-Institution (typisch: Bruessel, Luxemburg " + "oder Strassburg). Format z.B. 'Rue de la Loi 200, B-1049 Bruxelles'. " + "Haeufiger Fehler: Nur Postfach ohne physische Adresse." + ), + }, + { + "id": "eu_controller_email", + "label": "E-Mail-Adresse des Verantwortlichen", + "level": 2, "parent": "eu_controller", + "patterns": [ + r"[a-z0-9._%+\-]+@[a-z0-9.\-]+\.europa\.eu", + r"[a-z0-9._%+\-]+@[a-z0-9.\-]+\.[a-z]{2,}", + ], + "severity": "MEDIUM", + "hint": ( + "Eine funktionale E-Mail-Adresse ist Pflicht (Art. 15(1)(a) VO 2018/1725). " + "Bei EU-Organen typischerweise @ec.europa.eu, @europarl.europa.eu o.ae. " + "Ein reines Kontaktformular genuegt nicht als unmittelbarer Kommunikationskanal." + ), + }, + # == L1: Datenschutzbeauftragter (DPO) ================================= + { + "id": "eu_dpo", + "label": "Datenschutzbeauftragter (Art. 15(1)(b) / Art. 43 VO 2018/1725)", + "level": 1, "parent": None, + "patterns": [ + r"datenschutzbeauftragt", + r"data\s+protection\s+officer", + r"kontaktdaten\s+de[rs]\s+datenschutz", + r"dpo", + r"d(?:ae|ä)legu(?:e|é)\s+(?:a|à)\s+la\s+protection\s+des\s+donn(?:e|é)es", + ], + "severity": "HIGH", + "hint": ( + "Art. 43-44 VO 2018/1725: Jedes EU-Organ MUSS einen DSB (DPO) benennen. " + "Dies ist — anders als unter der DSGVO — keine Frage der Mitarbeiterzahl, " + "sondern absolute Pflicht fuer alle EU-Organe. Die Kontaktdaten muessen in " + "jeder Datenschutzerklaerung angegeben werden (Art. 15(1)(b))." + ), + }, + { + "id": "eu_dpo_contact", + "label": "DPO-Kontaktdaten (E-Mail oder Adresse)", + "level": 2, "parent": "eu_dpo", + "patterns": [ + r"(?:data\s+protection\s+officer|dpo|datenschutzbeauftragt)[\s\S]{0,300}[a-z0-9._%+\-]+@", + r"dpo[\s\S]{0,100}@", + r"data[\-\.]?protection@", + r"dpo@\w+\.europa\.eu", + ], + "severity": "MEDIUM", + "hint": ( + "Art. 44(7) VO 2018/1725: Die Kontaktdaten des DPO muessen veroeffentlicht werden. " + "Mindestens eine funktionale E-Mail-Adresse angeben (z.B. DATA-PROTECTION-OFFICER@ec.europa.eu). " + "Den Namen des DPO muessen Sie nicht nennen." + ), + }, + + { + "id": "eu_dpo_function", + "label": "DPO-Funktion / -Rolle beschrieben", + "level": 2, "parent": "eu_dpo", + "patterns": [ + r"(?:aufgaben|role|function|zustaendig).*(?:dpo|datenschutzbeauftragt|data\s+protection\s+officer)", + r"(?:dpo|datenschutzbeauftragt|data\s+protection\s+officer).*(?:aufgaben|role|function|zustaendig)", + r"art(?:icle)?\s*44\s+(?:vo|regulation|verordnung)", + ], + "severity": "LOW", + "hint": ( + "Art. 44 VO 2018/1725 beschreibt die Aufgaben des DPO bei EU-Organen: " + "Beratung, Ueberwachung, Zusammenarbeit mit dem EDSB. " + "Es empfiehlt sich, kurz die Rolle des DPO zu erlaeutern, damit " + "Betroffene wissen, wofuer der DPO zustaendig ist." + ), + }, + + # == L1: Zwecke und Rechtsgrundlage ==================================== + { + "id": "eu_purposes", + "label": "Zwecke der Verarbeitung (Art. 15(1)(c) VO 2018/1725)", + "level": 1, "parent": None, + "patterns": [ + r"zweck\w*\s+(?:der|und|die)\s+(?:verarbeitung|datenerhebung|datenverarbeitung)", + r"purpose\w*\s+(?:of|for)\s+(?:the\s+)?(?:processing|data)", + r"zu\s+welch\w+\s+zweck", + r"(?:data|personal\s+data)\s+(?:is|are)\s+(?:collected|processed)\s+(?:for|to)", + ], + "severity": "HIGH", + "hint": ( + "Art. 15(1)(c) VO 2018/1725 verlangt konkrete Zweckangaben. " + "EU-Organe muessen jeden Verarbeitungszweck einzeln auffuehren: z.B. " + "'Verwaltung von Bewerbungen', 'Zugangsmanagement zum Gebaeude', " + "'Webanalyse der Internetseite'. Pauschalformulierungen sind unzulaessig." + ), + }, + { + "id": "eu_purposes_specific", + "label": "Konkrete Verarbeitungszwecke benannt", + "level": 2, "parent": "eu_purposes", + "patterns": [ + r"(?:recruitment|selection|verwaltung|management|administration|monitoring|evaluation)", + r"(?:human\s+resources|hr|personal|bewerbung|grant|procurement|vergabe)", + r"(?:access|zugang|building|gebaeude|website|webseite|intranet)", + ], + "severity": "LOW", + "hint": ( + "Mindestens 2 konkrete Zwecke benennen, jeweils mit zugehoeriger " + "Rechtsgrundlage. Typische EU-Organ-Zwecke: Personalverwaltung, " + "Gebaeudezugang, IT-Sicherheitsmonitoring, Vergabeverfahren, " + "Evaluierung von Foerderprogrammen. Pauschalformulierungen genuegen " + "nicht dem Bestimmtheitsgrundsatz." + ), + }, + # == L1: Rechtsgrundlage (Art. 5 statt Art. 6 DSGVO) ================== + { + "id": "eu_legal_basis", + "label": "Rechtsgrundlage (Art. 5 VO 2018/1725)", + "level": 1, "parent": None, + "patterns": [ + r"rechtsgrundlage", + r"art\.\s*5\s*(?:abs|absatz)?\s*\.?\s*1", + r"legal\s+basis", + r"lawfulness\s+of\s+processing", + r"art(?:icle)?\s*5\s*(?:\(1\))?\s*(?:\([a-d]\))?", + r"auf\s+grundlage\s+(?:von|des|der)\s+art", + r"regulation\s*\(eu\)\s*2018\s*/?\s*1725", + r"verordnung\s*\(eu\)\s*2018\s*/?\s*1725", + ], + "severity": "HIGH", + "hint": ( + "Art. 5(1) VO 2018/1725 enthaelt die Rechtsgrundlagen fuer EU-Organe: " + "(a) Einwilligung, (b) Vertrag, (c) rechtliche Verpflichtung, " + "(d) im oeffentlichen Interesse/Ausuebung oeffentlicher Gewalt. " + "WICHTIG: Art. 5(1)(d) ist der haeufigste Tatbestand bei EU-Organen — " + "er entspricht etwa Art. 6(1)(e) DSGVO. Art. 6(1)(f) DSGVO " + "(berechtigtes Interesse) existiert in der VO 2018/1725 NICHT." + ), + }, + { + "id": "eu_legal_basis_public_interest", + "label": "Art. 5(1)(a) — Oeffentliches Interesse / oeffentliche Gewalt", + "level": 2, "parent": "eu_legal_basis", + "patterns": [ + r"art\.\s*5\s*(?:\(1\))?\s*\(?(?:1\s*)?(?:let(?:ter)?\.?\s*)?a\)?", + r"(?:oeffentlich|öffentlich).*(?:interesse|gewalt|aufgabe)", + r"public\s+interest", + r"(?:exercise|performance)\s+of\s+(?:official|public)\s+(?:authority|task)", + ], + "severity": "LOW", + "hint": ( + "Art. 5(1)(a) VO 2018/1725 ist die Hauptrechtsgrundlage fuer EU-Organe. " + "Verlangt einen konkreten Rechtsakt als Grundlage (z.B. Verordnung, " + "Beschluss, Basisrechtsakt der Institution). Benennen Sie den spezifischen " + "Rechtsakt, nicht nur pauschal 'oeffentliches Interesse'." + ), + }, + { + "id": "eu_legal_basis_consent", + "label": "Art. 5(1)(d) — Einwilligung", + "level": 2, "parent": "eu_legal_basis", + "patterns": [ + r"art\.\s*5\s*(?:\(1\))?\s*\(?(?:1\s*)?(?:let(?:ter)?\.?\s*)?d\)?", + r"einwilligung\s+(?:gem|nach|i\.?\s*s\.?\s*d\.?)", + r"consent\s+(?:of|given\s+by)\s+the\s+data\s+subject", + ], + "severity": "LOW", + "hint": ( + "Bei Einwilligung (Art. 5(1)(d) VO 2018/1725) muss auf das jederzeitige " + "Widerrufsrecht hingewiesen werden (Art. 7(3) VO 2018/1725). " + "Achtung: EU-Organe sollten Einwilligung nur als Rechtsgrundlage waehlen, " + "wenn keine andere Grundlage greift — wegen des Machtungleichgewichts " + "zwischen Institution und Einzelperson (EDSB-Leitlinien)." + ), + }, + + # == L1: Empfaenger ==================================================== + { + "id": "eu_recipients", + "label": "Empfaenger (Art. 15(1)(e) VO 2018/1725)", + "level": 1, "parent": None, + "patterns": [ + r"empf(?:ae|ä)nger", + r"(?:ueber|über|weiter)mitt(?:el|l)ung", + r"recipient", + r"weitergabe\s+(?:an|von)\s+daten", + r"data\s+(?:will\s+be|are|is)\s+(?:shared|disclosed|transferred|transmitted)\s+to", + r"auftragsverarbeit", + r"processor", + ], + "severity": "MEDIUM", + "hint": ( + "Art. 15(1)(e) VO 2018/1725: Empfaenger oder Empfaengerkategorien benennen. " + "Typisch bei EU-Organen: andere EU-Institutionen (z.B. OLAF, Rechnungshof), " + "Mitgliedstaaten-Behoerden, IT-Dienstleister. Auftragsverarbeiter muessen " + "nach Art. 29 VO 2018/1725 vertraglich gebunden sein." + ), + }, + { + "id": "eu_recipients_processor", + "label": "Auftragsverarbeiter / Processor (Art. 29 VO 2018/1725)", + "level": 2, "parent": "eu_recipients", + "patterns": [ + r"auftragsverarbeit(?:er|ung)", + r"art\.\s*29\s+(?:vo|verordnung|regulation)", + r"art(?:icle)?\s*29", + r"processor", + r"sub[\-\s]?processor", + ], + "severity": "LOW", + "hint": ( + "Art. 29 VO 2018/1725 (entspricht Art. 28 DSGVO): " + "Auftragsverarbeiter muessen vertraglich gebunden werden. " + "Erwaehnen Sie, dass ein Auftragsverarbeitungsvertrag besteht. " + "Bei Cloud-Diensten (z.B. Microsoft 365, AWS): Vertrag muss " + "die Vorgaben von Art. 29(3) VO 2018/1725 einhalten." + ), + }, + + # == L1: Drittlandtransfer ============================================= + { + "id": "eu_third_country", + "label": "Drittlandtransfer (Art. 46-50 VO 2018/1725)", + "level": 1, "parent": None, + "patterns": [ + r"drittland", + r"dritt\s*staat", + r"third\s+countr", + r"angemessenheitsbeschluss", + r"adequacy\s+decision", + r"standard\s*(?:vertragsklausel|contractual\s+clause)", + r"(?:transfer|uebermittlung|übermittlung).*(?:ausserhalb|außerhalb|outside)", + r"(?:europ(?:ae|ä)ischen\s+wirtschaftsraum|ewr|eea)", + r"art(?:icle)?\s*4[6-9]", + r"art\.\s*50", + ], + "severity": "MEDIUM", + "hint": ( + "Art. 46-50 VO 2018/1725 (entspricht Art. 44-49 DSGVO): " + "Drittlandtransfers erfordern Angemessenheitsbeschluss (Art. 47), " + "geeignete Garantien (Art. 48) oder Ausnahmen (Art. 50). " + "EDSB-Empfehlung: EU-Organe muessen besonders streng pruefen, " + "da sie eine Vorbildfunktion fuer die Mitgliedstaaten haben." + ), + }, + { + "id": "eu_third_country_mechanism", + "label": "Transfermechanismus benannt (Art. 47-48 VO 2018/1725)", + "level": 2, "parent": "eu_third_country", + "patterns": [ + r"standard\s*vertragsklausel|scc|standard\s+contractual", + r"angemessenheitsbeschluss|adequacy\s+decision", + r"art(?:icle)?\s*4[7-8]", + r"data\s+privacy\s+framework|dpf", + r"appropriate\s+safeguards", + r"geeignete\s+garantien", + ], + "severity": "MEDIUM", + "hint": ( + "Art. 48 VO 2018/1725: Bei fehlender Angemessenheit koennen " + "geeignete Garantien (z.B. SCC, verbindliche Verwaltungsvereinbarungen) " + "den Transfer absichern. Der EDSB hat 2020 eigene Leitlinien zu " + "Drittlandtransfers fuer EU-Organe veroeffentlicht." + ), + }, + + # == L1: Speicherdauer ================================================= + { + "id": "eu_retention", + "label": "Speicherdauer (Art. 15(1)(g) VO 2018/1725)", + "level": 1, "parent": None, + "patterns": [ + r"speicherdauer", + r"aufbewahrungsfrist", + r"retention\s+period", + r"(?:how\s+long|storage\s+period|data\s+retention)", + r"l(?:oe|ö)sch(?:ung|frist)", + r"daten\s+werden\s+gel(?:oe|ö)scht", + r"(?:\d+\s+(?:tage|monate|jahre|days|months|years))", + r"dauer\s+der\s+speicherung", + r"data\s+will\s+be\s+(?:kept|stored|retained)\s+(?:for|until|during)", + ], + "severity": "HIGH", + "hint": ( + "Art. 15(1)(g) VO 2018/1725 verlangt die Speicherdauer oder " + "Kriterien zu deren Festlegung. EU-Organe haben oft interne " + "Aufbewahrungsrichtlinien (retention schedules). Nennen Sie die " + "konkreten Fristen oder verweisen Sie auf die interne Richtlinie " + "mit Dokumentenreferenz." + ), + }, + { + "id": "eu_retention_periods", + "label": "Konkrete Zeitangaben", + "level": 2, "parent": "eu_retention", + "patterns": [ + r"\d+\s+(?:tage?|monate?|jahre?|days?|months?|years?)", + r"(?:after|nach)\s+(?:the\s+)?(?:end|closure|completion|ablauf|beendigung)", + r"retention\s+(?:schedule|policy|period)\s+(?:of|for)\s+\d+", + ], + "severity": "MEDIUM", + "hint": ( + "Konkrete Fristen pro Datenkategorie nennen. EU-Organe folgen " + "typischerweise der Common Retention List (CRL) der Kommission. " + "Beispiel: Bewerbungsdaten 2 Jahre, Finanzunterlagen 7 Jahre, " + "Gebaeudezugangslogs 6 Monate." + ), + }, + + # == L1: Betroffenenrechte (Art. 17-24 statt Art. 15-22 DSGVO) ========= + { + "id": "eu_rights", + "label": "Betroffenenrechte (Art. 17-24 VO 2018/1725)", + "level": 1, "parent": None, + "patterns": [ + r"recht\s+auf\s+auskunft", + r"recht\s+auf\s+l(?:oe|ö)schung", + r"recht\s+auf\s+berichtigung", + r"widerspruchsrecht", + r"right\s+to\s+(?:access|erasure|rectification|object|restrict)", + r"betroffenenrecht", + r"rechte\s+(?:des|der)\s+betroffenen", + r"(?:your|data\s+subject)\s+rights", + r"art(?:icle)?\s*(?:17|18|19|20|21|22|23|24)\s+(?:vo|regulation|verordnung)", + ], + "severity": "HIGH", + "hint": ( + "Art. 15(1)(h) VO 2018/1725 verlangt Nennung der Betroffenenrechte: " + "Auskunft (Art. 17), Berichtigung (Art. 18), Loeschung (Art. 19), " + "Einschraenkung (Art. 20), Datenportabilitaet (Art. 22), " + "Widerspruch (Art. 23). Achtung: Die Artikelnummern unterscheiden sich " + "von der DSGVO (Art. 15-22)! Haeufiger Fehler: DSGVO-Artikel " + "statt VO 2018/1725 Artikel zitieren." + ), + }, + { + "id": "eu_rights_access", + "label": "Recht auf Auskunft (Art. 17 VO 2018/1725)", + "level": 2, "parent": "eu_rights", + "patterns": [ + r"art(?:icle)?\s*17\s+(?:vo|regulation|verordnung)", + r"art\.\s*17", + r"recht\s+auf\s+(?:\w+\s+)?auskunft", + r"right\s+(?:of|to)\s+access", + ], + "severity": "LOW", + "hint": ( + "Art. 17 VO 2018/1725 (entspricht Art. 15 DSGVO): Betroffene koennen " + "Auskunft und eine Kopie ihrer Daten verlangen. Antwortfrist: 1 Monat " + "(Art. 14(3) VO 2018/1725). Anfragen gehen typischerweise an den DPO " + "der Institution." + ), + }, + { + "id": "eu_rights_erasure", + "label": "Recht auf Loeschung (Art. 19 VO 2018/1725)", + "level": 2, "parent": "eu_rights", + "patterns": [ + r"art(?:icle)?\s*19\s+(?:vo|regulation|verordnung)", + r"art\.\s*19", + r"recht\s+auf\s+(?:\w+\s+)?l(?:oe|ö)schung", + r"right\s+to\s+erasure", + r"right\s+to\s+be\s+forgotten", + ], + "severity": "LOW", + "hint": ( + "Art. 19 VO 2018/1725 (entspricht Art. 17 DSGVO): " + "Recht auf Loeschung bei Zweckentfall, Widerruf der Einwilligung " + "oder unrechtmaessiger Verarbeitung. Erwaehnen Sie auch die " + "Ausnahmen fuer EU-Organe: Archivzwecke im oeffentlichen Interesse, " + "gesetzliche Aufbewahrungspflichten." + ), + }, + + { + "id": "eu_rights_restriction", + "label": "Recht auf Einschraenkung (Art. 20 VO 2018/1725)", + "level": 2, "parent": "eu_rights", + "patterns": [ + r"art(?:icle)?\s*20\s+(?:vo|regulation|verordnung)", + r"art\.\s*20", + r"einschr(?:ae|ä)nkung\s+der\s+verarbeitung", + r"right\s+to\s+restrict(?:ion)?", + ], + "severity": "LOW", + "hint": ( + "Art. 20 VO 2018/1725 (entspricht Art. 18 DSGVO): " + "Recht auf Einschraenkung der Verarbeitung bei bestrittener " + "Richtigkeit, unrechtmaessiger Verarbeitung oder laufendem " + "Widerspruch. Wird am haeufigsten vergessen." + ), + }, + { + "id": "eu_rights_automated", + "label": "Automatisierte Entscheidungen (Art. 24 VO 2018/1725)", + "level": 2, "parent": "eu_rights", + "patterns": [ + r"art(?:icle)?\s*24\s+(?:vo|regulation|verordnung)", + r"art\.\s*24", + r"automatisierte\s+entscheidung", + r"automated\s+(?:decision|individual)", + r"profiling", + ], + "severity": "LOW", + "hint": ( + "Art. 24 VO 2018/1725 (entspricht Art. 22 DSGVO): " + "Bei automatisierten Einzelentscheidungen muessen Logik, " + "Tragweite und Auswirkungen erklaert werden. Falls kein " + "Profiling stattfindet, explizit verneinen." + ), + }, + + # == L1: Beschwerderecht beim EDSB ===================================== + { + "id": "eu_complaint", + "label": "Beschwerderecht beim EDSB (Art. 15(1)(i) VO 2018/1725)", + "level": 1, "parent": None, + "patterns": [ + r"beschwerderecht", + r"right\s+to\s+lodge\s+a\s+complaint", + r"beschwerde.*(?:edsb|edps)", + r"edsb", + r"edps", + r"europ(?:ae|ä)isch\w*\s+datenschutzbeauftragt", + r"european\s+data\s+protection\s+supervisor", + r"contr(?:o|ô)leur\s+europ(?:e|é)en", + r"art(?:icle)?\s*63", + ], + "severity": "HIGH", + "hint": ( + "Art. 15(1)(i) VO 2018/1725: Bei EU-Organen ist der EDSB " + "(Europaeischer Datenschutzbeauftragter / European Data Protection " + "Supervisor) die zustaendige Aufsichtsbehoerde — NICHT die nationalen " + "Datenschutzbehoerden. Kontakt: edps@edps.europa.eu, " + "Rue Wiertz 60, B-1047 Bruxelles. Haeufiger Fehler: Verweis auf " + "nationale Aufsichtsbehoerde statt EDSB." + ), + }, + { + "id": "eu_complaint_edps_contact", + "label": "EDSB-Kontaktdaten angegeben", + "level": 2, "parent": "eu_complaint", + "patterns": [ + r"edps@edps\.europa\.eu", + r"edps\.europa\.eu", + r"edsb.*(?:kontakt|anschrift|adresse|e[\-\s]?mail|wiertz)", + r"edps.*(?:contact|address|e[\-\s]?mail|wiertz)", + r"rue\s+wiertz", + ], + "severity": "MEDIUM", + "hint": ( + "Vollstaendige EDSB-Kontaktdaten angeben: " + "Europaeischer Datenschutzbeauftragter (EDSB), " + "Rue Wiertz 60, B-1047 Bruxelles/Bruessel, " + "edps@edps.europa.eu, https://edps.europa.eu. " + "Haeufiger Fehler: Nur 'EDSB' erwaehnt ohne Kontaktdaten." + ), + }, +] diff --git a/backend-compliance/compliance/services/doc_checks/runner.py b/backend-compliance/compliance/services/doc_checks/runner.py index 70e1bd2..96ce71a 100644 --- a/backend-compliance/compliance/services/doc_checks/runner.py +++ b/backend-compliance/compliance/services/doc_checks/runner.py @@ -15,6 +15,7 @@ from .impressum_checks import IMPRESSUM_CHECKLIST from .cookie_checks import COOKIE_CHECKLIST from .social_media_checks import JOINT_CONTROLLER_CHECKLIST from .dsfa_checks import DSFA_CHECKLIST +from .eu_institution_checks import EU_INSTITUTION_CHECKLIST logger = logging.getLogger(__name__) @@ -35,6 +36,7 @@ _CHECKLIST_MAP = { "social_media": (JOINT_CONTROLLER_CHECKLIST, "Art. 26 DSGVO"), "joint_controller": (JOINT_CONTROLLER_CHECKLIST, "Art. 26 DSGVO"), "dsfa": (DSFA_CHECKLIST, "Art. 35 DSGVO"), + "eu_institution": (EU_INSTITUTION_CHECKLIST, "VO (EU) 2018/1725"), } @@ -218,6 +220,11 @@ def classify_document_type(title: str, url: str) -> str: if any(kw in combined for kw in ["social media", "facebook", "instagram", "linkedin", "fanpage"]): if any(kw in combined for kw in ["datenschutzerkl", "datenschutz für", "datenschutzinformation"]): return "social_media" + # EU institution check BEFORE generic privacy — 2018/1725 is more specific + if any(kw in combined for kw in ["2018/1725", "2018 1725", "regulation (eu)", + "verordnung (eu)", "edsb", "edps", + "european data protection supervisor"]): + return "eu_institution" if any(kw in combined for kw in ["datenschutz", "privacy", "dsgvo", "data protection", "données"]): return "dse" if any(kw in combined for kw in ["widerruf", "withdrawal", "rétractation", "desistimiento"]): diff --git a/consent-tester/checks/banner_runner.py b/consent-tester/checks/banner_runner.py new file mode 100644 index 0000000..fe0ef67 --- /dev/null +++ b/consent-tester/checks/banner_runner.py @@ -0,0 +1,175 @@ +""" +Banner Runner — maps scan results to the L1/L2 check hierarchy. + +Takes the raw ScanResponse dict and produces a structured_checks list +compatible with ChecklistView (same format as document checks). +""" + +from checks.banner_checks import BANNER_CHECKLIST + + +def map_scan_to_checks(scan_result: dict) -> dict: + """Map a /scan response to the L1/L2 banner check hierarchy. + + Returns dict with: + - structured_checks: list of CheckItem dicts + - completeness_pct: L1 pass rate (0-100) + - correctness_pct: L2 pass rate (0-100) + """ + # Collect all violation codes from every source + violation_codes = _collect_violation_codes(scan_result) + + # Collect pass codes — some checks produce boolean signals, not violations + pass_codes = _collect_pass_codes(scan_result) + + # Build structured checks + checks: list[dict] = [] + l1_checks: list[dict] = [] + l2_checks: list[dict] = [] + + for defn in BANNER_CHECKLIST: + key = defn["check_key"] + level = defn["level"] + parent = defn.get("parent") + + # Determine pass/fail + is_violation_key = key in violation_codes + is_pass_key = key in pass_codes + + # For checks whose check_key appears in violations → failed + # For checks whose check_key appears only in passes → passed + # For checks where neither → assume passed (not tested = no finding) + if is_violation_key: + passed = False + matched_text = violation_codes[key] + elif is_pass_key: + passed = True + matched_text = pass_codes.get(key, "") + else: + # Key not found in violations or explicit passes. + # If the scan ran (banner detected) → assume passed. + # If banner not detected → only banner_detected fails. + passed = scan_result.get("banner_detected", False) or key == "banner_detected" + if key == "banner_detected": + passed = scan_result.get("banner_detected", False) + matched_text = "" + + # L2 checks are skipped if their parent L1 failed + skipped = False + if level == 2 and parent: + parent_check = next( + (c for c in checks if c["id"] == parent), None + ) + if parent_check and not parent_check["passed"]: + skipped = True + + item = { + "id": defn["id"], + "label": defn["label"], + "passed": passed and not skipped, + "severity": defn["severity"], + "level": level, + "parent": parent, + "skipped": skipped, + "hint": defn.get("hint", ""), + "matched_text": matched_text if passed else "", + } + checks.append(item) + + if level == 1: + l1_checks.append(item) + elif level == 2: + l2_checks.append(item) + + # Compute percentages + l1_total = len(l1_checks) + l1_passed = sum(1 for c in l1_checks if c["passed"]) + completeness_pct = round(l1_passed / l1_total * 100) if l1_total else 0 + + l2_active = [c for c in l2_checks if not c["skipped"]] + l2_passed = sum(1 for c in l2_active if c["passed"]) + correctness_pct = round(l2_passed / len(l2_active) * 100) if l2_active else 0 + + return { + "structured_checks": checks, + "completeness_pct": completeness_pct, + "correctness_pct": correctness_pct, + } + + +def _collect_violation_codes(scan: dict) -> dict[str, str]: + """Collect check_key → violation text from all sources.""" + codes: dict[str, str] = {} + + # Banner text violations + banner_checks = scan.get("banner_checks", {}) + for v in banner_checks.get("violations", []): + code = v.get("code", "") + if code: + codes[code] = v.get("text", "")[:120] + + # Phase A violations (before consent) + phase_a = scan.get("phases", {}).get("before_consent", {}) + for v in phase_a.get("violations", []): + code = v.get("code", "") + if code: + codes[code] = v.get("text", "")[:120] + + # Phase B violations (after reject) + phase_b = scan.get("phases", {}).get("after_reject", {}) + for v in phase_b.get("violations", []): + code = v.get("code", "") + if code: + codes[code] = v.get("text", "")[:120] + + # Tracking services in phase A → tracking_before_consent + tracking_a = phase_a.get("tracking_services", []) + if tracking_a and "tracking_before_consent" not in codes: + codes["tracking_before_consent"] = ", ".join(tracking_a[:5]) + + # Cookies before consent → cookies_before_consent + cookies_a = phase_a.get("cookies", []) + tracking_cookies = [c for c in cookies_a if _is_tracking_cookie(c)] + if tracking_cookies and "cookies_before_consent" not in codes: + codes["cookies_before_consent"] = ", ".join(tracking_cookies[:5]) + + # New tracking after reject → tracking_after_reject + new_tracking_b = phase_b.get("new_tracking", []) + if new_tracking_b and "tracking_after_reject" not in codes: + codes["tracking_after_reject"] = ", ".join(new_tracking_b[:5]) + + return codes + + +def _collect_pass_codes(scan: dict) -> dict[str, str]: + """Collect explicit pass signals from scan results.""" + passes: dict[str, str] = {} + + # Banner detected + if scan.get("banner_detected"): + passes["banner_detected"] = scan.get("banner_provider", "detected") + + # Provider named + provider = scan.get("banner_provider", "") + if provider: + passes["banner_provider_named"] = provider + + # Impressum link + bc = scan.get("banner_checks", {}) + if bc.get("has_impressum_link"): + passes["impressum_link"] = "Impressum-Link gefunden" + if bc.get("has_dse_link"): + passes["dse_link"] = "DSE-Link gefunden" + + return passes + + +_TRACKING_COOKIE_PREFIXES = ( + "_ga", "_gid", "_fbp", "_fbc", "IDE", "_gcl", "fr", "_pin", + "_tt_", "li_sugr", "_hj", "mp_", "ajs_", "_clck", "_clsk", +) + + +def _is_tracking_cookie(name: str) -> bool: + """Check if a cookie name is a known tracking cookie.""" + return any(name.startswith(p) for p in _TRACKING_COOKIE_PREFIXES) diff --git a/consent-tester/main.py b/consent-tester/main.py index 64fc3ab..93cbc8e 100644 --- a/consent-tester/main.py +++ b/consent-tester/main.py @@ -16,6 +16,7 @@ from services.consent_scanner import run_consent_test, ConsentTestResult from services.authenticated_scanner import run_authenticated_test, AuthTestResult from services.playwright_scanner import scan_website_playwright from services.dsi_discovery import discover_dsi_documents, DSIDiscoveryResult +from checks.banner_runner import map_scan_to_checks logging.basicConfig(level=logging.INFO, format="%(levelname)s:%(name)s: %(message)s") logger = logging.getLogger(__name__) @@ -44,6 +45,9 @@ class ScanResponse(BaseModel): scanned_at: str category_tests: list = [] banner_checks: dict = {} + structured_checks: list = [] + completeness_pct: int = 0 + correctness_pct: int = 0 @app.get("/health") @@ -57,30 +61,47 @@ async def scan_consent(req: ScanRequest): logger.info("Starting consent test for %s", req.url) result = await run_consent_test(req.url, req.timeout_per_phase) + # Build raw response dict for structured check mapping + phases = { + "before_consent": { + "scripts": result.before_scripts, + "cookies": result.before_cookies, + "tracking_services": result.before_tracking, + "violations": [v.__dict__ for v in result.before_violations], + }, + "after_reject": { + "scripts": result.reject_scripts, + "cookies": result.reject_cookies, + "new_tracking": result.reject_new_tracking, + "violations": [v.__dict__ for v in result.reject_violations], + }, + "after_accept": { + "scripts": result.accept_scripts, + "cookies": result.accept_cookies, + "new_tracking": result.accept_new_tracking, + "undocumented": result.accept_undocumented, + }, + } + banner_checks_data = { + "has_impressum_link": result.banner_has_impressum_link, + "has_dse_link": result.banner_has_dse_link, + "violations": [v.__dict__ for v in result.banner_text_violations], + } + + # Map to L1/L2 hierarchy + raw_for_mapping = { + "banner_detected": result.banner_detected, + "banner_provider": result.banner_provider, + "phases": phases, + "banner_checks": banner_checks_data, + } + check_result = map_scan_to_checks(raw_for_mapping) + return ScanResponse( url=req.url, banner_detected=result.banner_detected, banner_provider=result.banner_provider, - phases={ - "before_consent": { - "scripts": result.before_scripts, - "cookies": result.before_cookies, - "tracking_services": result.before_tracking, - "violations": [v.__dict__ for v in result.before_violations], - }, - "after_reject": { - "scripts": result.reject_scripts, - "cookies": result.reject_cookies, - "new_tracking": result.reject_new_tracking, - "violations": [v.__dict__ for v in result.reject_violations], - }, - "after_accept": { - "scripts": result.accept_scripts, - "cookies": result.accept_cookies, - "new_tracking": result.accept_new_tracking, - "undocumented": result.accept_undocumented, - }, - }, + phases=phases, summary={ "critical": sum(1 for v in result.reject_violations if v.severity == "CRITICAL"), "high": len(result.before_violations) + sum(1 for v in result.banner_text_violations if v.severity == "HIGH"), @@ -90,11 +111,10 @@ async def scan_consent(req: ScanRequest): "categories_tested": len(result.category_tests), "banner_text_issues": len(result.banner_text_violations), }, - banner_checks={ - "has_impressum_link": result.banner_has_impressum_link, - "has_dse_link": result.banner_has_dse_link, - "violations": [v.__dict__ for v in result.banner_text_violations], - }, + banner_checks=banner_checks_data, + structured_checks=check_result["structured_checks"], + completeness_pct=check_result["completeness_pct"], + correctness_pct=check_result["correctness_pct"], scanned_at=datetime.now(timezone.utc).isoformat(), category_tests=[{ "category": ct.category, diff --git a/consent-tester/services/dsi_discovery.py b/consent-tester/services/dsi_discovery.py index 75be505..e5aa4d0 100644 --- a/consent-tester/services/dsi_discovery.py +++ b/consent-tester/services/dsi_discovery.py @@ -23,6 +23,8 @@ from urllib.parse import urlparse, urljoin from playwright.async_api import Page +from services.dsi_helpers import goto_resilient, try_dismiss_consent_banner, is_pdf_redirect + logger = logging.getLogger(__name__) # Legal document keywords in all EU/EEA official languages. @@ -216,11 +218,36 @@ async def discover_dsi_documents( seen_titles: set[str] = set() try: - # Step 1: Load the page - await page.goto(url, wait_until="networkidle", timeout=60000) + # Step 1: Load the page (with networkidle → domcontentloaded fallback) + await goto_resilient(page, url, timeout=60000) await page.wait_for_timeout(2000) - # Step 1b: Self-extraction — if the URL itself is a DSI page, + # Step 1a: Detect PDF redirects (e.g. dm.de redirects to GCS PDF) + final_url = page.url + if is_pdf_redirect(url, final_url): + is_dsi_url, dsi_lang = _matches_dsi_keyword(urlparse(url).path.lower()) + if is_dsi_url: + result.documents.append(DiscoveredDSI( + title=urlparse(url).path.split("/")[-1] or "Datenschutzerklaerung", + url=final_url, + source_url=url, + language=dsi_lang or "de", + doc_type="pdf", + text="[PDF — Textextraktion erforderlich]", + )) + seen_urls.add(url) + seen_urls.add(final_url) + logger.info("PDF redirect detected: %s -> %s", url, final_url) + # Return early — a PDF redirect means no HTML content to scan + result.total_found = len(result.documents) + return result + + # Step 1b: Try dismissing cookie consent banners before extraction. + # Many German sites (dm.de, Zalando, etc.) block page content behind + # a consent wall. Dismissing it reveals the actual DSI text. + await try_dismiss_consent_banner(page) + + # Step 1c: Self-extraction — if the URL itself is a DSI page, # extract its full text as the first document. This handles the # case where the user provides the DSE URL directly (e.g. # example.com/datenschutz) instead of the homepage. @@ -251,6 +278,8 @@ async def discover_dsi_documents( )) seen_urls.add(url) logger.info("Self-extracted %d words from %s", self_wc, url) + else: + logger.info("Self-extraction too short (%d words) for %s", self_wc, url) except Exception as e: logger.warning("Self-extraction failed for %s: %s", url, e) @@ -323,58 +352,69 @@ async def discover_dsi_documents( if is_anchor: continue - # Navigate to page — wait for JS to load content - resp = await page.goto(href, wait_until="networkidle", timeout=45000) - if resp and resp.status < 400: - await page.wait_for_timeout(2000) - await _expand_all_interactive(page) - await page.wait_for_timeout(500) + # Navigate to page — with networkidle/domcontentloaded fallback + await goto_resilient(page, href, timeout=45000) + resp_url = page.url - # Extract text — try specific content areas, fall back to full body - text = await page.evaluate(""" - () => { - // Try progressively broader content selectors - const selectors = [ - '.article-content', '.page-content', '.entry-content', - '[class*="content-area"]', '[class*="main-content"]', - 'main article', 'main', 'article', - '[role="main"]', '.content', '#content', - ]; - for (const sel of selectors) { - const el = document.querySelector(sel); - if (el && el.textContent.trim().length > 200) { - return el.textContent.trim(); - } + # Check for PDF redirect on followed links + if is_pdf_redirect(href, resp_url): + result.documents.append(DiscoveredDSI( + title=title, url=resp_url, source_url=url, + language=lang, doc_type="pdf", + text="[PDF — Textextraktion erforderlich]", + )) + await goto_resilient(page, url, timeout=45000) + continue + + await try_dismiss_consent_banner(page) + await _expand_all_interactive(page) + await page.wait_for_timeout(500) + + # Extract text — try specific content areas, fall back to full body + text = await page.evaluate(""" + () => { + // Try progressively broader content selectors + const selectors = [ + '.article-content', '.page-content', '.entry-content', + '[class*="content-area"]', '[class*="main-content"]', + 'main article', 'main', 'article', + '[role="main"]', '.content', '#content', + ]; + for (const sel of selectors) { + const el = document.querySelector(sel); + if (el && el.textContent.trim().length > 200) { + return el.textContent.trim(); } - // Fallback: full body minus nav/header/footer - const body = document.body.cloneNode(true); - body.querySelectorAll('nav, header, footer, script, style, [class*="nav"], [class*="sidebar"]').forEach(e => e.remove()); - return body.textContent?.trim() || ''; } - """) - if text and len(text) > 50: - result.documents.append(DiscoveredDSI( - title=title, url=href, source_url=url, - language=lang, - doc_type="cross_domain" if not _is_allowed_domain(href, base_domain) else "html_page", - text=text[:50000], word_count=len(text.split()), - )) + // Fallback: full body minus nav/header/footer + const body = document.body.cloneNode(true); + body.querySelectorAll('nav, header, footer, script, style, [class*="nav"], [class*="sidebar"]').forEach(e => e.remove()); + return body.textContent?.trim() || ''; + } + """) + if text and len(text) > 50: + result.documents.append(DiscoveredDSI( + title=title, url=href, source_url=url, + language=lang, + doc_type="cross_domain" if not _is_allowed_domain(href, base_domain) else "html_page", + text=text[:50000], word_count=len(text.split()), + )) - # Recursive: search THIS page for more DSI links - new_links = await _find_dsi_links(page, base_domain) - for nl in new_links: - if nl["href"] not in seen_urls and nl["href"] not in [p["href"] for p in pending_links]: - pending_links.append(nl) + # Recursive: search THIS page for more DSI links + new_links = await _find_dsi_links(page, base_domain) + for nl in new_links: + if nl["href"] not in seen_urls and nl["href"] not in [p["href"] for p in pending_links]: + pending_links.append(nl) # Navigate back for next link - await page.goto(url, wait_until="networkidle", timeout=45000) + await goto_resilient(page, url, timeout=45000) await page.wait_for_timeout(500) await _expand_all_interactive(page) except Exception as e: result.errors.append(f"Failed to load {href}: {str(e)[:80]}") try: - await page.goto(url, wait_until="networkidle", timeout=45000) + await goto_resilient(page, url, timeout=45000) except Exception: pass diff --git a/consent-tester/services/dsi_helpers.py b/consent-tester/services/dsi_helpers.py new file mode 100644 index 0000000..b5ad847 --- /dev/null +++ b/consent-tester/services/dsi_helpers.py @@ -0,0 +1,118 @@ +""" +DSI Discovery Helpers — resilient navigation, consent dismissal, PDF redirect detection. + +Extracted from dsi_discovery.py to keep modules under 500 LOC. +""" + +import logging + +from playwright.async_api import Page, TimeoutError as PlaywrightTimeout + +logger = logging.getLogger(__name__) + + +async def goto_resilient(page: Page, url: str, timeout: int = 60000) -> None: + """Navigate to URL with fallback: try networkidle first, then domcontentloaded. + + SPAs like Zalando never reach networkidle because of continuous background + requests. Falling back to domcontentloaded + a short wait gives JS time to + render the main content without waiting for every network request to finish. + """ + try: + await page.goto(url, wait_until="networkidle", timeout=timeout) + except PlaywrightTimeout: + logger.info("networkidle timeout for %s, falling back to domcontentloaded", url) + await page.goto(url, wait_until="domcontentloaded", timeout=timeout) + await page.wait_for_timeout(5000) # extra wait for JS rendering + + +async def try_dismiss_consent_banner(page: Page) -> bool: + """Try to dismiss cookie consent banners that block page content. + + Handles shadow DOM (Usercentrics), iframes (Sourcepoint), and regular + DOM banners (OneTrust, Cookiebot, Didomi, etc.). + Returns True if a banner was dismissed. + """ + # 1) Usercentrics shadow DOM — most common for German sites + try: + uc_root = await page.query_selector("#usercentrics-root") + if uc_root: + clicked = await page.evaluate("""() => { + const root = document.querySelector('#usercentrics-root'); + if (!root || !root.shadowRoot) return false; + const buttons = root.shadowRoot.querySelectorAll('button'); + for (const btn of buttons) { + const t = btn.textContent.trim().toLowerCase(); + if (t.includes('akzeptieren') || t.includes('accept') + || t.includes('zustimmen') || t.includes('agree')) { + btn.click(); + return true; + } + } + return false; + }""") + if clicked: + logger.info("Dismissed Usercentrics consent banner (shadow DOM)") + await page.wait_for_timeout(2000) + return True + except Exception: + pass + + # 2) Standard DOM banners — OneTrust, Cookiebot, Didomi, Borlabs, etc. + accept_selectors = [ + "#onetrust-accept-btn-handler", + "#CybotCookiebotDialogBodyLevelButtonLevelOptinAllowAll", + "#didomi-notice-agree-button", + "#BorlabsCookieBox .cookie-accept, [data-cookie-accept]", + ".cmpboxbtn.cmpboxbtnyes", + ".klaro .cm-btn-accept", + ".cky-btn-accept", + "[class*='qc-cmp2-summary-buttons'] button:first-child", + "#tarteaucitronPersonalize2", + ] + for sel in accept_selectors: + try: + btn = page.locator(sel).first + if await btn.count() > 0 and await btn.is_visible(): + await btn.click(timeout=3000) + logger.info("Dismissed consent banner via %s", sel) + await page.wait_for_timeout(2000) + return True + except Exception: + continue + + # 3) Generic text-based button search + accept_texts = [ + "Alle akzeptieren", "Alles akzeptieren", "Alle Cookies akzeptieren", + "Accept all", "Accept All Cookies", "Akzeptieren", "Zustimmen", + "Einverstanden", "Ich stimme zu", + ] + try: + clicked = await page.evaluate("""(texts) => { + for (const btn of document.querySelectorAll('button, a[role="button"]')) { + const t = (btn.textContent || '').trim(); + for (const target of texts) { + if (t === target) { btn.click(); return true; } + } + } + return false; + }""", accept_texts) + if clicked: + logger.info("Dismissed consent banner via generic text match") + await page.wait_for_timeout(2000) + return True + except Exception: + pass + + return False + + +def is_pdf_redirect(original_url: str, final_url: str) -> bool: + """Check if the page redirected to a PDF or external storage.""" + final_lower = final_url.lower() + return ( + final_lower.endswith(".pdf") + or "storage.googleapis.com" in final_lower + or "blob.core.windows.net" in final_lower + or "s3.amazonaws.com" in final_lower + )