From b39c1d5dcedf00df6cac912096716dcb550acb3e Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Tue, 28 Apr 2026 17:53:44 +0200 Subject: [PATCH 001/413] feat: DSR Prozessbeschreibungen Art. 15-21 mit Swim-Lane-Diagrammen MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 7 vollstaendige Prozessbeschreibungen fuer den Document Generator: - Art. 15: Auskunftsrecht (30 Tage, 6 Schritte, Informationskatalog) - Art. 16: Berichtigungsrecht (14 Tage, inkl. Art. 19 Mitteilung) - Art. 17: Loeschungsrecht (14 Tage, Art. 17(3) Ausnahmen-Checkliste) - Art. 18: Einschraenkungsrecht (14 Tage, erlaubte Verarbeitung) - Art. 19: Mitteilungspflicht (automatisch bei Art. 16/17/18) - Art. 20: Datenuebertragbarkeit (30 Tage, JSON/CSV/XML Export) - Art. 21: Widerspruchsrecht (30 Tage, Sonderfall Direktwerbung) Jede Beschreibung enthaelt: - Mermaid Swim-Lane-Diagramm (Betroffener/Sachbearbeitung/Fachabteilung/DSB) - Detaillierte Schritt-Tabelle mit Verantwortlichkeiten und Fristen - Rechtsgrundlagen-Verweise - Firmen-Platzhalter (FIRMENNAME, VERSION, DATUM, DSB_NAME) Integration: - 7 neue Typen in VALID_DOCUMENT_TYPES (legal_template_routes.py) - Neue Kategorie "DSR-Prozesse" im Document Generator Frontend - DSR types-core.ts: templateType Feld verknuepft DSR → Document Generator - Migration 085 seeded die Templates in die legal_templates Tabelle [migration-approved] Co-Authored-By: Claude Opus 4.6 (1M context) --- .../app/sdk/document-generator/_constants.ts | 4 + admin-compliance/lib/sdk/dsr/types-core.ts | 20 +- .../compliance/api/legal_template_routes.py | 8 + .../migrations/085_dsr_process_templates.sql | 466 ++++++++++++++++++ 4 files changed, 492 insertions(+), 6 deletions(-) create mode 100644 backend-compliance/migrations/085_dsr_process_templates.sql diff --git a/admin-compliance/app/sdk/document-generator/_constants.ts b/admin-compliance/app/sdk/document-generator/_constants.ts index e009798..301350c 100644 --- a/admin-compliance/app/sdk/document-generator/_constants.ts +++ b/admin-compliance/app/sdk/document-generator/_constants.ts @@ -18,6 +18,10 @@ export const CATEGORIES: { key: string; label: string; types: string[] | null }[ { key: 'cloud', label: 'Cloud', types: ['cloud_service_agreement'] }, { key: 'misc', label: 'Weitere', types: ['community_guidelines', 'copyright_policy', 'data_usage_clause'] }, { key: 'dsfa', label: 'DSFA', types: ['dsfa'] }, + { key: 'dsr', label: 'DSR-Prozesse', types: [ + 'dsr_process_art15', 'dsr_process_art16', 'dsr_process_art17', + 'dsr_process_art18', 'dsr_process_art19', 'dsr_process_art20', 'dsr_process_art21', + ]}, ] // ============================================================================= diff --git a/admin-compliance/lib/sdk/dsr/types-core.ts b/admin-compliance/lib/sdk/dsr/types-core.ts index 6c20701..ac99417 100644 --- a/admin-compliance/lib/sdk/dsr/types-core.ts +++ b/admin-compliance/lib/sdk/dsr/types-core.ts @@ -52,6 +52,8 @@ export interface DSRTypeInfo { color: string bgColor: string processDocument?: string + /** Document type in the legal_templates table for the Document Generator */ + templateType?: string } export const DSR_TYPE_INFO: Record = { @@ -60,41 +62,47 @@ export const DSR_TYPE_INFO: Record = { description: 'Recht auf Auskunft ueber gespeicherte personenbezogene Daten', defaultDeadlineDays: 30, maxExtensionMonths: 2, color: 'text-blue-700', bgColor: 'bg-blue-100', - processDocument: 'Prozessbeschreibung Art. 15 DSGVO_v02.pdf' + processDocument: 'Prozessbeschreibung Art. 15 DSGVO_v02.pdf', + templateType: 'dsr_process_art15', }, rectification: { type: 'rectification', article: 'Art. 16', label: 'Berichtigungsrecht', labelShort: 'Berichtigung', description: 'Recht auf Berichtigung unrichtiger personenbezogener Daten', defaultDeadlineDays: 14, maxExtensionMonths: 2, color: 'text-yellow-700', bgColor: 'bg-yellow-100', - processDocument: 'Prozessbeschriebung Art. 16 DSGVO_v02.pdf' + processDocument: 'Prozessbeschreibung Art. 16 DSGVO_v02.pdf', + templateType: 'dsr_process_art16', }, erasure: { type: 'erasure', article: 'Art. 17', label: 'Loeschungsrecht', labelShort: 'Loeschung', description: 'Recht auf Loeschung personenbezogener Daten ("Recht auf Vergessenwerden")', defaultDeadlineDays: 14, maxExtensionMonths: 2, color: 'text-red-700', bgColor: 'bg-red-100', - processDocument: 'Prozessbeschreibung Art. 17 DSGVO_v03.pdf' + processDocument: 'Prozessbeschreibung Art. 17 DSGVO_v03.pdf', + templateType: 'dsr_process_art17', }, restriction: { type: 'restriction', article: 'Art. 18', label: 'Einschraenkungsrecht', labelShort: 'Einschraenkung', description: 'Recht auf Einschraenkung der Verarbeitung', defaultDeadlineDays: 14, maxExtensionMonths: 2, color: 'text-orange-700', bgColor: 'bg-orange-100', - processDocument: 'Prozessbeschreibung Art. 18 DSGVO_v01.pdf' + processDocument: 'Prozessbeschreibung Art. 18 DSGVO_v01.pdf', + templateType: 'dsr_process_art18', }, portability: { type: 'portability', article: 'Art. 20', label: 'Datenuebertragbarkeit', labelShort: 'Uebertragung', description: 'Recht auf Datenuebertragbarkeit in maschinenlesbarem Format', defaultDeadlineDays: 30, maxExtensionMonths: 2, color: 'text-purple-700', bgColor: 'bg-purple-100', - processDocument: 'Prozessbeschreibung Art. 20 DSGVO_v02.pdf' + processDocument: 'Prozessbeschreibung Art. 20 DSGVO_v02.pdf', + templateType: 'dsr_process_art20', }, objection: { type: 'objection', article: 'Art. 21', label: 'Widerspruchsrecht', labelShort: 'Widerspruch', description: 'Recht auf Widerspruch gegen die Verarbeitung', defaultDeadlineDays: 30, maxExtensionMonths: 0, - color: 'text-gray-700', bgColor: 'bg-gray-100' + color: 'text-gray-700', bgColor: 'bg-gray-100', + templateType: 'dsr_process_art21', } } diff --git a/backend-compliance/compliance/api/legal_template_routes.py b/backend-compliance/compliance/api/legal_template_routes.py index d16b8ef..01e9f1f 100644 --- a/backend-compliance/compliance/api/legal_template_routes.py +++ b/backend-compliance/compliance/api/legal_template_routes.py @@ -101,6 +101,14 @@ VALID_DOCUMENT_TYPES = { "tom_documentation", "loeschkonzept", "pflichtenregister", + # DSR Process Documents (Migration 085) + "dsr_process_art15", + "dsr_process_art16", + "dsr_process_art17", + "dsr_process_art18", + "dsr_process_art19", + "dsr_process_art20", + "dsr_process_art21", } VALID_STATUSES = {"published", "draft", "archived"} diff --git a/backend-compliance/migrations/085_dsr_process_templates.sql b/backend-compliance/migrations/085_dsr_process_templates.sql new file mode 100644 index 0000000..fa35986 --- /dev/null +++ b/backend-compliance/migrations/085_dsr_process_templates.sql @@ -0,0 +1,466 @@ +-- Migration 085: DSR Process Document Templates (Art. 15-21 DSGVO) +-- Prozessbeschreibungen mit Swim-Lane-Diagrammen fuer den Document Generator + +INSERT INTO compliance_legal_templates (id, tenant_id, document_type, title, language, status, content, placeholders, created_at, updated_at) +VALUES + +-- ============================================================================ +-- Art. 15 DSGVO — Auskunftsrecht +-- ============================================================================ +(gen_random_uuid(), '__default__', 'dsr_process_art15', +'Prozessbeschreibung: Auskunftsrecht (Art. 15 DSGVO)', 'de', 'published', +'

Prozessbeschreibung: Auskunftsrecht nach Art. 15 DSGVO

+

Verantwortliche Stelle: {{FIRMENNAME}}
+Version: {{VERSION}}
+Stand: {{DATUM}}
+Genehmigt durch: {{DSB_NAME}}

+ +

1. Zweck und Geltungsbereich

+

Diese Prozessbeschreibung regelt den Ablauf bei Auskunftsersuchen betroffener Personen nach Art. 15 DSGVO. Jede natuerliche Person hat das Recht, eine Bestaetigung darueber zu verlangen, ob personenbezogene Daten verarbeitet werden, und ggf. Auskunft ueber diese Daten zu erhalten.

+ +

2. Rechtsgrundlage

+
    +
  • Art. 15 DSGVO — Auskunftsrecht der betroffenen Person
  • +
  • Art. 12 DSGVO — Transparente Information und Kommunikation
  • +
  • Erwaegungsgrund 63 — Recht auf Zugang zu personenbezogenen Daten
  • +
+ +

3. Frist

+

30 Tage nach Eingang der Anfrage. Verlaengerung um weitere 2 Monate bei komplexen Anfragen moeglich (Art. 12 Abs. 3 DSGVO), mit Begruendung innerhalb der ersten 30 Tage.

+ +

4. Prozessablauf (Swim Lane)

+
+graph TD
+    subgraph Betroffener
+        A1[Auskunftsersuchen einreichen]
+        A7[Auskunft erhalten und pruefen]
+    end
+    subgraph Empfang/Sachbearbeitung
+        B1[Anfrage erfassen und dokumentieren]
+        B2[Identitaet pruefen]
+        B3{Identitaet bestaetigt?}
+    end
+    subgraph Fachabteilung
+        C1[Alle Verarbeitungstaetigkeiten identifizieren]
+        C2[Personenbezogene Daten zusammenstellen]
+        C3[Daten auf Drittrechte pruefen]
+        C4[Auskunft erstellen]
+    end
+    subgraph Datenschutzbeauftragter
+        D1[Auskunft auf Vollstaendigkeit pruefen]
+        D2[Freigabe erteilen]
+    end
+
+    A1 --> B1 --> B2 --> B3
+    B3 -- Nein --> B2
+    B3 -- Ja --> C1 --> C2 --> C3 --> C4 --> D1 --> D2 --> A7
+
+ +

5. Detaillierte Schritte

+ + + + + + + + +
SchrittVerantwortlichBeschreibungFrist
1. EingangEmpfangAnfrage dokumentieren, Eingangsbestaetigung senden1 Tag
2. ID-PruefungSachbearbeitungIdentitaet des Antragstellers verifizieren3 Tage
3. DatenerhebungFachabteilungAlle verarbeiteten Daten zusammenstellen14 Tage
4. Pruefung DrittrechteFachabteilungRechte Dritter pruefen und ggf. schwaerzen5 Tage
5. QS + FreigabeDSBVollstaendigkeit und Richtigkeit pruefen3 Tage
6. VersandSachbearbeitungAuskunft an Betroffenen uebermitteln1 Tag
+ +

6. Zu liefernde Informationen (Art. 15 Abs. 1)

+
    +
  • Verarbeitungszwecke
  • +
  • Kategorien personenbezogener Daten
  • +
  • Empfaenger oder Kategorien von Empfaengern
  • +
  • Speicherdauer oder Kriterien fuer die Festlegung
  • +
  • Bestehen eines Rechts auf Berichtigung, Loeschung, Einschraenkung, Widerspruch
  • +
  • Beschwerderecht bei einer Aufsichtsbehoerde
  • +
  • Herkunft der Daten (wenn nicht beim Betroffenen erhoben)
  • +
  • Bestehen einer automatisierten Entscheidungsfindung inkl. Profiling
  • +
  • Bei Drittlanduebermittlung: geeignete Garantien
  • +
+ +

7. Dokumentation

+

Alle Schritte werden im DSR-Modul von {{FIRMENNAME}} protokolliert inkl. Zeitstempel, Bearbeiter und Ergebnis.

', +'["FIRMENNAME", "VERSION", "DATUM", "DSB_NAME"]'::jsonb, +NOW(), NOW()), + +-- ============================================================================ +-- Art. 16 DSGVO — Recht auf Berichtigung +-- ============================================================================ +(gen_random_uuid(), '__default__', 'dsr_process_art16', +'Prozessbeschreibung: Recht auf Berichtigung (Art. 16 DSGVO)', 'de', 'published', +'

Prozessbeschreibung: Recht auf Berichtigung nach Art. 16 DSGVO

+

Verantwortliche Stelle: {{FIRMENNAME}}
+Version: {{VERSION}}
+Stand: {{DATUM}}
+Genehmigt durch: {{DSB_NAME}}

+ +

1. Zweck

+

Regelung des Ablaufs bei Antraegen auf Berichtigung unrichtiger personenbezogener Daten nach Art. 16 DSGVO.

+ +

2. Rechtsgrundlage

+
    +
  • Art. 16 DSGVO — Recht auf Berichtigung
  • +
  • Art. 19 DSGVO — Mitteilungspflicht an Empfaenger
  • +
  • Art. 5 Abs. 1 lit. d DSGVO — Grundsatz der Richtigkeit
  • +
+ +

3. Frist

+

Unverzueglich, spaetestens innerhalb von 14 Tagen.

+ +

4. Prozessablauf

+
+graph TD
+    subgraph Betroffener
+        A1[Berichtigungsantrag stellen]
+        A5[Bestaetigung erhalten]
+    end
+    subgraph Sachbearbeitung
+        B1[Antrag erfassen]
+        B2[Identitaet pruefen]
+        B3[Unrichtigkeit pruefen]
+        B4{Berechtigt?}
+    end
+    subgraph Fachabteilung
+        C1[Daten in allen Systemen berichtigen]
+        C2[Empfaenger benachrichtigen Art. 19]
+    end
+    subgraph DSB
+        D1[Berichtigung dokumentieren]
+    end
+
+    A1 --> B1 --> B2 --> B3 --> B4
+    B4 -- Nein --> A5
+    B4 -- Ja --> C1 --> C2 --> D1 --> A5
+
+ +

5. Schritte

+ + + + + + + + +
SchrittVerantwortlichBeschreibungFrist
1. EingangEmpfangAntrag dokumentieren1 Tag
2. ID-PruefungSachbearbeitungIdentitaet verifizieren2 Tage
3. SachpruefungSachbearbeitungUnrichtigkeit der Daten bestaetigen3 Tage
4. BerichtigungFachabteilungDaten in allen Systemen korrigieren3 Tage
5. Mitteilung Art. 19FachabteilungAlle Empfaenger ueber Berichtigung informieren3 Tage
6. DokumentationDSBBerichtigung protokollieren1 Tag
', +'["FIRMENNAME", "VERSION", "DATUM", "DSB_NAME"]'::jsonb, +NOW(), NOW()), + +-- ============================================================================ +-- Art. 17 DSGVO — Recht auf Loeschung +-- ============================================================================ +(gen_random_uuid(), '__default__', 'dsr_process_art17', +'Prozessbeschreibung: Recht auf Loeschung (Art. 17 DSGVO)', 'de', 'published', +'

Prozessbeschreibung: Recht auf Loeschung nach Art. 17 DSGVO

+

Verantwortliche Stelle: {{FIRMENNAME}}
+Version: {{VERSION}}
+Stand: {{DATUM}}
+Genehmigt durch: {{DSB_NAME}}

+ +

1. Zweck

+

Regelung des Ablaufs bei Loeschungsersuchen (Recht auf Vergessenwerden) nach Art. 17 DSGVO.

+ +

2. Rechtsgrundlage

+
    +
  • Art. 17 DSGVO — Recht auf Loeschung
  • +
  • Art. 17 Abs. 3 DSGVO — Ausnahmen (Meinungsfreiheit, rechtliche Verpflichtung, oeffentliches Interesse, Rechtsansprueche)
  • +
  • Art. 19 DSGVO — Mitteilungspflicht an Empfaenger
  • +
+ +

3. Frist

+

Unverzueglich, spaetestens innerhalb von 14 Tagen.

+ +

4. Prozessablauf

+
+graph TD
+    subgraph Betroffener
+        A1[Loeschungsantrag stellen]
+        A6[Bestaetigung erhalten]
+    end
+    subgraph Sachbearbeitung
+        B1[Antrag erfassen + ID pruefen]
+        B2[Loeschungsgrund pruefen]
+    end
+    subgraph DSB
+        C1[Art. 17 Abs. 3 Ausnahmen pruefen]
+        C2{Ausnahme greift?}
+        C3[Ablehnung mit Begruendung]
+    end
+    subgraph Fachabteilung
+        D1[Daten in allen Systemen loeschen]
+        D2[Backups markieren]
+        D3[Empfaenger benachrichtigen Art. 19]
+        D4[Loeschprotokoll erstellen]
+    end
+
+    A1 --> B1 --> B2 --> C1 --> C2
+    C2 -- Ja --> C3 --> A6
+    C2 -- Nein --> D1 --> D2 --> D3 --> D4 --> A6
+
+ +

5. Art. 17 Abs. 3 Ausnahmen (Checkliste)

+ + + + + + + +
AusnahmeBeschreibung
a) MeinungsfreiheitAusuebung des Rechts auf freie Meinungsaeusserung und Information
b) Rechtliche VerpflichtungErfuellung einer rechtlichen Verpflichtung (z.B. Aufbewahrungspflichten)
c) Oeffentliches InteresseGruende des oeffentlichen Interesses im Bereich Gesundheit
d) ArchivzweckeIm oeffentlichen Interesse liegende Archivzwecke, Forschung, Statistik
e) RechtsanspruecheGeltendmachung, Ausuebung oder Verteidigung von Rechtsanspruechen
+ +

6. Schritte

+ + + + + + + +
SchrittVerantwortlichBeschreibungFrist
1. Eingang + IDSachbearbeitungAntrag dokumentieren, Identitaet verifizieren3 Tage
2. AusnahmepruefungDSBArt. 17 Abs. 3 Checkliste durchgehen3 Tage
3. LoeschungFachabteilungDaten in allen Systemen loeschen3 Tage
4. Backup-HandlingITBackups markieren, bei naechstem Zyklus loeschen3 Tage
5. Mitteilung Art. 19FachabteilungEmpfaenger ueber Loeschung informieren2 Tage
', +'["FIRMENNAME", "VERSION", "DATUM", "DSB_NAME"]'::jsonb, +NOW(), NOW()), + +-- ============================================================================ +-- Art. 18 DSGVO — Recht auf Einschraenkung der Verarbeitung +-- ============================================================================ +(gen_random_uuid(), '__default__', 'dsr_process_art18', +'Prozessbeschreibung: Einschraenkung der Verarbeitung (Art. 18 DSGVO)', 'de', 'published', +'

Prozessbeschreibung: Einschraenkung der Verarbeitung nach Art. 18 DSGVO

+

Verantwortliche Stelle: {{FIRMENNAME}}
+Version: {{VERSION}}
+Stand: {{DATUM}}
+Genehmigt durch: {{DSB_NAME}}

+ +

1. Zweck

+

Regelung des Ablaufs bei Antraegen auf Einschraenkung der Verarbeitung personenbezogener Daten nach Art. 18 DSGVO.

+ +

2. Rechtsgrundlage und Voraussetzungen

+

Die betroffene Person kann die Einschraenkung verlangen wenn:

+
    +
  • a) Richtigkeit der Daten bestritten wird (fuer die Dauer der Ueberpruefung)
  • +
  • b) Verarbeitung unrechtmaessig ist und Betroffener Loeschung ablehnt
  • +
  • c) Verantwortlicher Daten nicht mehr benoetigt, Betroffener aber fuer Rechtsansprueche
  • +
  • d) Widerspruch nach Art. 21 eingelegt wurde (fuer die Dauer der Pruefung)
  • +
+ +

3. Frist

+

Unverzueglich, spaetestens innerhalb von 14 Tagen.

+ +

4. Prozessablauf

+
+graph TD
+    subgraph Betroffener
+        A1[Antrag auf Einschraenkung]
+        A5[Bestaetigung erhalten]
+    end
+    subgraph Sachbearbeitung
+        B1[Antrag erfassen + ID pruefen]
+        B2[Voraussetzung nach Art. 18 Abs. 1 pruefen]
+        B3{Berechtigt?}
+    end
+    subgraph IT/Fachabteilung
+        C1[Daten markieren/sperren]
+        C2[Verarbeitung einschraenken]
+        C3[Empfaenger benachrichtigen Art. 19]
+    end
+    subgraph DSB
+        D1[Dokumentation + Wiedervorlage]
+    end
+
+    A1 --> B1 --> B2 --> B3
+    B3 -- Nein --> A5
+    B3 -- Ja --> C1 --> C2 --> C3 --> D1 --> A5
+
+ +

5. Erlaubte Verarbeitung bei Einschraenkung (Art. 18 Abs. 2)

+

Eingeschraenkte Daten duerfen nur noch verarbeitet werden fuer:

+
    +
  • Speicherung
  • +
  • Mit Einwilligung der betroffenen Person
  • +
  • Zur Geltendmachung von Rechtsanspruechen
  • +
  • Zum Schutz der Rechte einer anderen Person
  • +
  • Aus Gruenden eines wichtigen oeffentlichen Interesses
  • +
', +'["FIRMENNAME", "VERSION", "DATUM", "DSB_NAME"]'::jsonb, +NOW(), NOW()), + +-- ============================================================================ +-- Art. 19 DSGVO — Mitteilungspflicht +-- ============================================================================ +(gen_random_uuid(), '__default__', 'dsr_process_art19', +'Prozessbeschreibung: Mitteilungspflicht (Art. 19 DSGVO)', 'de', 'published', +'

Prozessbeschreibung: Mitteilungspflicht nach Art. 19 DSGVO

+

Verantwortliche Stelle: {{FIRMENNAME}}
+Version: {{VERSION}}
+Stand: {{DATUM}}
+Genehmigt durch: {{DSB_NAME}}

+ +

1. Zweck

+

Art. 19 DSGVO verpflichtet den Verantwortlichen, alle Empfaenger, denen personenbezogene Daten offengelegt wurden, ueber jede Berichtigung (Art. 16), Loeschung (Art. 17) oder Einschraenkung (Art. 18) zu informieren — es sei denn, dies ist unverhaeltnismaessig.

+ +

2. Rechtsgrundlage

+
    +
  • Art. 19 DSGVO — Mitteilungspflicht im Zusammenhang mit der Berichtigung oder Loeschung personenbezogener Daten oder der Einschraenkung der Verarbeitung
  • +
+ +

3. Ausloeser

+

Art. 19 wird automatisch ausgeloest bei:

+
    +
  • Erfolgreicher Berichtigung nach Art. 16
  • +
  • Erfolgreicher Loeschung nach Art. 17
  • +
  • Einschraenkung der Verarbeitung nach Art. 18
  • +
+ +

4. Prozessablauf

+
+graph TD
+    subgraph Ausloeser
+        A1[Berichtigung/Loeschung/Einschraenkung durchgefuehrt]
+    end
+    subgraph Sachbearbeitung
+        B1[Alle Empfaenger aus VVT identifizieren]
+        B2[Auftragsverarbeiter identifizieren]
+        B3{Mitteilung verhaeltnismaessig?}
+        B4[Mitteilung an jeden Empfaenger senden]
+        B5[Unverhaeltnismaessigkeit dokumentieren]
+    end
+    subgraph DSB
+        C1[Mitteilung dokumentieren]
+        C2[Betroffenen ueber Empfaenger informieren auf Verlangen]
+    end
+
+    A1 --> B1 --> B2 --> B3
+    B3 -- Nein --> B5 --> C1
+    B3 -- Ja --> B4 --> C1 --> C2
+
+ +

5. Pflicht zur Auskunft ueber Empfaenger

+

Der Verantwortliche muss dem Betroffenen auf Verlangen die Empfaenger mitteilen, an die die Daten offengelegt wurden.

', +'["FIRMENNAME", "VERSION", "DATUM", "DSB_NAME"]'::jsonb, +NOW(), NOW()), + +-- ============================================================================ +-- Art. 20 DSGVO — Recht auf Datenuebertragbarkeit +-- ============================================================================ +(gen_random_uuid(), '__default__', 'dsr_process_art20', +'Prozessbeschreibung: Datenuebertragbarkeit (Art. 20 DSGVO)', 'de', 'published', +'

Prozessbeschreibung: Recht auf Datenuebertragbarkeit nach Art. 20 DSGVO

+

Verantwortliche Stelle: {{FIRMENNAME}}
+Version: {{VERSION}}
+Stand: {{DATUM}}
+Genehmigt durch: {{DSB_NAME}}

+ +

1. Zweck

+

Regelung des Ablaufs bei Antraegen auf Datenuebertragbarkeit. Die betroffene Person hat das Recht, die sie betreffenden personenbezogenen Daten in einem strukturierten, gaengigen und maschinenlesbaren Format zu erhalten.

+ +

2. Rechtsgrundlage und Voraussetzungen

+
    +
  • Art. 20 DSGVO — Recht auf Datenuebertragbarkeit
  • +
  • Gilt nur fuer Daten, die auf Einwilligung (Art. 6 Abs. 1 lit. a) oder Vertrag (Art. 6 Abs. 1 lit. b) basieren
  • +
  • Gilt nur fuer automatisiert verarbeitete Daten
  • +
  • Nur vom Betroffenen selbst bereitgestellte Daten
  • +
+ +

3. Frist

+

30 Tage nach Eingang der Anfrage.

+ +

4. Prozessablauf

+
+graph TD
+    subgraph Betroffener
+        A1[Antrag auf Datenuebertragung]
+        A5[Daten erhalten / Direktuebertragung bestaetigt]
+    end
+    subgraph Sachbearbeitung
+        B1[Antrag erfassen + ID pruefen]
+        B2[Rechtsgrundlage pruefen - Einwilligung oder Vertrag?]
+        B3{Voraussetzungen erfuellt?}
+    end
+    subgraph IT
+        C1[Daten identifizieren und extrahieren]
+        C2[In maschinenlesbares Format konvertieren - JSON/CSV/XML]
+        C3{Direktuebertragung gewuenscht?}
+        C4[Daten an Betroffenen uebergeben]
+        C5[Daten an anderen Verantwortlichen uebertragen]
+    end
+
+    A1 --> B1 --> B2 --> B3
+    B3 -- Nein --> A5
+    B3 -- Ja --> C1 --> C2 --> C3
+    C3 -- Nein --> C4 --> A5
+    C3 -- Ja --> C5 --> A5
+
+ +

5. Exportformate

+
    +
  • JSON (bevorzugt)
  • +
  • CSV (fuer tabellarische Daten)
  • +
  • XML (bei Bedarf)
  • +
', +'["FIRMENNAME", "VERSION", "DATUM", "DSB_NAME"]'::jsonb, +NOW(), NOW()), + +-- ============================================================================ +-- Art. 21 DSGVO — Widerspruchsrecht +-- ============================================================================ +(gen_random_uuid(), '__default__', 'dsr_process_art21', +'Prozessbeschreibung: Widerspruchsrecht (Art. 21 DSGVO)', 'de', 'published', +'

Prozessbeschreibung: Widerspruchsrecht nach Art. 21 DSGVO

+

Verantwortliche Stelle: {{FIRMENNAME}}
+Version: {{VERSION}}
+Stand: {{DATUM}}
+Genehmigt durch: {{DSB_NAME}}

+ +

1. Zweck

+

Regelung des Ablaufs bei Widerspruechen gegen die Verarbeitung personenbezogener Daten nach Art. 21 DSGVO.

+ +

2. Rechtsgrundlage

+
    +
  • Art. 21 Abs. 1 DSGVO — Allgemeines Widerspruchsrecht (bei Verarbeitung nach Art. 6 Abs. 1 lit. e oder f)
  • +
  • Art. 21 Abs. 2 DSGVO — Widerspruch gegen Direktwerbung (absolutes Recht, keine Abwaegung)
  • +
  • Art. 21 Abs. 6 DSGVO — Widerspruch gegen wissenschaftliche/historische Forschung
  • +
+ +

3. Frist

+

30 Tage nach Eingang des Widerspruchs.

+ +

4. Prozessablauf

+
+graph TD
+    subgraph Betroffener
+        A1[Widerspruch einlegen]
+        A6[Ergebnis erhalten]
+    end
+    subgraph Sachbearbeitung
+        B1[Widerspruch erfassen + ID pruefen]
+        B2{Art des Widerspruchs?}
+    end
+    subgraph Fall: Direktwerbung
+        C1[Verarbeitung fuer Direktwerbung SOFORT einstellen]
+        C2[Bestaetigung senden]
+    end
+    subgraph Fall: Allgemeiner Widerspruch
+        D1[Zwingende schutzwuerdige Gruende pruefen]
+        D2{Zwingende Gruende vorhanden?}
+        D3[Verarbeitung einstellen]
+        D4[Widerspruch ablehnen mit Begruendung]
+    end
+
+    A1 --> B1 --> B2
+    B2 -- Direktwerbung Art. 21 Abs. 2 --> C1 --> C2 --> A6
+    B2 -- Allgemein Art. 21 Abs. 1 --> D1 --> D2
+    D2 -- Nein --> D3 --> A6
+    D2 -- Ja --> D4 --> A6
+
+ +

5. Besonderheit: Widerspruch gegen Direktwerbung

+

Absolutes Recht — keine Interessenabwaegung erforderlich. Bei Widerspruch gegen Direktwerbung muss die Verarbeitung sofort eingestellt werden. Dies umfasst auch Profiling, soweit es mit Direktwerbung zusammenhaengt.

+ +

6. Interessenabwaegung bei allgemeinem Widerspruch

+

Bei Widerspruch nach Art. 21 Abs. 1 muss der Verantwortliche nachweisen, dass zwingende schutzwuerdige Gruende fuer die Verarbeitung vorliegen, die die Interessen des Betroffenen ueberwiegen.

', +'["FIRMENNAME", "VERSION", "DATUM", "DSB_NAME"]'::jsonb, +NOW(), NOW()) + +ON CONFLICT DO NOTHING; From 3c9ac03ccce03c9a53a4dea7dd18261825ba6abb Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Tue, 28 Apr 2026 23:06:28 +0200 Subject: [PATCH 002/413] fix: show ComplianceAdvisor + PipelineSidebar without project selection Widgets were hidden behind projectId guard. Removed condition so new users can ask questions (e.g. "Wie lege ich ein Projekt an?") before creating a project. Co-Authored-By: Claude Opus 4.6 (1M context) --- admin-compliance/app/sdk/layout.tsx | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/admin-compliance/app/sdk/layout.tsx b/admin-compliance/app/sdk/layout.tsx index e125162..5bd3054 100644 --- a/admin-compliance/app/sdk/layout.tsx +++ b/admin-compliance/app/sdk/layout.tsx @@ -208,10 +208,10 @@ function SDKInnerLayout({ children }: { children: React.ReactNode }) { {isCommandBarOpen && setCommandBarOpen(false)} />} {/* Pipeline Sidebar (FAB on mobile/tablet, fixed on desktop xl+) */} - {projectId && } + - {/* Compliance Advisor Widget */} - {projectId && } + {/* Compliance Advisor Widget — immer sichtbar, auch ohne Projekt */} + ) } From ac8eb1bf991454ff3869353dc55aea8a275cf91d Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Tue, 28 Apr 2026 23:17:13 +0200 Subject: [PATCH 003/413] feat: "Als Email senden" Button im Compliance Advisor Chat-Verlauf wird als strukturiertes Beratungsprotokoll per Email an den DSB gesendet. Button erscheint im Header sobald Nachrichten vorhanden sind. Zeigt Checkmark nach erfolgreichem Versand. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../app/api/sdk/v1/agent/notify/route.ts | 30 ++++++++ .../sdk/ComplianceAdvisorWidget.tsx | 73 +++++++++++++++++++ 2 files changed, 103 insertions(+) create mode 100644 admin-compliance/app/api/sdk/v1/agent/notify/route.ts diff --git a/admin-compliance/app/api/sdk/v1/agent/notify/route.ts b/admin-compliance/app/api/sdk/v1/agent/notify/route.ts new file mode 100644 index 0000000..896f517 --- /dev/null +++ b/admin-compliance/app/api/sdk/v1/agent/notify/route.ts @@ -0,0 +1,30 @@ +/** + * Agent Notify API Proxy + * POST /api/sdk/v1/agent/notify → backend-compliance /api/compliance/agent/notify + */ + +import { NextRequest, NextResponse } from 'next/server' + +const BACKEND_URL = process.env.BACKEND_API_URL || 'http://backend-compliance:8002' + +export async function POST(request: NextRequest) { + try { + const body = await request.text() + const response = await fetch(`${BACKEND_URL}/api/compliance/agent/notify`, { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body, + signal: AbortSignal.timeout(15000), + }) + + if (!response.ok) { + const errorText = await response.text() + return NextResponse.json({ error: errorText }, { status: response.status }) + } + + return NextResponse.json(await response.json()) + } catch (error) { + console.error('Agent notify proxy error:', error) + return NextResponse.json({ error: 'Email-Versand fehlgeschlagen' }, { status: 503 }) + } +} diff --git a/admin-compliance/components/sdk/ComplianceAdvisorWidget.tsx b/admin-compliance/components/sdk/ComplianceAdvisorWidget.tsx index b2057c2..22b6208 100644 --- a/admin-compliance/components/sdk/ComplianceAdvisorWidget.tsx +++ b/admin-compliance/components/sdk/ComplianceAdvisorWidget.tsx @@ -141,6 +141,54 @@ export function ComplianceAdvisorWidget({ currentStep = 'default' }: ComplianceA setIsTyping(false) }, []) + const [emailSending, setEmailSending] = useState(false) + const [emailSent, setEmailSent] = useState(false) + + const handleSendAsEmail = useCallback(async () => { + if (messages.length === 0 || emailSending) return + setEmailSending(true) + try { + // Build HTML from chat messages + const qaPairs = messages.reduce<{ q: string; a: string }[]>((acc, m, i) => { + if (m.role === 'user') { + const next = messages[i + 1] + acc.push({ q: m.content, a: next?.role === 'agent' ? next.content : '(keine Antwort)' }) + } + return acc + }, []) + + const qaHtml = qaPairs.map(({ q, a }) => + `

Frage: ${q}

${a}

` + ).join('') + + const bodyHtml = ` +

Compliance Advisor — Beratungsprotokoll

+

Datum: ${new Date().toLocaleString('de-DE')} | Land: ${selectedCountry} | Kontext: ${currentStep}

+
+ ${qaHtml} +
+

Automatisch erstellt vom BreakPilot Compliance Advisor (Qwen)

+ ` + + await fetch('/api/sdk/v1/agent/notify', { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ + recipient: 'dsb@breakpilot.local', + subject: `Compliance Advisor — ${qaPairs.length} Fragen (${currentStep})`, + body_html: bodyHtml, + role: 'Datenschutzbeauftragter', + }), + }) + setEmailSent(true) + setTimeout(() => setEmailSent(false), 3000) + } catch (e) { + console.error('Email send failed:', e) + } finally { + setEmailSending(false) + } + }, [messages, emailSending, selectedCountry, currentStep]) + const handleKeyDown = (e: React.KeyboardEvent) => { if (e.key === 'Enter' && !e.shiftKey) { e.preventDefault() @@ -188,6 +236,31 @@ export function ComplianceAdvisorWidget({ currentStep = 'default' }: ComplianceA
+ {/* Send as Email */} + {messages.length > 0 && ( + + )} - ))} + {/* Tab Selection + Info */} +
+
+ {TABS.map(t => ( + + ))} +
+

{currentTab.info}

{/* URL Input */} @@ -136,10 +143,32 @@ export default function AgentPage() {
)} - {/* History (quick only) */} + {/* History */} {tab === 'quick' && ( { setUrl(r.url); analyze(r.url, mode) }} /> )} + {tab === 'scan' && scanHistory.length > 0 && ( +
+

Letzte Scans

+
+ {scanHistory.map((item, i) => ( + + ))} +
+
+ )} ) } diff --git a/backend-compliance/compliance/api/agent_analyze_routes.py b/backend-compliance/compliance/api/agent_analyze_routes.py index 51399b0..ac6ceb5 100644 --- a/backend-compliance/compliance/api/agent_analyze_routes.py +++ b/backend-compliance/compliance/api/agent_analyze_routes.py @@ -105,7 +105,7 @@ async def analyze_url(req: AnalyzeRequest): email_result = send_email( recipient=req.recipient, subject=f"[{mode_label}] Compliance-Finding: {classification} — {req.url[:60]}", - body_html=f"
{summary}
", + body_html=summary, ) return AnalyzeResponse( @@ -349,53 +349,77 @@ def _risk_to_escalation(risk_level: str) -> str: return mapping.get(risk_level.upper() if risk_level else "", "E0") +DOC_TYPE_LABELS = { + "privacy_policy": "Datenschutzerklaerung", + "cookie_banner": "Cookie-Banner", + "terms_of_service": "AGB", + "imprint": "Impressum", + "dpa": "Auftragsverarbeitung (AVV)", + "other": "Sonstiges", +} + +RISK_COLORS = { + "MINIMAL": ("#16a34a", "Niedrig"), + "LOW": ("#ca8a04", "Gering"), + "LIMITED": ("#ea580c", "Mittel"), + "HIGH": ("#dc2626", "Hoch"), + "UNACCEPTABLE": ("#991b1b", "Kritisch"), +} + + def _build_summary( url: str, classification: str, assessment: dict, role: str, findings_str: list[str], controls_str: list[str], mode: str = "post_launch", ) -> str: - """Build a German manager summary, adapted to pre/post-launch context.""" + """Build HTML summary for email and frontend.""" risk = assessment.get("risk_level", "unbekannt") score = assessment.get("risk_score", 0) recommendation = assessment.get("recommendation", "") dsfa = assessment.get("dsfa_recommended", False) is_live = mode == "post_launch" + risk_color, risk_label = RISK_COLORS.get(risk, ("#6b7280", risk)) + doc_label = DOC_TYPE_LABELS.get(classification, classification) - findings_text = "\n".join(f"- {f}" for f in findings_str[:5]) if findings_str else "Keine" - controls_text = "\n".join(f"- {c}" for c in controls_str[:5]) if controls_str else "Keine" - - mode_header = ( - "PRUEFUNG LIVE-WEBSITE — Das Dokument ist bereits oeffentlich zugaenglich." + mode_banner = ( + '
' + 'LIVE-WEBSITE — Das Dokument ist bereits oeffentlich zugaenglich.
' if is_live else - "INTERNE PRUEFUNG — Das Dokument ist noch nicht veroeffentlicht." + '
' + 'INTERNE PRUEFUNG — Dokument noch nicht veroeffentlicht.
' ) - parts = [ - mode_header, - "", - f"Dokumenttyp: {classification}", - f"Quelle: {url}", - f"Risikobewertung: {risk} ({score}/100)", - f"Zustaendig: {role}", - f"DSFA empfohlen: {'Ja' if dsfa else 'Nein'}", - "", - f"Findings:\n{findings_text}", - "", - f"Erforderliche Massnahmen:\n{controls_text}", - ] + findings_html = "".join(f'
  • {f}
  • ' for f in findings_str[:8]) if findings_str else '
  • Keine
  • ' + controls_html = "".join(f'
  • {c}
  • ' for c in controls_str[:8]) if controls_str else '
  • Keine
  • ' + warning = "" if is_live and findings_str: - parts.extend([ - "", - "ACHTUNG: Diese Maengel sind bereits oeffentlich sichtbar. " - "Sofortige Nachbesserung empfohlen um Abmahnrisiken zu minimieren.", - ]) + warning = ( + '
    ' + '⚠ ACHTUNG: Diese Maengel sind bereits oeffentlich sichtbar. ' + 'Sofortige Nachbesserung empfohlen um Abmahnrisiken zu minimieren.
    ' + ) elif not is_live and controls_str: - parts.extend([ - "", - "Empfehlung: Implementieren Sie die erforderlichen Kontrollen vor der Veroeffentlichung.", - ]) + warning = ( + '
    ' + 'Empfehlung: Implementieren Sie die erforderlichen Kontrollen vor der Veroeffentlichung.
    ' + ) - if recommendation: - parts.extend(["", f"Weitere Empfehlung: {recommendation}"]) - return "\n".join(parts) + rec_html = f'

    {recommendation}

    ' if recommendation else "" + + return f""" + {mode_banner} + + + + + + +
    Dokumenttyp{doc_label}
    Quelle{url}
    Risikobewertung{risk_label} ({score}/100)
    Zustaendig{role}
    DSFA empfohlen{'Ja' if dsfa else 'Nein'}
    +

    Findings

    +
      {findings_html}
    +

    Erforderliche Massnahmen

    +
      {controls_html}
    + {warning} + {rec_html} + """ From 0266dfd0110ce82090730bb29d9b11a81ef25051 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Wed, 29 Apr 2026 11:32:27 +0200 Subject: [PATCH 007/413] =?UTF-8?q?docs:=20Compliance=20Agent=20product=20?= =?UTF-8?q?roadmap=20=E2=80=94=208=20phases,=20PoC=20to=20production?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit P0: UCCA score calibration + control relevance filter P1: Headless browser consent test (before/after cookie banner) + 80+ services P2: Scan acceleration, DB persistence, PDF export P3: Recurring scans, multi-website comparison Investor demo scenario included. Co-Authored-By: Claude Opus 4.6 (1M context) --- zeroclaw/PLAN-compliance-agent-product.md | 488 ++++++++++++++++++++++ 1 file changed, 488 insertions(+) create mode 100644 zeroclaw/PLAN-compliance-agent-product.md diff --git a/zeroclaw/PLAN-compliance-agent-product.md b/zeroclaw/PLAN-compliance-agent-product.md new file mode 100644 index 0000000..1709d2f --- /dev/null +++ b/zeroclaw/PLAN-compliance-agent-product.md @@ -0,0 +1,488 @@ +# Plan: Compliance Agent — Vom PoC zum Produkt + +## Kontext + +Der Compliance Agent PoC funktioniert: URL scannen, Qwen klassifiziert, +UCCA bewertet, Dienstleister erkannt, Korrekturvorschlaege generiert, Email gesendet. + +Aber: Scores sind zu niedrig, zu viele False-Positive Controls, kein Consent-Test, +keine Persistenz, keine PDF-Exports. Dieser Plan macht das PoC produktreif. + +**Strategische Bedeutung:** Erstmalig wird das RAG (166k Controls) gegen echte +Webseiten getestet. Der Consent-Test (vor/nach Cookie-Einwilligung) waere ein +Alleinstellungsmerkmal das kein Wettbewerber hat. + +--- + +## Phase 0: UCCA Score-Kalibrierung (P0, 2-3 Tage) + +### Problem + +Der UCCA-Score fuer Opodo war LOW (20/100). Realistisch waere MEDIUM (40-50). +Die Intake-Flags werden aus dem Text extrahiert, aber zu wenige werden gesetzt. + +### Loesung + +**0.1 Intelligentere Intake-Flag-Erkennung** + +Aktuell: einfache Keyword-Suche (`"werbung" in text.lower()`). +Neu: LLM-gestuetzte Extraktion der Intake-Flags. + +```python +# Statt: +"marketing": "werbung" in text.lower() + +# Neu: Qwen extrahiert strukturiert +prompt = """ +Analysiere diesen Text und setze folgende Flags auf true/false: +- personal_data: Werden personenbezogene Daten verarbeitet? +- customer_data: Werden Kundendaten gespeichert? +- marketing: Werden Daten fuer Werbung/Marketing genutzt? +- profiling: Findet Profiling oder Personalisierung statt? +- minor_data: Werden Daten von Minderjaehrigen verarbeitet? +- biometric_data: Werden biometrische Daten verarbeitet? +- location_data: Werden Standortdaten erhoben? +- third_party_sharing: Werden Daten an Dritte weitergegeben? +- automated_decisions: Werden automatisierte Entscheidungen getroffen? +- cross_border_transfer: Findet Drittlandtransfer statt? +Antworte als JSON. +""" +``` + +**0.2 Score-Gewichtung anpassen** + +Die UCCA-Engine (Go, `ai-compliance-sdk/internal/ucca/`) hat die Score-Berechnung. +Pruefen ob die Gewichte realistisch sind: +- Personenbezogene Daten allein = 10 Punkte (zu wenig) +- Marketing + Drittlandtransfer + Profiling sollte mindestens +30 geben +- Zahlungsdaten + Passdaten sollte +20 geben + +**Dateien:** +- `backend-compliance/compliance/api/agent_analyze_routes.py` — Flag-Extraktion +- `ai-compliance-sdk/internal/ucca/engine.go` — Score-Berechnung (Go) +- `ai-compliance-sdk/internal/ucca/rules.go` — Regel-Definitionen + +**Testfaelle:** +- Opodo: Soll MEDIUM (40-50) ergeben +- Google: Soll HIGH (60-70) ergeben +- Einfacher Blog ohne Tracking: Soll MINIMAL (0-10) ergeben + +--- + +## Phase 1: Control Relevance Filter (P0, 1 Tag) + +### Bereits geplant in PLAN-control-relevance-filter.md + +Nur Phase 1 (regelbasiert) hier umsetzen: + +1. Neues `relevance_conditions` JSONB-Feld auf `canonical_controls` +2. Top-20 generische Controls mit Keywords versehen +3. Filter-Funktion im Agent: Control nur empfehlen wenn Keywords im Text vorkommen +4. Test: C_TRANSPARENCY faellt bei Opodo weg (kein KI-Nachweis) + +**Datei:** `backend-compliance/compliance/services/relevance_filter.py` (~200 LOC) + +--- + +## Phase 2: Headless Browser Consent-Test (P1, 2-3 Tage) + +### Das Killer-Feature + +Automatischer 3-Phasen-Test: + +``` +Phase A: Erster Besuch (ohne Interaktion) + → Welche Scripts/Cookies laden VOR dem Consent-Banner? + → Finding: "Script X laedt ohne Einwilligung" + +Phase B: Consent ablehnen ("Nur notwendige") + → Button klicken, 3 Sek warten + → Welche Scripts/Cookies laden NACH Ablehnung? + → Finding: "Google Analytics laedt trotz Ablehnung" = schwerer Verstoss + +Phase C: Consent akzeptieren ("Alle akzeptieren") + → Neuer Browser-Kontext, akzeptieren klicken + → Welche Scripts/Cookies laden NACH Zustimmung? + → Abgleich mit Cookie-Policy: "TikTok Pixel laedt, ist aber nicht dokumentiert" +``` + +### Technische Implementierung + +**2.1 Playwright im Backend-Container** + +```dockerfile +# Ergaenzung im backend-compliance Dockerfile +RUN pip install playwright && playwright install chromium +``` + +Alternativ: eigener `consent-tester` Service (besser isoliert, ~200MB Image). + +**2.2 Consent Test Service** + +```python +# backend-compliance/compliance/services/consent_tester.py (~250 LOC) + +class ConsentTester: + async def test(self, url: str) -> ConsentTestResult: + async with async_playwright() as p: + browser = await p.chromium.launch(headless=True) + + # Phase A: Ohne Consent + pre = await self._scan_phase(browser, url, action=None) + + # Phase B: Ablehnen + reject = await self._scan_phase(browser, url, action="reject") + + # Phase C: Akzeptieren + accept = await self._scan_phase(browser, url, action="accept") + + await browser.close() + + return ConsentTestResult( + banner_detected=pre.banner_visible, + banner_type=pre.banner_provider, # Didomi, OneTrust, Cookiebot etc. + scripts_before_consent=pre.scripts, + cookies_before_consent=pre.cookies, + violations_before_consent=self._find_violations(pre), + scripts_after_reject=reject.scripts, + cookies_after_reject=reject.cookies, + violations_after_reject=self._find_violations(reject), + scripts_after_accept=accept.scripts, + cookies_after_accept=accept.cookies, + undocumented_after_accept=self._find_undocumented(accept, dse_text), + ) +``` + +**2.3 Banner-Button-Erkennung** + +```python +# Typische Consent-Banner Button-Patterns +ACCEPT_PATTERNS = [ + 'button:has-text("Alle akzeptieren")', + 'button:has-text("Alles akzeptieren")', + 'button:has-text("Accept all")', + 'button:has-text("Alle Cookies akzeptieren")', + '[class*="accept-all"]', + '[data-action="accept-all"]', + '#didomi-notice-agree-button', # Didomi + '.cky-btn-accept', # CookieYes + '#onetrust-accept-btn-handler', # OneTrust + '#CybotCookiebotDialogBodyLevelButtonLevelOptinAllowAll', # Cookiebot +] + +REJECT_PATTERNS = [ + 'button:has-text("Nur notwendige")', + 'button:has-text("Ablehnen")', + 'button:has-text("Reject")', + 'button:has-text("Nur essentielle")', + '[class*="reject"]', + '#didomi-notice-disagree-button', + '#onetrust-reject-all-handler', + '#CybotCookiebotDialogBodyButtonDecline', +] +``` + +**2.4 Violation-Erkennung** + +```python +def _find_violations(self, phase: ScanPhase) -> list[Violation]: + violations = [] + + for script in phase.scripts: + service = match_service(script) # Gegen SERVICE_REGISTRY + if service and service.requires_consent: + if phase.action is None: + # Script laedt VOR Consent → Verstoss + violations.append(Violation( + severity="HIGH", + service=service.name, + legal_ref="§25 TDDDG", + text=f"{service.name} laedt OHNE vorherige Einwilligung", + )) + elif phase.action == "reject": + # Script laedt NACH Ablehnung → schwerer Verstoss + violations.append(Violation( + severity="CRITICAL", + service=service.name, + legal_ref="§25 TDDDG, Art. 5(3) ePrivacy", + text=f"{service.name} laedt TROTZ Ablehnung — Dark Pattern", + )) + + return violations +``` + +**2.5 Frontend: Consent-Test Tab** + +Dritter Tab im Agent: "Schnellanalyse | Website-Scan | Cookie-Test" + +Anzeige: +``` +Cookie-Consent-Test: opodo.de +═══════════════════════════════ + +Banner erkannt: Ja (Didomi) + +Phase A: Vor Einwilligung + ✗ Google Analytics — laedt ohne Einwilligung (§25 TDDDG) + ✓ Didomi CMP — notwendig, OK + ✗ Google Tag Manager — laedt ohne Einwilligung + +Phase B: Nach Ablehnung ("Nur notwendige") + ✗ Google Analytics — laedt TROTZ Ablehnung (KRITISCH!) + ✓ Keine neuen Tracking-Cookies gesetzt + +Phase C: Nach Zustimmung ("Alle akzeptieren") + ✓ Google Analytics — jetzt aktiv (mit Consent OK) + ✓ Didomi Consent Cookie gesetzt + ✗ TikTok Pixel — nicht in Cookie-Policy dokumentiert + +Zusammenfassung: + 2 kritische Verstoesse (Tracking ohne/trotz Ablehnung) + 1 Dokumentationsluecke (TikTok nicht in Policy) +``` + +**2.6 Neuer Endpoint** + +``` +POST /api/compliance/agent/consent-test +Body: { "url": "https://www.opodo.de" } +Response: ConsentTestResult (3 Phasen + Violations) +``` + +**Dateien:** +- `backend-compliance/compliance/services/consent_tester.py` — Playwright Test (~250 LOC) +- `backend-compliance/compliance/api/agent_consent_routes.py` — Endpoint (~100 LOC) +- `admin-compliance/app/sdk/agent/_components/ConsentTestResult.tsx` — UI (~150 LOC) +- `admin-compliance/app/api/sdk/v1/agent/consent-test/route.ts` — Proxy (~35 LOC) + +**Aufwand:** 2-3 Tage (inkl. Playwright Setup + Button-Erkennung) + +--- + +## Phase 3: Dienstleister-Registry erweitern (P1, 1 Tag) + +### Aktuell: ~20 Services in website_scanner.py + +### Ziel: 80+ Services + +Neue Kategorien: +- **Newsletter/Email Marketing** — Mailchimp, Brevo, CleverReach, ActiveCampaign, HubSpot, Rapidmail +- **Social Media Embeds** — Twitter/X, Instagram, LinkedIn, Pinterest, TikTok +- **A/B Testing** — Optimizely, VWO, Google Optimize (Legacy) +- **Heatmaps/Session Recording** — FullStory, Mouseflow, Crazy Egg, Lucky Orange +- **Werbenetwerke** — Google Ads, Meta Ads, TikTok Ads, Criteo, Taboola, Outbrain +- **Tag Manager** — Google, Tealium, Segment +- **CRM** — HubSpot, Salesforce Pardot, Pipedrive +- **Push Notifications** — OneSignal, Pushwoosh, Firebase +- **Customer Support** — Freshdesk, Zendesk (erweitern), HelpScout +- **Cloud/CDN** — AWS CloudFront, Azure CDN, Vercel, Netlify +- **Error Tracking** — Sentry, Bugsnag, Datadog RUM, New Relic + +Jeder Eintrag: Regex, Provider, Land, EU-Adaequanz, Consent-Pflicht, Rechtsgrundlage. + +**Datei:** `backend-compliance/compliance/services/website_scanner.py` — SERVICE_REGISTRY erweitern +Evtl. in eigene Datei auslagern: `service_registry.py` (~300 LOC, reine Daten) + +--- + +## Phase 4: Scan beschleunigen (P2, 1 Tag) + +### Problem + +Aktuell: Seiten sequentiell fetchen + 3-4 LLM-Calls = 3-5 Minuten. + +### Loesung + +**4.1 Parallel Fetchen** +```python +# Statt sequentiell: +for url in pages: + html = await fetch(url) + +# Parallel: +htmls = await asyncio.gather(*[fetch(url) for url in pages]) +``` + +**4.2 Qwen-Calls reduzieren** +- DSE-Extraktion: Nur wenn Datenschutzseite gefunden +- Korrekturvorschlaege: Nur fuer HIGH-Severity Findings (nicht fuer alle) +- Batch: Alle Korrekturen in einem LLM-Call statt einzeln + +**4.3 Kleineres Modell fuer Klassifizierung** +- Qwen 2.5:14b statt 3.5:35b fuer einfache Klassifizierung (~5x schneller) +- 3.5:35b nur fuer Korrekturvorschlaege und DSE-Extraktion + +**Ziel:** Scan in <60 Sekunden statt 3-5 Minuten. + +--- + +## Phase 5: Persistenz — Ergebnisse in DB speichern (P2, 1 Tag) + +### Problem + +Ergebnisse sind aktuell nur im Browser-Session-State und in Mailpit-Emails. +Bei Seitenreload oder neuem Tab: alles weg. + +### Loesung + +**5.1 Neue DB-Tabelle** + +```sql +CREATE TABLE compliance_agent_scans ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + tenant_id UUID NOT NULL, + user_id TEXT NOT NULL, + url TEXT NOT NULL, + mode TEXT NOT NULL, -- 'quick', 'scan', 'consent_test' + analysis_mode TEXT NOT NULL, -- 'pre_launch', 'post_launch' + classification TEXT, + risk_level TEXT, + risk_score FLOAT, + escalation_level TEXT, + services JSONB DEFAULT '[]', + findings JSONB DEFAULT '[]', + corrections JSONB DEFAULT '[]', + consent_test JSONB, -- Phase 2 Ergebnisse + summary_html TEXT, + email_sent BOOLEAN DEFAULT FALSE, + created_at TIMESTAMPTZ DEFAULT now() +); + +CREATE INDEX idx_agent_scans_tenant ON compliance_agent_scans(tenant_id); +CREATE INDEX idx_agent_scans_url ON compliance_agent_scans(url); +``` + +**5.2 Frontend: Scan-Verlauf aus DB laden** + +Statt Session-basierter History → API-Call: `GET /api/compliance/agent/scans` +Sortiert nach Datum, filterbar nach URL/Risiko/Typ. + +**Dateien:** +- `backend-compliance/compliance/api/agent_scan_routes.py` — DB-Save nach jedem Scan +- `backend-compliance/compliance/db/agent_scan_models.py` — SQLAlchemy Model (~40 LOC) +- Migration SQL + +--- + +## Phase 6: PDF-Export (P2, 0.5 Tage) + +### Nutzen + +Manager wollen druckbare Reports, nicht nur Emails. + +### Implementierung + +ReportLab oder WeasyPrint (bereits im Backend vorhanden fuer andere PDF-Exports). + +```python +# Neuer Endpoint +GET /api/compliance/agent/scans/{id}/pdf + +# Generiert PDF mit: +# - Deckblatt (Firmenlogo, Datum, URL) +# - Executive Summary (Risiko-Ampel, Rolle) +# - Findings-Tabelle +# - Dienstleister-Abgleich (SOLL/IST) +# - Consent-Test Ergebnisse (wenn vorhanden) +# - Korrekturvorschlaege +# - Anhang: Gescannte Seiten, Rechtsgrundlagen +``` + +**Datei:** `backend-compliance/compliance/services/agent_pdf_export.py` (~200 LOC) + +--- + +## Phase 7: Recurring Scans / ZeroClaw (P3, 1 Tag) + +### Nutzen + +Website aendert sich → neue Dienstleister eingebunden → automatisches Alert. + +### Implementierung + +ZeroClaw SOP mit Cron-Trigger: + +```toml +[[triggers]] +type = "cron" +schedule = "0 6 * * 1" # Jeden Montag 06:00 +``` + +Der Agent: +1. Laedt alle gespeicherten URLs aus der DB +2. Scannt jede URL +3. Vergleicht mit letztem Scan-Ergebnis +4. Bei Aenderungen: Email an DSB mit Diff + +**Oder:** Einfacher Cron-Job im Backend (kein ZeroClaw noetig): +```python +# backend-compliance/compliance/services/recurring_scan.py +async def run_weekly_scans(): + scans = await db.get_all_monitored_urls() + for scan in scans: + result = await analyze(scan.url, scan.mode) + if has_changes(result, scan.last_result): + await send_change_alert(scan, result) +``` + +--- + +## Phase 8: Multi-Website Vergleich (P3, 1 Tag) + +### Nutzen + +"Wie steht mein Unternehmen im Vergleich zu 5 Wettbewerbern?" + +### Implementierung + +Frontend: Mehrere URLs eingeben → paralleler Scan → Vergleichstabelle: + +``` + | Meine Firma | Opodo | Booking | Expedia | +Datenschutzerkl. | ✓ | ✓ | ✓ | ✓ | +Impressum | ✓ | ✗ (404) | ✓ | ✓ | +Cookie-Banner | ✓ | ✓ | ✓ | ✗ | +Google Fonts lokal | ✓ | ✗ | ✓ | ✗ | +Kuendigungsbutton | ✓ | ✗ | n/a | n/a | +Tracking vor Consent| ✗ | ✗ | ✓ | ✗ | +Risiko-Score | 15/100 | 45/100 | 20/100 | 55/100 | +``` + +**Endpoint:** `POST /api/compliance/agent/compare` +**Body:** `{ "urls": ["url1", "url2", "url3"] }` + +--- + +## Implementierungsreihenfolge + +| Woche | Phase | Was | Ergebnis | +|-------|-------|-----|----------| +| 1 | Phase 0 | UCCA Score-Kalibrierung | Realistische Risiko-Scores | +| 1 | Phase 1 | Control Relevance Filter | Keine False Positives mehr | +| 2 | Phase 2 | Consent-Test (Playwright) | Killer-Feature, vor/nach Einwilligung | +| 2 | Phase 3 | Registry 80+ Services | Umfassende Dienstleister-Erkennung | +| 3 | Phase 4 | Scan beschleunigen | <60 Sekunden statt 3-5 Minuten | +| 3 | Phase 5 | DB-Persistenz | Scan-Verlauf, keine verlorenen Ergebnisse | +| 4 | Phase 6 | PDF-Export | Druckbare Reports fuer Management | +| 4 | Phase 7 | Recurring Scans | Automatische Ueberwachung | +| 5 | Phase 8 | Multi-Website Vergleich | Wettbewerber-Benchmark | + +## Investoren-Demo Szenario + +Nach Phase 2 (Woche 2) koennen wir folgende Demo zeigen: + +1. **URL eingeben:** `https://www.opodo.de` +2. **Website-Scan:** 6 Seiten gescannt, Google Analytics + Fonts + GTM erkannt +3. **SOLL/IST:** 3 Dienste NICHT in DSE dokumentiert → Art. 13 Verstoss +4. **Consent-Test:** Google Analytics laedt VOR Einwilligung → §25 TDDDG Verstoss +5. **Consent ablehnen:** Analytics laedt TROTZ Ablehnung → KRITISCH +6. **Score:** MEDIUM (45/100) mit 5 Findings +7. **Korrekturvorschlag:** Einbaufertige DSE-Textbausteine per Qwen +8. **Email an DSB:** Formatierter HTML-Report mit Handlungsanweisung +9. **Vergleich:** Opodo vs. Booking.com — wer ist besser aufgestellt? + +Das demonstriert: +- RAG funktioniert gegen echte Systeme (166k Controls) +- LLM generiert juristische Textbausteine +- Automatisierte Compliance-Pruefung in <60 Sekunden +- Consent-Test den kein Wettbewerber hat From 4298ae17abf63da448fde30dcaf38f7eaf6289d0 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Wed, 29 Apr 2026 11:36:24 +0200 Subject: [PATCH 008/413] =?UTF-8?q?feat:=20Phase=200+1=20=E2=80=94=20LLM?= =?UTF-8?q?=20intake=20extraction=20+=20control=20relevance=20filter?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Phase 0: Qwen extracts 14 structured intake flags (personal_data, marketing, profiling, ai_usage, etc.) instead of keyword matching. Fallback to keywords if LLM unavailable. Flags feed into UCCA for accurate scoring. Phase 1: Control relevance filter removes false positives. C_TRANSPARENCY only recommended if AI/ML keywords found in text. 7 control rules with keyword lists + intake flag fallback. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../compliance/api/agent_analyze_routes.py | 53 +++--- .../compliance/services/intake_extractor.py | 125 ++++++++++++++ .../compliance/services/relevance_filter.py | 152 ++++++++++++++++++ 3 files changed, 301 insertions(+), 29 deletions(-) create mode 100644 backend-compliance/compliance/services/intake_extractor.py create mode 100644 backend-compliance/compliance/services/relevance_filter.py diff --git a/backend-compliance/compliance/api/agent_analyze_routes.py b/backend-compliance/compliance/api/agent_analyze_routes.py index ac6ceb5..833fa38 100644 --- a/backend-compliance/compliance/api/agent_analyze_routes.py +++ b/backend-compliance/compliance/api/agent_analyze_routes.py @@ -15,6 +15,8 @@ from fastapi import APIRouter from pydantic import BaseModel from compliance.services.smtp_sender import send_email +from compliance.services.intake_extractor import extract_intake_flags, flags_to_ucca_intake +from compliance.services.relevance_filter import filter_controls logger = logging.getLogger(__name__) @@ -77,21 +79,24 @@ async def analyze_url(req: AnalyzeRequest): # Step 2: Classify via SDK LLM classification = await _classify(client, text) - # Step 3: Assess via UCCA - assessment = await _assess(client, text, classification) + # Step 3: Extract intake flags via LLM (better than keyword matching) + intake_flags = await extract_intake_flags(text) - # Step 4: Determine role + # Step 4: Assess via UCCA with LLM-extracted flags + assessment = await _assess(client, text, classification, intake_flags) + + # Step 5: Determine role esc_level = assessment.get("escalation_level", "E0") role = ESCALATION_ROLES.get(esc_level, ESCALATION_ROLES["E0"]) - # Step 5: Website compliance checks (§312k BGB etc.) + # Step 6: Website compliance checks (§312k BGB etc.) site_findings, follow_ups = await _check_website_compliance(client, req.url, raw_html) - # Step 6: Merge findings + # Step 7: Merge and filter findings/controls findings = assessment.get("triggered_rules", []) controls = assessment.get("required_controls", []) findings_str = _to_string_list(findings) + site_findings - controls_str = _to_string_list(controls) + controls_str = filter_controls(_to_string_list(controls), text, intake_flags) # Escalate if website checks found issues if site_findings and esc_level == "E0": @@ -179,34 +184,24 @@ async def _classify(client: httpx.AsyncClient, text: str) -> str: return "other" -async def _assess(client: httpx.AsyncClient, text: str, classification: str) -> dict: +async def _assess(client: httpx.AsyncClient, text: str, classification: str, intake_flags: dict | None = None) -> dict: """Run UCCA assessment via SDK. Returns flattened result dict.""" try: - # UCCA expects boolean intake flags, not string categories + # Use LLM-extracted flags if available, otherwise minimal defaults + if intake_flags: + ucca_intake = flags_to_ucca_intake(intake_flags) + else: + ucca_intake = { + "data_types": {"personal_data": True}, + "purpose": {}, + "automation": "manual", + "outputs": {}, + } + resp = await client.post(f"{SDK_URL}/sdk/v1/ucca/assess", headers=SDK_HEADERS, json={ "use_case_text": text[:3000], "domain": classification, - "data_types": { - "personal_data": True, - "customer_data": True, - "location_data": "tracking" in text.lower() or "standort" in text.lower(), - "images": False, - "biometric_data": "biometrisch" in text.lower(), - "minor_data": "kinder" in text.lower() or "minderjährig" in text.lower(), - }, - "purpose": { - "marketing": "werbung" in text.lower() or "marketing" in text.lower(), - "analytics": "analyse" in text.lower() or "analytics" in text.lower(), - "profiling": "profil" in text.lower() or "personalis" in text.lower(), - "automation": False, - "customer_support": False, - }, - "automation": "partially_automated", - "outputs": { - "content_generation": False, - "recommendations_to_users": "empfehl" in text.lower(), - "data_export": "export" in text.lower() or "uebertrag" in text.lower(), - }, + **ucca_intake, }) data = resp.json() # Flatten: UCCA wraps result under "assessment" and "result" diff --git a/backend-compliance/compliance/services/intake_extractor.py b/backend-compliance/compliance/services/intake_extractor.py new file mode 100644 index 0000000..4c3fb90 --- /dev/null +++ b/backend-compliance/compliance/services/intake_extractor.py @@ -0,0 +1,125 @@ +""" +Intake Extractor — LLM-based extraction of UCCA intake flags from document text. + +Replaces simple keyword matching with structured LLM analysis for more +accurate risk scoring. +""" + +import json +import logging +import os +import re + +import httpx + +logger = logging.getLogger(__name__) + +OLLAMA_URL = os.environ.get("OLLAMA_URL", "http://host.docker.internal:11434") +OLLAMA_MODEL = os.environ.get("OLLAMA_MODEL", "qwen3.5:35b-a3b") + +EXTRACTION_PROMPT = """/no_think +Analysiere den folgenden Text (Datenschutzerklaerung oder Website-Inhalt) und +bestimme fuer JEDES der folgenden Flags ob es zutrifft (true/false). + +Flags: +- personal_data: Werden personenbezogene Daten verarbeitet? +- customer_data: Werden Kundendaten (Name, Email, Adresse) gespeichert? +- payment_data: Werden Zahlungsdaten (Kreditkarte, IBAN, PayPal) verarbeitet? +- location_data: Werden Standort-/GPS-Daten erhoben? +- biometric_data: Werden biometrische Daten verarbeitet? +- minor_data: Werden Daten von Kindern/Minderjaehrigen verarbeitet? +- health_data: Werden Gesundheitsdaten verarbeitet? +- marketing: Werden Daten fuer Werbung/Marketing/Newsletter genutzt? +- profiling: Findet Profiling, Scoring oder Personalisierung statt? +- automated_decisions: Werden automatisierte Einzelentscheidungen getroffen (Art. 22)? +- third_party_sharing: Werden Daten an Dritte/Partner weitergegeben? +- cross_border_transfer: Findet Datentransfer ausserhalb EU/EWR statt? +- tracking: Werden Cookies/Tracking-Pixel/Analytics eingesetzt? +- ai_usage: Wird KI/Machine Learning/Algorithmen eingesetzt? + +Antworte NUR mit einem JSON-Objekt, keine Erklaerung: +{"personal_data": true, "customer_data": true, ...} +""" + + +async def extract_intake_flags(text: str) -> dict: + """Extract structured intake flags from text via LLM.""" + try: + async with httpx.AsyncClient(timeout=90.0) as client: + resp = await client.post(f"{OLLAMA_URL}/api/generate", json={ + "model": OLLAMA_MODEL, + "prompt": f"{EXTRACTION_PROMPT}\n\nTEXT:\n{text[:2500]}", + "stream": False, + }) + raw = resp.json().get("response", "") + raw = re.sub(r".*?", "", raw, flags=re.DOTALL).strip() + + # Extract JSON from response + match = re.search(r"\{[^}]+\}", raw, re.DOTALL) + if match: + flags = json.loads(match.group()) + logger.info("Extracted intake flags: %s", {k: v for k, v in flags.items() if v}) + return flags + except Exception as e: + logger.warning("Intake extraction failed, using keyword fallback: %s", e) + + # Fallback: keyword-based extraction + return _keyword_fallback(text) + + +def _keyword_fallback(text: str) -> dict: + """Simple keyword-based fallback when LLM is unavailable.""" + t = text.lower() + return { + "personal_data": True, # Always assume for websites + "customer_data": any(w in t for w in ["kunde", "customer", "nutzerkonto", "registrier"]), + "payment_data": any(w in t for w in ["zahlung", "kreditkarte", "paypal", "stripe", "klarna", "iban"]), + "location_data": any(w in t for w in ["standort", "gps", "location", "geo"]), + "biometric_data": any(w in t for w in ["biometrisch", "fingerabdruck", "gesichtserkennung"]), + "minor_data": any(w in t for w in ["kinder", "minderjährig", "under 16", "unter 16"]), + "health_data": any(w in t for w in ["gesundheit", "medizin", "patient", "health"]), + "marketing": any(w in t for w in ["werbung", "marketing", "newsletter", "werbe"]), + "profiling": any(w in t for w in ["profil", "personalis", "scoring", "empfehl"]), + "automated_decisions": any(w in t for w in ["automatisiert", "automated decision", "scoring"]), + "third_party_sharing": any(w in t for w in ["dritte", "partner", "dienstleister", "third part"]), + "cross_border_transfer": any(w in t for w in ["usa", "drittland", "drittst", "third countr"]), + "tracking": any(w in t for w in ["cookie", "tracking", "analytics", "pixel"]), + "ai_usage": any(w in t for w in ["künstliche intelligenz", "machine learning", "ki-", "ai-powered"]), + } + + +def flags_to_ucca_intake(flags: dict) -> dict: + """Convert extracted flags to UCCA intake format.""" + return { + "data_types": { + "personal_data": flags.get("personal_data", False), + "customer_data": flags.get("customer_data", False), + "location_data": flags.get("location_data", False), + "biometric_data": flags.get("biometric_data", False), + "minor_data": flags.get("minor_data", False), + "images": False, + "audio": False, + "financial_data": flags.get("payment_data", False), + "employee_data": False, + "article_9_data": flags.get("health_data", False) or flags.get("biometric_data", False), + }, + "purpose": { + "marketing": flags.get("marketing", False), + "analytics": flags.get("tracking", False), + "profiling": flags.get("profiling", False), + "automation": flags.get("ai_usage", False), + "customer_support": False, + "evaluation_scoring": flags.get("automated_decisions", False), + "decision_making": flags.get("automated_decisions", False), + }, + "automation": "fully_automated" if flags.get("automated_decisions") else + "partially_automated" if flags.get("ai_usage") else "manual", + "outputs": { + "recommendations_to_users": flags.get("profiling", False), + "data_export": flags.get("cross_border_transfer", False), + "legal_effects": flags.get("automated_decisions", False), + }, + "hosting": { + "region": "non_eu" if flags.get("cross_border_transfer") else "eu", + }, + } diff --git a/backend-compliance/compliance/services/relevance_filter.py b/backend-compliance/compliance/services/relevance_filter.py new file mode 100644 index 0000000..413c227 --- /dev/null +++ b/backend-compliance/compliance/services/relevance_filter.py @@ -0,0 +1,152 @@ +""" +Control Relevance Filter — filters out controls that are not relevant +for the analyzed document based on keyword matching. + +Prevents false positives like C_TRANSPARENCY being recommended when +no AI usage is evident. +""" + +import logging +import re + +logger = logging.getLogger(__name__) + +# Top controls with their relevance conditions. +# A control is only relevant if ANY keyword from 'requires_any' matches the text. +# If 'requires_any' is empty, the control is always relevant. +CONTROL_RELEVANCE: dict[str, dict] = { + "C_TRANSPARENCY": { + "description": "KI-Transparenz-Hinweis (Art. 52 AI Act)", + "requires_any": [ + "künstliche intelligenz", "kuenstliche intelligenz", + "artificial intelligence", "machine learning", "maschinelles lernen", + "ki-gestützt", "ki-gestuetzt", "ai-powered", "ai system", + "chatbot", "neural", "deep learning", "algorithmus", "algorithmen", + "automatisierte entscheidung", "automated decision", + ], + "reason": "Nur relevant wenn KI/ML tatsaechlich eingesetzt wird", + }, + "C_DSFA_REQUIRED": { + "description": "Datenschutz-Folgenabschaetzung durchfuehren", + "requires_any": [ + "gesundheit", "biometrisch", "genetisch", "health", "biometric", + "scoring", "profiling", "systematisch", "umfangreich", + "videoüberwachung", "videoueberwachung", "kamera", + "minderjährig", "minderjaehrig", "kinder", + ], + "reason": "Nur bei hohem Risiko (Art. 9 Daten, Profiling, Ueberwachung)", + }, + "C_ART22_INFO": { + "description": "Info ueber automatisierte Einzelentscheidung (Art. 22 DSGVO)", + "requires_any": [ + "automatisierte entscheidung", "automated decision", "scoring", + "bonitaet", "kredit", "rating", "algorithmische entscheidung", + "profiling", "klarna", "ratenzahlung", + ], + "reason": "Nur bei automatisierten Einzelentscheidungen mit Rechtswirkung", + }, + "C_DPO_REQUIRED": { + "description": "Datenschutzbeauftragten bestellen", + "requires_any": [], # Always relevant — empty means no filter + "reason": "Generell relevant fuer Unternehmen", + }, + "C_EXPLICIT_CONSENT": { + "description": "Explizite Einwilligung einholen", + "requires_any": [ + "cookie", "tracking", "analytics", "pixel", "marketing", + "werbung", "newsletter", "remarketing", "retargeting", + "einwilligung", "consent", "opt-in", + ], + "reason": "Nur bei Tracking/Marketing das Einwilligung erfordert", + }, + "C_CHILD_PROTECTION": { + "description": "Besonderer Schutz fuer Minderdjaehrige", + "requires_any": [ + "kinder", "minderjährig", "minderjaehrig", "jugend", + "under 16", "unter 16", "schüler", "schueler", "child", + ], + "reason": "Nur wenn Daten von Minderjaehrigen verarbeitet werden", + }, + "C_THIRD_COUNTRY_SAFEGUARDS": { + "description": "Drittlandtransfer absichern (Art. 44-49 DSGVO)", + "requires_any": [ + "usa", "united states", "drittland", "drittst", "third countr", + "standardvertragsklausel", "sccs", "binding corporate", + "angemessenheitsbeschluss", "adequacy", + "google", "meta", "facebook", "amazon", "microsoft", "apple", + "cloudflare", "stripe", "paypal", + ], + "reason": "Nur bei Datentransfer in Drittlaender", + }, +} + + +def filter_controls( + controls: list[str], + source_text: str, + intake_flags: dict | None = None, +) -> list[str]: + """Filter controls based on relevance to the analyzed text. + + Returns only controls that are relevant (keyword match or no filter defined). + """ + if not controls: + return controls + + text_lower = source_text.lower() + filtered = [] + removed = [] + + for control in controls: + # Extract control ID from string like "[C_TRANSPARENCY] Nutzer informieren..." + control_id = _extract_control_id(control) + + if control_id and control_id in CONTROL_RELEVANCE: + rules = CONTROL_RELEVANCE[control_id] + keywords = rules["requires_any"] + + if not keywords: + # No filter = always relevant + filtered.append(control) + continue + + # Check if any keyword matches + if any(kw in text_lower for kw in keywords): + filtered.append(control) + else: + # Also check intake flags as fallback + if intake_flags and _check_flags(control_id, intake_flags): + filtered.append(control) + else: + removed.append((control_id, rules["reason"])) + else: + # Unknown control — keep it (don't filter what we don't understand) + filtered.append(control) + + if removed: + logger.info( + "Relevance filter removed %d controls: %s", + len(removed), + ", ".join(f"{cid} ({reason})" for cid, reason in removed), + ) + + return filtered + + +def _extract_control_id(control: str) -> str | None: + """Extract control ID from '[C_XXX] description' format.""" + match = re.match(r"\[([A-Z_0-9]+)\]", control) + return match.group(1) if match else None + + +def _check_flags(control_id: str, flags: dict) -> bool: + """Check if intake flags make a control relevant.""" + flag_map = { + "C_TRANSPARENCY": flags.get("ai_usage", False), + "C_DSFA_REQUIRED": flags.get("health_data", False) or flags.get("biometric_data", False), + "C_ART22_INFO": flags.get("automated_decisions", False), + "C_EXPLICIT_CONSENT": flags.get("tracking", False) or flags.get("marketing", False), + "C_CHILD_PROTECTION": flags.get("minor_data", False), + "C_THIRD_COUNTRY_SAFEGUARDS": flags.get("cross_border_transfer", False), + } + return flag_map.get(control_id, False) From 0ba76d041ac30d649ef0643500c4402423357c07 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Wed, 29 Apr 2026 11:55:26 +0200 Subject: [PATCH 009/413] =?UTF-8?q?feat:=20DSE=20parser=20+=20matcher=20?= =?UTF-8?q?=E2=80=94=20textblock=20references=20in=20scan=20findings?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - dse_parser.py: HTML → structured sections (heading, number, content, parent) Uses heading hierarchy (h1-h4) with regex fallback - dse_matcher.py: matches detected services against DSE sections Exact name → provider → category matching with insertion point suggestion - agent_scan_routes: TextReference model in findings (original text, section, paragraph, correction type, insert_after) Enables showing: "Google Analytics not found in DSE, insert after Section 2.4 Cookies und Tracking" Co-Authored-By: Claude Opus 4.6 (1M context) --- .../compliance/api/agent_scan_routes.py | 70 +++++- .../compliance/services/dse_matcher.py | 189 +++++++++++++++ .../compliance/services/dse_parser.py | 224 ++++++++++++++++++ 3 files changed, 478 insertions(+), 5 deletions(-) create mode 100644 backend-compliance/compliance/services/dse_matcher.py create mode 100644 backend-compliance/compliance/services/dse_parser.py diff --git a/backend-compliance/compliance/api/agent_scan_routes.py b/backend-compliance/compliance/api/agent_scan_routes.py index 7b00bfc..935656c 100644 --- a/backend-compliance/compliance/api/agent_scan_routes.py +++ b/backend-compliance/compliance/api/agent_scan_routes.py @@ -16,6 +16,8 @@ from pydantic import BaseModel from compliance.services.website_scanner import scan_website, DetectedService from compliance.services.dse_service_extractor import extract_dse_services, compare_services from compliance.services.smtp_sender import send_email +from compliance.services.dse_parser import parse_dse +from compliance.services.dse_matcher import build_text_references, TextReference logger = logging.getLogger(__name__) @@ -49,11 +51,27 @@ class ServiceInfo(BaseModel): status: str # "ok", "undocumented", "outdated" +class TextReferenceModel(BaseModel): + found: bool = False + source_url: str = "" + document_type: str = "Datenschutzerklaerung" + section_heading: str = "" + section_number: str = "" + parent_section: str = "" + paragraph_index: int = 0 + original_text: str = "" + issue: str = "" + correction_type: str = "" + correction_text: str = "" + insert_after: str = "" + + class ScanFinding(BaseModel): code: str severity: str text: str correction: str = "" + text_reference: TextReferenceModel | None = None class ScanResponse(BaseModel): @@ -87,14 +105,22 @@ async def scan_website_endpoint(req: ScanRequest): dse_services = await extract_dse_services(dse_text) if dse_text else [] logger.info("DSE mentions %d services", len(dse_services)) - # Step 4: SOLL/IST comparison + # Step 4: Parse DSE into structured sections + dse_html = await _fetch_dse_html(req.url, scan.pages_scanned) + dse_sections = parse_dse(dse_html, req.url) if dse_html else [] + logger.info("Parsed %d DSE sections", len(dse_sections)) + + # Step 5: SOLL/IST comparison detected_dicts = [_service_to_dict(s) for s in scan.detected_services] comparison = compare_services(detected_dicts, dse_services) - # Step 5: Generate findings - services_info, findings = _build_findings(comparison, scan, is_live) + # Step 6: Build TextReferences for each detected service + text_refs = build_text_references(detected_dicts, dse_services, dse_sections, req.url) - # Step 6: Generate corrections for pre-launch mode + # Step 7: Generate findings with text references + services_info, findings = _build_findings(comparison, scan, is_live, text_refs) + + # Step 8: Generate corrections for pre-launch mode if not is_live and findings: await _add_corrections(findings, dse_text) @@ -149,6 +175,24 @@ async def _fetch_dse_text(url: str, scanned_pages: list[str]) -> str: return "" +async def _fetch_dse_html(url: str, scanned_pages: list[str]) -> str: + """Fetch the raw HTML of the privacy policy page (for structured parsing).""" + import re + dse_url = None + for page in scanned_pages: + if re.search(r"datenschutz|privacy|dsgvo", page, re.IGNORECASE): + dse_url = page + break + if not dse_url: + dse_url = url + try: + async with httpx.AsyncClient(timeout=15.0, follow_redirects=True) as client: + resp = await client.get(dse_url, headers={"User-Agent": "BreakPilot-Compliance-Agent/1.0"}) + return resp.text + except Exception: + return "" + + def _service_to_dict(svc: DetectedService) -> dict: return { "id": svc.id, "name": svc.name, "category": svc.category, @@ -159,11 +203,25 @@ def _service_to_dict(svc: DetectedService) -> dict: def _build_findings( - comparison: dict, scan, is_live: bool, + comparison: dict, scan, is_live: bool, text_refs: dict | None = None, ) -> tuple[list[ServiceInfo], list[ScanFinding]]: """Build service info list and findings from comparison.""" services = [] findings = [] + text_refs = text_refs or {} + + def _get_ref(svc_id: str) -> TextReferenceModel | None: + ref = text_refs.get(svc_id) + if not ref: + return None + return TextReferenceModel( + found=ref.found, source_url=ref.source_url, + document_type=ref.document_type, section_heading=ref.section_heading, + section_number=ref.section_number, parent_section=ref.parent_section, + paragraph_index=ref.paragraph_index, original_text=ref.original_text, + issue=ref.issue, correction_type=ref.correction_type, + correction_text=ref.correction_text, insert_after=ref.insert_after, + ) # Undocumented services (on website, NOT in DSE) for svc in comparison["undocumented"]: @@ -175,12 +233,14 @@ def _build_findings( legal_ref=svc.get("legal_ref", ""), in_dse=False, status="undocumented", )) severity = "HIGH" if is_live else "MEDIUM" + ref = _get_ref(svc.get("id", "")) findings.append(ScanFinding( code=f"DSE-MISSING-{svc['id'].upper()}", severity=severity, text=f"{svc['name']} ({svc.get('provider', '')}, {svc.get('country', '')}) " f"ist auf der Website eingebunden aber NICHT in der Datenschutzerklaerung " f"dokumentiert (Art. 13 DSGVO).", + text_reference=ref, )) # Documented services (OK) diff --git a/backend-compliance/compliance/services/dse_matcher.py b/backend-compliance/compliance/services/dse_matcher.py new file mode 100644 index 0000000..61c51b4 --- /dev/null +++ b/backend-compliance/compliance/services/dse_matcher.py @@ -0,0 +1,189 @@ +""" +DSE Matcher — matches detected services against DSE sections and +generates TextReferences with original text, position, and corrections. +""" + +import logging +import re +from dataclasses import dataclass + +from compliance.services.dse_parser import DSESection, find_section_by_content, find_section_by_category + +logger = logging.getLogger(__name__) + +# Category → typical DSE section heading keywords +CATEGORY_SECTION_MAP = { + "tracking": ["cookie", "tracking", "webanalyse", "analytics", "statistik", "reichweitenmessung"], + "marketing": ["marketing", "werbung", "newsletter", "remarketing", "werbe"], + "payment": ["zahlung", "payment", "bezahl", "zahlungsabwicklung", "zahlungsdienst"], + "chatbot": ["chat", "kommunikation", "kundenservice", "kontakt", "livechat"], + "cdn": ["hosting", "bereitstellung", "technisch", "infrastruktur", "content delivery"], + "other": ["sonstig", "weitere", "dritte", "extern", "dienstleister"], +} + + +@dataclass +class TextReference: + """Reference to a specific text block in the DSE.""" + found: bool + source_url: str = "" + document_type: str = "Datenschutzerklaerung" + section_heading: str = "" + section_number: str = "" + parent_section: str = "" + paragraph_index: int = 0 + original_text: str = "" + issue: str = "" # "missing", "incomplete", "incorrect" + correction_type: str = "" # "insert", "replace", "append" + correction_text: str = "" + insert_after: str = "" + + +def match_service_to_dse( + service_name: str, + service_category: str, + sections: list[DSESection], + url: str = "", +) -> TextReference: + """Find where a service is mentioned in the DSE and build a TextReference.""" + # Step 1: Search for exact service name + section = find_section_by_content(sections, service_name) + + if section: + # Found — extract the relevant paragraph + original = _extract_relevant_paragraph(section.content, service_name) + return TextReference( + found=True, + source_url=url, + section_heading=section.heading, + section_number=section.section_number, + parent_section=section.parent_heading, + paragraph_index=_find_paragraph_index(section.content, service_name), + original_text=original, + issue="", # Found and present — caller determines if complete + ) + + # Step 2: Search for provider name (e.g., "Google" for "Google Analytics") + provider = service_name.split()[0] if " " in service_name else service_name + section = find_section_by_content(sections, provider) + + if section: + original = _extract_relevant_paragraph(section.content, provider) + return TextReference( + found=True, + source_url=url, + section_heading=section.heading, + section_number=section.section_number, + parent_section=section.parent_heading, + paragraph_index=_find_paragraph_index(section.content, provider), + original_text=original, + issue="incomplete", # Provider mentioned but not specific service + ) + + # Step 3: Not found — suggest insertion point + insert_section = find_section_by_category(sections, service_category) + insert_after = insert_section.heading if insert_section else "" + + # If no category match, find the last "Cookies"/"Tracking" or "Sonstiges" section + if not insert_after: + for s in reversed(sections): + h = s.heading.lower() + if any(kw in h for kw in ["cookie", "datenschutz", "daten"]): + insert_after = s.heading + break + + return TextReference( + found=False, + source_url=url, + document_type="Datenschutzerklaerung", + issue="missing", + correction_type="insert", + insert_after=insert_after, + ) + + +def build_text_references( + detected_services: list[dict], + dse_services: list[dict], + sections: list[DSESection], + url: str = "", +) -> dict[str, TextReference]: + """Build TextReferences for all detected services. + + Returns dict: service_id → TextReference + """ + refs: dict[str, TextReference] = {} + + for svc in detected_services: + service_id = svc.get("id", svc.get("name", "")) + service_name = svc.get("name", "") + category = svc.get("category", "other") + + ref = match_service_to_dse(service_name, category, sections, url) + + # Check if service is in the DSE SOLL list + dse_match = _find_in_dse_list(service_name, dse_services) + + if ref.found and dse_match: + ref.issue = "" # All good — documented and present + elif ref.found and not dse_match: + # Found in text but not in LLM extraction — still OK + ref.issue = "" + elif not ref.found: + ref.issue = "missing" + ref.correction_type = "insert" + + refs[service_id] = ref + + return refs + + +def _extract_relevant_paragraph(content: str, search_term: str) -> str: + """Extract the paragraph containing the search term.""" + search_lower = search_term.lower() + content_lower = content.lower() + + # Find position of search term + pos = content_lower.find(search_lower) + if pos == -1: + return content[:300] + + # Find sentence/paragraph boundaries + # Look backwards for paragraph break + start = max(0, content.rfind(".", 0, pos)) + if start > 0: + start += 2 # Skip ". " + else: + start = max(0, pos - 100) + + # Look forward for end of paragraph + end = content.find(".", pos + len(search_term)) + if end == -1 or end - pos > 500: + end = min(len(content), pos + 300) + else: + end += 1 # Include the period + + return content[start:end].strip() + + +def _find_paragraph_index(content: str, search_term: str) -> int: + """Find which paragraph (1-based) contains the search term.""" + paragraphs = re.split(r"\n\n|\n(?=[A-Z])", content) + search_lower = search_term.lower() + for i, para in enumerate(paragraphs, 1): + if search_lower in para.lower(): + return i + return 0 + + +def _find_in_dse_list(service_name: str, dse_services: list[dict]) -> dict | None: + """Check if a service appears in the LLM-extracted DSE service list.""" + name_lower = service_name.lower() + for svc in dse_services: + dse_name = svc.get("name", "").lower() + if name_lower in dse_name or dse_name in name_lower: + return svc + # Check first word (provider match) + if name_lower.split()[0] in dse_name: + return svc + return None diff --git a/backend-compliance/compliance/services/dse_parser.py b/backend-compliance/compliance/services/dse_parser.py new file mode 100644 index 0000000..f10a201 --- /dev/null +++ b/backend-compliance/compliance/services/dse_parser.py @@ -0,0 +1,224 @@ +""" +DSE Parser — parses privacy policy HTML into structured sections. + +Extracts headings, section numbers, content blocks and builds a +hierarchical structure that enables precise text references. +""" + +import logging +import re +from dataclasses import dataclass, field +from html.parser import HTMLParser + +logger = logging.getLogger(__name__) + + +@dataclass +class DSESection: + """A section in a privacy policy.""" + heading: str + heading_level: int # 1-4 + section_number: str # "2.5" or "" if no number + content: str # Plain text content + html: str # Original HTML content + parent_heading: str = "" + url: str = "" + element_id: str = "" + paragraph_count: int = 0 + + +class _HeadingExtractor(HTMLParser): + """Extract headings and their content from HTML.""" + + def __init__(self): + super().__init__() + self.sections: list[dict] = [] + self._current_tag = "" + self._in_heading = False + self._heading_level = 0 + self._heading_text = "" + self._heading_id = "" + self._content_parts: list[str] = [] + self._html_parts: list[str] = [] + self._skip_tags = {"script", "style", "nav", "footer", "header"} + self._skip_depth = 0 + self._p_count = 0 + + def handle_starttag(self, tag, attrs): + attrs_dict = dict(attrs) + if tag in self._skip_tags: + self._skip_depth += 1 + return + if self._skip_depth > 0: + return + + if tag in ("h1", "h2", "h3", "h4"): + # Save previous section + if self._heading_text: + self._save_section() + self._in_heading = True + self._heading_level = int(tag[1]) + self._heading_text = "" + self._heading_id = attrs_dict.get("id", "") + self._content_parts = [] + self._html_parts = [] + self._p_count = 0 + + if tag == "p": + self._p_count += 1 + + # Reconstruct HTML + attr_str = " ".join(f'{k}="{v}"' for k, v in attrs) + self._html_parts.append(f"<{tag}{' ' + attr_str if attr_str else ''}>") + + def handle_endtag(self, tag): + if tag in self._skip_tags and self._skip_depth > 0: + self._skip_depth -= 1 + return + if self._skip_depth > 0: + return + + if tag in ("h1", "h2", "h3", "h4"): + self._in_heading = False + + self._html_parts.append(f"") + + def handle_data(self, data): + if self._skip_depth > 0: + return + if self._in_heading: + self._heading_text += data.strip() + else: + self._content_parts.append(data) + self._html_parts.append(data) + + def _save_section(self): + if not self._heading_text: + return + content = " ".join(self._content_parts) + content = re.sub(r"\s+", " ", content).strip() + self.sections.append({ + "heading": self._heading_text.strip(), + "heading_level": self._heading_level, + "element_id": self._heading_id, + "content": content, + "html": "".join(self._html_parts), + "paragraph_count": self._p_count, + }) + + def finalize(self): + """Call after feeding all data to save the last section.""" + if self._heading_text: + self._save_section() + + +def parse_dse(html: str, url: str = "") -> list[DSESection]: + """Parse privacy policy HTML into structured sections.""" + extractor = _HeadingExtractor() + try: + extractor.feed(html) + extractor.finalize() + except Exception as e: + logger.warning("HTML parsing failed, falling back to regex: %s", e) + return _regex_fallback(html, url) + + if not extractor.sections: + return _regex_fallback(html, url) + + # Build parent hierarchy + sections: list[DSESection] = [] + parent_stack: list[str] = [""] # Stack of parent headings by level + + for raw in extractor.sections: + heading = raw["heading"] + level = raw["heading_level"] + + # Extract section number (e.g., "2.5" from "2.5 Webanalyse") + num_match = re.match(r"^(\d+(?:\.\d+)*)\s*[.:]?\s*", heading) + section_number = num_match.group(1) if num_match else "" + + # Track parent headings + while len(parent_stack) > level: + parent_stack.pop() + parent = parent_stack[-1] if parent_stack else "" + parent_stack.append(heading) + + sections.append(DSESection( + heading=heading, + heading_level=level, + section_number=section_number, + content=raw["content"][:2000], # Cap content length + html=raw["html"][:3000], + parent_heading=parent, + url=url, + element_id=raw["element_id"], + paragraph_count=raw["paragraph_count"], + )) + + logger.info("Parsed DSE: %d sections from %s", len(sections), url) + return sections + + +def _regex_fallback(html: str, url: str) -> list[DSESection]: + """Fallback parser using regex when HTML parsing fails.""" + # Strip scripts and styles + clean = re.sub(r"<(script|style)[^>]*>.*?", "", html, flags=re.DOTALL | re.IGNORECASE) + + sections = [] + # Find all headings + for match in re.finditer(r"]*(?:id=[\"']([^\"']*)[\"'])?[^>]*>(.*?)", clean, re.DOTALL | re.IGNORECASE): + level = int(match.group(1)) + elem_id = match.group(2) or "" + heading = re.sub(r"<[^>]+>", "", match.group(3)).strip() + + # Get content until next heading + start = match.end() + next_heading = re.search(r"]+>", " ", content) + content = re.sub(r"\s+", " ", content).strip() + + num_match = re.match(r"^(\d+(?:\.\d+)*)", heading) + + sections.append(DSESection( + heading=heading, + heading_level=level, + section_number=num_match.group(1) if num_match else "", + content=content[:2000], + html="", + url=url, + element_id=elem_id, + )) + + return sections + + +def find_section_by_content(sections: list[DSESection], search_text: str) -> DSESection | None: + """Find the section that contains specific text.""" + search_lower = search_text.lower() + for section in sections: + if search_lower in section.content.lower(): + return section + return None + + +def find_section_by_category(sections: list[DSESection], category: str) -> DSESection | None: + """Find the section most likely to contain a service category.""" + category_keywords = { + "tracking": ["cookie", "tracking", "webanalyse", "analytics", "statistik"], + "marketing": ["marketing", "werbung", "newsletter", "remarketing"], + "payment": ["zahlung", "payment", "bezahlung", "zahlungsabwicklung"], + "chatbot": ["chat", "kommunikation", "kundenservice", "kontakt"], + "cdn": ["hosting", "bereitstellung", "technisch", "infrastruktur", "cdn"], + "other": ["sonstig", "weitere", "dritte", "extern"], + } + keywords = category_keywords.get(category, category_keywords["other"]) + + for section in sections: + heading_lower = section.heading.lower() + content_lower = section.content.lower()[:500] + for kw in keywords: + if kw in heading_lower or kw in content_lower: + return section + return None From 15d1e118ed070bae982dbb610f0f92c5457fc4e2 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Wed, 29 Apr 2026 11:59:55 +0200 Subject: [PATCH 010/413] =?UTF-8?q?feat:=20TextReference=20component=20?= =?UTF-8?q?=E2=80=94=20original=20text,=20position,=20correction=20in=20fi?= =?UTF-8?q?ndings?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Shows for each finding: - Original text block from DSE (or "missing" indicator) - Position: section heading, number, parent section, paragraph index - Correction: insert/append/replace with copy button Falls back to plain correction view if no text reference available. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../app/sdk/agent/_components/ScanResult.tsx | 24 +++- .../sdk/agent/_components/TextReference.tsx | 108 ++++++++++++++++++ 2 files changed, 131 insertions(+), 1 deletion(-) create mode 100644 admin-compliance/app/sdk/agent/_components/TextReference.tsx diff --git a/admin-compliance/app/sdk/agent/_components/ScanResult.tsx b/admin-compliance/app/sdk/agent/_components/ScanResult.tsx index e19d01f..0bf9664 100644 --- a/admin-compliance/app/sdk/agent/_components/ScanResult.tsx +++ b/admin-compliance/app/sdk/agent/_components/ScanResult.tsx @@ -1,6 +1,7 @@ 'use client' import React, { useState } from 'react' +import { TextReference } from './TextReference' interface ServiceInfo { name: string @@ -14,11 +15,27 @@ interface ServiceInfo { status: string } +interface TextRef { + found: boolean + source_url: string + document_type: string + section_heading: string + section_number: string + parent_section: string + paragraph_index: number + original_text: string + issue: string + correction_type: string + correction_text: string + insert_after: string +} + interface ScanFinding { code: string severity: string text: string correction: string + text_reference: TextRef | null } interface ScanData { @@ -157,7 +174,12 @@ export function ScanResult({ data }: { data: ScanData }) {

    {f.text}

    - {f.correction && ( + {/* Text Reference (original text + position + correction) */} + {f.text_reference && ( + + )} + {/* Fallback: correction without text reference */} + {!f.text_reference && f.correction && (
    + {showCorrection && ( +
    + {issue && ( + + {CORRECTION_LABELS[ref.correction_type] || issue.label} + + )} +
    {correctionText}
    + +
    + )} +
    + )} + + ) +} From d105842bf26c7b4fbb06068c1394f31d02d0a8b9 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Wed, 29 Apr 2026 12:14:41 +0200 Subject: [PATCH 011/413] =?UTF-8?q?feat:=20consent-tester=20microservice?= =?UTF-8?q?=20=E2=80=94=20Playwright=203-phase=20cookie=20test?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit New independent service (port 8094) with headless Chromium: - Phase A: What loads BEFORE any consent interaction - Phase B: What loads AFTER rejecting consent (CRITICAL if tracking persists) - Phase C: What loads AFTER accepting (check against cookie policy) - 10 CMP-specific selectors (Didomi, OneTrust, Cookiebot, Usercentrics, etc.) - Generic fallback via button text matching - 18 tracking service patterns for script classification Co-Authored-By: Claude Opus 4.6 (1M context) --- consent-tester/Dockerfile | 23 +++ consent-tester/main.py | 86 +++++++++++ consent-tester/requirements.txt | 3 + consent-tester/services/__init__.py | 0 consent-tester/services/banner_detector.py | 149 ++++++++++++++++++ consent-tester/services/consent_scanner.py | 171 +++++++++++++++++++++ consent-tester/services/script_analyzer.py | 157 +++++++++++++++++++ 7 files changed, 589 insertions(+) create mode 100644 consent-tester/Dockerfile create mode 100644 consent-tester/main.py create mode 100644 consent-tester/requirements.txt create mode 100644 consent-tester/services/__init__.py create mode 100644 consent-tester/services/banner_detector.py create mode 100644 consent-tester/services/consent_scanner.py create mode 100644 consent-tester/services/script_analyzer.py diff --git a/consent-tester/Dockerfile b/consent-tester/Dockerfile new file mode 100644 index 0000000..58c2333 --- /dev/null +++ b/consent-tester/Dockerfile @@ -0,0 +1,23 @@ +FROM python:3.12-slim-bookworm + +WORKDIR /app + +# Install system dependencies for Playwright/Chromium +RUN apt-get update && apt-get install -y --no-install-recommends \ + libnss3 libnspr4 libatk1.0-0 libatk-bridge2.0-0 libcups2 \ + libdrm2 libxkbcommon0 libxcomposite1 libxdamage1 libxfixes3 \ + libxrandr2 libgbm1 libpango-1.0-0 libcairo2 libasound2 \ + && rm -rf /var/lib/apt/lists/* + +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt +RUN playwright install chromium + +COPY . . + +RUN useradd --create-home appuser +USER appuser + +EXPOSE 8094 + +CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8094"] diff --git a/consent-tester/main.py b/consent-tester/main.py new file mode 100644 index 0000000..50eae88 --- /dev/null +++ b/consent-tester/main.py @@ -0,0 +1,86 @@ +""" +Consent Tester Service — Playwright-based 3-phase cookie consent test. + +Tests what scripts/cookies load BEFORE consent, AFTER rejection, and AFTER acceptance. +Runs as independent microservice on port 8094. +""" + +import logging +from datetime import datetime, timezone + +from fastapi import FastAPI +from fastapi.middleware.cors import CORSMiddleware +from pydantic import BaseModel + +from services.consent_scanner import run_consent_test, ConsentTestResult + +logging.basicConfig(level=logging.INFO, format="%(levelname)s:%(name)s: %(message)s") +logger = logging.getLogger(__name__) + +app = FastAPI(title="BreakPilot Consent Tester", version="1.0.0") + +app.add_middleware( + CORSMiddleware, + allow_origins=["*"], + allow_methods=["*"], + allow_headers=["*"], +) + + +class ScanRequest(BaseModel): + url: str + timeout_per_phase: int = 10 # seconds to wait after page load + + +class ScanResponse(BaseModel): + url: str + banner_detected: bool + banner_provider: str + phases: dict + summary: dict + scanned_at: str + + +@app.get("/health") +async def health(): + return {"status": "healthy", "service": "consent-tester"} + + +@app.post("/scan", response_model=ScanResponse) +async def scan_consent(req: ScanRequest): + """Run 3-phase consent test on a URL.""" + logger.info("Starting consent test for %s", req.url) + result = await run_consent_test(req.url, req.timeout_per_phase) + + return ScanResponse( + url=req.url, + banner_detected=result.banner_detected, + banner_provider=result.banner_provider, + phases={ + "before_consent": { + "scripts": result.before_scripts, + "cookies": result.before_cookies, + "tracking_services": result.before_tracking, + "violations": [v.__dict__ for v in result.before_violations], + }, + "after_reject": { + "scripts": result.reject_scripts, + "cookies": result.reject_cookies, + "new_tracking": result.reject_new_tracking, + "violations": [v.__dict__ for v in result.reject_violations], + }, + "after_accept": { + "scripts": result.accept_scripts, + "cookies": result.accept_cookies, + "new_tracking": result.accept_new_tracking, + "undocumented": result.accept_undocumented, + }, + }, + summary={ + "critical": sum(1 for v in result.reject_violations if v.severity == "CRITICAL"), + "high": len(result.before_violations), + "undocumented": len(result.accept_undocumented), + "total_violations": len(result.before_violations) + len(result.reject_violations), + }, + scanned_at=datetime.now(timezone.utc).isoformat(), + ) diff --git a/consent-tester/requirements.txt b/consent-tester/requirements.txt new file mode 100644 index 0000000..894be98 --- /dev/null +++ b/consent-tester/requirements.txt @@ -0,0 +1,3 @@ +fastapi==0.115.12 +uvicorn==0.34.2 +playwright==1.52.0 diff --git a/consent-tester/services/__init__.py b/consent-tester/services/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/consent-tester/services/banner_detector.py b/consent-tester/services/banner_detector.py new file mode 100644 index 0000000..396c5dd --- /dev/null +++ b/consent-tester/services/banner_detector.py @@ -0,0 +1,149 @@ +""" +Banner Detector — identifies Consent Management Platforms and their buttons. + +Supports 10+ CMPs with specific selectors + generic fallback. +""" + +from dataclasses import dataclass + +from playwright.async_api import Page, Locator + + +@dataclass +class BannerInfo: + detected: bool + provider: str + accept_selector: str + reject_selector: str + + +# CMP-specific selectors (ordered by market share) +CMP_SELECTORS = [ + { + "name": "Didomi", + "detect": "#didomi-host, [class*='didomi']", + "accept": "#didomi-notice-agree-button", + "reject": "#didomi-notice-disagree-button, .didomi-components-button--secondary", + }, + { + "name": "OneTrust", + "detect": "#onetrust-banner-sdk, [class*='onetrust']", + "accept": "#onetrust-accept-btn-handler", + "reject": "#onetrust-reject-all-handler, .onetrust-close-btn-handler", + }, + { + "name": "Cookiebot", + "detect": "#CybotCookiebotDialog, [class*='CybotCookiebot']", + "accept": "#CybotCookiebotDialogBodyLevelButtonLevelOptinAllowAll", + "reject": "#CybotCookiebotDialogBodyButtonDecline", + }, + { + "name": "Usercentrics", + "detect": "#usercentrics-root, [data-testid='uc-banner']", + "accept": "[data-testid='uc-accept-all-button']", + "reject": "[data-testid='uc-deny-all-button']", + }, + { + "name": "CookieYes", + "detect": ".cky-consent-container, [class*='cky-']", + "accept": ".cky-btn-accept", + "reject": ".cky-btn-reject, .cky-btn-customize", + }, + { + "name": "Quantcast", + "detect": ".qc-cmp2-container, [class*='qc-cmp']", + "accept": "[class*='qc-cmp2-summary-buttons'] button:first-child", + "reject": "[class*='qc-cmp2-summary-buttons'] button:last-child", + }, + { + "name": "Borlabs", + "detect": "#BorlabsCookieBox, [class*='BorlabsCookie']", + "accept": "#BorlabsCookieBox .cookie-accept, [data-cookie-accept]", + "reject": "#BorlabsCookieBox .cookie-refuse, [data-cookie-refuse]", + }, + { + "name": "Consentmanager", + "detect": "#cmpbox, [class*='cmpbox']", + "accept": ".cmpboxbtn.cmpboxbtnyes", + "reject": ".cmpboxbtn.cmpboxbtnno", + }, + { + "name": "Klaro", + "detect": ".klaro, [class*='klaro']", + "accept": ".klaro .cm-btn-accept", + "reject": ".klaro .cm-btn-decline", + }, + { + "name": "TarteAuCitron", + "detect": "#tarteaucitronRoot, [class*='tarteaucitron']", + "accept": "#tarteaucitronPersonalize2", + "reject": "#tarteaucitronAllDenied2", + }, +] + +# Generic fallback patterns (text-based) +GENERIC_ACCEPT_TEXTS = [ + "Alle akzeptieren", "Alles akzeptieren", "Alle Cookies akzeptieren", + "Accept all", "Accept All Cookies", "Akzeptieren", "Zustimmen", + "Einverstanden", "Ich stimme zu", "Ja, einverstanden", +] + +GENERIC_REJECT_TEXTS = [ + "Nur notwendige", "Nur essentielle", "Ablehnen", "Alle ablehnen", + "Reject", "Reject all", "Nur erforderliche", "Nur technisch notwendige", + "Decline", "Nein", "Nicht einverstanden", +] + + +async def detect_banner(page: Page) -> BannerInfo: + """Detect which CMP is used and return button selectors.""" + # Try CMP-specific selectors first + for cmp in CMP_SELECTORS: + try: + count = await page.locator(cmp["detect"]).count() + if count > 0: + return BannerInfo( + detected=True, + provider=cmp["name"], + accept_selector=cmp["accept"], + reject_selector=cmp["reject"], + ) + except Exception: + continue + + # Generic fallback — search for buttons by text + for text in GENERIC_ACCEPT_TEXTS: + try: + btn = page.get_by_text(text, exact=False) + if await btn.count() > 0: + accept = f'button:has-text("{text}")' + # Try to find reject button nearby + reject = "" + for rtext in GENERIC_REJECT_TEXTS: + rbtn = page.get_by_text(rtext, exact=False) + if await rbtn.count() > 0: + reject = f'button:has-text("{rtext}")' + break + return BannerInfo( + detected=True, + provider="Generic", + accept_selector=accept, + reject_selector=reject, + ) + except Exception: + continue + + return BannerInfo(detected=False, provider="", accept_selector="", reject_selector="") + + +async def click_button(page: Page, selector: str, timeout: int = 5000) -> bool: + """Try to click a consent button. Returns True if clicked successfully.""" + if not selector: + return False + try: + locator = page.locator(selector).first + await locator.wait_for(state="visible", timeout=timeout) + await locator.click() + return True + except Exception: + return False diff --git a/consent-tester/services/consent_scanner.py b/consent-tester/services/consent_scanner.py new file mode 100644 index 0000000..caa1c32 --- /dev/null +++ b/consent-tester/services/consent_scanner.py @@ -0,0 +1,171 @@ +""" +Consent Scanner — Playwright-based 3-phase cookie consent test. + +Phase A: Before consent (first visit) +Phase B: After rejecting consent +Phase C: After accepting consent +""" + +import logging +from dataclasses import dataclass, field + +from playwright.async_api import async_playwright, Page, BrowserContext + +from services.banner_detector import detect_banner, click_button, BannerInfo +from services.script_analyzer import ( + classify_scripts, find_tracking_services, + find_violations_before_consent, find_violations_after_reject, Violation, +) + +logger = logging.getLogger(__name__) + +USER_AGENT = ( + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " + "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" +) + + +@dataclass +class ConsentTestResult: + banner_detected: bool = False + banner_provider: str = "" + # Phase A: Before consent + before_scripts: list[str] = field(default_factory=list) + before_cookies: list[str] = field(default_factory=list) + before_tracking: list[str] = field(default_factory=list) + before_violations: list[Violation] = field(default_factory=list) + # Phase B: After reject + reject_scripts: list[str] = field(default_factory=list) + reject_cookies: list[str] = field(default_factory=list) + reject_new_tracking: list[str] = field(default_factory=list) + reject_violations: list[Violation] = field(default_factory=list) + # Phase C: After accept + accept_scripts: list[str] = field(default_factory=list) + accept_cookies: list[str] = field(default_factory=list) + accept_new_tracking: list[str] = field(default_factory=list) + accept_undocumented: list[str] = field(default_factory=list) + + +async def run_consent_test(url: str, wait_secs: int = 10) -> ConsentTestResult: + """Run 3-phase consent test on a URL.""" + result = ConsentTestResult() + wait_ms = wait_secs * 1000 + + async with async_playwright() as p: + browser = await p.chromium.launch( + headless=True, + args=["--no-sandbox", "--disable-dev-shm-usage"], + ) + + try: + # ── Phase A: Before consent ────────────────────────── + logger.info("Phase A: First visit (no interaction)") + ctx_a = await browser.new_context(user_agent=USER_AGENT) + page_a = await ctx_a.new_page() + scripts_a = [] + page_a.on("request", lambda req: _collect_script(req, scripts_a)) + + await page_a.goto(url, wait_until="networkidle", timeout=30000) + await page_a.wait_for_timeout(wait_ms) + + result.before_scripts = _get_page_scripts(scripts_a) + result.before_cookies = _get_cookie_names(await ctx_a.cookies()) + result.before_tracking = find_tracking_services(result.before_scripts) + result.before_violations = find_violations_before_consent(result.before_scripts) + + # Detect banner + banner = await detect_banner(page_a) + result.banner_detected = banner.detected + result.banner_provider = banner.provider + + await ctx_a.close() + + if not banner.detected: + logger.info("No consent banner detected — skipping Phase B/C") + await browser.close() + return result + + # ── Phase B: After rejecting ───────────────────────── + logger.info("Phase B: Reject consent (%s)", banner.provider) + ctx_b = await browser.new_context(user_agent=USER_AGENT) + page_b = await ctx_b.new_page() + scripts_b = [] + page_b.on("request", lambda req: _collect_script(req, scripts_b)) + + await page_b.goto(url, wait_until="networkidle", timeout=30000) + await page_b.wait_for_timeout(3000) + + clicked = await click_button(page_b, banner.reject_selector) + if clicked: + logger.info("Reject button clicked, waiting %ds", wait_secs) + await page_b.wait_for_timeout(wait_ms) + else: + logger.warning("Could not click reject button") + + result.reject_scripts = _get_page_scripts(scripts_b) + result.reject_cookies = _get_cookie_names(await ctx_b.cookies()) + reject_tracking = find_tracking_services(result.reject_scripts) + result.reject_new_tracking = [t for t in reject_tracking if t not in result.before_tracking] + result.reject_violations = find_violations_after_reject( + result.before_scripts, result.reject_scripts, + ) + + await ctx_b.close() + + # ── Phase C: After accepting ───────────────────────── + logger.info("Phase C: Accept consent (%s)", banner.provider) + ctx_c = await browser.new_context(user_agent=USER_AGENT) + page_c = await ctx_c.new_page() + scripts_c = [] + page_c.on("request", lambda req: _collect_script(req, scripts_c)) + + await page_c.goto(url, wait_until="networkidle", timeout=30000) + await page_c.wait_for_timeout(3000) + + clicked = await click_button(page_c, banner.accept_selector) + if clicked: + logger.info("Accept button clicked, waiting %ds", wait_secs) + await page_c.wait_for_timeout(wait_ms) + else: + logger.warning("Could not click accept button") + + result.accept_scripts = _get_page_scripts(scripts_c) + result.accept_cookies = _get_cookie_names(await ctx_c.cookies()) + accept_tracking = find_tracking_services(result.accept_scripts) + result.accept_new_tracking = [t for t in accept_tracking if t not in result.before_tracking] + + await ctx_c.close() + + except Exception as e: + logger.error("Consent test failed: %s", e) + finally: + await browser.close() + + logger.info( + "Consent test complete: banner=%s, violations_before=%d, violations_reject=%d", + result.banner_provider, len(result.before_violations), len(result.reject_violations), + ) + return result + + +def _collect_script(request, scripts: list[str]): + """Collect script request URLs.""" + if request.resource_type in ("script", "image", "xhr", "fetch"): + scripts.append(request.url) + + +def _get_page_scripts(collected: list[str]) -> list[str]: + """Deduplicate and filter script URLs.""" + seen = set() + result = [] + for url in collected: + domain = url.split("/")[2] if "/" in url and len(url.split("/")) > 2 else url + if domain not in seen: + seen.add(domain) + result.append(url) + return result[:50] # Cap at 50 + + +def _get_cookie_names(cookies: list[dict]) -> list[str]: + """Extract cookie names from Playwright cookie list.""" + return sorted(set(c.get("name", "") for c in cookies if c.get("name"))) diff --git a/consent-tester/services/script_analyzer.py b/consent-tester/services/script_analyzer.py new file mode 100644 index 0000000..4079362 --- /dev/null +++ b/consent-tester/services/script_analyzer.py @@ -0,0 +1,157 @@ +""" +Script Analyzer — classifies detected scripts and cookies against known services. +""" + +import re +from dataclasses import dataclass + +SERVICE_PATTERNS: dict[str, dict] = { + r"google.?analytics|gtag|UA-\d|G-\w{5}": { + "name": "Google Analytics", "requires_consent": True, + "legal_ref": "§25 TDDDG, Art. 44-49 DSGVO", + }, + r"googletagmanager|gtm\.js": { + "name": "Google Tag Manager", "requires_consent": True, + "legal_ref": "§25 TDDDG", + }, + r"facebook\.net|fbevents|fbq": { + "name": "Meta/Facebook Pixel", "requires_consent": True, + "legal_ref": "§25 TDDDG, Art. 44-49 DSGVO", + }, + r"hotjar\.com|_hjSettings": { + "name": "Hotjar", "requires_consent": True, + "legal_ref": "§25 TDDDG (Session Recording)", + }, + r"clarity\.ms": { + "name": "Microsoft Clarity", "requires_consent": True, + "legal_ref": "§25 TDDDG (Session Replay)", + }, + r"tiktok\.com/i18n|analytics\.tiktok": { + "name": "TikTok Pixel", "requires_consent": True, + "legal_ref": "§25 TDDDG, Drittlandtransfer China", + }, + r"linkedin\.com/insight|snap\.licdn": { + "name": "LinkedIn Insight", "requires_consent": True, + "legal_ref": "§25 TDDDG, Art. 44-49 DSGVO", + }, + r"pinterest\.com/ct|pinimg\.com/ct": { + "name": "Pinterest Tag", "requires_consent": True, + "legal_ref": "§25 TDDDG", + }, + r"criteo\.com|criteo\.net": { + "name": "Criteo", "requires_consent": True, + "legal_ref": "§25 TDDDG", + }, + r"doubleclick\.net|googlesyndication": { + "name": "Google Ads/DoubleClick", "requires_consent": True, + "legal_ref": "§25 TDDDG, Art. 44-49 DSGVO", + }, + r"fonts\.googleapis\.com|fonts\.gstatic": { + "name": "Google Fonts", "requires_consent": True, + "legal_ref": "LG Muenchen I, Az. 3 O 17493/20", + }, + r"recaptcha|grecaptcha": { + "name": "Google reCAPTCHA", "requires_consent": True, + "legal_ref": "§25 TDDDG", + }, + r"youtube\.com/embed|ytimg": { + "name": "YouTube", "requires_consent": True, + "legal_ref": "§25 TDDDG, Art. 44-49 DSGVO", + }, + r"maps\.googleapis|maps\.google": { + "name": "Google Maps", "requires_consent": True, + "legal_ref": "§25 TDDDG", + }, + r"intercom\.io|intercomcdn": { + "name": "Intercom", "requires_consent": True, + "legal_ref": "Art. 44-49 DSGVO", + }, + r"zendesk\.com|zdassets": { + "name": "Zendesk", "requires_consent": True, + "legal_ref": "Art. 44-49 DSGVO", + }, + r"sentry\.io|sentry-cdn": { + "name": "Sentry", "requires_consent": False, + "legal_ref": "Berechtigtes Interesse (Error Tracking)", + }, + r"cdn\.cloudflare\.com": { + "name": "Cloudflare CDN", "requires_consent": False, + "legal_ref": "Berechtigtes Interesse (CDN)", + }, + r"didomi|cookiebot|onetrust|usercentrics|consentmanager": { + "name": "Consent Management", "requires_consent": False, + "legal_ref": "Notwendig (CMP)", + }, +} + + +@dataclass +class Violation: + service: str + severity: str # "HIGH", "CRITICAL" + text: str + legal_ref: str + + +def classify_scripts(scripts: list[str]) -> list[str]: + """Classify script URLs into known service names.""" + services = set() + for script in scripts: + for pattern, meta in SERVICE_PATTERNS.items(): + if re.search(pattern, script, re.IGNORECASE): + services.add(meta["name"]) + break + return sorted(services) + + +def find_tracking_services(scripts: list[str]) -> list[str]: + """Find services that require consent.""" + tracking = [] + for script in scripts: + for pattern, meta in SERVICE_PATTERNS.items(): + if re.search(pattern, script, re.IGNORECASE) and meta["requires_consent"]: + tracking.append(meta["name"]) + break + return sorted(set(tracking)) + + +def find_violations_before_consent(scripts: list[str]) -> list[Violation]: + """Find tracking scripts that load without consent (HIGH).""" + violations = [] + seen = set() + for script in scripts: + for pattern, meta in SERVICE_PATTERNS.items(): + if re.search(pattern, script, re.IGNORECASE) and meta["requires_consent"]: + name = meta["name"] + if name not in seen: + seen.add(name) + violations.append(Violation( + service=name, severity="HIGH", + text=f"{name} laedt OHNE vorherige Einwilligung", + legal_ref=meta["legal_ref"], + )) + break + return violations + + +def find_violations_after_reject( + before_scripts: list[str], after_scripts: list[str], +) -> list[Violation]: + """Find tracking scripts that still load after rejection (CRITICAL).""" + violations = [] + after_tracking = find_tracking_services(after_scripts) + before_tracking = find_tracking_services(before_scripts) + + for service in after_tracking: + if service in before_tracking: + # Was already loading before AND still loads after reject = CRITICAL + for pattern, meta in SERVICE_PATTERNS.items(): + if meta["name"] == service: + violations.append(Violation( + service=service, severity="CRITICAL", + text=f"{service} laedt TROTZ Ablehnung — moegliches Dark Pattern", + legal_ref=meta["legal_ref"] + ", Art. 5(3) ePrivacy", + )) + break + + return violations From f3c0481631261078eac0d94744d7bea0d76135ea Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Wed, 29 Apr 2026 12:33:20 +0200 Subject: [PATCH 012/413] feat: add consent-tester service to docker-compose (port 8094, 2GB mem limit) Co-Authored-By: Claude Opus 4.6 (1M context) --- docker-compose.yml | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/docker-compose.yml b/docker-compose.yml index 22397e5..3af0dc3 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -244,6 +244,28 @@ services: # ========================================================= # DOCUMENT CRAWLER & AUTO-ONBOARDING # ========================================================= + consent-tester: + build: + context: ./consent-tester + dockerfile: Dockerfile + container_name: bp-compliance-consent-tester + platform: linux/arm64 + ports: + - "8094:8094" + mem_limit: 2g + depends_on: + core-health-check: + condition: service_completed_successfully + healthcheck: + test: ["CMD", "curl", "-f", "http://127.0.0.1:8094/health"] + interval: 30s + timeout: 10s + start_period: 30s + retries: 3 + restart: unless-stopped + networks: + - breakpilot-network + document-crawler: build: context: ./document-crawler From b7f9099ad9b8a1d87a10320842b62a171aaaf896 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Wed, 29 Apr 2026 12:38:15 +0200 Subject: [PATCH 013/413] =?UTF-8?q?feat:=20Cookie-Test=20tab=20=E2=80=94?= =?UTF-8?q?=203-phase=20consent=20test=20UI=20+=20API=20proxy?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Third tab "Cookie-Test" in Compliance Agent: - Phase A: Before consent (tracking without permission) - Phase B: After rejection (CRITICAL if tracking persists) - Phase C: After acceptance (undocumented services) - CMP badge (Didomi, OneTrust, etc.) - Violation cards with severity badges and legal references Co-Authored-By: Claude Opus 4.6 (1M context) --- .../api/sdk/v1/agent/consent-test/route.ts | 37 ++++ .../agent/_components/ConsentTestResult.tsx | 166 ++++++++++++++++++ admin-compliance/app/sdk/agent/page.tsx | 47 ++++- 3 files changed, 241 insertions(+), 9 deletions(-) create mode 100644 admin-compliance/app/api/sdk/v1/agent/consent-test/route.ts create mode 100644 admin-compliance/app/sdk/agent/_components/ConsentTestResult.tsx diff --git a/admin-compliance/app/api/sdk/v1/agent/consent-test/route.ts b/admin-compliance/app/api/sdk/v1/agent/consent-test/route.ts new file mode 100644 index 0000000..4440458 --- /dev/null +++ b/admin-compliance/app/api/sdk/v1/agent/consent-test/route.ts @@ -0,0 +1,37 @@ +/** + * Consent Test API Proxy + * POST /api/sdk/v1/agent/consent-test → consent-tester:8094/scan + */ + +import { NextRequest, NextResponse } from 'next/server' + +const CONSENT_TESTER_URL = process.env.CONSENT_TESTER_URL || 'http://bp-compliance-consent-tester:8094' + +export async function POST(request: NextRequest) { + try { + const body = await request.text() + + const response = await fetch(`${CONSENT_TESTER_URL}/scan`, { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body, + signal: AbortSignal.timeout(180000), // 3 min — 3 browser phases + }) + + if (!response.ok) { + const errorText = await response.text() + return NextResponse.json( + { error: `Consent-Tester: ${response.status}`, detail: errorText }, + { status: response.status } + ) + } + + return NextResponse.json(await response.json()) + } catch (error) { + console.error('Consent test proxy error:', error) + return NextResponse.json( + { error: 'Cookie-Test fehlgeschlagen oder Timeout' }, + { status: 503 } + ) + } +} diff --git a/admin-compliance/app/sdk/agent/_components/ConsentTestResult.tsx b/admin-compliance/app/sdk/agent/_components/ConsentTestResult.tsx new file mode 100644 index 0000000..57385ae --- /dev/null +++ b/admin-compliance/app/sdk/agent/_components/ConsentTestResult.tsx @@ -0,0 +1,166 @@ +'use client' + +import React from 'react' + +interface Violation { + service: string + severity: string + text: string + legal_ref: string +} + +interface PhaseData { + scripts: string[] + cookies: string[] + tracking_services?: string[] + new_tracking?: string[] + violations?: Violation[] + undocumented?: string[] +} + +interface ConsentData { + banner_detected: boolean + banner_provider: string + phases: { + before_consent: PhaseData + after_reject: PhaseData + after_accept: PhaseData + } + summary: { + critical: number + high: number + undocumented: number + total_violations: number + } +} + +const SEV = { + CRITICAL: { bg: 'bg-red-100 border-red-300', text: 'text-red-800', badge: 'bg-red-600' }, + HIGH: { bg: 'bg-orange-100 border-orange-300', text: 'text-orange-800', badge: 'bg-orange-500' }, +} + +function PhaseCard({ title, icon, data, type }: { + title: string; icon: string; data: PhaseData; type: 'before' | 'reject' | 'accept' +}) { + const violations = data.violations || [] + const tracking = data.tracking_services || data.new_tracking || [] + const undocumented = data.undocumented || [] + const hasProblem = violations.length > 0 || undocumented.length > 0 + + return ( +
    +

    + {icon} {title} +

    + + {/* Violations */} + {violations.map((v, i) => ( +
    +
    + + {v.severity} + + + {v.service} + +
    +

    {v.text}

    +

    {v.legal_ref}

    +
    + ))} + + {/* Undocumented (Phase C only) */} + {undocumented.map((s, i) => ( +
    + ✗ {s} — nicht in Cookie-Policy dokumentiert +
    + ))} + + {/* Tracking services (no violations) */} + {violations.length === 0 && undocumented.length === 0 && tracking.length > 0 && ( +
    + {tracking.map((t, i) =>
    ✓ {t} — {type === 'accept' ? 'mit Consent OK' : 'erkannt'}
    )} +
    + )} + + {violations.length === 0 && undocumented.length === 0 && tracking.length === 0 && ( +

    ✓ Keine Tracking-Dienste erkannt

    + )} + + {/* Cookie/Script count */} +
    + {data.scripts?.length || 0} Scripts + {data.cookies?.length || 0} Cookies +
    +
    + ) +} + +export function ConsentTestResult({ data }: { data: ConsentData }) { + const s = data.summary + + return ( +
    + {/* Header */} +
    +
    + + + Cookie-Banner: {data.banner_detected ? data.banner_provider : 'Nicht erkannt'} + +
    +
    + {s.critical > 0 && ( + + {s.critical} Kritisch + + )} + {s.high > 0 && ( + + {s.high} Hoch + + )} + {s.total_violations === 0 && ( + + Keine Verstoesse + + )} +
    +
    + + {/* Three Phases */} +
    + + {data.banner_detected && ( + <> + + + + )} +
    + + {/* No banner warning */} + {!data.banner_detected && ( +
    + Kein Cookie-Banner erkannt. Alle erkannten Tracking-Dienste laden ohne + Einwilligung — dies ist ein Verstoss gegen §25 TDDDG. +
    + )} +
    + ) +} diff --git a/admin-compliance/app/sdk/agent/page.tsx b/admin-compliance/app/sdk/agent/page.tsx index 6c5391a..404fee3 100644 --- a/admin-compliance/app/sdk/agent/page.tsx +++ b/admin-compliance/app/sdk/agent/page.tsx @@ -6,9 +6,10 @@ import { AnalysisResult } from './_components/AnalysisResult' import { AnalysisHistory } from './_components/AnalysisHistory' import { FollowUpQuestions } from './_components/FollowUpQuestions' import { ScanResult } from './_components/ScanResult' +import { ConsentTestResult } from './_components/ConsentTestResult' type AnalysisMode = 'pre_launch' | 'post_launch' -type AnalysisTab = 'quick' | 'scan' +type AnalysisTab = 'quick' | 'scan' | 'consent' const MODES: { id: AnalysisMode; label: string; desc: string; icon: string }[] = [ { id: 'pre_launch', label: 'Internes Dokument', desc: 'Vor Veroeffentlichung pruefen', icon: '📋' }, @@ -17,7 +18,8 @@ const MODES: { id: AnalysisMode; label: string; desc: string; icon: string }[] = const TABS: { id: AnalysisTab; label: string; info: string }[] = [ { id: 'quick', label: 'Schnellanalyse', info: 'Analysiert nur die eingegebene URL. Fuer einen umfassenden Check nutzen Sie den Website-Scan.' }, - { id: 'scan', label: 'Website-Scan', info: 'Scannt automatisch 5-10 Unterseiten (Startseite, Datenschutz, Impressum, AGB, Cookies) und gleicht erkannte Dienste mit der Datenschutzerklaerung ab.' }, + { id: 'scan', label: 'Website-Scan', info: 'Scannt automatisch 5-10 Unterseiten und gleicht erkannte Dienste mit der Datenschutzerklaerung ab.' }, + { id: 'consent', label: 'Cookie-Test', info: 'Testet mit echtem Browser was VOR und NACH Cookie-Einwilligung geladen wird. Erkennt Verstoesse gegen §25 TDDDG.' }, ] export default function AgentPage() { @@ -28,6 +30,9 @@ export default function AgentPage() { const [scanError, setScanError] = useState(null) const [scanData, setScanData] = useState(null) const [scanHistory, setScanHistory] = useState([]) + const [consentLoading, setConsentLoading] = useState(false) + const [consentError, setConsentError] = useState(null) + const [consentData, setConsentData] = useState(null) const { analyze, answerFollowUp, loading, error, result, history } = useAgentAnalysis() const handleSubmit = async (e: React.FormEvent) => { @@ -36,7 +41,7 @@ export default function AgentPage() { if (tab === 'quick') { analyze(url.trim(), mode) - } else { + } else if (tab === 'scan') { setScanLoading(true) setScanError(null) setScanData(null) @@ -55,11 +60,28 @@ export default function AgentPage() { } finally { setScanLoading(false) } + } else { + setConsentLoading(true) + setConsentError(null) + setConsentData(null) + try { + const res = await fetch('/api/sdk/v1/agent/consent-test', { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ url: url.trim() }), + }) + if (!res.ok) throw new Error(`Cookie-Test fehlgeschlagen: ${res.status}`) + setConsentData(await res.json()) + } catch (e) { + setConsentError(e instanceof Error ? e.message : 'Unbekannter Fehler') + } finally { + setConsentLoading(false) + } } } - const isLoading = tab === 'quick' ? loading : scanLoading - const currentError = tab === 'quick' ? error : scanError + const isLoading = tab === 'quick' ? loading : tab === 'scan' ? scanLoading : consentLoading + const currentError = tab === 'quick' ? error : tab === 'scan' ? scanError : consentError const currentTab = TABS.find(t => t.id === tab)! return ( @@ -105,7 +127,7 @@ export default function AgentPage() { {/* URL Input */}
    setUrl(e.target.value)} - placeholder={tab === 'scan' ? 'https://www.example.com/' : 'https://example.com/datenschutz'} + placeholder={tab === 'consent' ? 'https://www.example.com/' : tab === 'scan' ? 'https://www.example.com/' : 'https://example.com/datenschutz'} className="flex-1 px-4 py-3 border border-gray-300 rounded-lg focus:ring-2 focus:ring-purple-500 focus:border-transparent text-sm" disabled={isLoading} required />
    @@ -143,6 +165,13 @@ export default function AgentPage() { )} + {/* Consent Test Result */} + {tab === 'consent' && consentData && ( +
    + +
    + )} + {/* History */} {tab === 'quick' && ( { setUrl(r.url); analyze(r.url, mode) }} /> @@ -152,7 +181,7 @@ export default function AgentPage() {

    Letzte Scans

    {scanHistory.map((item, i) => ( -
    )} + {/* PDF Export Button */} +
    + +
    ) } diff --git a/admin-compliance/app/sdk/agent/page.tsx b/admin-compliance/app/sdk/agent/page.tsx index 404fee3..ff88614 100644 --- a/admin-compliance/app/sdk/agent/page.tsx +++ b/admin-compliance/app/sdk/agent/page.tsx @@ -7,82 +7,93 @@ import { AnalysisHistory } from './_components/AnalysisHistory' import { FollowUpQuestions } from './_components/FollowUpQuestions' import { ScanResult } from './_components/ScanResult' import { ConsentTestResult } from './_components/ConsentTestResult' +import { CompareResult } from './_components/CompareResult' +import { AuthTestResult } from './_components/AuthTestResult' -type AnalysisMode = 'pre_launch' | 'post_launch' -type AnalysisTab = 'quick' | 'scan' | 'consent' +type Mode = 'pre_launch' | 'post_launch' +type Tab = 'quick' | 'scan' | 'consent' | 'compare' | 'auth' -const MODES: { id: AnalysisMode; label: string; desc: string; icon: string }[] = [ - { id: 'pre_launch', label: 'Internes Dokument', desc: 'Vor Veroeffentlichung pruefen', icon: '📋' }, - { id: 'post_launch', label: 'Live-Website', desc: 'Bereits online analysieren', icon: '🌐' }, +const MODES = [ + { id: 'pre_launch' as Mode, label: 'Internes Dokument', desc: 'Vor Veroeffentlichung', icon: '📋' }, + { id: 'post_launch' as Mode, label: 'Live-Website', desc: 'Bereits online', icon: '🌐' }, ] -const TABS: { id: AnalysisTab; label: string; info: string }[] = [ - { id: 'quick', label: 'Schnellanalyse', info: 'Analysiert nur die eingegebene URL. Fuer einen umfassenden Check nutzen Sie den Website-Scan.' }, - { id: 'scan', label: 'Website-Scan', info: 'Scannt automatisch 5-10 Unterseiten und gleicht erkannte Dienste mit der Datenschutzerklaerung ab.' }, - { id: 'consent', label: 'Cookie-Test', info: 'Testet mit echtem Browser was VOR und NACH Cookie-Einwilligung geladen wird. Erkennt Verstoesse gegen §25 TDDDG.' }, +const TABS = [ + { id: 'quick' as Tab, label: 'Schnellanalyse', info: 'Einzelne URL klassifizieren und bewerten.' }, + { id: 'scan' as Tab, label: 'Website-Scan', info: '5-10 Seiten scannen, Dienstleister abgleichen, Pflichtinhalte pruefen.' }, + { id: 'consent' as Tab, label: 'Cookie-Test', info: 'Testet mit Browser was VOR und NACH Cookie-Einwilligung geladen wird.' }, + { id: 'compare' as Tab, label: 'Vergleich', info: '2-5 Websites parallel scannen und Compliance vergleichen.' }, + { id: 'auth' as Tab, label: 'Login-Test', info: 'Nach Login pruefen: Kuendigung, Daten loeschen, Export, Einwilligungen.' }, ] export default function AgentPage() { const [url, setUrl] = useState('') - const [mode, setMode] = useState('post_launch') - const [tab, setTab] = useState('quick') + const [urls, setUrls] = useState('') + const [mode, setMode] = useState('post_launch') + const [tab, setTab] = useState('quick') const [scanLoading, setScanLoading] = useState(false) const [scanError, setScanError] = useState(null) const [scanData, setScanData] = useState(null) const [scanHistory, setScanHistory] = useState([]) - const [consentLoading, setConsentLoading] = useState(false) - const [consentError, setConsentError] = useState(null) const [consentData, setConsentData] = useState(null) + const [compareData, setCompareData] = useState(null) + const [authData, setAuthData] = useState(null) + const [authUser, setAuthUser] = useState('') + const [authPass, setAuthPass] = useState('') const { analyze, answerFollowUp, loading, error, result, history } = useAgentAnalysis() const handleSubmit = async (e: React.FormEvent) => { e.preventDefault() - if (!url.trim()) return + setScanLoading(true) + setScanError(null) - if (tab === 'quick') { - analyze(url.trim(), mode) - } else if (tab === 'scan') { - setScanLoading(true) - setScanError(null) - setScanData(null) - try { - const res = await fetch('/api/sdk/v1/agent/scan', { - method: 'POST', - headers: { 'Content-Type': 'application/json' }, - body: JSON.stringify({ url: url.trim(), mode }), - }) - if (!res.ok) throw new Error(`Scan fehlgeschlagen: ${res.status}`) - const data = await res.json() + try { + if (tab === 'quick') { + setScanLoading(false) + analyze(url.trim(), mode) + return + } + + let endpoint = '' + let body: any = {} + + if (tab === 'scan') { + endpoint = '/api/sdk/v1/agent/scan' + body = { url: url.trim(), mode } + } else if (tab === 'consent') { + endpoint = '/api/sdk/v1/agent/consent-test' + body = { url: url.trim() } + } else if (tab === 'compare') { + endpoint = '/api/sdk/v1/agent/compare' + body = { urls: urls.split('\n').map(u => u.trim()).filter(Boolean), mode } + } else if (tab === 'auth') { + endpoint = '/api/sdk/v1/agent/authenticated-scan' + body = { url: url.trim(), username: authUser, password: authPass } + } + + const res = await fetch(endpoint, { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify(body), + }) + if (!res.ok) throw new Error(`Fehlgeschlagen: ${res.status}`) + const data = await res.json() + + if (tab === 'scan') { setScanData(data) setScanHistory(prev => [{ url: url.trim(), ...data, scanned_at: new Date().toISOString() }, ...prev].slice(0, 20)) - } catch (e) { - setScanError(e instanceof Error ? e.message : 'Unbekannter Fehler') - } finally { - setScanLoading(false) - } - } else { - setConsentLoading(true) - setConsentError(null) - setConsentData(null) - try { - const res = await fetch('/api/sdk/v1/agent/consent-test', { - method: 'POST', - headers: { 'Content-Type': 'application/json' }, - body: JSON.stringify({ url: url.trim() }), - }) - if (!res.ok) throw new Error(`Cookie-Test fehlgeschlagen: ${res.status}`) - setConsentData(await res.json()) - } catch (e) { - setConsentError(e instanceof Error ? e.message : 'Unbekannter Fehler') - } finally { - setConsentLoading(false) - } + } else if (tab === 'consent') setConsentData(data) + else if (tab === 'compare') setCompareData(data) + else if (tab === 'auth') setAuthData(data) + } catch (e) { + setScanError(e instanceof Error ? e.message : 'Fehler') + } finally { + setScanLoading(false) } } - const isLoading = tab === 'quick' ? loading : tab === 'scan' ? scanLoading : consentLoading - const currentError = tab === 'quick' ? error : tab === 'scan' ? scanError : consentError - const currentTab = TABS.find(t => t.id === tab)! + const isLoading = tab === 'quick' ? loading : scanLoading + const currentError = tab === 'quick' ? error : scanError return (
    @@ -91,12 +102,11 @@ export default function AgentPage() {

    Analysiere Dokumente und Webseiten auf DSGVO-Konformitaet.

    - {/* Mode Selection */} + {/* Mode */}
    {MODES.map(m => ( ))}
    -

    {currentTab.info}

    +

    {TABS.find(t => t.id === tab)?.info}

    - {/* URL Input */} -
    - setUrl(e.target.value)} - placeholder={tab === 'consent' ? 'https://www.example.com/' : tab === 'scan' ? 'https://www.example.com/' : 'https://example.com/datenschutz'} - className="flex-1 px-4 py-3 border border-gray-300 rounded-lg focus:ring-2 focus:ring-purple-500 focus:border-transparent text-sm" - disabled={isLoading} required /> -