From e636b8cef8e0ad8705a8f96074e4e626b88cae58 Mon Sep 17 00:00:00 2001 From: BreakPilot Dev Date: Tue, 10 Feb 2026 23:26:18 +0100 Subject: [PATCH] feat(rag): Migrate national DPA laws from bp_dsfa_corpus to bp_legal_corpus Move 23 sources (18 national data protection laws + 5 EDPB guidelines/SCC) from bp_dsfa_corpus to bp_legal_corpus with vector preservation. Extend REGULATIONS array with national_law and eu_guideline types. Mark migrated sources in dsfa_corpus_ingestion.py to prevent re-ingestion. Co-Authored-By: Claude Opus 4.6 --- admin-v2/app/(admin)/ai/rag/page.tsx | 673 +++++++++++++++++- .../backend/dsfa_corpus_ingestion.py | 333 ++++++++- klausur-service/backend/migrate_rag_chunks.py | 307 ++++++++ 3 files changed, 1289 insertions(+), 24 deletions(-) create mode 100644 klausur-service/backend/migrate_rag_chunks.py diff --git a/admin-v2/app/(admin)/ai/rag/page.tsx b/admin-v2/app/(admin)/ai/rag/page.tsx index 93b63eb..ebe138f 100644 --- a/admin-v2/app/(admin)/ai/rag/page.tsx +++ b/admin-v2/app/(admin)/ai/rag/page.tsx @@ -14,6 +14,7 @@ import { AIModuleSidebarResponsive } from '@/components/ai/AIModuleSidebar' // API uses local proxy route to klausur-service const API_PROXY = '/api/legal-corpus' +const DSFA_API_PROXY = '/api/dsfa-corpus' // Types interface RegulationStatus { @@ -45,6 +46,32 @@ interface SearchResult { score: number } +// DSFA source type (from /api/dsfa-corpus) +interface DsfaSource { + source_code: string + name: string + full_name?: string + organization?: string + source_url?: string + license_code: string + attribution_text: string + document_type: string + language: string + chunk_count?: number +} + +interface DsfaCorpusStatus { + qdrant_collection: string + total_sources: number + total_documents: number + total_chunks: number + qdrant_points_count: number + qdrant_status: string +} + +// RAG category filter for Regulations tab +type RegulationCategory = 'regulations' | 'dsfa' | 'nibis' | 'templates' + // Tab definitions type TabId = 'overview' | 'regulations' | 'map' | 'search' | 'data' | 'ingestion' | 'pipeline' @@ -366,13 +393,331 @@ const REGULATIONS = [ keyTopics: ['Patientenakte (MyHealth@EU)', 'Sekundaernutzung', 'Datenzugangsorgane', 'Gesundheitsdatenstandards', 'Forschungszugang'], effectiveDate: '2025 (gestaffelt bis 2029)' }, + // National Data Protection Laws (migrated from bp_dsfa_corpus) + { + code: 'AT_DSG', + name: 'DSG Oesterreich', + fullName: 'Datenschutzgesetz Oesterreich (DSG)', + type: 'national_law', + expected: 50, + description: 'Oesterreichisches Datenschutzgesetz zur Ergaenzung der DSGVO. Regelt nationale Besonderheiten wie Bildverarbeitung, Datenschutzbehoerde und Strafbestimmungen.', + relevantFor: ['Unternehmen in Oesterreich', 'DACH-Unternehmen', 'Auftragsverarbeiter'], + keyTopics: ['Nationale DSGVO-Ergaenzung', 'Bildverarbeitung', 'Datenschutzbehoerde', 'Strafbestimmungen'], + effectiveDate: '25. Mai 2018' + }, + { + code: 'BDSG_FULL', + name: 'BDSG', + fullName: 'Bundesdatenschutzgesetz (BDSG) - Volltext', + type: 'de_law', + expected: 9, + description: 'Deutsches Bundesdatenschutzgesetz als nationale Ergaenzung zur DSGVO. Regelt Beschaeftigtendatenschutz, Videoueberachung, Scoring und Datenschutzbeauftragte.', + relevantFor: ['Deutsche Unternehmen', 'Arbeitgeber', 'Auskunfteien', 'Oeffentliche Stellen'], + keyTopics: ['Beschaeftigtendatenschutz', 'Videoueberachung', 'Scoring', 'Datenschutzbeauftragter'], + effectiveDate: '25. Mai 2018' + }, + { + code: 'CH_DSG', + name: 'DSG Schweiz', + fullName: 'Datenschutzgesetz Schweiz (revDSG 2023)', + type: 'national_law', + expected: 2, + description: 'Revidiertes Schweizer Datenschutzgesetz mit DSGVO-nahen Anforderungen. Gilt fuer Schweizer Unternehmen und solche, die Schweizer Daten verarbeiten.', + relevantFor: ['Schweizer Unternehmen', 'DACH-Unternehmen', 'Internationale Dienstleister'], + keyTopics: ['Datenschutz-Folgenabschaetzung', 'Meldepflichten', 'Profiling', 'Strafbestimmungen'], + effectiveDate: '1. September 2023' + }, + { + code: 'LI_DSG', + name: 'DSG Liechtenstein', + fullName: 'Datenschutzgesetz Liechtenstein', + type: 'national_law', + expected: 1, + description: 'Liechtensteinisches Datenschutzgesetz als EWR-Umsetzung der DSGVO.', + relevantFor: ['Unternehmen in Liechtenstein', 'EWR-Dienstleister'], + keyTopics: ['EWR-Datenschutz', 'Nationale Ergaenzung', 'Datenschutzstelle'], + effectiveDate: '2018' + }, + { + code: 'BE_DPA_LAW', + name: 'Datenschutzgesetz Belgien', + fullName: 'Loi relative a la protection des donnees (Belgien)', + type: 'national_law', + expected: 153, + description: 'Belgisches Datenschutzgesetz zur nationalen Umsetzung der DSGVO. Regelt die Autorite de protection des donnees (APD).', + relevantFor: ['Unternehmen in Belgien', 'EU-Hauptsitz Bruessel'], + keyTopics: ['APD', 'Nationale DSGVO-Umsetzung', 'Strafbestimmungen', 'Sektorale Regeln'], + effectiveDate: '2018' + }, + { + code: 'NL_UAVG', + name: 'UAVG Niederlande', + fullName: 'Uitvoeringswet AVG (UAVG) Niederlande', + type: 'national_law', + expected: 138, + description: 'Niederlaendisches Ausfuehrungsgesetz zur DSGVO. Regelt nationale Besonderheiten wie BSN-Verarbeitung und Gesundheitsdaten.', + relevantFor: ['Unternehmen in den Niederlanden', 'Gesundheitssektor NL'], + keyTopics: ['BSN-Nummer', 'Gesundheitsdaten', 'Autoriteit Persoonsgegevens', 'Nationale Ergaenzung'], + effectiveDate: '25. Mai 2018' + }, + { + code: 'FR_CNIL_GUIDE', + name: 'CNIL Guide RGPD', + fullName: 'Guide pratique RGPD (CNIL Frankreich)', + type: 'national_law', + expected: 14, + description: 'Praktischer DSGVO-Leitfaden der franzoesischen Datenschutzbehoerde CNIL. Wichtig fuer alle Unternehmen mit franzoesischen Kunden.', + relevantFor: ['Unternehmen in Frankreich', 'Franzoesischsprachige Maerkte'], + keyTopics: ['CNIL-Guidance', 'Cookies', 'Einwilligung', 'Sanktionen'], + effectiveDate: '2018' + }, + { + code: 'ES_LOPDGDD', + name: 'LOPDGDD Spanien', + fullName: 'Ley Organica de Proteccion de Datos (LOPDGDD) Spanien', + type: 'national_law', + expected: 154, + description: 'Spanisches organisches Datenschutzgesetz mit Garantien digitaler Rechte. Umfassende DSGVO-Umsetzung mit digitalen Grundrechten.', + relevantFor: ['Unternehmen in Spanien', 'Spanischsprachige Maerkte'], + keyTopics: ['Digitale Rechte', 'AEPD', 'Recht auf Vergessenwerden', 'Beschaeftigtendatenschutz'], + effectiveDate: '7. Dezember 2018' + }, + { + code: 'IT_CODICE_PRIVACY', + name: 'Codice Privacy Italien', + fullName: 'Codice in materia di protezione dei dati personali (Italien)', + type: 'national_law', + expected: 3, + description: 'Italienisches Datenschutzgesetzbuch, aktualisiert gemaess DSGVO. Umfassende nationale Regelung durch den Garante.', + relevantFor: ['Unternehmen in Italien', 'Garante-regulierte Sektoren'], + keyTopics: ['Garante Privacy', 'Codice Privacy', 'Gesundheitsdaten', 'Strafrecht'], + effectiveDate: '2018 (aktualisiert)' + }, + { + code: 'IE_DPA_2018', + name: 'DPA 2018 Ireland', + fullName: 'Data Protection Act 2018 (Ireland)', + type: 'national_law', + expected: 28, + description: 'Irisches Datenschutzgesetz. Besonders relevant da viele Tech-Konzerne (Google, Meta, Apple) ihren EU-Hauptsitz in Irland haben.', + relevantFor: ['Tech-Konzerne mit EU-Sitz Irland', 'Irische Unternehmen', 'DPC-reguliert'], + keyTopics: ['DPC Ireland', 'Big Tech Aufsicht', 'Nationale Ergaenzung', 'Strafbestimmungen'], + effectiveDate: '24. Mai 2018' + }, + { + code: 'UK_DPA_2018', + name: 'DPA 2018 UK', + fullName: 'Data Protection Act 2018 (United Kingdom)', + type: 'national_law', + expected: 94, + description: 'Britisches Datenschutzgesetz nach dem Brexit. Ergaenzt die UK GDPR mit nationalen Bestimmungen, reguliert durch das ICO.', + relevantFor: ['Unternehmen mit UK-Kunden', 'UK-Datentransfers', 'ICO-regulierte Unternehmen'], + keyTopics: ['ICO', 'UK Adequacy', 'Post-Brexit Datenschutz', 'Law Enforcement'], + effectiveDate: '23. Mai 2018' + }, + { + code: 'UK_GDPR', + name: 'UK GDPR', + fullName: 'UK General Data Protection Regulation (retained EU law)', + type: 'national_law', + expected: 24, + description: 'In UK-Recht ueberfuehrte DSGVO nach dem Brexit. Weitgehend identisch mit EU-DSGVO, aber unter britischer Aufsicht (ICO).', + relevantFor: ['UK-Unternehmen', 'EU-UK Datentransfers', 'Internationale Konzerne'], + keyTopics: ['Retained EU Law', 'UK-EU Adequacy', 'ICO Enforcement', 'UK-spezifische Anpassungen'], + effectiveDate: '1. Januar 2021' + }, + { + code: 'NO_PERSONOPPLYSNINGSLOVEN', + name: 'Personopplysningsloven', + fullName: 'Personopplysningsloven (Norwegen)', + type: 'national_law', + expected: 18, + description: 'Norwegisches Datenschutzgesetz als EWR-Umsetzung der DSGVO. Reguliert durch Datatilsynet.', + relevantFor: ['Unternehmen in Norwegen', 'EWR-Dienstleister', 'Skandinavische Maerkte'], + keyTopics: ['Datatilsynet', 'EWR-Datenschutz', 'Nationale Ergaenzung', 'Kameras'], + effectiveDate: '20. Juli 2018' + }, + { + code: 'SE_DATASKYDDSLAG', + name: 'Dataskyddslag Schweden', + fullName: 'Dataskyddslag (2018:218) Schweden', + type: 'national_law', + expected: 30, + description: 'Schwedisches Datenschutzgesetz als ergaenzende Bestimmungen zur DSGVO. Reguliert durch IMY.', + relevantFor: ['Unternehmen in Schweden', 'Skandinavische Maerkte'], + keyTopics: ['IMY', 'Personnummer', 'Forschungsdaten', 'Pressefreiheit'], + effectiveDate: '25. Mai 2018' + }, + { + code: 'FI_TIETOSUOJALAKI', + name: 'Tietosuojalaki Finnland', + fullName: 'Tietosuojalaki (1050/2018) Finnland', + type: 'national_law', + expected: 1, + description: 'Finnisches Datenschutzgesetz als nationale Ergaenzung zur DSGVO.', + relevantFor: ['Unternehmen in Finnland', 'Nordische Maerkte'], + keyTopics: ['Nationale Ergaenzung', 'Tietosuojavaltuutettu', 'Forschungsdaten'], + effectiveDate: '1. Januar 2019' + }, + { + code: 'PL_UODO', + name: 'UODO Polen', + fullName: 'Ustawa o ochronie danych osobowych (Polen)', + type: 'national_law', + expected: 1, + description: 'Polnisches Datenschutzgesetz als DSGVO-Umsetzung. Reguliert durch den UODO (Praesident des Amtes fuer den Schutz personenbezogener Daten).', + relevantFor: ['Unternehmen in Polen', 'Osteuropaeische Maerkte'], + keyTopics: ['UODO', 'Nationale Ergaenzung', 'Strafbestimmungen', 'Oeffentlicher Sektor'], + effectiveDate: '25. Mai 2018' + }, + { + code: 'CZ_ZOU', + name: 'Zakon Tschechien', + fullName: 'Zakon o zpracovani osobnich udaju (Tschechien)', + type: 'national_law', + expected: 135, + description: 'Tschechisches Datenschutzgesetz zur DSGVO-Umsetzung. Reguliert durch das UOOU.', + relevantFor: ['Unternehmen in Tschechien', 'Mitteleuropaeische Maerkte'], + keyTopics: ['UOOU', 'Nationale Ergaenzung', 'Kamerasysteme', 'Strafbestimmungen'], + effectiveDate: '24. April 2019' + }, + { + code: 'HU_INFOTV', + name: 'Infotv. Ungarn', + fullName: 'Informacios torvenye (Infotv.) Ungarn', + type: 'national_law', + expected: 156, + description: 'Ungarisches Informationsgesetz ueber Selbstbestimmung und Informationsfreiheit als DSGVO-Ergaenzung. Reguliert durch NAIH.', + relevantFor: ['Unternehmen in Ungarn', 'Mitteleuropaeische Maerkte'], + keyTopics: ['NAIH', 'Informationsfreiheit', 'Nationale Ergaenzung', 'Datensicherheit'], + effectiveDate: '2018 (aktualisiert)' + }, + { + code: 'SCC_FULL_TEXT', + name: 'SCC Volltext', + fullName: 'Standardvertragsklauseln Volltext (2021/914/EU)', + type: 'eu_regulation', + expected: 154, + description: 'Vollstaendiger Text der EU-Standardvertragsklauseln fuer internationale Datentransfers. Alle Module (C2C, C2P, P2C, P2P) mit Annexen.', + relevantFor: ['Alle mit Drittlandtransfers', 'Cloud-Nutzer', 'Auftragsverarbeiter'], + keyTopics: ['Module 1-4', 'TIA', 'Annexe', 'Technische Massnahmen'], + effectiveDate: '27. Juni 2021' + }, + { + code: 'EDPB_GUIDELINES_2_2019', + name: 'EDPB GL Art. 6(1)(b)', + fullName: 'EDPB Leitlinien 2/2019 zu Art. 6(1)(b) DSGVO', + type: 'eu_guideline', + expected: 3, + description: 'EDPB-Leitlinien zur Verarbeitung personenbezogener Daten auf Grundlage der Vertragserfullung gemaess Art. 6 Abs. 1 lit. b DSGVO.', + relevantFor: ['Alle Verantwortlichen', 'Vertragsdatenverarbeitung', 'Online-Dienste'], + keyTopics: ['Vertragserfullung', 'Art. 6(1)(b)', 'Erforderlichkeit', 'Online-Dienste'], + effectiveDate: '2019' + }, + { + code: 'EDPB_GUIDELINES_3_2019', + name: 'EDPB GL Videoueberwachung', + fullName: 'EDPB Leitlinien 3/2019 Videoueberwachung', + type: 'eu_guideline', + expected: 3, + description: 'EDPB-Leitlinien zur Verarbeitung personenbezogener Daten durch Videoueberwachungsgeraete.', + relevantFor: ['Videoueberwachung', 'Sicherheitsdienste', 'Einzelhandel', 'Oeffentliche Stellen'], + keyTopics: ['Videoueberwachung', 'Kameras', 'Speicherfristen', 'Hinweisschilder'], + effectiveDate: '2020' + }, + { + code: 'EDPB_GUIDELINES_5_2020', + name: 'EDPB GL Einwilligung', + fullName: 'EDPB Leitlinien 5/2020 zur Einwilligung', + type: 'eu_guideline', + expected: 2, + description: 'EDPB-Leitlinien zur Einwilligung gemaess DSGVO. Klaert Anforderungen an gueltige Einwilligungen, Widerruf und Cookie-Consent.', + relevantFor: ['Website-Betreiber', 'Marketing', 'App-Entwickler', 'Consent-Management'], + keyTopics: ['Einwilligung', 'Cookie-Consent', 'Widerruf', 'Freiwilligkeit'], + effectiveDate: '2020' + }, + { + code: 'EDPB_GUIDELINES_7_2020', + name: 'EDPB GL Controller/Processor', + fullName: 'EDPB Leitlinien 7/2020 Controller und Processor', + type: 'eu_guideline', + expected: 2, + description: 'EDPB-Leitlinien zu den Begriffen Verantwortlicher und Auftragsverarbeiter. Klaert Rollen, Pflichten und Joint Controllership.', + relevantFor: ['Alle Verantwortlichen', 'Auftragsverarbeiter', 'Joint Controller'], + keyTopics: ['Verantwortlicher', 'Auftragsverarbeiter', 'Joint Controller', 'AVV'], + effectiveDate: '2021' + }, ] +// License info for each regulation +const REGULATION_LICENSES: Record = { + GDPR: { license: 'PUBLIC_DOMAIN', licenseNote: 'Amtliches Werk der EU — frei verwendbar' }, + EPRIVACY: { license: 'PUBLIC_DOMAIN', licenseNote: 'EU-Richtlinie — amtliches Werk' }, + TDDDG: { license: 'PUBLIC_DOMAIN', licenseNote: 'Deutsches Bundesgesetz — amtliches Werk (§5 UrhG)' }, + SCC: { license: 'PUBLIC_DOMAIN', licenseNote: 'EU-Durchfuehrungsbeschluss — amtliches Werk' }, + DPF: { license: 'PUBLIC_DOMAIN', licenseNote: 'EU-Angemessenheitsbeschluss — amtliches Werk' }, + AIACT: { license: 'PUBLIC_DOMAIN', licenseNote: 'EU-Verordnung — amtliches Werk' }, + CRA: { license: 'PUBLIC_DOMAIN', licenseNote: 'EU-Verordnung — amtliches Werk' }, + NIS2: { license: 'PUBLIC_DOMAIN', licenseNote: 'EU-Richtlinie — amtliches Werk' }, + EUCSA: { license: 'PUBLIC_DOMAIN', licenseNote: 'EU-Verordnung — amtliches Werk' }, + DATAACT: { license: 'PUBLIC_DOMAIN', licenseNote: 'EU-Verordnung — amtliches Werk' }, + DGA: { license: 'PUBLIC_DOMAIN', licenseNote: 'EU-Verordnung — amtliches Werk' }, + DSA: { license: 'PUBLIC_DOMAIN', licenseNote: 'EU-Verordnung — amtliches Werk' }, + EAA: { license: 'PUBLIC_DOMAIN', licenseNote: 'EU-Richtlinie — amtliches Werk' }, + DSM: { license: 'PUBLIC_DOMAIN', licenseNote: 'EU-Richtlinie — amtliches Werk' }, + PLD: { license: 'PUBLIC_DOMAIN', licenseNote: 'EU-Richtlinie — amtliches Werk' }, + GPSR: { license: 'PUBLIC_DOMAIN', licenseNote: 'EU-Verordnung — amtliches Werk' }, + 'BSI-TR-03161-1': { license: 'DL-DE-BY-2.0', licenseNote: 'Datenlizenz Deutschland — Namensnennung 2.0' }, + 'BSI-TR-03161-2': { license: 'DL-DE-BY-2.0', licenseNote: 'Datenlizenz Deutschland — Namensnennung 2.0' }, + 'BSI-TR-03161-3': { license: 'DL-DE-BY-2.0', licenseNote: 'Datenlizenz Deutschland — Namensnennung 2.0' }, + DORA: { license: 'PUBLIC_DOMAIN', licenseNote: 'EU-Verordnung — amtliches Werk' }, + PSD2: { license: 'PUBLIC_DOMAIN', licenseNote: 'EU-Richtlinie — amtliches Werk' }, + AMLR: { license: 'PUBLIC_DOMAIN', licenseNote: 'EU-Verordnung — amtliches Werk' }, + MiCA: { license: 'PUBLIC_DOMAIN', licenseNote: 'EU-Verordnung — amtliches Werk' }, + EHDS: { license: 'PUBLIC_DOMAIN', licenseNote: 'EU-Verordnung — amtliches Werk' }, + // National Data Protection Laws + AT_DSG: { license: 'PUBLIC_DOMAIN', licenseNote: 'Amtliches Werk Oesterreich — frei verwendbar' }, + BDSG_FULL: { license: 'PUBLIC_DOMAIN', licenseNote: 'Deutsches Bundesgesetz — amtliches Werk (§5 UrhG)' }, + CH_DSG: { license: 'PUBLIC_DOMAIN', licenseNote: 'Amtliches Werk Schweiz — frei verwendbar' }, + LI_DSG: { license: 'PUBLIC_DOMAIN', licenseNote: 'Amtliches Werk Liechtenstein — frei verwendbar' }, + BE_DPA_LAW: { license: 'PUBLIC_DOMAIN', licenseNote: 'Amtliches Werk Belgien — frei verwendbar' }, + NL_UAVG: { license: 'PUBLIC_DOMAIN', licenseNote: 'Amtliches Werk Niederlande — frei verwendbar' }, + FR_CNIL_GUIDE: { license: 'PUBLIC_DOMAIN', licenseNote: 'CNIL — oeffentliches Dokument' }, + ES_LOPDGDD: { license: 'PUBLIC_DOMAIN', licenseNote: 'Amtliches Werk Spanien (BOE) — frei verwendbar' }, + IT_CODICE_PRIVACY: { license: 'PUBLIC_DOMAIN', licenseNote: 'Amtliches Werk Italien — frei verwendbar' }, + IE_DPA_2018: { license: 'OGL-3.0', licenseNote: 'Open Government Licence v3.0 — Ireland' }, + UK_DPA_2018: { license: 'OGL-3.0', licenseNote: 'Open Government Licence v3.0 — UK' }, + UK_GDPR: { license: 'OGL-3.0', licenseNote: 'Open Government Licence v3.0 — UK' }, + NO_PERSONOPPLYSNINGSLOVEN: { license: 'PUBLIC_DOMAIN', licenseNote: 'Amtliches Werk Norwegen — frei verwendbar' }, + SE_DATASKYDDSLAG: { license: 'PUBLIC_DOMAIN', licenseNote: 'Amtliches Werk Schweden — frei verwendbar' }, + FI_TIETOSUOJALAKI: { license: 'PUBLIC_DOMAIN', licenseNote: 'Amtliches Werk Finnland — frei verwendbar' }, + PL_UODO: { license: 'PUBLIC_DOMAIN', licenseNote: 'Amtliches Werk Polen — frei verwendbar' }, + CZ_ZOU: { license: 'PUBLIC_DOMAIN', licenseNote: 'Amtliches Werk Tschechien — frei verwendbar' }, + HU_INFOTV: { license: 'PUBLIC_DOMAIN', licenseNote: 'Amtliches Werk Ungarn — frei verwendbar' }, + SCC_FULL_TEXT: { license: 'PUBLIC_DOMAIN', licenseNote: 'EU-Durchfuehrungsbeschluss — amtliches Werk' }, + EDPB_GUIDELINES_2_2019: { license: 'EDPB-LICENSE', licenseNote: 'EDPB Document License' }, + EDPB_GUIDELINES_3_2019: { license: 'EDPB-LICENSE', licenseNote: 'EDPB Document License' }, + EDPB_GUIDELINES_5_2020: { license: 'EDPB-LICENSE', licenseNote: 'EDPB Document License' }, + EDPB_GUIDELINES_7_2020: { license: 'EDPB-LICENSE', licenseNote: 'EDPB Document License' }, +} + +// License display labels +const LICENSE_LABELS: Record = { + PUBLIC_DOMAIN: 'Public Domain', + 'DL-DE-BY-2.0': 'DL-DE-BY 2.0', + 'CC-BY-4.0': 'CC BY 4.0', + 'EDPB-LICENSE': 'EDPB License', + 'OGL-3.0': 'OGL v3.0', + PROPRIETARY: 'Proprietaer', +} + const TYPE_COLORS: Record = { eu_regulation: 'bg-blue-100 text-blue-700', eu_directive: 'bg-purple-100 text-purple-700', de_law: 'bg-yellow-100 text-yellow-700', bsi_standard: 'bg-green-100 text-green-700', + national_law: 'bg-orange-100 text-orange-700', + eu_guideline: 'bg-teal-100 text-teal-700', } const TYPE_LABELS: Record = { @@ -380,6 +725,8 @@ const TYPE_LABELS: Record = { eu_directive: 'EU-RL', de_law: 'DE-Gesetz', bsi_standard: 'BSI', + national_law: 'Nat. Gesetz', + eu_guideline: 'EDPB-GL', } // Industry/Sector definitions for the regulation map @@ -693,6 +1040,13 @@ export default function RAGPage() { const [autoRefresh, setAutoRefresh] = useState(true) const [elapsedTime, setElapsedTime] = useState('') + // DSFA corpus state + const [dsfaSources, setDsfaSources] = useState([]) + const [dsfaStatus, setDsfaStatus] = useState(null) + const [dsfaLoading, setDsfaLoading] = useState(false) + const [regulationCategory, setRegulationCategory] = useState('regulations') + const [expandedDsfaSource, setExpandedDsfaSource] = useState(null) + // Data tab state const [customDocuments, setCustomDocuments] = useState([]) const [uploadFile, setUploadFile] = useState(null) @@ -734,6 +1088,28 @@ export default function RAGPage() { } }, []) + const fetchDsfaStatus = useCallback(async () => { + setDsfaLoading(true) + try { + const [statusRes, sourcesRes] = await Promise.all([ + fetch(`${DSFA_API_PROXY}?action=status`), + fetch(`${DSFA_API_PROXY}?action=sources`), + ]) + if (statusRes.ok) { + const data = await statusRes.json() + setDsfaStatus(data) + } + if (sourcesRes.ok) { + const data = await sourcesRes.json() + setDsfaSources(data.sources || data || []) + } + } catch (error) { + console.error('Failed to fetch DSFA status:', error) + } finally { + setDsfaLoading(false) + } + }, []) + const fetchCustomDocuments = useCallback(async () => { try { const res = await fetch(`${API_PROXY}?action=custom-documents`) @@ -848,7 +1224,8 @@ export default function RAGPage() { useEffect(() => { fetchStatus() - }, [fetchStatus]) + fetchDsfaStatus() + }, [fetchStatus, fetchDsfaStatus]) useEffect(() => { if (activeTab === 'pipeline') { @@ -1023,47 +1400,45 @@ export default function RAGPage() { {/* Page Purpose */} {/* AI Module Sidebar - Desktop: Fixed, Mobile: FAB + Drawer */} - {/* Stats Cards */} + {/* RAG Collections Stats */}
-

Regulierungen

-

{REGULATIONS.length}

+

Legal Corpus

+

{loading ? '-' : getTotalChunks().toLocaleString()}

+

Chunks · {REGULATIONS.length} Regulierungen

-

Chunks Total

-

{loading ? '-' : getTotalChunks().toLocaleString()}

+

DSFA Corpus

+

{dsfaLoading ? '-' : (dsfaStatus?.total_chunks || 0).toLocaleString()}

+

Chunks · {dsfaSources.length || '~70'} Quellen

-

Vector Size

-

{collectionStatus?.vectorSize || 1024}

+

NiBiS EH

+

28.662

+

Chunks · Bildungs-Erwartungshorizonte

-
-

Status

-

- {collectionStatus?.status === 'green' ? '✓ Ready' : loading ? '-' : collectionStatus?.status || 'N/A'} -

+
+

Legal Templates

+

824

+

Chunks · Dokumentvorlagen

@@ -1088,6 +1463,39 @@ export default function RAGPage() { {/* Tab Content */} {activeTab === 'overview' && (
+ {/* RAG Categories Overview */} +
+

RAG-Kategorien

+
+ + +
+

NiBiS EH

+

28.662

+

Chunks · Bildungs-Erwartungshorizonte

+
+
+

Legal Templates

+

824

+

Chunks · Dokumentvorlagen (VVT, TOM, DSFA)

+
+
+
+ {/* Quick Stats per Type */}
{Object.entries(TYPE_LABELS).map(([type, label]) => { @@ -1134,6 +1542,53 @@ export default function RAGPage() { )} {activeTab === 'regulations' && ( +
+ {/* Category Filter */} +
+ + + + +
+ + {/* Regulations Table (existing) */} + {regulationCategory === 'regulations' && (

Alle {REGULATIONS.length} Regulierungen

@@ -1224,7 +1679,17 @@ export default function RAGPage() {
- In Kraft seit: {reg.effectiveDate} +
+ In Kraft seit: {reg.effectiveDate} + {REGULATION_LICENSES[reg.code] && ( + + + {LICENSE_LABELS[REGULATION_LICENSES[reg.code].license] || REGULATION_LICENSES[reg.code].license} + + {REGULATION_LICENSES[reg.code].licenseNote} + + )} +
+ )} + + {/* DSFA Sources */} + {regulationCategory === 'dsfa' && ( +
+
+
+

DSFA Quellen ({dsfaSources.length || '~70'})

+

WP248, DSK Kurzpapiere, Muss-Listen, nationale Datenschutzgesetze

+
+ +
+ {dsfaLoading ? ( +
Lade DSFA-Quellen...
+ ) : dsfaSources.length === 0 ? ( +
+

Keine DSFA-Quellen vom Backend geladen.

+

Endpunkt: /api/dsfa-corpus?action=sources

+
+ ) : ( +
+ {dsfaSources.map((source) => { + const isExpanded = expandedDsfaSource === source.source_code + const typeColors: Record = { + regulation: 'bg-blue-100 text-blue-700', + legislation: 'bg-indigo-100 text-indigo-700', + guideline: 'bg-teal-100 text-teal-700', + checklist: 'bg-yellow-100 text-yellow-700', + standard: 'bg-green-100 text-green-700', + methodology: 'bg-purple-100 text-purple-700', + specification: 'bg-orange-100 text-orange-700', + catalog: 'bg-pink-100 text-pink-700', + guidance: 'bg-cyan-100 text-cyan-700', + } + return ( + +
setExpandedDsfaSource(isExpanded ? null : source.source_code)} + className="px-4 py-3 hover:bg-slate-50 cursor-pointer transition-colors flex items-center justify-between" + > +
+ + {source.source_code} + + {source.document_type} + + {source.name} +
+
+ + {source.language} + + {source.chunk_count != null && ( + {source.chunk_count} Chunks + )} +
+
+ {isExpanded && ( +
+
+
+

{source.full_name || source.name}

+ {source.organization && ( +

Organisation: {source.organization}

+ )} +
+
+ + + {LICENSE_LABELS[source.license_code] || source.license_code} + + {source.attribution_text} + +
+ {source.source_url && ( + + )} +
+
+ )} +
+ ) + })} +
+ )} +
+ )} + + {/* NiBiS Dokumente (info only) */} + {regulationCategory === 'nibis' && ( +
+
+
📚
+
+

NiBiS Erwartungshorizonte

+

Collection: bp_nibis_eh

+
+
+
+
+

Chunks

+

28.662

+
+
+

Vector Size

+

1024

+
+
+

Typ

+

BGE-M3

+
+
+

+ Bildungsinhalte aus dem Niedersaechsischen Bildungsserver (NiBiS). Enthaelt Erwartungshorizonte fuer + verschiedene Faecher und Schulformen. Wird ueber die Klausur-Korrektur fuer EH-Matching genutzt. + Diese Daten sind nicht direkt compliance-relevant. +

+
+ )} + + {/* Templates (info only) */} + {regulationCategory === 'templates' && ( +
+
+
📋
+
+

Legal Templates & Vorlagen

+

Collection: bp_legal_templates

+
+
+
+
+

Chunks

+

824

+
+
+

Vector Size

+

1024

+
+
+

Typ

+

BGE-M3

+
+
+

+ Vorlagen fuer VVT (Verzeichnis von Verarbeitungstaetigkeiten), TOM (Technisch-Organisatorische Massnahmen), + DSFA-Berichte und weitere Compliance-Dokumente. Werden vom AI Compliance SDK fuer die Dokumentgenerierung genutzt. +

+
+ )} +
)} {activeTab === 'map' && ( diff --git a/klausur-service/backend/dsfa_corpus_ingestion.py b/klausur-service/backend/dsfa_corpus_ingestion.py index f634951..a7fbc06 100644 --- a/klausur-service/backend/dsfa_corpus_ingestion.py +++ b/klausur-service/backend/dsfa_corpus_ingestion.py @@ -766,6 +766,326 @@ DSFA_SOURCES = [ "document_type": "standard", "language": "de" }, + + # === EDPB Ergaenzende Leitlinien === + # MIGRATED to bp_legal_corpus via migrate_rag_chunks.py (2026-02-10) + { + "source_code": "EDPB_GUIDELINES_2_2019", + "name": "EDPB Leitlinien 2/2019 zu Art. 6(1)(b)", + "full_name": "EDPB Leitlinien 2/2019 zur Verarbeitung personenbezogener Daten auf Grundlage von Art. 6 Abs. 1 lit. b DSGVO", + "organization": "European Data Protection Board", + "source_url": "https://edpb.europa.eu/our-work-tools/our-documents/guidelines/guidelines-22019-processing-personal-data-under-article-61b_en", + "license_code": "EDPB-LICENSE", + "attribution_text": "Source: EDPB Guidelines 2/2019, European Data Protection Board", + "document_type": "guideline", + "language": "en", + "migrated_to": "bp_legal_corpus" + }, + { + "source_code": "EDPB_GUIDELINES_3_2019", + "name": "EDPB Leitlinien 3/2019 Videoueberwachung", + "full_name": "EDPB Leitlinien 3/2019 zur Verarbeitung personenbezogener Daten durch Videoueberwachung", + "organization": "European Data Protection Board", + "source_url": "https://edpb.europa.eu/our-work-tools/our-documents/guidelines/guidelines-32019-processing-personal-data-through-video_en", + "license_code": "EDPB-LICENSE", + "attribution_text": "Source: EDPB Guidelines 3/2019, European Data Protection Board", + "document_type": "guideline", + "language": "en", + "migrated_to": "bp_legal_corpus" + }, + { + "source_code": "EDPB_GUIDELINES_5_2020", + "name": "EDPB Leitlinien 5/2020 Einwilligung", + "full_name": "EDPB Leitlinien 5/2020 zur Einwilligung gemaess Verordnung 2016/679", + "organization": "European Data Protection Board", + "source_url": "https://edpb.europa.eu/our-work-tools/our-documents/guidelines/guidelines-052020-consent-under-regulation-2016679_en", + "license_code": "EDPB-LICENSE", + "attribution_text": "Source: EDPB Guidelines 5/2020, European Data Protection Board", + "document_type": "guideline", + "language": "en", + "migrated_to": "bp_legal_corpus" + }, + { + "source_code": "EDPB_GUIDELINES_7_2020", + "name": "EDPB Leitlinien 7/2020 Controller/Processor", + "full_name": "EDPB Leitlinien 7/2020 zu den Begriffen Verantwortlicher und Auftragsverarbeiter", + "organization": "European Data Protection Board", + "source_url": "https://edpb.europa.eu/our-work-tools/our-documents/guidelines/guidelines-072020-concepts-controller-and-processor-gdpr_en", + "license_code": "EDPB-LICENSE", + "attribution_text": "Source: EDPB Guidelines 7/2020, European Data Protection Board", + "document_type": "guideline", + "language": "en", + "migrated_to": "bp_legal_corpus" + }, + { + "source_code": "EDPB_GUIDELINES_1_2022", + "name": "EDPB Leitlinien 1/2022 Bussgelder", + "full_name": "EDPB Leitlinien 04/2022 zur Berechnung von Bussgeldern nach der DSGVO", + "organization": "European Data Protection Board", + "source_url": "https://edpb.europa.eu/our-work-tools/our-documents/guidelines/guidelines-042022-calculation-administrative-fines-under-gdpr_en", + "license_code": "EDPB-LICENSE", + "attribution_text": "Source: EDPB Guidelines 04/2022, European Data Protection Board", + "document_type": "guideline", + "language": "en", + "migrated_to": "bp_legal_corpus" + }, + { + "source_code": "SCC_FULL_TEXT", + "name": "Standard Contractual Clauses Volltext", + "full_name": "Standardvertragsklauseln fuer die Uebermittlung personenbezogener Daten an Drittlaender (2021/914/EU)", + "organization": "Europaeische Kommission", + "source_url": "https://eur-lex.europa.eu/eli/dec_impl/2021/914/oj", + "license_code": "PUBLIC_DOMAIN", + "attribution_text": "Quelle: SCC Volltext, Europaeische Kommission (EUR-Lex)", + "document_type": "regulation", + "language": "de", + "migrated_to": "bp_legal_corpus" + }, + + # === Nationale Datenschutzgesetze (DSGVO-Umsetzungen) === + # MIGRATED to bp_legal_corpus via migrate_rag_chunks.py (2026-02-10) + # These sources are kept here for reference but will be skipped during ingestion. + # Ingestion should target bp_legal_corpus for these source codes. + { + "source_code": "BDSG_FULL", + "name": "BDSG Volltext (Deutschland)", + "full_name": "Bundesdatenschutzgesetz (BDSG) - Volltext inkl. aller Teile", + "organization": "Bundesrepublik Deutschland", + "source_url": "https://www.gesetze-im-internet.de/bdsg_2018/", + "license_code": "PUBLIC_DOMAIN", + "attribution_text": "Quelle: BDSG, Bundesrepublik Deutschland (gesetze-im-internet.de)", + "document_type": "legislation", + "language": "de", + "migrated_to": "bp_legal_corpus" + }, + { + "source_code": "AT_DSG", + "name": "DSG Oesterreich", + "full_name": "Bundesgesetz zum Schutz natuerlicher Personen bei der Verarbeitung personenbezogener Daten (Datenschutzgesetz - DSG)", + "organization": "Republik Oesterreich", + "source_url": "https://www.ris.bka.gv.at/GeltendeFassung.wxe?Abfrage=Bundesnormen&Gesetzesnummer=10001597", + "license_code": "PUBLIC_DOMAIN", + "attribution_text": "Quelle: DSG, Republik Oesterreich (RIS)", + "document_type": "legislation", + "language": "de", + "migrated_to": "bp_legal_corpus" + }, + { + "source_code": "CH_DSG", + "name": "DSG Schweiz (revDSG 2023)", + "full_name": "Bundesgesetz ueber den Datenschutz (Datenschutzgesetz, DSG) - revidierte Fassung 2023", + "organization": "Schweizerische Eidgenossenschaft", + "source_url": "https://www.fedlex.admin.ch/eli/cc/2022/491/de", + "license_code": "PUBLIC_DOMAIN", + "attribution_text": "Quelle: DSG, Schweizerische Eidgenossenschaft (Fedlex)", + "document_type": "legislation", + "language": "de", + "migrated_to": "bp_legal_corpus" + }, + { + "source_code": "LI_DSG", + "name": "DSG Liechtenstein", + "full_name": "Datenschutzgesetz (DSG) Liechtenstein", + "organization": "Fuerstentum Liechtenstein", + "source_url": "https://www.gesetze.li/konso/2018.272", + "license_code": "PUBLIC_DOMAIN", + "attribution_text": "Quelle: DSG, Fuerstentum Liechtenstein (gesetze.li)", + "document_type": "legislation", + "language": "de", + "migrated_to": "bp_legal_corpus" + }, + { + "source_code": "FR_CNIL_GUIDE", + "name": "CNIL Guide RGPD", + "full_name": "Guide pratique RGPD - Commission Nationale de l'Informatique et des Libertes", + "organization": "CNIL (France)", + "source_url": "https://www.cnil.fr/fr/rgpd-de-quoi-parle-t-on", + "license_code": "PUBLIC_DOMAIN", + "attribution_text": "Source: CNIL Guide RGPD, Commission Nationale de l'Informatique et des Libertes", + "document_type": "guideline", + "language": "fr", + "migrated_to": "bp_legal_corpus" + }, + { + "source_code": "ES_LOPDGDD", + "name": "LOPDGDD Spanien", + "full_name": "Ley Organica de Proteccion de Datos Personales y garantia de los derechos digitales", + "organization": "Reino de Espana", + "source_url": "https://www.boe.es/buscar/act.php?id=BOE-A-2018-16673", + "license_code": "PUBLIC_DOMAIN", + "attribution_text": "Fuente: LOPDGDD, Reino de Espana (BOE)", + "document_type": "legislation", + "language": "es", + "migrated_to": "bp_legal_corpus" + }, + { + "source_code": "IT_CODICE_PRIVACY", + "name": "Codice Privacy Italien", + "full_name": "Codice in materia di protezione dei dati personali (D.Lgs. 196/2003, aggiornato D.Lgs. 101/2018)", + "organization": "Repubblica Italiana", + "source_url": "https://www.garanteprivacy.it/home/docweb/-/docweb-display/docweb/9042678", + "license_code": "PUBLIC_DOMAIN", + "attribution_text": "Fonte: Codice Privacy, Garante per la protezione dei dati personali", + "document_type": "legislation", + "language": "it", + "migrated_to": "bp_legal_corpus" + }, + { + "source_code": "NL_UAVG", + "name": "UAVG Niederlande", + "full_name": "Uitvoeringswet Algemene verordening gegevensbescherming (UAVG)", + "organization": "Koninkrijk der Nederlanden", + "source_url": "https://wetten.overheid.nl/BWBR0040940/", + "license_code": "PUBLIC_DOMAIN", + "attribution_text": "Bron: UAVG, Koninkrijk der Nederlanden (wetten.overheid.nl)", + "document_type": "legislation", + "language": "nl", + "migrated_to": "bp_legal_corpus" + }, + { + "source_code": "BE_DPA_LAW", + "name": "Datenschutzgesetz Belgien", + "full_name": "Loi relative a la protection des personnes physiques a l'egard des traitements de donnees a caractere personnel", + "organization": "Royaume de Belgique", + "source_url": "https://www.ejustice.just.fgov.be/cgi_loi/change_lg.pl?language=fr&la=F&cn=2018073046", + "license_code": "PUBLIC_DOMAIN", + "attribution_text": "Source: Loi Protection des Donnees, Royaume de Belgique (eJustice)", + "document_type": "legislation", + "language": "fr", + "migrated_to": "bp_legal_corpus" + }, + { + "source_code": "LU_DPA_LAW", + "name": "Datenschutzgesetz Luxemburg", + "full_name": "Loi du 1er aout 2018 portant organisation de la Commission nationale pour la protection des donnees", + "organization": "Grand-Duche de Luxembourg", + "source_url": "https://legilux.public.lu/eli/etat/leg/loi/2018/08/01/a686/jo", + "license_code": "PUBLIC_DOMAIN", + "attribution_text": "Source: Loi Protection des Donnees, Grand-Duche de Luxembourg (Legilux)", + "document_type": "legislation", + "language": "fr", + "migrated_to": "bp_legal_corpus" + }, + { + "source_code": "IE_DPA_2018", + "name": "Data Protection Act 2018 Ireland", + "full_name": "Data Protection Act 2018 (Act No. 7 of 2018) - Ireland", + "organization": "Government of Ireland", + "source_url": "https://www.irishstatutebook.ie/eli/2018/act/7/enacted/en/html", + "license_code": "OGL-3.0", + "attribution_text": "Contains public sector information licensed under the Open Government Licence v3.0. Source: Data Protection Act 2018, Ireland", + "document_type": "legislation", + "language": "en", + "migrated_to": "bp_legal_corpus" + }, + { + "source_code": "UK_DPA_2018", + "name": "Data Protection Act 2018 UK", + "full_name": "Data Protection Act 2018 (c. 12) - United Kingdom", + "organization": "Government of the United Kingdom", + "source_url": "https://www.legislation.gov.uk/ukpga/2018/12/contents/enacted", + "license_code": "OGL-3.0", + "attribution_text": "Contains public sector information licensed under the Open Government Licence v3.0. Source: Data Protection Act 2018, UK", + "document_type": "legislation", + "language": "en", + "migrated_to": "bp_legal_corpus" + }, + { + "source_code": "UK_GDPR", + "name": "UK GDPR (retained EU law)", + "full_name": "United Kingdom General Data Protection Regulation (UK GDPR) - retained EU law", + "organization": "Government of the United Kingdom", + "source_url": "https://www.legislation.gov.uk/eur/2016/679/contents", + "license_code": "OGL-3.0", + "attribution_text": "Contains public sector information licensed under the Open Government Licence v3.0. Source: UK GDPR, legislation.gov.uk", + "document_type": "legislation", + "language": "en", + "migrated_to": "bp_legal_corpus" + }, + { + "source_code": "NO_PERSONOPPLYSNINGSLOVEN", + "name": "Personopplysningsloven Norwegen", + "full_name": "Lov om behandling av personopplysninger (personopplysningsloven)", + "organization": "Kongeriket Norge", + "source_url": "https://lovdata.no/dokument/NL/lov/2018-06-15-38", + "license_code": "PUBLIC_DOMAIN", + "attribution_text": "Kilde: Personopplysningsloven, Kongeriket Norge (Lovdata)", + "document_type": "legislation", + "language": "no", + "migrated_to": "bp_legal_corpus" + }, + { + "source_code": "SE_DATASKYDDSLAG", + "name": "Dataskyddslag Schweden", + "full_name": "Lag (2018:218) med kompletterande bestammelser till EU:s dataskyddsforordning", + "organization": "Konungariket Sverige", + "source_url": "https://www.riksdagen.se/sv/dokument-och-lagar/dokument/svensk-forfattningssamling/lag-2018218-med-kompletterande-bestammelser-till_sfs-2018-218/", + "license_code": "PUBLIC_DOMAIN", + "attribution_text": "Kalla: Dataskyddslag (2018:218), Konungariket Sverige (Riksdagen)", + "document_type": "legislation", + "language": "sv", + "migrated_to": "bp_legal_corpus" + }, + { + "source_code": "DK_DATABESKYTTELSESLOVEN", + "name": "Databeskyttelsesloven Daenemark", + "full_name": "Lov om supplerende bestemmelser til forordning om beskyttelse af fysiske personer i forbindelse med behandling af personoplysninger", + "organization": "Kongeriget Danmark", + "source_url": "https://www.retsinformation.dk/eli/lta/2018/502", + "license_code": "PUBLIC_DOMAIN", + "attribution_text": "Kilde: Databeskyttelsesloven, Kongeriget Danmark (Retsinformation)", + "document_type": "legislation", + "language": "da", + "migrated_to": "bp_legal_corpus" + }, + { + "source_code": "FI_TIETOSUOJALAKI", + "name": "Tietosuojalaki Finnland", + "full_name": "Tietosuojalaki (1050/2018) - Datenschutzgesetz Finnland", + "organization": "Suomen tasavalta", + "source_url": "https://www.finlex.fi/fi/laki/ajantasa/2018/20181050", + "license_code": "PUBLIC_DOMAIN", + "attribution_text": "Lahde: Tietosuojalaki, Suomen tasavalta (Finlex)", + "document_type": "legislation", + "language": "fi", + "migrated_to": "bp_legal_corpus" + }, + { + "source_code": "PL_UODO", + "name": "UODO Polen", + "full_name": "Ustawa o ochronie danych osobowych - Datenschutzgesetz Polen", + "organization": "Rzeczpospolita Polska", + "source_url": "https://isap.sejm.gov.pl/isap.nsf/DocDetails.xsp?id=WDU20180001000", + "license_code": "PUBLIC_DOMAIN", + "attribution_text": "Zrodlo: Ustawa o ochronie danych osobowych, Rzeczpospolita Polska (ISAP)", + "document_type": "legislation", + "language": "pl", + "migrated_to": "bp_legal_corpus" + }, + { + "source_code": "CZ_ZOU", + "name": "Zakon o ochrane osobnich udaju Tschechien", + "full_name": "Zakon c. 110/2019 Sb. o zpracovani osobnich udaju", + "organization": "Ceska republika", + "source_url": "https://www.zakonyprolidi.cz/cs/2019-110", + "license_code": "PUBLIC_DOMAIN", + "attribution_text": "Zdroj: Zakon o ochrane osobnich udaju, Ceska republika (zakonyprolidi.cz)", + "document_type": "legislation", + "language": "cs", + "migrated_to": "bp_legal_corpus" + }, + { + "source_code": "HU_INFOTV", + "name": "Informacios torvenye Ungarn", + "full_name": "2011. evi CXII. torveny az informacios onrendelkezesi jogrol es az informacioszabadsagrol (Infotv.)", + "organization": "Magyarorszag", + "source_url": "https://njt.hu/jogszabaly/2011-112-00-00", + "license_code": "PUBLIC_DOMAIN", + "attribution_text": "Forras: Infotv., Magyarorszag (njt.hu)", + "document_type": "legislation", + "language": "hu", + "migrated_to": "bp_legal_corpus" + }, ] @@ -1100,7 +1420,7 @@ class DSFAQdrantService: @property def client(self) -> QdrantClient: if self._client is None: - self._client = QdrantClient(url=self.url) + self._client = QdrantClient(url=self.url, check_compatibility=False) return self._client async def ensure_collection(self) -> bool: @@ -1408,14 +1728,21 @@ async def init_dsfa_tables(pool: asyncpg.Pool): async def register_all_sources(pool: asyncpg.Pool): - """Register all DSFA sources in the database.""" + """Register all DSFA sources in the database (skips migrated sources).""" store = DSFACorpusStore(pool) + registered = 0 + skipped = 0 for source in DSFA_SOURCES: + if source.get("migrated_to"): + print(f"Skipping migrated source: {source['source_code']} -> {source['migrated_to']}") + skipped += 1 + continue source_id = await store.register_source(source) print(f"Registered source: {source['source_code']} -> {source_id}") + registered += 1 - print(f"\nTotal sources registered: {len(DSFA_SOURCES)}") + print(f"\nTotal sources registered: {registered} (skipped {skipped} migrated)") async def get_ingestion_status(pool: asyncpg.Pool): diff --git a/klausur-service/backend/migrate_rag_chunks.py b/klausur-service/backend/migrate_rag_chunks.py new file mode 100644 index 0000000..c9f76af --- /dev/null +++ b/klausur-service/backend/migrate_rag_chunks.py @@ -0,0 +1,307 @@ +""" +RAG Chunk Migration: bp_dsfa_corpus -> bp_legal_corpus + +Verschiebt nationale Datenschutzgesetze und EU-Dokumente aus bp_dsfa_corpus +nach bp_legal_corpus. Vektoren werden 1:1 uebernommen (kein Re-Embedding). + +Usage: + python migrate_rag_chunks.py # Dry run (default) + python migrate_rag_chunks.py --execute # Actually migrate + python migrate_rag_chunks.py --verify # Verify after migration +""" + +import os +import sys +import argparse +from datetime import datetime, timezone +from typing import List, Dict, Any + +from qdrant_client import QdrantClient +from qdrant_client.models import ( + PointStruct, Filter, FieldCondition, MatchAny, ScrollRequest +) + +# Configuration +QDRANT_URL = os.getenv("QDRANT_URL", "http://qdrant:6333") +SOURCE_COLLECTION = "bp_dsfa_corpus" +TARGET_COLLECTION = "bp_legal_corpus" + +# Source codes to migrate from bp_dsfa_corpus -> bp_legal_corpus +SOURCES_TO_MIGRATE = [ + # Nationale Datenschutzgesetze + "AT_DSG", + "BDSG_FULL", + "BE_DPA_LAW", + "CH_DSG", + "CZ_ZOU", + "ES_LOPDGDD", + "FI_TIETOSUOJALAKI", + "FR_CNIL_GUIDE", + "HU_INFOTV", + "IE_DPA_2018", + "IT_CODICE_PRIVACY", + "LI_DSG", + "NL_UAVG", + "NO_PERSONOPPLYSNINGSLOVEN", + "PL_UODO", + "SE_DATASKYDDSLAG", + "UK_DPA_2018", + "UK_GDPR", + # EU-Dokumente + "SCC_FULL_TEXT", + "EDPB_GUIDELINES_2_2019", + "EDPB_GUIDELINES_3_2019", + "EDPB_GUIDELINES_5_2020", + "EDPB_GUIDELINES_7_2020", +] + +# Mapping: source_code -> regulation_type for bp_legal_corpus +REGULATION_TYPE_MAP = { + "AT_DSG": "national_law", + "BDSG_FULL": "de_law", + "BE_DPA_LAW": "national_law", + "CH_DSG": "national_law", + "CZ_ZOU": "national_law", + "ES_LOPDGDD": "national_law", + "FI_TIETOSUOJALAKI": "national_law", + "FR_CNIL_GUIDE": "national_law", + "HU_INFOTV": "national_law", + "IE_DPA_2018": "national_law", + "IT_CODICE_PRIVACY": "national_law", + "LI_DSG": "national_law", + "NL_UAVG": "national_law", + "NO_PERSONOPPLYSNINGSLOVEN": "national_law", + "PL_UODO": "national_law", + "SE_DATASKYDDSLAG": "national_law", + "UK_DPA_2018": "national_law", + "UK_GDPR": "national_law", + "SCC_FULL_TEXT": "eu_regulation", + "EDPB_GUIDELINES_2_2019": "eu_guideline", + "EDPB_GUIDELINES_3_2019": "eu_guideline", + "EDPB_GUIDELINES_5_2020": "eu_guideline", + "EDPB_GUIDELINES_7_2020": "eu_guideline", +} + +# Mapping: source_code -> regulation_name for bp_legal_corpus +REGULATION_NAME_MAP = { + "AT_DSG": "DSG Oesterreich", + "BDSG_FULL": "BDSG", + "BE_DPA_LAW": "Datenschutzgesetz Belgien", + "CH_DSG": "DSG Schweiz", + "CZ_ZOU": "Zakon Tschechien", + "ES_LOPDGDD": "LOPDGDD Spanien", + "FI_TIETOSUOJALAKI": "Tietosuojalaki Finnland", + "FR_CNIL_GUIDE": "CNIL Guide RGPD", + "HU_INFOTV": "Infotv. Ungarn", + "IE_DPA_2018": "DPA 2018 Ireland", + "IT_CODICE_PRIVACY": "Codice Privacy Italien", + "LI_DSG": "DSG Liechtenstein", + "NL_UAVG": "UAVG Niederlande", + "NO_PERSONOPPLYSNINGSLOVEN": "Personopplysningsloven", + "PL_UODO": "UODO Polen", + "SE_DATASKYDDSLAG": "Dataskyddslag Schweden", + "UK_DPA_2018": "DPA 2018 UK", + "UK_GDPR": "UK GDPR", + "SCC_FULL_TEXT": "Standardvertragsklauseln", + "EDPB_GUIDELINES_2_2019": "EDPB GL 2/2019", + "EDPB_GUIDELINES_3_2019": "EDPB GL 3/2019", + "EDPB_GUIDELINES_5_2020": "EDPB GL 5/2020", + "EDPB_GUIDELINES_7_2020": "EDPB GL 7/2020", +} + + +def transform_payload(dsfa_payload: Dict[str, Any]) -> Dict[str, Any]: + """Transform bp_dsfa_corpus payload to bp_legal_corpus format.""" + source_code = dsfa_payload.get("source_code", "") + now = datetime.now(timezone.utc).isoformat() + + return { + "text": dsfa_payload.get("content", ""), + "regulation_code": source_code, + "regulation_name": REGULATION_NAME_MAP.get(source_code, dsfa_payload.get("source_name", "")), + "regulation_full_name": dsfa_payload.get("source_name", ""), + "regulation_type": REGULATION_TYPE_MAP.get(source_code, "national_law"), + "source_url": dsfa_payload.get("source_url", ""), + "chunk_index": dsfa_payload.get("chunk_index", 0), + "chunk_position": dsfa_payload.get("chunk_position", 0), + "article": dsfa_payload.get("article", None), + "paragraph": dsfa_payload.get("paragraph", None), + "language": dsfa_payload.get("language", "de"), + "indexed_at": now, + "training_allowed": False, + } + + +def scroll_all_points(client: QdrantClient, collection: str, source_codes: List[str]) -> List: + """Scroll through all points matching the source codes.""" + all_points = [] + offset = None + batch_size = 100 + + scroll_filter = Filter( + must=[ + FieldCondition( + key="source_code", + match=MatchAny(any=source_codes), + ) + ] + ) + + while True: + results, next_offset = client.scroll( + collection_name=collection, + scroll_filter=scroll_filter, + limit=batch_size, + offset=offset, + with_vectors=True, + with_payload=True, + ) + + all_points.extend(results) + + if next_offset is None: + break + offset = next_offset + + return all_points + + +def migrate(execute: bool = False): + """Run the migration.""" + print(f"{'=' * 60}") + print(f"RAG Chunk Migration: {SOURCE_COLLECTION} -> {TARGET_COLLECTION}") + print(f"Mode: {'EXECUTE' if execute else 'DRY RUN'}") + print(f"{'=' * 60}") + print() + + client = QdrantClient(url=QDRANT_URL) + + # Get initial counts + source_info = client.get_collection(SOURCE_COLLECTION) + target_info = client.get_collection(TARGET_COLLECTION) + print(f"Before migration:") + print(f" {SOURCE_COLLECTION}: {source_info.points_count} points") + print(f" {TARGET_COLLECTION}: {target_info.points_count} points") + print() + + # Scroll all points to migrate + print(f"Scrolling points for {len(SOURCES_TO_MIGRATE)} source codes...") + points = scroll_all_points(client, SOURCE_COLLECTION, SOURCES_TO_MIGRATE) + print(f" Found {len(points)} points to migrate") + print() + + if not points: + print("No points found to migrate. Exiting.") + return + + # Group by source_code for reporting + by_source: Dict[str, int] = {} + for p in points: + sc = p.payload.get("source_code", "UNKNOWN") + by_source[sc] = by_source.get(sc, 0) + 1 + + print("Points per source:") + for sc in sorted(by_source.keys()): + print(f" {sc}: {by_source[sc]} chunks") + print() + + if not execute: + print("DRY RUN complete. Use --execute to actually migrate.") + return + + # Transform and upsert in batches + batch_size = 50 + upserted = 0 + for i in range(0, len(points), batch_size): + batch = points[i:i + batch_size] + new_points = [] + for p in batch: + new_payload = transform_payload(p.payload) + new_points.append(PointStruct( + id=p.id, + vector=p.vector, + payload=new_payload, + )) + + client.upsert( + collection_name=TARGET_COLLECTION, + points=new_points, + ) + upserted += len(new_points) + print(f" Upserted {upserted}/{len(points)} points...") + + print(f"\nUpsert complete: {upserted} points added to {TARGET_COLLECTION}") + + # Delete from source collection + point_ids = [p.id for p in points] + for i in range(0, len(point_ids), 100): + batch_ids = point_ids[i:i + 100] + client.delete( + collection_name=SOURCE_COLLECTION, + points_selector=batch_ids, + ) + print(f" Deleted {min(i + 100, len(point_ids))}/{len(point_ids)} from {SOURCE_COLLECTION}...") + + print(f"\nDelete complete: {len(point_ids)} points removed from {SOURCE_COLLECTION}") + + # Final counts + source_info = client.get_collection(SOURCE_COLLECTION) + target_info = client.get_collection(TARGET_COLLECTION) + print(f"\nAfter migration:") + print(f" {SOURCE_COLLECTION}: {source_info.points_count} points") + print(f" {TARGET_COLLECTION}: {target_info.points_count} points") + print(f"\nMigration complete!") + + +def verify(): + """Verify migration results.""" + print(f"Verifying migration...") + client = QdrantClient(url=QDRANT_URL) + + source_info = client.get_collection(SOURCE_COLLECTION) + target_info = client.get_collection(TARGET_COLLECTION) + print(f" {SOURCE_COLLECTION}: {source_info.points_count} points") + print(f" {TARGET_COLLECTION}: {target_info.points_count} points") + + # Check that migrated sources are gone from dsfa + remaining = scroll_all_points(client, SOURCE_COLLECTION, SOURCES_TO_MIGRATE) + if remaining: + print(f"\n WARNING: {len(remaining)} points still in {SOURCE_COLLECTION}!") + by_source: Dict[str, int] = {} + for p in remaining: + sc = p.payload.get("source_code", "UNKNOWN") + by_source[sc] = by_source.get(sc, 0) + 1 + for sc, cnt in sorted(by_source.items()): + print(f" {sc}: {cnt}") + else: + print(f"\n OK: No migrated sources remaining in {SOURCE_COLLECTION}") + + # Check that migrated sources exist in legal + for code in SOURCES_TO_MIGRATE: + results, _ = client.scroll( + collection_name=TARGET_COLLECTION, + scroll_filter=Filter( + must=[FieldCondition(key="regulation_code", match=MatchAny(any=[code]))] + ), + limit=1, + with_payload=True, + with_vectors=False, + ) + status = f"{len(results)}+ chunks" if results else "MISSING" + print(f" {TARGET_COLLECTION}/{code}: {status}") + + +def main(): + parser = argparse.ArgumentParser(description="Migrate RAG chunks between collections") + parser.add_argument("--execute", action="store_true", help="Actually execute the migration (default: dry run)") + parser.add_argument("--verify", action="store_true", help="Verify migration results") + args = parser.parse_args() + + if args.verify: + verify() + else: + migrate(execute=args.execute) + + +if __name__ == "__main__": + main()