refactor: Consolidate standalone services into admin-v2, add new SDK modules
Remove standalone services (ai-compliance-sdk root, developer-portal, dsms-gateway, dsms-node, night-scheduler) and legacy compliance/dsgvo pages. Add new SDK pipeline modules (academy, document-crawler, dsb-portal, incidents, whistleblower, reporting, sso, multi-tenant, industry-templates). Add drafting engine, legal corpus files (AT/CH/DE), pitch-deck, blog and Förderantrag pages. Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
204
klausur-service/backend/country_metadata.py
Normal file
204
klausur-service/backend/country_metadata.py
Normal file
@@ -0,0 +1,204 @@
|
||||
"""
|
||||
Country Metadata for EU/EWR Compliance Coverage.
|
||||
|
||||
Provides entry points (DPA URLs, legal portals, languages) for all EU/EWR countries.
|
||||
DACH countries have full RAG coverage; others have entry-point metadata for system prompt injection.
|
||||
|
||||
Usage:
|
||||
from country_metadata import COUNTRY_METADATA, get_country_context, DACH_COUNTRIES
|
||||
"""
|
||||
|
||||
from typing import Dict, Optional
|
||||
|
||||
COUNTRY_METADATA: Dict[str, dict] = {
|
||||
"EU": {
|
||||
"country_name": "Europaeische Union",
|
||||
"language_codes": ["de", "en", "fr"],
|
||||
"legal_portal": {"name": "EUR-Lex", "url": "https://eur-lex.europa.eu/"},
|
||||
"dpa": {"name": "European Data Protection Board (EDPB)", "url": "https://edpb.europa.eu/"},
|
||||
"rag_coverage": "full",
|
||||
"priority_modules": ["DSI", "VVT", "TOMs", "AGB", "Impressum", "Retention", "DSFA", "AVV", "Cookies"],
|
||||
},
|
||||
"DE": {
|
||||
"country_name": "Deutschland",
|
||||
"language_codes": ["de"],
|
||||
"legal_portal": {"name": "Gesetze im Internet", "url": "https://www.gesetze-im-internet.de/"},
|
||||
"dpa": {"name": "Datenschutzkonferenz (DSK)", "url": "https://www.datenschutzkonferenz-online.de/"},
|
||||
"rag_coverage": "full",
|
||||
"priority_modules": ["DSI", "VVT", "TOMs", "AGB", "Impressum", "Retention", "DSFA", "AVV", "Cookies"],
|
||||
},
|
||||
"AT": {
|
||||
"country_name": "Oesterreich",
|
||||
"language_codes": ["de"],
|
||||
"legal_portal": {"name": "Rechtsinformationssystem (RIS)", "url": "https://www.ris.bka.gv.at/"},
|
||||
"dpa": {"name": "Datenschutzbehoerde (DSB)", "url": "https://www.dsb.gv.at/"},
|
||||
"rag_coverage": "full",
|
||||
"priority_modules": ["DSI", "VVT", "TOMs", "AGB", "Impressum", "Retention", "DSFA", "AVV", "Cookies"],
|
||||
},
|
||||
"CH": {
|
||||
"country_name": "Schweiz",
|
||||
"language_codes": ["de", "fr", "it"],
|
||||
"legal_portal": {"name": "Fedlex", "url": "https://www.fedlex.admin.ch/"},
|
||||
"dpa": {"name": "EDOEB", "url": "https://www.edoeb.admin.ch/"},
|
||||
"rag_coverage": "full",
|
||||
"priority_modules": ["DSI", "VVT", "TOMs", "AGB", "Impressum", "Retention", "DSFA", "AVV", "Cookies"],
|
||||
},
|
||||
"FR": {
|
||||
"country_name": "Frankreich",
|
||||
"language_codes": ["fr"],
|
||||
"legal_portal": {"name": "Legifrance", "url": "https://www.legifrance.gouv.fr/"},
|
||||
"dpa": {"name": "CNIL", "url": "https://www.cnil.fr/"},
|
||||
"rag_coverage": "entry_point",
|
||||
"priority_modules": ["DSI", "VVT", "TOMs", "AGB", "Impressum", "Retention"],
|
||||
},
|
||||
"IT": {
|
||||
"country_name": "Italien",
|
||||
"language_codes": ["it"],
|
||||
"legal_portal": {"name": "Normattiva", "url": "https://www.normattiva.it/"},
|
||||
"dpa": {"name": "Garante per la protezione dei dati personali", "url": "https://www.garanteprivacy.it/"},
|
||||
"rag_coverage": "entry_point",
|
||||
"priority_modules": ["DSI", "VVT", "TOMs", "AGB", "Impressum", "Retention"],
|
||||
},
|
||||
"ES": {
|
||||
"country_name": "Spanien",
|
||||
"language_codes": ["es"],
|
||||
"legal_portal": {"name": "BOE", "url": "https://www.boe.es/"},
|
||||
"dpa": {"name": "AEPD", "url": "https://www.aepd.es/"},
|
||||
"rag_coverage": "entry_point",
|
||||
"priority_modules": ["DSI", "VVT", "TOMs", "AGB", "Impressum", "Retention"],
|
||||
},
|
||||
"NL": {
|
||||
"country_name": "Niederlande",
|
||||
"language_codes": ["nl"],
|
||||
"legal_portal": {"name": "Overheid.nl", "url": "https://wetten.overheid.nl/"},
|
||||
"dpa": {"name": "Autoriteit Persoonsgegevens", "url": "https://www.autoriteitpersoonsgegevens.nl/"},
|
||||
"rag_coverage": "entry_point",
|
||||
"priority_modules": ["DSI", "VVT", "TOMs", "AGB", "Impressum", "Retention"],
|
||||
},
|
||||
"BE": {
|
||||
"country_name": "Belgien",
|
||||
"language_codes": ["fr", "nl", "de"],
|
||||
"legal_portal": {"name": "eJustice Belgium", "url": "https://www.ejustice.just.fgov.be/"},
|
||||
"dpa": {"name": "Autorite de protection des donnees (APD)", "url": "https://www.autoriteprotectiondonnees.be/"},
|
||||
"rag_coverage": "entry_point",
|
||||
"priority_modules": ["DSI", "VVT", "TOMs", "AGB", "Impressum", "Retention"],
|
||||
},
|
||||
"LU": {
|
||||
"country_name": "Luxemburg",
|
||||
"language_codes": ["fr", "de"],
|
||||
"legal_portal": {"name": "Legilux", "url": "https://legilux.public.lu/"},
|
||||
"dpa": {"name": "CNPD", "url": "https://cnpd.public.lu/"},
|
||||
"rag_coverage": "entry_point",
|
||||
"priority_modules": ["DSI", "VVT", "TOMs", "AGB", "Impressum", "Retention"],
|
||||
},
|
||||
"IE": {
|
||||
"country_name": "Irland",
|
||||
"language_codes": ["en"],
|
||||
"legal_portal": {"name": "Irish Statute Book", "url": "https://www.irishstatutebook.ie/"},
|
||||
"dpa": {"name": "Data Protection Commission (DPC)", "url": "https://www.dataprotection.ie/"},
|
||||
"rag_coverage": "entry_point",
|
||||
"priority_modules": ["DSI", "VVT", "TOMs", "AGB", "Impressum", "Retention"],
|
||||
},
|
||||
"DK": {
|
||||
"country_name": "Daenemark",
|
||||
"language_codes": ["da"],
|
||||
"legal_portal": {"name": "Retsinformation", "url": "https://www.retsinformation.dk/"},
|
||||
"dpa": {"name": "Datatilsynet", "url": "https://www.datatilsynet.dk/"},
|
||||
"rag_coverage": "entry_point",
|
||||
"priority_modules": ["DSI", "VVT", "TOMs"],
|
||||
},
|
||||
"SE": {
|
||||
"country_name": "Schweden",
|
||||
"language_codes": ["sv"],
|
||||
"legal_portal": {"name": "Riksdagen", "url": "https://www.riksdagen.se/"},
|
||||
"dpa": {"name": "IMY (Integritetsskyddsmyndigheten)", "url": "https://www.imy.se/"},
|
||||
"rag_coverage": "entry_point",
|
||||
"priority_modules": ["DSI", "VVT", "TOMs"],
|
||||
},
|
||||
"FI": {
|
||||
"country_name": "Finnland",
|
||||
"language_codes": ["fi", "sv"],
|
||||
"legal_portal": {"name": "Finlex", "url": "https://www.finlex.fi/"},
|
||||
"dpa": {"name": "Tietosuojavaltuutetun toimisto", "url": "https://tietosuoja.fi/"},
|
||||
"rag_coverage": "entry_point",
|
||||
"priority_modules": ["DSI", "VVT", "TOMs"],
|
||||
},
|
||||
"NO": {
|
||||
"country_name": "Norwegen",
|
||||
"language_codes": ["no"],
|
||||
"legal_portal": {"name": "Lovdata", "url": "https://lovdata.no/"},
|
||||
"dpa": {"name": "Datatilsynet", "url": "https://www.datatilsynet.no/"},
|
||||
"rag_coverage": "entry_point",
|
||||
"priority_modules": ["DSI", "VVT", "TOMs"],
|
||||
},
|
||||
"IS": {
|
||||
"country_name": "Island",
|
||||
"language_codes": ["is"],
|
||||
"legal_portal": {"name": "Althingi", "url": "https://www.althingi.is/"},
|
||||
"dpa": {"name": "Personuvernd", "url": "https://www.personuvernd.is/"},
|
||||
"rag_coverage": "entry_point",
|
||||
"priority_modules": ["DSI", "VVT"],
|
||||
},
|
||||
"LI": {
|
||||
"country_name": "Liechtenstein",
|
||||
"language_codes": ["de"],
|
||||
"legal_portal": {"name": "Gesetze.li", "url": "https://www.gesetze.li/"},
|
||||
"dpa": {"name": "Datenschutzstelle", "url": "https://www.datenschutzstelle.li/"},
|
||||
"rag_coverage": "entry_point",
|
||||
"priority_modules": ["DSI", "VVT", "TOMs"],
|
||||
},
|
||||
"UK": {
|
||||
"country_name": "Vereinigtes Koenigreich",
|
||||
"language_codes": ["en"],
|
||||
"legal_portal": {"name": "Legislation.gov.uk", "url": "https://www.legislation.gov.uk/"},
|
||||
"dpa": {"name": "Information Commissioner's Office (ICO)", "url": "https://ico.org.uk/"},
|
||||
"rag_coverage": "entry_point",
|
||||
"priority_modules": ["DSI", "VVT", "TOMs", "AGB", "Impressum", "Retention"],
|
||||
},
|
||||
}
|
||||
|
||||
DACH_COUNTRIES = {"DE", "AT", "CH"}
|
||||
|
||||
|
||||
def get_country_context(country_code: str) -> Optional[str]:
|
||||
"""
|
||||
Generate context string for system prompt injection.
|
||||
|
||||
For DACH countries: Full RAG search hint.
|
||||
For other countries: Entry point with DPA and legal portal URLs.
|
||||
"""
|
||||
meta = COUNTRY_METADATA.get(country_code.upper())
|
||||
if not meta:
|
||||
return None
|
||||
|
||||
lines = [f"Land: {meta['country_name']} ({country_code.upper()})"]
|
||||
lines.append(f"Sprachen: {', '.join(meta['language_codes'])}")
|
||||
lines.append(f"Rechtsportal: {meta['legal_portal']['name']} — {meta['legal_portal']['url']}")
|
||||
lines.append(f"Datenschutzbehoerde: {meta['dpa']['name']} — {meta['dpa']['url']}")
|
||||
|
||||
if meta["rag_coverage"] == "full":
|
||||
lines.append(f"RAG-Abdeckung: VOLLSTAENDIG — Suche in bp_legal_corpus fuer nationale Gesetze verfuegbar.")
|
||||
lines.append(f"Compliance-Module: {', '.join(meta['priority_modules'])}")
|
||||
else:
|
||||
lines.append(f"RAG-Abdeckung: Einstiegspunkt — Keine nationalen Gesetze im RAG. Verweise auf das Rechtsportal und die Datenschutzbehoerde.")
|
||||
lines.append(f"Hinweis: Fuer detaillierte rechtliche Informationen zu {meta['country_name']} bitte das Rechtsportal oder die DPA konsultieren.")
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def get_all_countries_summary() -> str:
|
||||
"""Generate a summary of all covered countries for overview display."""
|
||||
lines = ["Laenderabdeckung EU/EWR:"]
|
||||
lines.append("")
|
||||
lines.append("VOLLSTAENDIG (DACH):")
|
||||
for code in sorted(DACH_COUNTRIES):
|
||||
meta = COUNTRY_METADATA[code]
|
||||
lines.append(f" {code}: {meta['country_name']} — DPA: {meta['dpa']['name']}")
|
||||
|
||||
lines.append("")
|
||||
lines.append("EINSTIEGSPUNKTE:")
|
||||
for code, meta in sorted(COUNTRY_METADATA.items()):
|
||||
if code not in DACH_COUNTRIES and code != "EU":
|
||||
lines.append(f" {code}: {meta['country_name']} — DPA: {meta['dpa']['name']}")
|
||||
|
||||
return "\n".join(lines)
|
||||
@@ -766,6 +766,326 @@ DSFA_SOURCES = [
|
||||
"document_type": "standard",
|
||||
"language": "de"
|
||||
},
|
||||
|
||||
# === EDPB Ergaenzende Leitlinien ===
|
||||
# MIGRATED to bp_legal_corpus via migrate_rag_chunks.py (2026-02-10)
|
||||
{
|
||||
"source_code": "EDPB_GUIDELINES_2_2019",
|
||||
"name": "EDPB Leitlinien 2/2019 zu Art. 6(1)(b)",
|
||||
"full_name": "EDPB Leitlinien 2/2019 zur Verarbeitung personenbezogener Daten auf Grundlage von Art. 6 Abs. 1 lit. b DSGVO",
|
||||
"organization": "European Data Protection Board",
|
||||
"source_url": "https://edpb.europa.eu/our-work-tools/our-documents/guidelines/guidelines-22019-processing-personal-data-under-article-61b_en",
|
||||
"license_code": "EDPB-LICENSE",
|
||||
"attribution_text": "Source: EDPB Guidelines 2/2019, European Data Protection Board",
|
||||
"document_type": "guideline",
|
||||
"language": "en",
|
||||
"migrated_to": "bp_legal_corpus"
|
||||
},
|
||||
{
|
||||
"source_code": "EDPB_GUIDELINES_3_2019",
|
||||
"name": "EDPB Leitlinien 3/2019 Videoueberwachung",
|
||||
"full_name": "EDPB Leitlinien 3/2019 zur Verarbeitung personenbezogener Daten durch Videoueberwachung",
|
||||
"organization": "European Data Protection Board",
|
||||
"source_url": "https://edpb.europa.eu/our-work-tools/our-documents/guidelines/guidelines-32019-processing-personal-data-through-video_en",
|
||||
"license_code": "EDPB-LICENSE",
|
||||
"attribution_text": "Source: EDPB Guidelines 3/2019, European Data Protection Board",
|
||||
"document_type": "guideline",
|
||||
"language": "en",
|
||||
"migrated_to": "bp_legal_corpus"
|
||||
},
|
||||
{
|
||||
"source_code": "EDPB_GUIDELINES_5_2020",
|
||||
"name": "EDPB Leitlinien 5/2020 Einwilligung",
|
||||
"full_name": "EDPB Leitlinien 5/2020 zur Einwilligung gemaess Verordnung 2016/679",
|
||||
"organization": "European Data Protection Board",
|
||||
"source_url": "https://edpb.europa.eu/our-work-tools/our-documents/guidelines/guidelines-052020-consent-under-regulation-2016679_en",
|
||||
"license_code": "EDPB-LICENSE",
|
||||
"attribution_text": "Source: EDPB Guidelines 5/2020, European Data Protection Board",
|
||||
"document_type": "guideline",
|
||||
"language": "en",
|
||||
"migrated_to": "bp_legal_corpus"
|
||||
},
|
||||
{
|
||||
"source_code": "EDPB_GUIDELINES_7_2020",
|
||||
"name": "EDPB Leitlinien 7/2020 Controller/Processor",
|
||||
"full_name": "EDPB Leitlinien 7/2020 zu den Begriffen Verantwortlicher und Auftragsverarbeiter",
|
||||
"organization": "European Data Protection Board",
|
||||
"source_url": "https://edpb.europa.eu/our-work-tools/our-documents/guidelines/guidelines-072020-concepts-controller-and-processor-gdpr_en",
|
||||
"license_code": "EDPB-LICENSE",
|
||||
"attribution_text": "Source: EDPB Guidelines 7/2020, European Data Protection Board",
|
||||
"document_type": "guideline",
|
||||
"language": "en",
|
||||
"migrated_to": "bp_legal_corpus"
|
||||
},
|
||||
{
|
||||
"source_code": "EDPB_GUIDELINES_1_2022",
|
||||
"name": "EDPB Leitlinien 1/2022 Bussgelder",
|
||||
"full_name": "EDPB Leitlinien 04/2022 zur Berechnung von Bussgeldern nach der DSGVO",
|
||||
"organization": "European Data Protection Board",
|
||||
"source_url": "https://edpb.europa.eu/our-work-tools/our-documents/guidelines/guidelines-042022-calculation-administrative-fines-under-gdpr_en",
|
||||
"license_code": "EDPB-LICENSE",
|
||||
"attribution_text": "Source: EDPB Guidelines 04/2022, European Data Protection Board",
|
||||
"document_type": "guideline",
|
||||
"language": "en",
|
||||
"migrated_to": "bp_legal_corpus"
|
||||
},
|
||||
{
|
||||
"source_code": "SCC_FULL_TEXT",
|
||||
"name": "Standard Contractual Clauses Volltext",
|
||||
"full_name": "Standardvertragsklauseln fuer die Uebermittlung personenbezogener Daten an Drittlaender (2021/914/EU)",
|
||||
"organization": "Europaeische Kommission",
|
||||
"source_url": "https://eur-lex.europa.eu/eli/dec_impl/2021/914/oj",
|
||||
"license_code": "PUBLIC_DOMAIN",
|
||||
"attribution_text": "Quelle: SCC Volltext, Europaeische Kommission (EUR-Lex)",
|
||||
"document_type": "regulation",
|
||||
"language": "de",
|
||||
"migrated_to": "bp_legal_corpus"
|
||||
},
|
||||
|
||||
# === Nationale Datenschutzgesetze (DSGVO-Umsetzungen) ===
|
||||
# MIGRATED to bp_legal_corpus via migrate_rag_chunks.py (2026-02-10)
|
||||
# These sources are kept here for reference but will be skipped during ingestion.
|
||||
# Ingestion should target bp_legal_corpus for these source codes.
|
||||
{
|
||||
"source_code": "BDSG_FULL",
|
||||
"name": "BDSG Volltext (Deutschland)",
|
||||
"full_name": "Bundesdatenschutzgesetz (BDSG) - Volltext inkl. aller Teile",
|
||||
"organization": "Bundesrepublik Deutschland",
|
||||
"source_url": "https://www.gesetze-im-internet.de/bdsg_2018/",
|
||||
"license_code": "PUBLIC_DOMAIN",
|
||||
"attribution_text": "Quelle: BDSG, Bundesrepublik Deutschland (gesetze-im-internet.de)",
|
||||
"document_type": "legislation",
|
||||
"language": "de",
|
||||
"migrated_to": "bp_legal_corpus"
|
||||
},
|
||||
{
|
||||
"source_code": "AT_DSG",
|
||||
"name": "DSG Oesterreich",
|
||||
"full_name": "Bundesgesetz zum Schutz natuerlicher Personen bei der Verarbeitung personenbezogener Daten (Datenschutzgesetz - DSG)",
|
||||
"organization": "Republik Oesterreich",
|
||||
"source_url": "https://www.ris.bka.gv.at/GeltendeFassung.wxe?Abfrage=Bundesnormen&Gesetzesnummer=10001597",
|
||||
"license_code": "PUBLIC_DOMAIN",
|
||||
"attribution_text": "Quelle: DSG, Republik Oesterreich (RIS)",
|
||||
"document_type": "legislation",
|
||||
"language": "de",
|
||||
"migrated_to": "bp_legal_corpus"
|
||||
},
|
||||
{
|
||||
"source_code": "CH_DSG",
|
||||
"name": "DSG Schweiz (revDSG 2023)",
|
||||
"full_name": "Bundesgesetz ueber den Datenschutz (Datenschutzgesetz, DSG) - revidierte Fassung 2023",
|
||||
"organization": "Schweizerische Eidgenossenschaft",
|
||||
"source_url": "https://www.fedlex.admin.ch/eli/cc/2022/491/de",
|
||||
"license_code": "PUBLIC_DOMAIN",
|
||||
"attribution_text": "Quelle: DSG, Schweizerische Eidgenossenschaft (Fedlex)",
|
||||
"document_type": "legislation",
|
||||
"language": "de",
|
||||
"migrated_to": "bp_legal_corpus"
|
||||
},
|
||||
{
|
||||
"source_code": "LI_DSG",
|
||||
"name": "DSG Liechtenstein",
|
||||
"full_name": "Datenschutzgesetz (DSG) Liechtenstein",
|
||||
"organization": "Fuerstentum Liechtenstein",
|
||||
"source_url": "https://www.gesetze.li/konso/2018.272",
|
||||
"license_code": "PUBLIC_DOMAIN",
|
||||
"attribution_text": "Quelle: DSG, Fuerstentum Liechtenstein (gesetze.li)",
|
||||
"document_type": "legislation",
|
||||
"language": "de",
|
||||
"migrated_to": "bp_legal_corpus"
|
||||
},
|
||||
{
|
||||
"source_code": "FR_CNIL_GUIDE",
|
||||
"name": "CNIL Guide RGPD",
|
||||
"full_name": "Guide pratique RGPD - Commission Nationale de l'Informatique et des Libertes",
|
||||
"organization": "CNIL (France)",
|
||||
"source_url": "https://www.cnil.fr/fr/rgpd-de-quoi-parle-t-on",
|
||||
"license_code": "PUBLIC_DOMAIN",
|
||||
"attribution_text": "Source: CNIL Guide RGPD, Commission Nationale de l'Informatique et des Libertes",
|
||||
"document_type": "guideline",
|
||||
"language": "fr",
|
||||
"migrated_to": "bp_legal_corpus"
|
||||
},
|
||||
{
|
||||
"source_code": "ES_LOPDGDD",
|
||||
"name": "LOPDGDD Spanien",
|
||||
"full_name": "Ley Organica de Proteccion de Datos Personales y garantia de los derechos digitales",
|
||||
"organization": "Reino de Espana",
|
||||
"source_url": "https://www.boe.es/buscar/act.php?id=BOE-A-2018-16673",
|
||||
"license_code": "PUBLIC_DOMAIN",
|
||||
"attribution_text": "Fuente: LOPDGDD, Reino de Espana (BOE)",
|
||||
"document_type": "legislation",
|
||||
"language": "es",
|
||||
"migrated_to": "bp_legal_corpus"
|
||||
},
|
||||
{
|
||||
"source_code": "IT_CODICE_PRIVACY",
|
||||
"name": "Codice Privacy Italien",
|
||||
"full_name": "Codice in materia di protezione dei dati personali (D.Lgs. 196/2003, aggiornato D.Lgs. 101/2018)",
|
||||
"organization": "Repubblica Italiana",
|
||||
"source_url": "https://www.garanteprivacy.it/home/docweb/-/docweb-display/docweb/9042678",
|
||||
"license_code": "PUBLIC_DOMAIN",
|
||||
"attribution_text": "Fonte: Codice Privacy, Garante per la protezione dei dati personali",
|
||||
"document_type": "legislation",
|
||||
"language": "it",
|
||||
"migrated_to": "bp_legal_corpus"
|
||||
},
|
||||
{
|
||||
"source_code": "NL_UAVG",
|
||||
"name": "UAVG Niederlande",
|
||||
"full_name": "Uitvoeringswet Algemene verordening gegevensbescherming (UAVG)",
|
||||
"organization": "Koninkrijk der Nederlanden",
|
||||
"source_url": "https://wetten.overheid.nl/BWBR0040940/",
|
||||
"license_code": "PUBLIC_DOMAIN",
|
||||
"attribution_text": "Bron: UAVG, Koninkrijk der Nederlanden (wetten.overheid.nl)",
|
||||
"document_type": "legislation",
|
||||
"language": "nl",
|
||||
"migrated_to": "bp_legal_corpus"
|
||||
},
|
||||
{
|
||||
"source_code": "BE_DPA_LAW",
|
||||
"name": "Datenschutzgesetz Belgien",
|
||||
"full_name": "Loi relative a la protection des personnes physiques a l'egard des traitements de donnees a caractere personnel",
|
||||
"organization": "Royaume de Belgique",
|
||||
"source_url": "https://www.ejustice.just.fgov.be/cgi_loi/change_lg.pl?language=fr&la=F&cn=2018073046",
|
||||
"license_code": "PUBLIC_DOMAIN",
|
||||
"attribution_text": "Source: Loi Protection des Donnees, Royaume de Belgique (eJustice)",
|
||||
"document_type": "legislation",
|
||||
"language": "fr",
|
||||
"migrated_to": "bp_legal_corpus"
|
||||
},
|
||||
{
|
||||
"source_code": "LU_DPA_LAW",
|
||||
"name": "Datenschutzgesetz Luxemburg",
|
||||
"full_name": "Loi du 1er aout 2018 portant organisation de la Commission nationale pour la protection des donnees",
|
||||
"organization": "Grand-Duche de Luxembourg",
|
||||
"source_url": "https://legilux.public.lu/eli/etat/leg/loi/2018/08/01/a686/jo",
|
||||
"license_code": "PUBLIC_DOMAIN",
|
||||
"attribution_text": "Source: Loi Protection des Donnees, Grand-Duche de Luxembourg (Legilux)",
|
||||
"document_type": "legislation",
|
||||
"language": "fr",
|
||||
"migrated_to": "bp_legal_corpus"
|
||||
},
|
||||
{
|
||||
"source_code": "IE_DPA_2018",
|
||||
"name": "Data Protection Act 2018 Ireland",
|
||||
"full_name": "Data Protection Act 2018 (Act No. 7 of 2018) - Ireland",
|
||||
"organization": "Government of Ireland",
|
||||
"source_url": "https://www.irishstatutebook.ie/eli/2018/act/7/enacted/en/html",
|
||||
"license_code": "OGL-3.0",
|
||||
"attribution_text": "Contains public sector information licensed under the Open Government Licence v3.0. Source: Data Protection Act 2018, Ireland",
|
||||
"document_type": "legislation",
|
||||
"language": "en",
|
||||
"migrated_to": "bp_legal_corpus"
|
||||
},
|
||||
{
|
||||
"source_code": "UK_DPA_2018",
|
||||
"name": "Data Protection Act 2018 UK",
|
||||
"full_name": "Data Protection Act 2018 (c. 12) - United Kingdom",
|
||||
"organization": "Government of the United Kingdom",
|
||||
"source_url": "https://www.legislation.gov.uk/ukpga/2018/12/contents/enacted",
|
||||
"license_code": "OGL-3.0",
|
||||
"attribution_text": "Contains public sector information licensed under the Open Government Licence v3.0. Source: Data Protection Act 2018, UK",
|
||||
"document_type": "legislation",
|
||||
"language": "en",
|
||||
"migrated_to": "bp_legal_corpus"
|
||||
},
|
||||
{
|
||||
"source_code": "UK_GDPR",
|
||||
"name": "UK GDPR (retained EU law)",
|
||||
"full_name": "United Kingdom General Data Protection Regulation (UK GDPR) - retained EU law",
|
||||
"organization": "Government of the United Kingdom",
|
||||
"source_url": "https://www.legislation.gov.uk/eur/2016/679/contents",
|
||||
"license_code": "OGL-3.0",
|
||||
"attribution_text": "Contains public sector information licensed under the Open Government Licence v3.0. Source: UK GDPR, legislation.gov.uk",
|
||||
"document_type": "legislation",
|
||||
"language": "en",
|
||||
"migrated_to": "bp_legal_corpus"
|
||||
},
|
||||
{
|
||||
"source_code": "NO_PERSONOPPLYSNINGSLOVEN",
|
||||
"name": "Personopplysningsloven Norwegen",
|
||||
"full_name": "Lov om behandling av personopplysninger (personopplysningsloven)",
|
||||
"organization": "Kongeriket Norge",
|
||||
"source_url": "https://lovdata.no/dokument/NL/lov/2018-06-15-38",
|
||||
"license_code": "PUBLIC_DOMAIN",
|
||||
"attribution_text": "Kilde: Personopplysningsloven, Kongeriket Norge (Lovdata)",
|
||||
"document_type": "legislation",
|
||||
"language": "no",
|
||||
"migrated_to": "bp_legal_corpus"
|
||||
},
|
||||
{
|
||||
"source_code": "SE_DATASKYDDSLAG",
|
||||
"name": "Dataskyddslag Schweden",
|
||||
"full_name": "Lag (2018:218) med kompletterande bestammelser till EU:s dataskyddsforordning",
|
||||
"organization": "Konungariket Sverige",
|
||||
"source_url": "https://www.riksdagen.se/sv/dokument-och-lagar/dokument/svensk-forfattningssamling/lag-2018218-med-kompletterande-bestammelser-till_sfs-2018-218/",
|
||||
"license_code": "PUBLIC_DOMAIN",
|
||||
"attribution_text": "Kalla: Dataskyddslag (2018:218), Konungariket Sverige (Riksdagen)",
|
||||
"document_type": "legislation",
|
||||
"language": "sv",
|
||||
"migrated_to": "bp_legal_corpus"
|
||||
},
|
||||
{
|
||||
"source_code": "DK_DATABESKYTTELSESLOVEN",
|
||||
"name": "Databeskyttelsesloven Daenemark",
|
||||
"full_name": "Lov om supplerende bestemmelser til forordning om beskyttelse af fysiske personer i forbindelse med behandling af personoplysninger",
|
||||
"organization": "Kongeriget Danmark",
|
||||
"source_url": "https://www.retsinformation.dk/eli/lta/2018/502",
|
||||
"license_code": "PUBLIC_DOMAIN",
|
||||
"attribution_text": "Kilde: Databeskyttelsesloven, Kongeriget Danmark (Retsinformation)",
|
||||
"document_type": "legislation",
|
||||
"language": "da",
|
||||
"migrated_to": "bp_legal_corpus"
|
||||
},
|
||||
{
|
||||
"source_code": "FI_TIETOSUOJALAKI",
|
||||
"name": "Tietosuojalaki Finnland",
|
||||
"full_name": "Tietosuojalaki (1050/2018) - Datenschutzgesetz Finnland",
|
||||
"organization": "Suomen tasavalta",
|
||||
"source_url": "https://www.finlex.fi/fi/laki/ajantasa/2018/20181050",
|
||||
"license_code": "PUBLIC_DOMAIN",
|
||||
"attribution_text": "Lahde: Tietosuojalaki, Suomen tasavalta (Finlex)",
|
||||
"document_type": "legislation",
|
||||
"language": "fi",
|
||||
"migrated_to": "bp_legal_corpus"
|
||||
},
|
||||
{
|
||||
"source_code": "PL_UODO",
|
||||
"name": "UODO Polen",
|
||||
"full_name": "Ustawa o ochronie danych osobowych - Datenschutzgesetz Polen",
|
||||
"organization": "Rzeczpospolita Polska",
|
||||
"source_url": "https://isap.sejm.gov.pl/isap.nsf/DocDetails.xsp?id=WDU20180001000",
|
||||
"license_code": "PUBLIC_DOMAIN",
|
||||
"attribution_text": "Zrodlo: Ustawa o ochronie danych osobowych, Rzeczpospolita Polska (ISAP)",
|
||||
"document_type": "legislation",
|
||||
"language": "pl",
|
||||
"migrated_to": "bp_legal_corpus"
|
||||
},
|
||||
{
|
||||
"source_code": "CZ_ZOU",
|
||||
"name": "Zakon o ochrane osobnich udaju Tschechien",
|
||||
"full_name": "Zakon c. 110/2019 Sb. o zpracovani osobnich udaju",
|
||||
"organization": "Ceska republika",
|
||||
"source_url": "https://www.zakonyprolidi.cz/cs/2019-110",
|
||||
"license_code": "PUBLIC_DOMAIN",
|
||||
"attribution_text": "Zdroj: Zakon o ochrane osobnich udaju, Ceska republika (zakonyprolidi.cz)",
|
||||
"document_type": "legislation",
|
||||
"language": "cs",
|
||||
"migrated_to": "bp_legal_corpus"
|
||||
},
|
||||
{
|
||||
"source_code": "HU_INFOTV",
|
||||
"name": "Informacios torvenye Ungarn",
|
||||
"full_name": "2011. evi CXII. torveny az informacios onrendelkezesi jogrol es az informacioszabadsagrol (Infotv.)",
|
||||
"organization": "Magyarorszag",
|
||||
"source_url": "https://njt.hu/jogszabaly/2011-112-00-00",
|
||||
"license_code": "PUBLIC_DOMAIN",
|
||||
"attribution_text": "Forras: Infotv., Magyarorszag (njt.hu)",
|
||||
"document_type": "legislation",
|
||||
"language": "hu",
|
||||
"migrated_to": "bp_legal_corpus"
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
@@ -1100,7 +1420,7 @@ class DSFAQdrantService:
|
||||
@property
|
||||
def client(self) -> QdrantClient:
|
||||
if self._client is None:
|
||||
self._client = QdrantClient(url=self.url)
|
||||
self._client = QdrantClient(url=self.url, check_compatibility=False)
|
||||
return self._client
|
||||
|
||||
async def ensure_collection(self) -> bool:
|
||||
@@ -1408,14 +1728,21 @@ async def init_dsfa_tables(pool: asyncpg.Pool):
|
||||
|
||||
|
||||
async def register_all_sources(pool: asyncpg.Pool):
|
||||
"""Register all DSFA sources in the database."""
|
||||
"""Register all DSFA sources in the database (skips migrated sources)."""
|
||||
store = DSFACorpusStore(pool)
|
||||
|
||||
registered = 0
|
||||
skipped = 0
|
||||
for source in DSFA_SOURCES:
|
||||
if source.get("migrated_to"):
|
||||
print(f"Skipping migrated source: {source['source_code']} -> {source['migrated_to']}")
|
||||
skipped += 1
|
||||
continue
|
||||
source_id = await store.register_source(source)
|
||||
print(f"Registered source: {source['source_code']} -> {source_id}")
|
||||
registered += 1
|
||||
|
||||
print(f"\nTotal sources registered: {len(DSFA_SOURCES)}")
|
||||
print(f"\nTotal sources registered: {registered} (skipped {skipped} migrated)")
|
||||
|
||||
|
||||
async def get_ingestion_status(pool: asyncpg.Pool):
|
||||
|
||||
@@ -1,8 +1,9 @@
|
||||
"""
|
||||
Legal Corpus Ingestion for UCCA RAG Integration.
|
||||
|
||||
Indexes all 19 regulations from the Compliance Hub into Qdrant for
|
||||
Indexes all regulations from the Compliance Hub into Qdrant for
|
||||
semantic search during UCCA assessments and explanations.
|
||||
Includes EU regulations, DACH national laws, and EDPB guidelines.
|
||||
|
||||
Collections:
|
||||
- bp_legal_corpus: All regulation texts (GDPR, AI Act, CRA, BSI, etc.)
|
||||
@@ -84,7 +85,7 @@ class Regulation:
|
||||
requirement_count: int = 0
|
||||
|
||||
|
||||
# All 19 regulations from Compliance Hub
|
||||
# All regulations from Compliance Hub (EU + DACH national laws + guidelines)
|
||||
REGULATIONS: List[Regulation] = [
|
||||
Regulation(
|
||||
code="GDPR",
|
||||
@@ -323,6 +324,348 @@ REGULATIONS: List[Regulation] = [
|
||||
celex="32023R1114",
|
||||
requirement_count=149,
|
||||
),
|
||||
# =====================================================================
|
||||
# DACH National Laws — Deutschland (P1)
|
||||
# =====================================================================
|
||||
Regulation(
|
||||
code="DE_DDG",
|
||||
name="Digitale-Dienste-Gesetz",
|
||||
full_name="Digitale-Dienste-Gesetz (DDG)",
|
||||
regulation_type="de_law",
|
||||
source_url="https://www.gesetze-im-internet.de/ddg/",
|
||||
description="Deutsches Umsetzungsgesetz zum DSA. Regelt Impressumspflicht (§5), Informationspflichten fuer digitale Dienste und Cookies.",
|
||||
requirement_count=30,
|
||||
),
|
||||
Regulation(
|
||||
code="DE_BGB_AGB",
|
||||
name="BGB AGB-Recht",
|
||||
full_name="BGB §§305-310, 312-312k — AGB und Fernabsatz",
|
||||
regulation_type="de_law",
|
||||
source_url="https://www.gesetze-im-internet.de/bgb/",
|
||||
description="Deutsches AGB-Recht (§§305-310 BGB) und Fernabsatzrecht (§§312-312k BGB). Klauselverbote, Inhaltskontrolle, Widerrufsrecht, Button-Loesung.",
|
||||
local_path="DE_BGB_AGB.txt",
|
||||
requirement_count=40,
|
||||
),
|
||||
Regulation(
|
||||
code="DE_EGBGB",
|
||||
name="EGBGB Art. 246-248",
|
||||
full_name="Einfuehrungsgesetz zum BGB — Informationspflichten",
|
||||
regulation_type="de_law",
|
||||
source_url="https://www.gesetze-im-internet.de/bgbeg/",
|
||||
description="Informationspflichten bei Verbrauchervertraegen (Art. 246), Fernabsatz (Art. 246a), E-Commerce (Art. 246c).",
|
||||
local_path="DE_EGBGB.txt",
|
||||
requirement_count=20,
|
||||
),
|
||||
Regulation(
|
||||
code="DE_UWG",
|
||||
name="UWG Deutschland",
|
||||
full_name="Gesetz gegen den unlauteren Wettbewerb (UWG)",
|
||||
regulation_type="de_law",
|
||||
source_url="https://www.gesetze-im-internet.de/uwg_2004/",
|
||||
description="Unlauterer Wettbewerb: irrefuehrende Werbung, Spam-Verbot, Preisangaben, Online-Marketing-Regeln.",
|
||||
requirement_count=25,
|
||||
),
|
||||
Regulation(
|
||||
code="DE_HGB_RET",
|
||||
name="HGB Aufbewahrung",
|
||||
full_name="HGB §§238-261, 257 — Handelsbuecher und Aufbewahrungsfristen",
|
||||
regulation_type="de_law",
|
||||
source_url="https://www.gesetze-im-internet.de/hgb/",
|
||||
description="Buchfuehrungspflicht, Aufbewahrungsfristen 6/10 Jahre, Anforderungen an elektronische Aufbewahrung.",
|
||||
local_path="DE_HGB_RET.txt",
|
||||
requirement_count=15,
|
||||
),
|
||||
Regulation(
|
||||
code="DE_AO_RET",
|
||||
name="AO Aufbewahrung",
|
||||
full_name="Abgabenordnung §§140-148 — Steuerliche Aufbewahrungspflichten",
|
||||
regulation_type="de_law",
|
||||
source_url="https://www.gesetze-im-internet.de/ao_1977/",
|
||||
description="Steuerliche Buchfuehrungs- und Aufbewahrungspflichten. 6/10 Jahre Fristen, Datenzugriff durch Finanzbehoerden.",
|
||||
local_path="DE_AO_RET.txt",
|
||||
requirement_count=12,
|
||||
),
|
||||
Regulation(
|
||||
code="DE_TKG",
|
||||
name="TKG 2021",
|
||||
full_name="Telekommunikationsgesetz 2021",
|
||||
regulation_type="de_law",
|
||||
source_url="https://www.gesetze-im-internet.de/tkg_2021/",
|
||||
description="Telekommunikationsregulierung: Kundenschutz, Datenschutz, Vertragslaufzeiten, Netzinfrastruktur.",
|
||||
requirement_count=45,
|
||||
),
|
||||
# =====================================================================
|
||||
# DACH National Laws — Oesterreich (P1)
|
||||
# =====================================================================
|
||||
Regulation(
|
||||
code="AT_ECG",
|
||||
name="E-Commerce-Gesetz AT",
|
||||
full_name="E-Commerce-Gesetz (ECG) Oesterreich",
|
||||
regulation_type="at_law",
|
||||
source_url="https://www.ris.bka.gv.at/GeltendeFassung.wxe?Abfrage=Bundesnormen&Gesetzesnummer=20001703",
|
||||
description="Oesterreichisches E-Commerce-Gesetz: Impressum/Offenlegungspflicht (§5), Informationspflichten, Haftung von Diensteanbietern.",
|
||||
language="de",
|
||||
requirement_count=30,
|
||||
),
|
||||
Regulation(
|
||||
code="AT_TKG",
|
||||
name="TKG 2021 AT",
|
||||
full_name="Telekommunikationsgesetz 2021 Oesterreich",
|
||||
regulation_type="at_law",
|
||||
source_url="https://www.ris.bka.gv.at/GeltendeFassung.wxe?Abfrage=Bundesnormen&Gesetzesnummer=20011678",
|
||||
description="Oesterreichisches TKG: Cookie-Bestimmungen (§165), Kommunikationsgeheimnis, Endgeraetezugriff.",
|
||||
language="de",
|
||||
requirement_count=40,
|
||||
),
|
||||
Regulation(
|
||||
code="AT_KSCHG",
|
||||
name="KSchG Oesterreich",
|
||||
full_name="Konsumentenschutzgesetz (KSchG) Oesterreich",
|
||||
regulation_type="at_law",
|
||||
source_url="https://www.ris.bka.gv.at/GeltendeFassung.wxe?Abfrage=Bundesnormen&Gesetzesnummer=10002462",
|
||||
description="Konsumentenschutz: AGB-Kontrolle (§6 Klauselverbote, §9 Verbandsklage), Ruecktrittsrecht, Informationspflichten.",
|
||||
language="de",
|
||||
requirement_count=35,
|
||||
),
|
||||
Regulation(
|
||||
code="AT_FAGG",
|
||||
name="FAGG Oesterreich",
|
||||
full_name="Fern- und Auswaertsgeschaefte-Gesetz (FAGG) Oesterreich",
|
||||
regulation_type="at_law",
|
||||
source_url="https://www.ris.bka.gv.at/GeltendeFassung.wxe?Abfrage=Bundesnormen&Gesetzesnummer=20008847",
|
||||
description="Fernabsatzrecht: Informationspflichten, Widerrufsrecht 14 Tage, Button-Loesung, Ausnahmen.",
|
||||
language="de",
|
||||
requirement_count=20,
|
||||
),
|
||||
Regulation(
|
||||
code="AT_UGB_RET",
|
||||
name="UGB Aufbewahrung AT",
|
||||
full_name="UGB §§189-216, 212 — Rechnungslegung und Aufbewahrung Oesterreich",
|
||||
regulation_type="at_law",
|
||||
source_url="https://www.ris.bka.gv.at/GeltendeFassung.wxe?Abfrage=Bundesnormen&Gesetzesnummer=10001702",
|
||||
description="Oesterreichische Rechnungslegungspflicht und Aufbewahrungsfristen (7 Jahre). Buchfuehrung, Jahresabschluss.",
|
||||
local_path="AT_UGB_RET.txt",
|
||||
language="de",
|
||||
requirement_count=15,
|
||||
),
|
||||
Regulation(
|
||||
code="AT_BAO_RET",
|
||||
name="BAO §132 AT",
|
||||
full_name="Bundesabgabenordnung §132 — Aufbewahrung Oesterreich",
|
||||
regulation_type="at_law",
|
||||
source_url="https://www.ris.bka.gv.at/GeltendeFassung.wxe?Abfrage=Bundesnormen&Gesetzesnummer=10003940",
|
||||
description="Steuerliche Aufbewahrungspflicht 7 Jahre fuer Buecher, Aufzeichnungen und Belege. Grundstuecke 22 Jahre.",
|
||||
language="de",
|
||||
requirement_count=5,
|
||||
),
|
||||
Regulation(
|
||||
code="AT_MEDIENG",
|
||||
name="MedienG §§24-25 AT",
|
||||
full_name="Mediengesetz §§24-25 Oesterreich — Impressum und Offenlegung",
|
||||
regulation_type="at_law",
|
||||
source_url="https://www.ris.bka.gv.at/GeltendeFassung.wxe?Abfrage=Bundesnormen&Gesetzesnummer=10000719",
|
||||
description="Impressum/Offenlegungspflicht fuer periodische Medien und Websites in Oesterreich.",
|
||||
language="de",
|
||||
requirement_count=10,
|
||||
),
|
||||
# =====================================================================
|
||||
# DACH National Laws — Schweiz (P1)
|
||||
# =====================================================================
|
||||
Regulation(
|
||||
code="CH_DSV",
|
||||
name="DSV Schweiz",
|
||||
full_name="Datenschutzverordnung (DSV) Schweiz — SR 235.11",
|
||||
regulation_type="ch_law",
|
||||
source_url="https://www.fedlex.admin.ch/eli/cc/2022/568/de",
|
||||
description="Ausfuehrungsverordnung zum revDSG: Meldepflichten, DSFA-Verfahren, Auslandtransfers, technische Massnahmen.",
|
||||
language="de",
|
||||
requirement_count=30,
|
||||
),
|
||||
Regulation(
|
||||
code="CH_OR_AGB",
|
||||
name="OR AGB/Aufbewahrung CH",
|
||||
full_name="Obligationenrecht — AGB-Kontrolle und Aufbewahrung Schweiz (SR 220)",
|
||||
regulation_type="ch_law",
|
||||
source_url="https://www.fedlex.admin.ch/eli/cc/27/317_321_377/de",
|
||||
description="Art. 8 OR (AGB-Inhaltskontrolle), Art. 19/20 (Vertragsfreiheit), Art. 957-958f (Buchfuehrung, 10 Jahre Aufbewahrung).",
|
||||
local_path="CH_OR_AGB.txt",
|
||||
language="de",
|
||||
requirement_count=20,
|
||||
),
|
||||
Regulation(
|
||||
code="CH_UWG",
|
||||
name="UWG Schweiz",
|
||||
full_name="Bundesgesetz gegen den unlauteren Wettbewerb Schweiz (SR 241)",
|
||||
regulation_type="ch_law",
|
||||
source_url="https://www.fedlex.admin.ch/eli/cc/1988/223_223_223/de",
|
||||
description="Lauterkeitsrecht: Impressumspflicht, irrefuehrende Werbung, aggressive Verkaufsmethoden, AGB-Transparenz.",
|
||||
language="de",
|
||||
requirement_count=20,
|
||||
),
|
||||
Regulation(
|
||||
code="CH_FMG",
|
||||
name="FMG Schweiz",
|
||||
full_name="Fernmeldegesetz Schweiz (SR 784.10)",
|
||||
regulation_type="ch_law",
|
||||
source_url="https://www.fedlex.admin.ch/eli/cc/1997/2187_2187_2187/de",
|
||||
description="Telekommunikationsregulierung: Fernmeldegeheimnis, Cookies/Tracking (Art. 45c), Spam-Verbot, Datenschutz.",
|
||||
language="de",
|
||||
requirement_count=25,
|
||||
),
|
||||
# =====================================================================
|
||||
# Deutschland P2
|
||||
# =====================================================================
|
||||
Regulation(
|
||||
code="DE_PANGV",
|
||||
name="PAngV",
|
||||
full_name="Preisangabenverordnung (PAngV 2022)",
|
||||
regulation_type="de_law",
|
||||
source_url="https://www.gesetze-im-internet.de/pangv_2022/",
|
||||
description="Preisangaben: Gesamtpreis, Grundpreis, Streichpreise (§11), Online-Preisauszeichnung.",
|
||||
requirement_count=15,
|
||||
),
|
||||
Regulation(
|
||||
code="DE_DLINFOV",
|
||||
name="DL-InfoV",
|
||||
full_name="Dienstleistungs-Informationspflichten-Verordnung",
|
||||
regulation_type="de_law",
|
||||
source_url="https://www.gesetze-im-internet.de/dlinfov/",
|
||||
description="Informationspflichten fuer Dienstleister: Identitaet, Kontakt, Berufshaftpflicht, AGB-Zugang.",
|
||||
requirement_count=10,
|
||||
),
|
||||
Regulation(
|
||||
code="DE_BETRVG",
|
||||
name="BetrVG §87",
|
||||
full_name="Betriebsverfassungsgesetz §87 Abs.1 Nr.6",
|
||||
regulation_type="de_law",
|
||||
source_url="https://www.gesetze-im-internet.de/betrvg/",
|
||||
description="Mitbestimmung bei technischer Ueberwachung: Betriebsrat-Beteiligung bei IT-Systemen, die Arbeitnehmerverhalten ueberwachen koennen.",
|
||||
requirement_count=5,
|
||||
),
|
||||
# =====================================================================
|
||||
# Oesterreich P2
|
||||
# =====================================================================
|
||||
Regulation(
|
||||
code="AT_ABGB_AGB",
|
||||
name="ABGB AGB-Recht AT",
|
||||
full_name="ABGB §§861-879, 864a — AGB-Kontrolle Oesterreich",
|
||||
regulation_type="at_law",
|
||||
source_url="https://www.ris.bka.gv.at/GeltendeFassung.wxe?Abfrage=Bundesnormen&Gesetzesnummer=10001622",
|
||||
description="Geltungskontrolle (§864a), Sittenwidrigkeitskontrolle (§879 Abs.3), allgemeine Vertragsregeln.",
|
||||
local_path="AT_ABGB_AGB.txt",
|
||||
language="de",
|
||||
requirement_count=10,
|
||||
),
|
||||
Regulation(
|
||||
code="AT_UWG",
|
||||
name="UWG Oesterreich",
|
||||
full_name="Bundesgesetz gegen den unlauteren Wettbewerb Oesterreich",
|
||||
regulation_type="at_law",
|
||||
source_url="https://www.ris.bka.gv.at/GeltendeFassung.wxe?Abfrage=Bundesnormen&Gesetzesnummer=10002665",
|
||||
description="Lauterkeitsrecht AT: irrefuehrende Geschaeftspraktiken, aggressive Praktiken, Preisauszeichnung.",
|
||||
language="de",
|
||||
requirement_count=15,
|
||||
),
|
||||
# =====================================================================
|
||||
# Schweiz P2
|
||||
# =====================================================================
|
||||
Regulation(
|
||||
code="CH_GEBUV",
|
||||
name="GeBuV Schweiz",
|
||||
full_name="Geschaeftsbuecher-Verordnung Schweiz (SR 221.431)",
|
||||
regulation_type="ch_law",
|
||||
source_url="https://www.fedlex.admin.ch/eli/cc/2002/468_468_468/de",
|
||||
description="Ausfuehrungsvorschriften zur Buchfuehrung: elektronische Aufbewahrung, Integritaet, Datentraeger.",
|
||||
language="de",
|
||||
requirement_count=10,
|
||||
),
|
||||
Regulation(
|
||||
code="CH_ZERTES",
|
||||
name="ZertES Schweiz",
|
||||
full_name="Bundesgesetz ueber die elektronische Signatur (SR 943.03)",
|
||||
regulation_type="ch_law",
|
||||
source_url="https://www.fedlex.admin.ch/eli/cc/2016/752/de",
|
||||
description="Elektronische Signatur und Zertifizierung: Qualifizierte Signaturen, Zertifizierungsdiensteanbieter.",
|
||||
language="de",
|
||||
requirement_count=10,
|
||||
),
|
||||
# =====================================================================
|
||||
# Deutschland P3
|
||||
# =====================================================================
|
||||
Regulation(
|
||||
code="DE_GESCHGEHG",
|
||||
name="GeschGehG",
|
||||
full_name="Gesetz zum Schutz von Geschaeftsgeheimnissen",
|
||||
regulation_type="de_law",
|
||||
source_url="https://www.gesetze-im-internet.de/geschgehg/",
|
||||
description="Schutz von Geschaeftsgeheimnissen: Definition, angemessene Geheimhaltungsmassnahmen, Reverse Engineering.",
|
||||
requirement_count=10,
|
||||
),
|
||||
Regulation(
|
||||
code="DE_BSIG",
|
||||
name="BSI-Gesetz",
|
||||
full_name="Gesetz ueber das Bundesamt fuer Sicherheit in der Informationstechnik (BSIG)",
|
||||
regulation_type="de_law",
|
||||
source_url="https://www.gesetze-im-internet.de/bsig_2009/",
|
||||
description="BSI-Aufgaben, KRITIS-Meldepflichten, IT-Sicherheitsstandards, Zertifizierung.",
|
||||
requirement_count=20,
|
||||
),
|
||||
Regulation(
|
||||
code="DE_USTG_RET",
|
||||
name="UStG §14b",
|
||||
full_name="Umsatzsteuergesetz §14b — Aufbewahrung von Rechnungen",
|
||||
regulation_type="de_law",
|
||||
source_url="https://www.gesetze-im-internet.de/ustg_1980/",
|
||||
description="Aufbewahrungspflicht fuer Rechnungen: 10 Jahre, Grundstuecke 20 Jahre, elektronische Aufbewahrung.",
|
||||
local_path="DE_USTG_RET.txt",
|
||||
requirement_count=5,
|
||||
),
|
||||
# =====================================================================
|
||||
# Schweiz P3
|
||||
# =====================================================================
|
||||
Regulation(
|
||||
code="CH_ZGB_PERS",
|
||||
name="ZGB Persoenlichkeitsschutz CH",
|
||||
full_name="Zivilgesetzbuch Art. 28-28l — Persoenlichkeitsschutz Schweiz (SR 210)",
|
||||
regulation_type="ch_law",
|
||||
source_url="https://www.fedlex.admin.ch/eli/cc/24/233_245_233/de",
|
||||
description="Persoenlichkeitsschutz: Recht am eigenen Bild, Schutz der Privatsphaere, Gegendarstellungsrecht.",
|
||||
language="de",
|
||||
requirement_count=8,
|
||||
),
|
||||
# =====================================================================
|
||||
# 3 fehlgeschlagene Quellen mit alternativen URLs nachholen
|
||||
# =====================================================================
|
||||
Regulation(
|
||||
code="LU_DPA_LAW",
|
||||
name="Datenschutzgesetz Luxemburg",
|
||||
full_name="Loi du 1er aout 2018 — Datenschutzgesetz Luxemburg",
|
||||
regulation_type="national_law",
|
||||
source_url="https://legilux.public.lu/eli/etat/leg/loi/2018/08/01/a686/jo",
|
||||
description="Luxemburgisches Datenschutzgesetz: Organisation der CNPD, nationale DSGVO-Ergaenzung.",
|
||||
language="fr",
|
||||
requirement_count=40,
|
||||
),
|
||||
Regulation(
|
||||
code="DK_DATABESKYTTELSESLOVEN",
|
||||
name="Databeskyttelsesloven DK",
|
||||
full_name="Databeskyttelsesloven — Datenschutzgesetz Daenemark",
|
||||
regulation_type="national_law",
|
||||
source_url="https://www.retsinformation.dk/eli/lta/2018/502",
|
||||
description="Daenisches Datenschutzgesetz als ergaenzende Bestimmungen zur DSGVO. Reguliert durch Datatilsynet.",
|
||||
language="da",
|
||||
requirement_count=30,
|
||||
),
|
||||
Regulation(
|
||||
code="EDPB_GUIDELINES_1_2022",
|
||||
name="EDPB GL Bussgelder",
|
||||
full_name="EDPB Leitlinien 04/2022 zur Berechnung von Bussgeldern nach der DSGVO",
|
||||
regulation_type="eu_guideline",
|
||||
source_url="https://www.edpb.europa.eu/system/files/2023-05/edpb_guidelines_042022_calculationofadministrativefines_en.pdf",
|
||||
description="EDPB-Leitlinien zur Berechnung von Verwaltungsbussgeldern unter der DSGVO.",
|
||||
language="en",
|
||||
requirement_count=15,
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
@@ -887,7 +1230,7 @@ async def main():
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(description="Legal Corpus Ingestion for UCCA")
|
||||
parser.add_argument("--ingest-all", action="store_true", help="Ingest all 19 regulations")
|
||||
parser.add_argument("--ingest-all", action="store_true", help="Ingest all regulations")
|
||||
parser.add_argument("--ingest", nargs="+", metavar="CODE", help="Ingest specific regulations by code")
|
||||
parser.add_argument("--status", action="store_true", help="Show collection status")
|
||||
parser.add_argument("--search", type=str, help="Test search query")
|
||||
@@ -902,7 +1245,7 @@ async def main():
|
||||
print(json.dumps(status, indent=2))
|
||||
|
||||
elif args.ingest_all:
|
||||
print("Ingesting all 19 regulations...")
|
||||
print(f"Ingesting all {len(REGULATIONS)} regulations...")
|
||||
results = await ingestion.ingest_all()
|
||||
print("\nResults:")
|
||||
for code, count in results.items():
|
||||
|
||||
307
klausur-service/backend/migrate_rag_chunks.py
Normal file
307
klausur-service/backend/migrate_rag_chunks.py
Normal file
@@ -0,0 +1,307 @@
|
||||
"""
|
||||
RAG Chunk Migration: bp_dsfa_corpus -> bp_legal_corpus
|
||||
|
||||
Verschiebt nationale Datenschutzgesetze und EU-Dokumente aus bp_dsfa_corpus
|
||||
nach bp_legal_corpus. Vektoren werden 1:1 uebernommen (kein Re-Embedding).
|
||||
|
||||
Usage:
|
||||
python migrate_rag_chunks.py # Dry run (default)
|
||||
python migrate_rag_chunks.py --execute # Actually migrate
|
||||
python migrate_rag_chunks.py --verify # Verify after migration
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import argparse
|
||||
from datetime import datetime, timezone
|
||||
from typing import List, Dict, Any
|
||||
|
||||
from qdrant_client import QdrantClient
|
||||
from qdrant_client.models import (
|
||||
PointStruct, Filter, FieldCondition, MatchAny, ScrollRequest
|
||||
)
|
||||
|
||||
# Configuration
|
||||
QDRANT_URL = os.getenv("QDRANT_URL", "http://qdrant:6333")
|
||||
SOURCE_COLLECTION = "bp_dsfa_corpus"
|
||||
TARGET_COLLECTION = "bp_legal_corpus"
|
||||
|
||||
# Source codes to migrate from bp_dsfa_corpus -> bp_legal_corpus
|
||||
SOURCES_TO_MIGRATE = [
|
||||
# Nationale Datenschutzgesetze
|
||||
"AT_DSG",
|
||||
"BDSG_FULL",
|
||||
"BE_DPA_LAW",
|
||||
"CH_DSG",
|
||||
"CZ_ZOU",
|
||||
"ES_LOPDGDD",
|
||||
"FI_TIETOSUOJALAKI",
|
||||
"FR_CNIL_GUIDE",
|
||||
"HU_INFOTV",
|
||||
"IE_DPA_2018",
|
||||
"IT_CODICE_PRIVACY",
|
||||
"LI_DSG",
|
||||
"NL_UAVG",
|
||||
"NO_PERSONOPPLYSNINGSLOVEN",
|
||||
"PL_UODO",
|
||||
"SE_DATASKYDDSLAG",
|
||||
"UK_DPA_2018",
|
||||
"UK_GDPR",
|
||||
# EU-Dokumente
|
||||
"SCC_FULL_TEXT",
|
||||
"EDPB_GUIDELINES_2_2019",
|
||||
"EDPB_GUIDELINES_3_2019",
|
||||
"EDPB_GUIDELINES_5_2020",
|
||||
"EDPB_GUIDELINES_7_2020",
|
||||
]
|
||||
|
||||
# Mapping: source_code -> regulation_type for bp_legal_corpus
|
||||
REGULATION_TYPE_MAP = {
|
||||
"AT_DSG": "national_law",
|
||||
"BDSG_FULL": "de_law",
|
||||
"BE_DPA_LAW": "national_law",
|
||||
"CH_DSG": "national_law",
|
||||
"CZ_ZOU": "national_law",
|
||||
"ES_LOPDGDD": "national_law",
|
||||
"FI_TIETOSUOJALAKI": "national_law",
|
||||
"FR_CNIL_GUIDE": "national_law",
|
||||
"HU_INFOTV": "national_law",
|
||||
"IE_DPA_2018": "national_law",
|
||||
"IT_CODICE_PRIVACY": "national_law",
|
||||
"LI_DSG": "national_law",
|
||||
"NL_UAVG": "national_law",
|
||||
"NO_PERSONOPPLYSNINGSLOVEN": "national_law",
|
||||
"PL_UODO": "national_law",
|
||||
"SE_DATASKYDDSLAG": "national_law",
|
||||
"UK_DPA_2018": "national_law",
|
||||
"UK_GDPR": "national_law",
|
||||
"SCC_FULL_TEXT": "eu_regulation",
|
||||
"EDPB_GUIDELINES_2_2019": "eu_guideline",
|
||||
"EDPB_GUIDELINES_3_2019": "eu_guideline",
|
||||
"EDPB_GUIDELINES_5_2020": "eu_guideline",
|
||||
"EDPB_GUIDELINES_7_2020": "eu_guideline",
|
||||
}
|
||||
|
||||
# Mapping: source_code -> regulation_name for bp_legal_corpus
|
||||
REGULATION_NAME_MAP = {
|
||||
"AT_DSG": "DSG Oesterreich",
|
||||
"BDSG_FULL": "BDSG",
|
||||
"BE_DPA_LAW": "Datenschutzgesetz Belgien",
|
||||
"CH_DSG": "DSG Schweiz",
|
||||
"CZ_ZOU": "Zakon Tschechien",
|
||||
"ES_LOPDGDD": "LOPDGDD Spanien",
|
||||
"FI_TIETOSUOJALAKI": "Tietosuojalaki Finnland",
|
||||
"FR_CNIL_GUIDE": "CNIL Guide RGPD",
|
||||
"HU_INFOTV": "Infotv. Ungarn",
|
||||
"IE_DPA_2018": "DPA 2018 Ireland",
|
||||
"IT_CODICE_PRIVACY": "Codice Privacy Italien",
|
||||
"LI_DSG": "DSG Liechtenstein",
|
||||
"NL_UAVG": "UAVG Niederlande",
|
||||
"NO_PERSONOPPLYSNINGSLOVEN": "Personopplysningsloven",
|
||||
"PL_UODO": "UODO Polen",
|
||||
"SE_DATASKYDDSLAG": "Dataskyddslag Schweden",
|
||||
"UK_DPA_2018": "DPA 2018 UK",
|
||||
"UK_GDPR": "UK GDPR",
|
||||
"SCC_FULL_TEXT": "Standardvertragsklauseln",
|
||||
"EDPB_GUIDELINES_2_2019": "EDPB GL 2/2019",
|
||||
"EDPB_GUIDELINES_3_2019": "EDPB GL 3/2019",
|
||||
"EDPB_GUIDELINES_5_2020": "EDPB GL 5/2020",
|
||||
"EDPB_GUIDELINES_7_2020": "EDPB GL 7/2020",
|
||||
}
|
||||
|
||||
|
||||
def transform_payload(dsfa_payload: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Transform bp_dsfa_corpus payload to bp_legal_corpus format."""
|
||||
source_code = dsfa_payload.get("source_code", "")
|
||||
now = datetime.now(timezone.utc).isoformat()
|
||||
|
||||
return {
|
||||
"text": dsfa_payload.get("content", ""),
|
||||
"regulation_code": source_code,
|
||||
"regulation_name": REGULATION_NAME_MAP.get(source_code, dsfa_payload.get("source_name", "")),
|
||||
"regulation_full_name": dsfa_payload.get("source_name", ""),
|
||||
"regulation_type": REGULATION_TYPE_MAP.get(source_code, "national_law"),
|
||||
"source_url": dsfa_payload.get("source_url", ""),
|
||||
"chunk_index": dsfa_payload.get("chunk_index", 0),
|
||||
"chunk_position": dsfa_payload.get("chunk_position", 0),
|
||||
"article": dsfa_payload.get("article", None),
|
||||
"paragraph": dsfa_payload.get("paragraph", None),
|
||||
"language": dsfa_payload.get("language", "de"),
|
||||
"indexed_at": now,
|
||||
"training_allowed": False,
|
||||
}
|
||||
|
||||
|
||||
def scroll_all_points(client: QdrantClient, collection: str, source_codes: List[str]) -> List:
|
||||
"""Scroll through all points matching the source codes."""
|
||||
all_points = []
|
||||
offset = None
|
||||
batch_size = 100
|
||||
|
||||
scroll_filter = Filter(
|
||||
must=[
|
||||
FieldCondition(
|
||||
key="source_code",
|
||||
match=MatchAny(any=source_codes),
|
||||
)
|
||||
]
|
||||
)
|
||||
|
||||
while True:
|
||||
results, next_offset = client.scroll(
|
||||
collection_name=collection,
|
||||
scroll_filter=scroll_filter,
|
||||
limit=batch_size,
|
||||
offset=offset,
|
||||
with_vectors=True,
|
||||
with_payload=True,
|
||||
)
|
||||
|
||||
all_points.extend(results)
|
||||
|
||||
if next_offset is None:
|
||||
break
|
||||
offset = next_offset
|
||||
|
||||
return all_points
|
||||
|
||||
|
||||
def migrate(execute: bool = False):
|
||||
"""Run the migration."""
|
||||
print(f"{'=' * 60}")
|
||||
print(f"RAG Chunk Migration: {SOURCE_COLLECTION} -> {TARGET_COLLECTION}")
|
||||
print(f"Mode: {'EXECUTE' if execute else 'DRY RUN'}")
|
||||
print(f"{'=' * 60}")
|
||||
print()
|
||||
|
||||
client = QdrantClient(url=QDRANT_URL)
|
||||
|
||||
# Get initial counts
|
||||
source_info = client.get_collection(SOURCE_COLLECTION)
|
||||
target_info = client.get_collection(TARGET_COLLECTION)
|
||||
print(f"Before migration:")
|
||||
print(f" {SOURCE_COLLECTION}: {source_info.points_count} points")
|
||||
print(f" {TARGET_COLLECTION}: {target_info.points_count} points")
|
||||
print()
|
||||
|
||||
# Scroll all points to migrate
|
||||
print(f"Scrolling points for {len(SOURCES_TO_MIGRATE)} source codes...")
|
||||
points = scroll_all_points(client, SOURCE_COLLECTION, SOURCES_TO_MIGRATE)
|
||||
print(f" Found {len(points)} points to migrate")
|
||||
print()
|
||||
|
||||
if not points:
|
||||
print("No points found to migrate. Exiting.")
|
||||
return
|
||||
|
||||
# Group by source_code for reporting
|
||||
by_source: Dict[str, int] = {}
|
||||
for p in points:
|
||||
sc = p.payload.get("source_code", "UNKNOWN")
|
||||
by_source[sc] = by_source.get(sc, 0) + 1
|
||||
|
||||
print("Points per source:")
|
||||
for sc in sorted(by_source.keys()):
|
||||
print(f" {sc}: {by_source[sc]} chunks")
|
||||
print()
|
||||
|
||||
if not execute:
|
||||
print("DRY RUN complete. Use --execute to actually migrate.")
|
||||
return
|
||||
|
||||
# Transform and upsert in batches
|
||||
batch_size = 50
|
||||
upserted = 0
|
||||
for i in range(0, len(points), batch_size):
|
||||
batch = points[i:i + batch_size]
|
||||
new_points = []
|
||||
for p in batch:
|
||||
new_payload = transform_payload(p.payload)
|
||||
new_points.append(PointStruct(
|
||||
id=p.id,
|
||||
vector=p.vector,
|
||||
payload=new_payload,
|
||||
))
|
||||
|
||||
client.upsert(
|
||||
collection_name=TARGET_COLLECTION,
|
||||
points=new_points,
|
||||
)
|
||||
upserted += len(new_points)
|
||||
print(f" Upserted {upserted}/{len(points)} points...")
|
||||
|
||||
print(f"\nUpsert complete: {upserted} points added to {TARGET_COLLECTION}")
|
||||
|
||||
# Delete from source collection
|
||||
point_ids = [p.id for p in points]
|
||||
for i in range(0, len(point_ids), 100):
|
||||
batch_ids = point_ids[i:i + 100]
|
||||
client.delete(
|
||||
collection_name=SOURCE_COLLECTION,
|
||||
points_selector=batch_ids,
|
||||
)
|
||||
print(f" Deleted {min(i + 100, len(point_ids))}/{len(point_ids)} from {SOURCE_COLLECTION}...")
|
||||
|
||||
print(f"\nDelete complete: {len(point_ids)} points removed from {SOURCE_COLLECTION}")
|
||||
|
||||
# Final counts
|
||||
source_info = client.get_collection(SOURCE_COLLECTION)
|
||||
target_info = client.get_collection(TARGET_COLLECTION)
|
||||
print(f"\nAfter migration:")
|
||||
print(f" {SOURCE_COLLECTION}: {source_info.points_count} points")
|
||||
print(f" {TARGET_COLLECTION}: {target_info.points_count} points")
|
||||
print(f"\nMigration complete!")
|
||||
|
||||
|
||||
def verify():
|
||||
"""Verify migration results."""
|
||||
print(f"Verifying migration...")
|
||||
client = QdrantClient(url=QDRANT_URL)
|
||||
|
||||
source_info = client.get_collection(SOURCE_COLLECTION)
|
||||
target_info = client.get_collection(TARGET_COLLECTION)
|
||||
print(f" {SOURCE_COLLECTION}: {source_info.points_count} points")
|
||||
print(f" {TARGET_COLLECTION}: {target_info.points_count} points")
|
||||
|
||||
# Check that migrated sources are gone from dsfa
|
||||
remaining = scroll_all_points(client, SOURCE_COLLECTION, SOURCES_TO_MIGRATE)
|
||||
if remaining:
|
||||
print(f"\n WARNING: {len(remaining)} points still in {SOURCE_COLLECTION}!")
|
||||
by_source: Dict[str, int] = {}
|
||||
for p in remaining:
|
||||
sc = p.payload.get("source_code", "UNKNOWN")
|
||||
by_source[sc] = by_source.get(sc, 0) + 1
|
||||
for sc, cnt in sorted(by_source.items()):
|
||||
print(f" {sc}: {cnt}")
|
||||
else:
|
||||
print(f"\n OK: No migrated sources remaining in {SOURCE_COLLECTION}")
|
||||
|
||||
# Check that migrated sources exist in legal
|
||||
for code in SOURCES_TO_MIGRATE:
|
||||
results, _ = client.scroll(
|
||||
collection_name=TARGET_COLLECTION,
|
||||
scroll_filter=Filter(
|
||||
must=[FieldCondition(key="regulation_code", match=MatchAny(any=[code]))]
|
||||
),
|
||||
limit=1,
|
||||
with_payload=True,
|
||||
with_vectors=False,
|
||||
)
|
||||
status = f"{len(results)}+ chunks" if results else "MISSING"
|
||||
print(f" {TARGET_COLLECTION}/{code}: {status}")
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Migrate RAG chunks between collections")
|
||||
parser.add_argument("--execute", action="store_true", help="Actually execute the migration (default: dry run)")
|
||||
parser.add_argument("--verify", action="store_true", help="Verify migration results")
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.verify:
|
||||
verify()
|
||||
else:
|
||||
migrate(execute=args.execute)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user