feat(rag): optimize RAG pipeline — JSON-Mode, CoT, Hybrid Search, Re-Ranking, Cross-Reg Dedup, chunk 1024
Some checks failed
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Failing after 42s
CI/CD / test-python-backend-compliance (push) Successful in 1m38s
CI/CD / test-python-document-crawler (push) Successful in 20s
CI/CD / test-python-dsms-gateway (push) Successful in 17s
CI/CD / validate-canonical-controls (push) Successful in 10s
CI/CD / Deploy (push) Has been skipped

Phase 1 (LLM Quality):
- Add format=json to all Ollama payloads (obligation_extractor, control_generator, citation_backfill)
- Add Chain-of-Thought analysis steps to Pass 0a/0b system prompts

Phase 2 (Retrieval Quality):
- Hybrid search via Qdrant Query API with RRF fusion + automatic text index (legal_rag.go)
- Fallback to dense-only search if Query API unavailable
- Cross-encoder re-ranking with BGE Reranker v2 (RERANK_ENABLED=false by default)
- CPU-only PyTorch dependency to keep Docker image small

Phase 3 (Data Layer):
- Cross-regulation dedup pass (threshold 0.95) links controls across regulations
- DedupResult.link_type field distinguishes dedup_merge vs cross_regulation
- Chunk size defaults updated 512/50 → 1024/128 for new ingestions only
- Existing collections and controls are NOT affected

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-21 11:49:43 +01:00
parent c3a53fe5d2
commit c52dbdb8f1
24 changed files with 2620 additions and 139 deletions

View File

@@ -75,12 +75,12 @@ REGULATION_LICENSE_MAP: dict[str, dict] = {
# RULE 1: FREE USE — Laws, Public Domain
# source_type: "law" = binding legislation, "guideline" = authority guidance (soft law),
# "standard" = voluntary framework/best practice, "restricted" = protected norm
# EU Regulations
"eu_2016_679": {"license": "EU_LAW", "rule": 1, "source_type": "law", "name": "DSGVO"},
"eu_2024_1689": {"license": "EU_LAW", "rule": 1, "source_type": "law", "name": "AI Act (KI-Verordnung)"},
"eu_2022_2555": {"license": "EU_LAW", "rule": 1, "source_type": "law", "name": "NIS2"},
# EU Regulations — names MUST match canonical DB source names
"eu_2016_679": {"license": "EU_LAW", "rule": 1, "source_type": "law", "name": "DSGVO (EU) 2016/679"},
"eu_2024_1689": {"license": "EU_LAW", "rule": 1, "source_type": "law", "name": "KI-Verordnung (EU) 2024/1689"},
"eu_2022_2555": {"license": "EU_LAW", "rule": 1, "source_type": "law", "name": "NIS2-Richtlinie (EU) 2022/2555"},
"eu_2024_2847": {"license": "EU_LAW", "rule": 1, "source_type": "law", "name": "Cyber Resilience Act (CRA)"},
"eu_2023_1230": {"license": "EU_LAW", "rule": 1, "source_type": "law", "name": "Maschinenverordnung"},
"eu_2023_1230": {"license": "EU_LAW", "rule": 1, "source_type": "law", "name": "Maschinenverordnung (EU) 2023/1230"},
"eu_2022_2065": {"license": "EU_LAW", "rule": 1, "source_type": "law", "name": "Digital Services Act (DSA)"},
"eu_2022_1925": {"license": "EU_LAW", "rule": 1, "source_type": "law", "name": "Digital Markets Act (DMA)"},
"eu_2022_868": {"license": "EU_LAW", "rule": 1, "source_type": "law", "name": "Data Governance Act (DGA)"},
@@ -88,52 +88,52 @@ REGULATION_LICENSE_MAP: dict[str, dict] = {
"eu_2021_914": {"license": "EU_LAW", "rule": 1, "source_type": "law", "name": "Standardvertragsklauseln (SCC)"},
"eu_2002_58": {"license": "EU_LAW", "rule": 1, "source_type": "law", "name": "ePrivacy-Richtlinie"},
"eu_2000_31": {"license": "EU_LAW", "rule": 1, "source_type": "law", "name": "E-Commerce-Richtlinie"},
"eu_2023_1803": {"license": "EU_LAW", "rule": 1, "source_type": "law", "name": "IFRS-Uebernahmeverordnung"},
"eu_2023_1803": {"license": "EU_LAW", "rule": 1, "source_type": "law", "name": "IFRS-Übernahmeverordnung"},
"eucsa": {"license": "EU_LAW", "rule": 1, "source_type": "law", "name": "EU Cybersecurity Act"},
"dataact": {"license": "EU_LAW", "rule": 1, "source_type": "law", "name": "Data Act"},
"dora": {"license": "EU_LAW", "rule": 1, "source_type": "law", "name": "Digital Operational Resilience Act"},
"ehds": {"license": "EU_LAW", "rule": 1, "source_type": "law", "name": "European Health Data Space"},
"gpsr": {"license": "EU_LAW", "rule": 1, "source_type": "law", "name": "Allgemeine Produktsicherheitsverordnung"},
"eu_2023_988": {"license": "EU_LAW", "rule": 1, "source_type": "law", "name": "Allgemeine Produktsicherheitsverordnung (GPSR)"},
"eu_2023_1542": {"license": "EU_LAW", "rule": 1, "source_type": "law", "name": "Batterieverordnung"},
"mica": {"license": "EU_LAW", "rule": 1, "source_type": "law", "name": "Markets in Crypto-Assets"},
"eu_2023_1542": {"license": "EU_LAW", "rule": 1, "source_type": "law", "name": "Batterieverordnung (EU) 2023/1542"},
"mica": {"license": "EU_LAW", "rule": 1, "source_type": "law", "name": "Markets in Crypto-Assets (MiCA)"},
"psd2": {"license": "EU_LAW", "rule": 1, "source_type": "law", "name": "Zahlungsdiensterichtlinie 2"},
"dpf": {"license": "EU_LAW", "rule": 1, "source_type": "law", "name": "EU-US Data Privacy Framework"},
"dsm": {"license": "EU_LAW", "rule": 1, "source_type": "law", "name": "DSM-Urheberrechtsrichtlinie"},
"amlr": {"license": "EU_LAW", "rule": 1, "source_type": "law", "name": "AML-Verordnung"},
"eu_blue_guide_2022": {"license": "EU_PUBLIC", "rule": 1, "source_type": "guideline", "name": "Blue Guide 2022"},
"eu_blue_guide_2022": {"license": "EU_PUBLIC", "rule": 1, "source_type": "guideline", "name": "EU Blue Guide 2022"},
# NIST (Public Domain — NOT laws, voluntary standards)
"nist_sp_800_53": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard", "name": "NIST SP 800-53"},
"nist_sp800_53r5": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard", "name": "NIST SP 800-53 Rev.5"},
"nist_sp_800_63b": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard", "name": "NIST SP 800-63B"},
"nist_sp_800_53": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard", "name": "NIST SP 800-53 Rev. 5"},
"nist_sp800_53r5": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard", "name": "NIST SP 800-53 Rev. 5"},
"nist_sp_800_63b": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard", "name": "NIST SP 800-63-3"},
"nist_sp800_63_3": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard", "name": "NIST SP 800-63-3"},
"nist_csf_2_0": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard", "name": "NIST CSF 2.0"},
"nist_sp_800_218": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard", "name": "NIST SSDF"},
"nist_sp800_218": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard", "name": "NIST SSDF"},
"nist_sp800_207": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard", "name": "NIST SP 800-207 Zero Trust"},
"nist_csf_2_0": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard", "name": "NIST Cybersecurity Framework 2.0"},
"nist_sp_800_218": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard", "name": "NIST SP 800-218 (SSDF)"},
"nist_sp800_218": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard", "name": "NIST SP 800-218 (SSDF)"},
"nist_sp800_207": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard", "name": "NIST SP 800-207 (Zero Trust)"},
"nist_ai_rmf": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard", "name": "NIST AI Risk Management Framework"},
"nist_privacy_1_0": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard", "name": "NIST Privacy Framework 1.0"},
"nistir_8259a": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard", "name": "NISTIR 8259A IoT Security"},
"cisa_secure_by_design": {"license": "US_GOV_PUBLIC", "rule": 1, "source_type": "standard", "name": "CISA Secure by Design"},
# German Laws
"bdsg": {"license": "DE_LAW", "rule": 1, "source_type": "law", "name": "BDSG"},
"bdsg_2018_komplett": {"license": "DE_LAW", "rule": 1, "source_type": "law", "name": "BDSG 2018"},
"bdsg": {"license": "DE_LAW", "rule": 1, "source_type": "law", "name": "Bundesdatenschutzgesetz (BDSG)"},
"bdsg_2018_komplett": {"license": "DE_LAW", "rule": 1, "source_type": "law", "name": "Bundesdatenschutzgesetz (BDSG)"},
"ttdsg": {"license": "DE_LAW", "rule": 1, "source_type": "law", "name": "TTDSG"},
"tdddg_25": {"license": "DE_LAW", "rule": 1, "source_type": "law", "name": "TDDDG"},
"tkg": {"license": "DE_LAW", "rule": 1, "source_type": "law", "name": "TKG"},
"de_tkg": {"license": "DE_LAW", "rule": 1, "source_type": "law", "name": "TKG"},
"bgb_komplett": {"license": "DE_LAW", "rule": 1, "source_type": "law", "name": "BGB"},
"hgb": {"license": "DE_LAW", "rule": 1, "source_type": "law", "name": "HGB"},
"hgb_komplett": {"license": "DE_LAW", "rule": 1, "source_type": "law", "name": "HGB"},
"hgb": {"license": "DE_LAW", "rule": 1, "source_type": "law", "name": "Handelsgesetzbuch (HGB)"},
"hgb_komplett": {"license": "DE_LAW", "rule": 1, "source_type": "law", "name": "Handelsgesetzbuch (HGB)"},
"urhg_komplett": {"license": "DE_LAW", "rule": 1, "source_type": "law", "name": "UrhG"},
"uwg": {"license": "DE_LAW", "rule": 1, "source_type": "law", "name": "UWG"},
"tmg_komplett": {"license": "DE_LAW", "rule": 1, "source_type": "law", "name": "TMG"},
"gewo": {"license": "DE_LAW", "rule": 1, "source_type": "law", "name": "GewO"},
"ao": {"license": "DE_LAW", "rule": 1, "source_type": "law", "name": "Abgabenordnung"},
"ao_komplett": {"license": "DE_LAW", "rule": 1, "source_type": "law", "name": "Abgabenordnung"},
"gewo": {"license": "DE_LAW", "rule": 1, "source_type": "law", "name": "Gewerbeordnung (GewO)"},
"ao": {"license": "DE_LAW", "rule": 1, "source_type": "law", "name": "Abgabenordnung (AO)"},
"ao_komplett": {"license": "DE_LAW", "rule": 1, "source_type": "law", "name": "Abgabenordnung (AO)"},
"battdg": {"license": "DE_LAW", "rule": 1, "source_type": "law", "name": "Batteriegesetz"},
# Austrian Laws
"at_dsg": {"license": "AT_LAW", "rule": 1, "source_type": "law", "name": "AT DSG"},
"at_dsg": {"license": "AT_LAW", "rule": 1, "source_type": "law", "name": "Österreichisches Datenschutzgesetz (DSG)"},
"at_abgb": {"license": "AT_LAW", "rule": 1, "source_type": "law", "name": "AT ABGB"},
"at_abgb_agb": {"license": "AT_LAW", "rule": 1, "source_type": "law", "name": "AT ABGB AGB-Recht"},
"at_bao": {"license": "AT_LAW", "rule": 1, "source_type": "law", "name": "AT BAO"},
@@ -141,7 +141,7 @@ REGULATION_LICENSE_MAP: dict[str, dict] = {
"at_ecg": {"license": "AT_LAW", "rule": 1, "source_type": "law", "name": "AT E-Commerce-Gesetz"},
"at_kschg": {"license": "AT_LAW", "rule": 1, "source_type": "law", "name": "AT Konsumentenschutzgesetz"},
"at_medieng": {"license": "AT_LAW", "rule": 1, "source_type": "law", "name": "AT Mediengesetz"},
"at_tkg": {"license": "AT_LAW", "rule": 1, "source_type": "law", "name": "AT TKG"},
"at_tkg": {"license": "AT_LAW", "rule": 1, "source_type": "law", "name": "Telekommunikationsgesetz Oesterreich"},
"at_ugb": {"license": "AT_LAW", "rule": 1, "source_type": "law", "name": "AT UGB"},
"at_ugb_ret": {"license": "AT_LAW", "rule": 1, "source_type": "law", "name": "AT UGB Retention"},
"at_uwg": {"license": "AT_LAW", "rule": 1, "source_type": "law", "name": "AT UWG"},
@@ -179,21 +179,21 @@ REGULATION_LICENSE_MAP: dict[str, dict] = {
"wp260_transparency": {"license": "EU_PUBLIC", "rule": 1, "source_type": "guideline", "name": "WP29 Transparency"},
# RULE 2: CITATION REQUIRED — CC-BY, CC-BY-SA (voluntary standards)
"owasp_asvs": {"license": "CC-BY-SA-4.0", "rule": 2, "source_type": "standard", "name": "OWASP ASVS",
"owasp_asvs": {"license": "CC-BY-SA-4.0", "rule": 2, "source_type": "standard", "name": "OWASP ASVS 4.0",
"attribution": "OWASP Foundation, CC BY-SA 4.0"},
"owasp_masvs": {"license": "CC-BY-SA-4.0", "rule": 2, "source_type": "standard", "name": "OWASP MASVS",
"owasp_masvs": {"license": "CC-BY-SA-4.0", "rule": 2, "source_type": "standard", "name": "OWASP MASVS 2.0",
"attribution": "OWASP Foundation, CC BY-SA 4.0"},
"owasp_top10": {"license": "CC-BY-SA-4.0", "rule": 2, "source_type": "standard", "name": "OWASP Top 10",
"owasp_top10": {"license": "CC-BY-SA-4.0", "rule": 2, "source_type": "standard", "name": "OWASP Top 10 (2021)",
"attribution": "OWASP Foundation, CC BY-SA 4.0"},
"owasp_top10_2021": {"license": "CC-BY-SA-4.0", "rule": 2, "source_type": "standard", "name": "OWASP Top 10 2021",
"owasp_top10_2021": {"license": "CC-BY-SA-4.0", "rule": 2, "source_type": "standard", "name": "OWASP Top 10 (2021)",
"attribution": "OWASP Foundation, CC BY-SA 4.0"},
"owasp_api_top10_2023": {"license": "CC-BY-SA-4.0", "rule": 2, "source_type": "standard", "name": "OWASP API Top 10 2023",
"owasp_api_top10_2023": {"license": "CC-BY-SA-4.0", "rule": 2, "source_type": "standard", "name": "OWASP API Security Top 10 (2023)",
"attribution": "OWASP Foundation, CC BY-SA 4.0"},
"owasp_samm": {"license": "CC-BY-SA-4.0", "rule": 2, "source_type": "standard", "name": "OWASP SAMM",
"owasp_samm": {"license": "CC-BY-SA-4.0", "rule": 2, "source_type": "standard", "name": "OWASP SAMM 2.0",
"attribution": "OWASP Foundation, CC BY-SA 4.0"},
"owasp_mobile_top10": {"license": "CC-BY-SA-4.0", "rule": 2, "source_type": "standard", "name": "OWASP Mobile Top 10",
"attribution": "OWASP Foundation, CC BY-SA 4.0"},
"oecd_ai_principles": {"license": "OECD_PUBLIC", "rule": 2, "source_type": "standard", "name": "OECD AI Principles",
"oecd_ai_principles": {"license": "OECD_PUBLIC", "rule": 2, "source_type": "standard", "name": "OECD KI-Empfehlung",
"attribution": "OECD"},
# RULE 3: RESTRICTED — Full reformulation required
@@ -626,6 +626,7 @@ async def _llm_ollama(prompt: str, system_prompt: Optional[str] = None) -> str:
"model": OLLAMA_MODEL,
"messages": messages,
"stream": False,
"format": "json",
"options": {"num_predict": 512}, # Limit response length for speed
"think": False, # Disable thinking for faster responses
}
@@ -1040,8 +1041,10 @@ Quelle: {chunk.regulation_name} ({chunk.regulation_code}), {chunk.article}"""
effective_paragraph = llm_paragraph or chunk.paragraph or ""
control.license_rule = 1
control.source_original_text = chunk.text
# Use canonical name from REGULATION_LICENSE_MAP, not Qdrant's regulation_name
canonical_source = license_info.get("name", chunk.regulation_name)
control.source_citation = {
"source": chunk.regulation_name,
"source": canonical_source,
"article": effective_article,
"paragraph": effective_paragraph,
"license": license_info.get("license", ""),
@@ -1105,8 +1108,10 @@ Quelle: {chunk.regulation_name}, {chunk.article}"""
effective_paragraph = llm_paragraph or chunk.paragraph or ""
control.license_rule = 2
control.source_original_text = chunk.text
# Use canonical name from REGULATION_LICENSE_MAP, not Qdrant's regulation_name
canonical_source = license_info.get("name", chunk.regulation_name)
control.source_citation = {
"source": chunk.regulation_name,
"source": canonical_source,
"article": effective_article,
"paragraph": effective_paragraph,
"license": license_info.get("license", ""),
@@ -1277,8 +1282,10 @@ Gib ein JSON-Array zurueck mit GENAU {len(chunks)} Elementen. Fuer Chunks ohne A
effective_paragraph = llm_paragraph or chunk.paragraph or ""
if lic["rule"] in (1, 2):
control.source_original_text = chunk.text
# Use canonical name from REGULATION_LICENSE_MAP, not Qdrant's regulation_name
canonical_source = lic.get("name", chunk.regulation_name)
control.source_citation = {
"source": chunk.regulation_name,
"source": canonical_source,
"article": effective_article,
"paragraph": effective_paragraph,
"license": lic.get("license", ""),