feat(rag): optimize RAG pipeline — JSON-Mode, CoT, Hybrid Search, Re-Ranking, Cross-Reg Dedup, chunk 1024
Some checks failed
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Failing after 42s
CI/CD / test-python-backend-compliance (push) Successful in 1m38s
CI/CD / test-python-document-crawler (push) Successful in 20s
CI/CD / test-python-dsms-gateway (push) Successful in 17s
CI/CD / validate-canonical-controls (push) Successful in 10s
CI/CD / Deploy (push) Has been skipped
Some checks failed
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Failing after 42s
CI/CD / test-python-backend-compliance (push) Successful in 1m38s
CI/CD / test-python-document-crawler (push) Successful in 20s
CI/CD / test-python-dsms-gateway (push) Successful in 17s
CI/CD / validate-canonical-controls (push) Successful in 10s
CI/CD / Deploy (push) Has been skipped
Phase 1 (LLM Quality): - Add format=json to all Ollama payloads (obligation_extractor, control_generator, citation_backfill) - Add Chain-of-Thought analysis steps to Pass 0a/0b system prompts Phase 2 (Retrieval Quality): - Hybrid search via Qdrant Query API with RRF fusion + automatic text index (legal_rag.go) - Fallback to dense-only search if Query API unavailable - Cross-encoder re-ranking with BGE Reranker v2 (RERANK_ENABLED=false by default) - CPU-only PyTorch dependency to keep Docker image small Phase 3 (Data Layer): - Cross-regulation dedup pass (threshold 0.95) links controls across regulations - DedupResult.link_type field distinguishes dedup_merge vs cross_regulation - Chunk size defaults updated 512/50 → 1024/128 for new ingestions only - Existing collections and controls are NOT affected Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -75,12 +75,12 @@ REGULATION_LICENSE_MAP: dict[str, dict] = {
|
||||
# RULE 1: FREE USE — Laws, Public Domain
|
||||
# source_type: "law" = binding legislation, "guideline" = authority guidance (soft law),
|
||||
# "standard" = voluntary framework/best practice, "restricted" = protected norm
|
||||
# EU Regulations
|
||||
"eu_2016_679": {"license": "EU_LAW", "rule": 1, "source_type": "law", "name": "DSGVO"},
|
||||
"eu_2024_1689": {"license": "EU_LAW", "rule": 1, "source_type": "law", "name": "AI Act (KI-Verordnung)"},
|
||||
"eu_2022_2555": {"license": "EU_LAW", "rule": 1, "source_type": "law", "name": "NIS2"},
|
||||
# EU Regulations — names MUST match canonical DB source names
|
||||
"eu_2016_679": {"license": "EU_LAW", "rule": 1, "source_type": "law", "name": "DSGVO (EU) 2016/679"},
|
||||
"eu_2024_1689": {"license": "EU_LAW", "rule": 1, "source_type": "law", "name": "KI-Verordnung (EU) 2024/1689"},
|
||||
"eu_2022_2555": {"license": "EU_LAW", "rule": 1, "source_type": "law", "name": "NIS2-Richtlinie (EU) 2022/2555"},
|
||||
"eu_2024_2847": {"license": "EU_LAW", "rule": 1, "source_type": "law", "name": "Cyber Resilience Act (CRA)"},
|
||||
"eu_2023_1230": {"license": "EU_LAW", "rule": 1, "source_type": "law", "name": "Maschinenverordnung"},
|
||||
"eu_2023_1230": {"license": "EU_LAW", "rule": 1, "source_type": "law", "name": "Maschinenverordnung (EU) 2023/1230"},
|
||||
"eu_2022_2065": {"license": "EU_LAW", "rule": 1, "source_type": "law", "name": "Digital Services Act (DSA)"},
|
||||
"eu_2022_1925": {"license": "EU_LAW", "rule": 1, "source_type": "law", "name": "Digital Markets Act (DMA)"},
|
||||
"eu_2022_868": {"license": "EU_LAW", "rule": 1, "source_type": "law", "name": "Data Governance Act (DGA)"},
|
||||
@@ -88,52 +88,52 @@ REGULATION_LICENSE_MAP: dict[str, dict] = {
|
||||
"eu_2021_914": {"license": "EU_LAW", "rule": 1, "source_type": "law", "name": "Standardvertragsklauseln (SCC)"},
|
||||
"eu_2002_58": {"license": "EU_LAW", "rule": 1, "source_type": "law", "name": "ePrivacy-Richtlinie"},
|
||||
"eu_2000_31": {"license": "EU_LAW", "rule": 1, "source_type": "law", "name": "E-Commerce-Richtlinie"},
|
||||
"eu_2023_1803": {"license": "EU_LAW", "rule": 1, "source_type": "law", "name": "IFRS-Uebernahmeverordnung"},
|
||||
"eu_2023_1803": {"license": "EU_LAW", "rule": 1, "source_type": "law", "name": "IFRS-Übernahmeverordnung"},
|
||||
"eucsa": {"license": "EU_LAW", "rule": 1, "source_type": "law", "name": "EU Cybersecurity Act"},
|
||||
"dataact": {"license": "EU_LAW", "rule": 1, "source_type": "law", "name": "Data Act"},
|
||||
"dora": {"license": "EU_LAW", "rule": 1, "source_type": "law", "name": "Digital Operational Resilience Act"},
|
||||
"ehds": {"license": "EU_LAW", "rule": 1, "source_type": "law", "name": "European Health Data Space"},
|
||||
"gpsr": {"license": "EU_LAW", "rule": 1, "source_type": "law", "name": "Allgemeine Produktsicherheitsverordnung"},
|
||||
"eu_2023_988": {"license": "EU_LAW", "rule": 1, "source_type": "law", "name": "Allgemeine Produktsicherheitsverordnung (GPSR)"},
|
||||
"eu_2023_1542": {"license": "EU_LAW", "rule": 1, "source_type": "law", "name": "Batterieverordnung"},
|
||||
"mica": {"license": "EU_LAW", "rule": 1, "source_type": "law", "name": "Markets in Crypto-Assets"},
|
||||
"eu_2023_1542": {"license": "EU_LAW", "rule": 1, "source_type": "law", "name": "Batterieverordnung (EU) 2023/1542"},
|
||||
"mica": {"license": "EU_LAW", "rule": 1, "source_type": "law", "name": "Markets in Crypto-Assets (MiCA)"},
|
||||
"psd2": {"license": "EU_LAW", "rule": 1, "source_type": "law", "name": "Zahlungsdiensterichtlinie 2"},
|
||||
"dpf": {"license": "EU_LAW", "rule": 1, "source_type": "law", "name": "EU-US Data Privacy Framework"},
|
||||
"dsm": {"license": "EU_LAW", "rule": 1, "source_type": "law", "name": "DSM-Urheberrechtsrichtlinie"},
|
||||
"amlr": {"license": "EU_LAW", "rule": 1, "source_type": "law", "name": "AML-Verordnung"},
|
||||
"eu_blue_guide_2022": {"license": "EU_PUBLIC", "rule": 1, "source_type": "guideline", "name": "Blue Guide 2022"},
|
||||
"eu_blue_guide_2022": {"license": "EU_PUBLIC", "rule": 1, "source_type": "guideline", "name": "EU Blue Guide 2022"},
|
||||
# NIST (Public Domain — NOT laws, voluntary standards)
|
||||
"nist_sp_800_53": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard", "name": "NIST SP 800-53"},
|
||||
"nist_sp800_53r5": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard", "name": "NIST SP 800-53 Rev.5"},
|
||||
"nist_sp_800_63b": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard", "name": "NIST SP 800-63B"},
|
||||
"nist_sp_800_53": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard", "name": "NIST SP 800-53 Rev. 5"},
|
||||
"nist_sp800_53r5": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard", "name": "NIST SP 800-53 Rev. 5"},
|
||||
"nist_sp_800_63b": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard", "name": "NIST SP 800-63-3"},
|
||||
"nist_sp800_63_3": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard", "name": "NIST SP 800-63-3"},
|
||||
"nist_csf_2_0": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard", "name": "NIST CSF 2.0"},
|
||||
"nist_sp_800_218": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard", "name": "NIST SSDF"},
|
||||
"nist_sp800_218": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard", "name": "NIST SSDF"},
|
||||
"nist_sp800_207": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard", "name": "NIST SP 800-207 Zero Trust"},
|
||||
"nist_csf_2_0": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard", "name": "NIST Cybersecurity Framework 2.0"},
|
||||
"nist_sp_800_218": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard", "name": "NIST SP 800-218 (SSDF)"},
|
||||
"nist_sp800_218": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard", "name": "NIST SP 800-218 (SSDF)"},
|
||||
"nist_sp800_207": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard", "name": "NIST SP 800-207 (Zero Trust)"},
|
||||
"nist_ai_rmf": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard", "name": "NIST AI Risk Management Framework"},
|
||||
"nist_privacy_1_0": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard", "name": "NIST Privacy Framework 1.0"},
|
||||
"nistir_8259a": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard", "name": "NISTIR 8259A IoT Security"},
|
||||
"cisa_secure_by_design": {"license": "US_GOV_PUBLIC", "rule": 1, "source_type": "standard", "name": "CISA Secure by Design"},
|
||||
# German Laws
|
||||
"bdsg": {"license": "DE_LAW", "rule": 1, "source_type": "law", "name": "BDSG"},
|
||||
"bdsg_2018_komplett": {"license": "DE_LAW", "rule": 1, "source_type": "law", "name": "BDSG 2018"},
|
||||
"bdsg": {"license": "DE_LAW", "rule": 1, "source_type": "law", "name": "Bundesdatenschutzgesetz (BDSG)"},
|
||||
"bdsg_2018_komplett": {"license": "DE_LAW", "rule": 1, "source_type": "law", "name": "Bundesdatenschutzgesetz (BDSG)"},
|
||||
"ttdsg": {"license": "DE_LAW", "rule": 1, "source_type": "law", "name": "TTDSG"},
|
||||
"tdddg_25": {"license": "DE_LAW", "rule": 1, "source_type": "law", "name": "TDDDG"},
|
||||
"tkg": {"license": "DE_LAW", "rule": 1, "source_type": "law", "name": "TKG"},
|
||||
"de_tkg": {"license": "DE_LAW", "rule": 1, "source_type": "law", "name": "TKG"},
|
||||
"bgb_komplett": {"license": "DE_LAW", "rule": 1, "source_type": "law", "name": "BGB"},
|
||||
"hgb": {"license": "DE_LAW", "rule": 1, "source_type": "law", "name": "HGB"},
|
||||
"hgb_komplett": {"license": "DE_LAW", "rule": 1, "source_type": "law", "name": "HGB"},
|
||||
"hgb": {"license": "DE_LAW", "rule": 1, "source_type": "law", "name": "Handelsgesetzbuch (HGB)"},
|
||||
"hgb_komplett": {"license": "DE_LAW", "rule": 1, "source_type": "law", "name": "Handelsgesetzbuch (HGB)"},
|
||||
"urhg_komplett": {"license": "DE_LAW", "rule": 1, "source_type": "law", "name": "UrhG"},
|
||||
"uwg": {"license": "DE_LAW", "rule": 1, "source_type": "law", "name": "UWG"},
|
||||
"tmg_komplett": {"license": "DE_LAW", "rule": 1, "source_type": "law", "name": "TMG"},
|
||||
"gewo": {"license": "DE_LAW", "rule": 1, "source_type": "law", "name": "GewO"},
|
||||
"ao": {"license": "DE_LAW", "rule": 1, "source_type": "law", "name": "Abgabenordnung"},
|
||||
"ao_komplett": {"license": "DE_LAW", "rule": 1, "source_type": "law", "name": "Abgabenordnung"},
|
||||
"gewo": {"license": "DE_LAW", "rule": 1, "source_type": "law", "name": "Gewerbeordnung (GewO)"},
|
||||
"ao": {"license": "DE_LAW", "rule": 1, "source_type": "law", "name": "Abgabenordnung (AO)"},
|
||||
"ao_komplett": {"license": "DE_LAW", "rule": 1, "source_type": "law", "name": "Abgabenordnung (AO)"},
|
||||
"battdg": {"license": "DE_LAW", "rule": 1, "source_type": "law", "name": "Batteriegesetz"},
|
||||
# Austrian Laws
|
||||
"at_dsg": {"license": "AT_LAW", "rule": 1, "source_type": "law", "name": "AT DSG"},
|
||||
"at_dsg": {"license": "AT_LAW", "rule": 1, "source_type": "law", "name": "Österreichisches Datenschutzgesetz (DSG)"},
|
||||
"at_abgb": {"license": "AT_LAW", "rule": 1, "source_type": "law", "name": "AT ABGB"},
|
||||
"at_abgb_agb": {"license": "AT_LAW", "rule": 1, "source_type": "law", "name": "AT ABGB AGB-Recht"},
|
||||
"at_bao": {"license": "AT_LAW", "rule": 1, "source_type": "law", "name": "AT BAO"},
|
||||
@@ -141,7 +141,7 @@ REGULATION_LICENSE_MAP: dict[str, dict] = {
|
||||
"at_ecg": {"license": "AT_LAW", "rule": 1, "source_type": "law", "name": "AT E-Commerce-Gesetz"},
|
||||
"at_kschg": {"license": "AT_LAW", "rule": 1, "source_type": "law", "name": "AT Konsumentenschutzgesetz"},
|
||||
"at_medieng": {"license": "AT_LAW", "rule": 1, "source_type": "law", "name": "AT Mediengesetz"},
|
||||
"at_tkg": {"license": "AT_LAW", "rule": 1, "source_type": "law", "name": "AT TKG"},
|
||||
"at_tkg": {"license": "AT_LAW", "rule": 1, "source_type": "law", "name": "Telekommunikationsgesetz Oesterreich"},
|
||||
"at_ugb": {"license": "AT_LAW", "rule": 1, "source_type": "law", "name": "AT UGB"},
|
||||
"at_ugb_ret": {"license": "AT_LAW", "rule": 1, "source_type": "law", "name": "AT UGB Retention"},
|
||||
"at_uwg": {"license": "AT_LAW", "rule": 1, "source_type": "law", "name": "AT UWG"},
|
||||
@@ -179,21 +179,21 @@ REGULATION_LICENSE_MAP: dict[str, dict] = {
|
||||
"wp260_transparency": {"license": "EU_PUBLIC", "rule": 1, "source_type": "guideline", "name": "WP29 Transparency"},
|
||||
|
||||
# RULE 2: CITATION REQUIRED — CC-BY, CC-BY-SA (voluntary standards)
|
||||
"owasp_asvs": {"license": "CC-BY-SA-4.0", "rule": 2, "source_type": "standard", "name": "OWASP ASVS",
|
||||
"owasp_asvs": {"license": "CC-BY-SA-4.0", "rule": 2, "source_type": "standard", "name": "OWASP ASVS 4.0",
|
||||
"attribution": "OWASP Foundation, CC BY-SA 4.0"},
|
||||
"owasp_masvs": {"license": "CC-BY-SA-4.0", "rule": 2, "source_type": "standard", "name": "OWASP MASVS",
|
||||
"owasp_masvs": {"license": "CC-BY-SA-4.0", "rule": 2, "source_type": "standard", "name": "OWASP MASVS 2.0",
|
||||
"attribution": "OWASP Foundation, CC BY-SA 4.0"},
|
||||
"owasp_top10": {"license": "CC-BY-SA-4.0", "rule": 2, "source_type": "standard", "name": "OWASP Top 10",
|
||||
"owasp_top10": {"license": "CC-BY-SA-4.0", "rule": 2, "source_type": "standard", "name": "OWASP Top 10 (2021)",
|
||||
"attribution": "OWASP Foundation, CC BY-SA 4.0"},
|
||||
"owasp_top10_2021": {"license": "CC-BY-SA-4.0", "rule": 2, "source_type": "standard", "name": "OWASP Top 10 2021",
|
||||
"owasp_top10_2021": {"license": "CC-BY-SA-4.0", "rule": 2, "source_type": "standard", "name": "OWASP Top 10 (2021)",
|
||||
"attribution": "OWASP Foundation, CC BY-SA 4.0"},
|
||||
"owasp_api_top10_2023": {"license": "CC-BY-SA-4.0", "rule": 2, "source_type": "standard", "name": "OWASP API Top 10 2023",
|
||||
"owasp_api_top10_2023": {"license": "CC-BY-SA-4.0", "rule": 2, "source_type": "standard", "name": "OWASP API Security Top 10 (2023)",
|
||||
"attribution": "OWASP Foundation, CC BY-SA 4.0"},
|
||||
"owasp_samm": {"license": "CC-BY-SA-4.0", "rule": 2, "source_type": "standard", "name": "OWASP SAMM",
|
||||
"owasp_samm": {"license": "CC-BY-SA-4.0", "rule": 2, "source_type": "standard", "name": "OWASP SAMM 2.0",
|
||||
"attribution": "OWASP Foundation, CC BY-SA 4.0"},
|
||||
"owasp_mobile_top10": {"license": "CC-BY-SA-4.0", "rule": 2, "source_type": "standard", "name": "OWASP Mobile Top 10",
|
||||
"attribution": "OWASP Foundation, CC BY-SA 4.0"},
|
||||
"oecd_ai_principles": {"license": "OECD_PUBLIC", "rule": 2, "source_type": "standard", "name": "OECD AI Principles",
|
||||
"oecd_ai_principles": {"license": "OECD_PUBLIC", "rule": 2, "source_type": "standard", "name": "OECD KI-Empfehlung",
|
||||
"attribution": "OECD"},
|
||||
|
||||
# RULE 3: RESTRICTED — Full reformulation required
|
||||
@@ -626,6 +626,7 @@ async def _llm_ollama(prompt: str, system_prompt: Optional[str] = None) -> str:
|
||||
"model": OLLAMA_MODEL,
|
||||
"messages": messages,
|
||||
"stream": False,
|
||||
"format": "json",
|
||||
"options": {"num_predict": 512}, # Limit response length for speed
|
||||
"think": False, # Disable thinking for faster responses
|
||||
}
|
||||
@@ -1040,8 +1041,10 @@ Quelle: {chunk.regulation_name} ({chunk.regulation_code}), {chunk.article}"""
|
||||
effective_paragraph = llm_paragraph or chunk.paragraph or ""
|
||||
control.license_rule = 1
|
||||
control.source_original_text = chunk.text
|
||||
# Use canonical name from REGULATION_LICENSE_MAP, not Qdrant's regulation_name
|
||||
canonical_source = license_info.get("name", chunk.regulation_name)
|
||||
control.source_citation = {
|
||||
"source": chunk.regulation_name,
|
||||
"source": canonical_source,
|
||||
"article": effective_article,
|
||||
"paragraph": effective_paragraph,
|
||||
"license": license_info.get("license", ""),
|
||||
@@ -1105,8 +1108,10 @@ Quelle: {chunk.regulation_name}, {chunk.article}"""
|
||||
effective_paragraph = llm_paragraph or chunk.paragraph or ""
|
||||
control.license_rule = 2
|
||||
control.source_original_text = chunk.text
|
||||
# Use canonical name from REGULATION_LICENSE_MAP, not Qdrant's regulation_name
|
||||
canonical_source = license_info.get("name", chunk.regulation_name)
|
||||
control.source_citation = {
|
||||
"source": chunk.regulation_name,
|
||||
"source": canonical_source,
|
||||
"article": effective_article,
|
||||
"paragraph": effective_paragraph,
|
||||
"license": license_info.get("license", ""),
|
||||
@@ -1277,8 +1282,10 @@ Gib ein JSON-Array zurueck mit GENAU {len(chunks)} Elementen. Fuer Chunks ohne A
|
||||
effective_paragraph = llm_paragraph or chunk.paragraph or ""
|
||||
if lic["rule"] in (1, 2):
|
||||
control.source_original_text = chunk.text
|
||||
# Use canonical name from REGULATION_LICENSE_MAP, not Qdrant's regulation_name
|
||||
canonical_source = lic.get("name", chunk.regulation_name)
|
||||
control.source_citation = {
|
||||
"source": chunk.regulation_name,
|
||||
"source": canonical_source,
|
||||
"article": effective_article,
|
||||
"paragraph": effective_paragraph,
|
||||
"license": lic.get("license", ""),
|
||||
|
||||
Reference in New Issue
Block a user