Files
breakpilot-compliance/ai-compliance-sdk/internal/ucca/legal_rag_client.go
Sharang Parnerkar c293d76e6b refactor(go/ucca): split policy_engine, legal_rag, ai_act, nis2, financial_policy, dsgvo_module
Split 6 oversized files (719–882 LOC each) into focused files under 500 LOC:
- policy_engine.go → types, loader, eval, gen (4 files)
- legal_rag.go     → types, client, http, context, scroll (5 files)
- ai_act_module.go → module, yaml, obligations (3 files)
- nis2_module.go   → module, yaml, obligations + shared obligation_yaml_types.go (3+1 files)
- financial_policy.go → types, engine (2 files)
- dsgvo_module.go  → module, yaml, obligations (3 files)

All in package ucca, zero exported symbol renames, go test ./internal/ucca/... passes.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-19 09:48:41 +02:00

153 lines
6.4 KiB
Go

package ucca
import (
"bytes"
"context"
"fmt"
"net/http"
"os"
"strings"
"time"
)
// LegalRAGClient provides access to the compliance CE vector search via Qdrant + Ollama bge-m3.
type LegalRAGClient struct {
qdrantURL string
qdrantAPIKey string
ollamaURL string
embeddingModel string
collection string
httpClient *http.Client
textIndexEnsured map[string]bool
hybridEnabled bool
}
// NewLegalRAGClient creates a new Legal RAG client using Ollama bge-m3 embeddings.
func NewLegalRAGClient() *LegalRAGClient {
qdrantURL := os.Getenv("QDRANT_URL")
if qdrantURL == "" {
qdrantURL = "http://localhost:6333"
}
qdrantURL = strings.TrimRight(qdrantURL, "/")
qdrantAPIKey := os.Getenv("QDRANT_API_KEY")
ollamaURL := os.Getenv("OLLAMA_URL")
if ollamaURL == "" {
ollamaURL = "http://localhost:11434"
}
hybridEnabled := os.Getenv("RAG_HYBRID_SEARCH") != "false"
return &LegalRAGClient{
qdrantURL: qdrantURL,
qdrantAPIKey: qdrantAPIKey,
ollamaURL: ollamaURL,
embeddingModel: "bge-m3",
collection: "bp_compliance_ce",
textIndexEnsured: make(map[string]bool),
hybridEnabled: hybridEnabled,
httpClient: &http.Client{
Timeout: 60 * time.Second,
},
}
}
// SearchCollection queries a specific Qdrant collection for relevant passages.
// If collection is empty, it falls back to the default collection (bp_compliance_ce).
func (c *LegalRAGClient) SearchCollection(ctx context.Context, collection string, query string, regulationIDs []string, topK int) ([]LegalSearchResult, error) {
if collection == "" {
collection = c.collection
}
return c.searchInternal(ctx, collection, query, regulationIDs, topK)
}
// Search queries the compliance CE corpus for relevant passages.
func (c *LegalRAGClient) Search(ctx context.Context, query string, regulationIDs []string, topK int) ([]LegalSearchResult, error) {
return c.searchInternal(ctx, c.collection, query, regulationIDs, topK)
}
// searchInternal performs the actual search against a given collection.
// If hybrid search is enabled, it uses the Qdrant Query API with RRF fusion
// (dense + full-text). Falls back to dense-only /points/search on failure.
func (c *LegalRAGClient) searchInternal(ctx context.Context, collection string, query string, regulationIDs []string, topK int) ([]LegalSearchResult, error) {
embedding, err := c.generateEmbedding(ctx, query)
if err != nil {
return nil, fmt.Errorf("failed to generate embedding: %w", err)
}
var hits []qdrantSearchHit
if c.hybridEnabled {
hybridHits, err := c.searchHybrid(ctx, collection, embedding, regulationIDs, topK)
if err == nil {
hits = hybridHits
}
}
if hits == nil {
denseHits, err := c.searchDense(ctx, collection, embedding, regulationIDs, topK)
if err != nil {
return nil, err
}
hits = denseHits
}
results := make([]LegalSearchResult, len(hits))
for i, hit := range hits {
results[i] = LegalSearchResult{
Text: getString(hit.Payload, "chunk_text"),
RegulationCode: getString(hit.Payload, "regulation_id"),
RegulationName: getString(hit.Payload, "regulation_name_de"),
RegulationShort: getString(hit.Payload, "regulation_short"),
Category: getString(hit.Payload, "category"),
Pages: getIntSlice(hit.Payload, "pages"),
SourceURL: getString(hit.Payload, "source"),
Score: hit.Score,
}
}
return results, nil
}
// FormatLegalContextForPrompt formats the legal context for inclusion in an LLM prompt.
func (c *LegalRAGClient) FormatLegalContextForPrompt(lc *LegalContext) string {
if lc == nil || len(lc.Results) == 0 {
return ""
}
var buf bytes.Buffer
buf.WriteString("\n\n**Relevante Rechtsgrundlagen:**\n\n")
for i, result := range lc.Results {
buf.WriteString(fmt.Sprintf("%d. **%s** (%s)", i+1, result.RegulationShort, result.RegulationCode))
if len(result.Pages) > 0 {
buf.WriteString(fmt.Sprintf(" - Seiten %v", result.Pages))
}
buf.WriteString("\n")
buf.WriteString(fmt.Sprintf(" > %s\n\n", truncateText(result.Text, 300)))
}
return buf.String()
}
// ListAvailableRegulations returns the list of regulations available in the corpus.
func (c *LegalRAGClient) ListAvailableRegulations() []CERegulationInfo {
return []CERegulationInfo{
{ID: "eu_2023_1230", NameDE: "EU-Maschinenverordnung 2023/1230", NameEN: "EU Machinery Regulation 2023/1230", Short: "Maschinenverordnung", Category: "regulation"},
{ID: "eu_2024_1689", NameDE: "EU KI-Verordnung (AI Act)", NameEN: "EU AI Act 2024/1689", Short: "AI Act", Category: "regulation"},
{ID: "eu_2024_2847", NameDE: "Cyber Resilience Act", NameEN: "Cyber Resilience Act 2024/2847", Short: "CRA", Category: "regulation"},
{ID: "eu_2022_2555", NameDE: "NIS-2-Richtlinie", NameEN: "NIS2 Directive 2022/2555", Short: "NIS2", Category: "regulation"},
{ID: "eu_2016_679", NameDE: "Datenschutz-Grundverordnung (DSGVO)", NameEN: "General Data Protection Regulation (GDPR)", Short: "DSGVO/GDPR", Category: "regulation"},
{ID: "eu_blue_guide_2022", NameDE: "EU Blue Guide 2022", NameEN: "EU Blue Guide 2022", Short: "Blue Guide", Category: "guidance"},
{ID: "nist_sp_800_218", NameDE: "NIST Secure Software Development Framework", NameEN: "NIST SSDF SP 800-218", Short: "NIST SSDF", Category: "guidance"},
{ID: "nist_csf_2_0", NameDE: "NIST Cybersecurity Framework 2.0", NameEN: "NIST CSF 2.0", Short: "NIST CSF", Category: "guidance"},
{ID: "oecd_ai_principles", NameDE: "OECD Empfehlung zu Kuenstlicher Intelligenz", NameEN: "OECD Recommendation on AI", Short: "OECD AI", Category: "guidance"},
{ID: "enisa_supply_chain_good_practices", NameDE: "ENISA Supply Chain Cybersecurity", NameEN: "ENISA Good Practices for Supply Chain Cybersecurity", Short: "ENISA Supply Chain", Category: "guidance"},
{ID: "enisa_threat_landscape_supply_chain", NameDE: "ENISA Threat Landscape Supply Chain", NameEN: "ENISA Threat Landscape for Supply Chain Attacks", Short: "ENISA Threat SC", Category: "guidance"},
{ID: "enisa_ics_scada_dependencies", NameDE: "ENISA ICS/SCADA Abhaengigkeiten", NameEN: "ENISA ICS/SCADA Communication Dependencies", Short: "ENISA ICS/SCADA", Category: "guidance"},
{ID: "cisa_secure_by_design", NameDE: "CISA Secure by Design", NameEN: "CISA Secure by Design", Short: "CISA SbD", Category: "guidance"},
{ID: "enisa_cybersecurity_state_2024", NameDE: "ENISA State of Cybersecurity 2024", NameEN: "ENISA State of Cybersecurity in the Union 2024", Short: "ENISA 2024", Category: "guidance"},
}
}