49147d9497
CI / detect-changes (pull_request) Successful in 16s
CI / branch-name (pull_request) Successful in 2s
CI / guardrail-integrity (pull_request) Successful in 5s
CI / secret-scan (pull_request) Successful in 6s
CI / dep-audit (pull_request) Failing after 1m1s
CI / sbom-scan (pull_request) Failing after 1m4s
CI / build-sha-integrity (pull_request) Successful in 14s
CI / validate-canonical-controls (pull_request) Successful in 13s
CI / test-go (pull_request) Successful in 1m2s
CI / loc-budget (pull_request) Successful in 24s
CI / go-lint (pull_request) Failing after 20s
CI / python-lint (pull_request) Failing after 23s
CI / nodejs-lint (pull_request) Failing after 1m10s
CI / nodejs-build (pull_request) Successful in 3m26s
CI / iace-gt-coverage (pull_request) Successful in 16s
CI / test-python-backend (pull_request) Successful in 27s
CI / test-python-document-crawler (pull_request) Successful in 13s
CI / test-python-dsms-gateway (pull_request) Successful in 9s
Re-orders /sdk/v1/rag/search results so binding law from the matching jurisdiction and domain ranks above guidance, foreign and off-domain law — without dropping anything (guidance stays as interpretation context). Internal-only: response schema is unchanged (json:"-" fields), so every consumer benefits without a contract change. - authority.go: classifyAuthority / queryDomain / chunkDomain / scopeClass / topic ontology. Tagged payload (authority_weight/source_class/jurisdiction) wins; deterministic fallback via category + name markers for the untagged corpus. - authority_rerank.go: rerankByAuthority. final = semantic + authority + jurisdiction + domain + scope + topic; the authority score is written back to Score so the multi-collection advisor merge preserves the order. - legal_rag_client: stratified retrieval — the binding-law pool AUGMENTS the semantic pool (mergeDedupHits), then re-rank. - legal_rag_http: searchBinding (source_class filter) + shared doPointsSearch. - table-driven tests for authority/domain/scope/topic + rerank acceptance + a stratified-binding integration test. go test -race green. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
214 lines
9.5 KiB
Go
214 lines
9.5 KiB
Go
package ucca
|
|
|
|
import (
|
|
"bytes"
|
|
"context"
|
|
"fmt"
|
|
"net/http"
|
|
"os"
|
|
"strings"
|
|
"time"
|
|
)
|
|
|
|
// LegalRAGClient provides access to the compliance CE vector search via Qdrant + Ollama bge-m3.
|
|
type LegalRAGClient struct {
|
|
qdrantURL string
|
|
qdrantAPIKey string
|
|
ollamaURL string
|
|
embeddingModel string
|
|
collection string
|
|
httpClient *http.Client
|
|
textIndexEnsured map[string]bool
|
|
hybridEnabled bool
|
|
}
|
|
|
|
// NewLegalRAGClient creates a new Legal RAG client using Ollama bge-m3 embeddings.
|
|
func NewLegalRAGClient() *LegalRAGClient {
|
|
qdrantURL := os.Getenv("QDRANT_URL")
|
|
if qdrantURL == "" {
|
|
qdrantURL = "http://localhost:6333"
|
|
}
|
|
qdrantURL = strings.TrimRight(qdrantURL, "/")
|
|
|
|
qdrantAPIKey := os.Getenv("QDRANT_API_KEY")
|
|
|
|
ollamaURL := os.Getenv("OLLAMA_URL")
|
|
if ollamaURL == "" {
|
|
ollamaURL = "http://localhost:11434"
|
|
}
|
|
|
|
hybridEnabled := os.Getenv("RAG_HYBRID_SEARCH") != "false"
|
|
|
|
return &LegalRAGClient{
|
|
qdrantURL: qdrantURL,
|
|
qdrantAPIKey: qdrantAPIKey,
|
|
ollamaURL: ollamaURL,
|
|
embeddingModel: "bge-m3",
|
|
collection: "bp_compliance_ce",
|
|
textIndexEnsured: make(map[string]bool),
|
|
hybridEnabled: hybridEnabled,
|
|
httpClient: &http.Client{
|
|
Timeout: 60 * time.Second,
|
|
},
|
|
}
|
|
}
|
|
|
|
// SearchCollection queries a specific Qdrant collection for relevant passages.
|
|
// If collection is empty, it falls back to the default collection (bp_compliance_ce).
|
|
func (c *LegalRAGClient) SearchCollection(ctx context.Context, collection string, query string, regulationIDs []string, topK int) ([]LegalSearchResult, error) {
|
|
if collection == "" {
|
|
collection = c.collection
|
|
}
|
|
return c.searchInternal(ctx, collection, query, regulationIDs, topK)
|
|
}
|
|
|
|
// Search queries the compliance CE corpus for relevant passages.
|
|
func (c *LegalRAGClient) Search(ctx context.Context, query string, regulationIDs []string, topK int) ([]LegalSearchResult, error) {
|
|
return c.searchInternal(ctx, c.collection, query, regulationIDs, topK)
|
|
}
|
|
|
|
// searchInternal performs the actual search against a given collection.
|
|
// If hybrid search is enabled, it uses the Qdrant Query API with RRF fusion
|
|
// (dense + full-text). Falls back to dense-only /points/search on failure.
|
|
func (c *LegalRAGClient) searchInternal(ctx context.Context, collection string, query string, regulationIDs []string, topK int) ([]LegalSearchResult, error) {
|
|
embedding, err := c.generateEmbedding(ctx, query)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to generate embedding: %w", err)
|
|
}
|
|
|
|
var hits []qdrantSearchHit
|
|
|
|
if c.hybridEnabled {
|
|
hybridHits, err := c.searchHybrid(ctx, collection, embedding, regulationIDs, topK)
|
|
if err == nil {
|
|
hits = hybridHits
|
|
}
|
|
}
|
|
|
|
if hits == nil {
|
|
denseHits, err := c.searchDense(ctx, collection, embedding, regulationIDs, topK)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
hits = denseHits
|
|
}
|
|
|
|
// Stratified: den binding_law-Pool ERGAENZEN (nicht ersetzen), damit die Pflichtquelle
|
|
// immer Kandidat ist — Guidance bleibt als Auslegungskontext erhalten. Best-effort:
|
|
// Fehler beim Binding-Query degradieren still auf den semantischen Pool.
|
|
if bindingHits, bErr := c.searchBinding(ctx, collection, embedding, topK); bErr == nil {
|
|
hits = mergeDedupHits(hits, bindingHits)
|
|
}
|
|
|
|
results := make([]LegalSearchResult, len(hits))
|
|
for i, hit := range hits {
|
|
// Legal-Metadaten nach rag_reingest_spec.md §2: bevorzugt die normalisierten Felder
|
|
// (article_label/regulation_code/article/...); Fallback auf alte Feldnamen, solange der
|
|
// Korpus noch nicht re-ingestiert ist (regulation_id, section="§ 38").
|
|
regCode := getString(hit.Payload, "regulation_code")
|
|
if regCode == "" {
|
|
regCode = getString(hit.Payload, "regulation_id")
|
|
}
|
|
article := getString(hit.Payload, "article")
|
|
if article == "" {
|
|
article = getString(hit.Payload, "section")
|
|
}
|
|
results[i] = LegalSearchResult{
|
|
Text: getString(hit.Payload, "chunk_text"),
|
|
RegulationCode: regCode,
|
|
RegulationName: getString(hit.Payload, "regulation_name_de"),
|
|
RegulationShort: getString(hit.Payload, "regulation_short"),
|
|
Category: getString(hit.Payload, "category"),
|
|
ArticleLabel: getString(hit.Payload, "article_label"),
|
|
Article: article,
|
|
Paragraph: getString(hit.Payload, "paragraph"),
|
|
Sub: getString(hit.Payload, "sub"),
|
|
IsRecital: getBool(hit.Payload, "is_recital"),
|
|
CitationStyle: getString(hit.Payload, "citation_style"),
|
|
Pages: getIntSlice(hit.Payload, "pages"),
|
|
SourceURL: getString(hit.Payload, "source"),
|
|
Score: hit.Score,
|
|
AuthorityWeight: getInt(hit.Payload, "authority_weight"),
|
|
SourceClass: getString(hit.Payload, "source_class"),
|
|
Jurisdiction: getString(hit.Payload, "jurisdiction"),
|
|
}
|
|
}
|
|
|
|
// Authority-aware Re-Ranking: bindendes Recht der passenden Jurisdiktion/Domaene nach
|
|
// oben, Guidance/Fremdrecht/Off-Domain runter (nichts wird geloescht). Reihenfolge only,
|
|
// Response-Schema unveraendert. Score traegt den Authority-Score, damit nachgelagerte
|
|
// Multi-Collection-Merges (Advisor) die Ordnung bewahren.
|
|
results = rerankByAuthority(query, results)
|
|
if topK > 0 && len(results) > topK {
|
|
results = results[:topK]
|
|
}
|
|
|
|
return results, nil
|
|
}
|
|
|
|
// mergeDedupHits concatenates two hit lists, keeping the first occurrence of each point ID.
|
|
func mergeDedupHits(primary, extra []qdrantSearchHit) []qdrantSearchHit {
|
|
seen := make(map[string]bool, len(primary)+len(extra))
|
|
out := make([]qdrantSearchHit, 0, len(primary)+len(extra))
|
|
for _, list := range [][]qdrantSearchHit{primary, extra} {
|
|
for _, h := range list {
|
|
id := fmt.Sprint(h.ID)
|
|
if seen[id] {
|
|
continue
|
|
}
|
|
seen[id] = true
|
|
out = append(out, h)
|
|
}
|
|
}
|
|
return out
|
|
}
|
|
|
|
// FormatLegalContextForPrompt formats the legal context for inclusion in an LLM prompt.
|
|
func (c *LegalRAGClient) FormatLegalContextForPrompt(lc *LegalContext) string {
|
|
if lc == nil || len(lc.Results) == 0 {
|
|
return ""
|
|
}
|
|
|
|
var buf bytes.Buffer
|
|
buf.WriteString("\n\n**Relevante Rechtsgrundlagen:**\n\n")
|
|
|
|
for i, result := range lc.Results {
|
|
buf.WriteString(fmt.Sprintf("%d. **%s** (%s)", i+1, result.RegulationShort, result.RegulationCode))
|
|
if len(result.Pages) > 0 {
|
|
buf.WriteString(fmt.Sprintf(" - Seiten %v", result.Pages))
|
|
}
|
|
buf.WriteString("\n")
|
|
buf.WriteString(fmt.Sprintf(" > %s\n\n", truncateText(result.Text, 300)))
|
|
}
|
|
|
|
return buf.String()
|
|
}
|
|
|
|
// ListAvailableRegulations returns the list of regulations available in the corpus.
|
|
func (c *LegalRAGClient) ListAvailableRegulations() []CERegulationInfo {
|
|
return []CERegulationInfo{
|
|
{ID: "eu_2023_1230", NameDE: "EU-Maschinenverordnung 2023/1230", NameEN: "EU Machinery Regulation 2023/1230", Short: "Maschinenverordnung", Category: "regulation"},
|
|
{ID: "eu_2024_1689", NameDE: "EU KI-Verordnung (AI Act)", NameEN: "EU AI Act 2024/1689", Short: "AI Act", Category: "regulation"},
|
|
{ID: "eu_2024_2847", NameDE: "Cyber Resilience Act", NameEN: "Cyber Resilience Act 2024/2847", Short: "CRA", Category: "regulation"},
|
|
{ID: "eu_2022_2555", NameDE: "NIS-2-Richtlinie", NameEN: "NIS2 Directive 2022/2555", Short: "NIS2", Category: "regulation"},
|
|
{ID: "eu_2016_679", NameDE: "Datenschutz-Grundverordnung (DSGVO)", NameEN: "General Data Protection Regulation (GDPR)", Short: "DSGVO/GDPR", Category: "regulation"},
|
|
{ID: "eu_blue_guide_2022", NameDE: "EU Blue Guide 2022", NameEN: "EU Blue Guide 2022", Short: "Blue Guide", Category: "guidance"},
|
|
{ID: "nist_sp_800_218", NameDE: "NIST Secure Software Development Framework", NameEN: "NIST SSDF SP 800-218", Short: "NIST SSDF", Category: "guidance"},
|
|
{ID: "nist_csf_2_0", NameDE: "NIST Cybersecurity Framework 2.0", NameEN: "NIST CSF 2.0", Short: "NIST CSF", Category: "guidance"},
|
|
{ID: "oecd_ai_principles", NameDE: "OECD Empfehlung zu Kuenstlicher Intelligenz", NameEN: "OECD Recommendation on AI", Short: "OECD AI", Category: "guidance"},
|
|
{ID: "enisa_supply_chain_good_practices", NameDE: "ENISA Supply Chain Cybersecurity", NameEN: "ENISA Good Practices for Supply Chain Cybersecurity", Short: "ENISA Supply Chain", Category: "guidance"},
|
|
{ID: "enisa_threat_landscape_supply_chain", NameDE: "ENISA Threat Landscape Supply Chain", NameEN: "ENISA Threat Landscape for Supply Chain Attacks", Short: "ENISA Threat SC", Category: "guidance"},
|
|
{ID: "enisa_ics_scada_dependencies", NameDE: "ENISA ICS/SCADA Abhaengigkeiten", NameEN: "ENISA ICS/SCADA Communication Dependencies", Short: "ENISA ICS/SCADA", Category: "guidance"},
|
|
{ID: "cisa_secure_by_design", NameDE: "CISA Secure by Design", NameEN: "CISA Secure by Design", Short: "CISA SbD", Category: "guidance"},
|
|
{ID: "enisa_cybersecurity_state_2024", NameDE: "ENISA State of Cybersecurity 2024", NameEN: "ENISA State of Cybersecurity in the Union 2024", Short: "ENISA 2024", Category: "guidance"},
|
|
// BAuA — Technische Regeln (gemeinfrei, §5 UrhG)
|
|
{ID: "trbs", NameDE: "TRBS — Technische Regeln fuer Betriebssicherheit", NameEN: "TRBS — Technical Rules for Operational Safety", Short: "TRBS", Category: "trbs"},
|
|
{ID: "trgs", NameDE: "TRGS — Technische Regeln fuer Gefahrstoffe", NameEN: "TRGS — Technical Rules for Hazardous Substances", Short: "TRGS", Category: "trgs"},
|
|
{ID: "asr", NameDE: "ASR — Arbeitsstaettenregeln", NameEN: "ASR — Workplace Rules", Short: "ASR", Category: "asr"},
|
|
// OSHA
|
|
{ID: "osha_1910", NameDE: "OSHA 1910 Subpart O — Maschinenschutz", NameEN: "OSHA 1910 Subpart O — Machinery and Machine Guarding", Short: "OSHA 1910", Category: "osha"},
|
|
// EuGH
|
|
{ID: "eugh_c_588_21", NameDE: "EuGH C-588/21 P — Datenschutz-Urteil", NameEN: "ECJ C-588/21 P — Data Protection Judgment", Short: "EuGH C-588/21", Category: "eu_recht"},
|
|
}
|
|
}
|