From a1f425d43af744d1ea1d1ae6dab019982ad99449 Mon Sep 17 00:00:00 2001 From: Benjamin_Boenisch Date: Tue, 23 Jun 2026 09:30:52 +0000 Subject: [PATCH] feat(ai-sdk): authority-aware re-ranking for legal RAG (Phase 1) (#31) --- ai-compliance-sdk/internal/ucca/authority.go | 220 ++++++++++++++++++ .../internal/ucca/authority_rerank.go | 68 ++++++ .../internal/ucca/authority_rerank_test.go | 96 ++++++++ .../internal/ucca/authority_test.go | 125 ++++++++++ .../internal/ucca/legal_rag_client.go | 36 +++ .../internal/ucca/legal_rag_http.go | 21 ++ .../internal/ucca/legal_rag_scroll.go | 12 + .../internal/ucca/legal_rag_test.go | 58 ++++- .../internal/ucca/legal_rag_types.go | 7 + 9 files changed, 641 insertions(+), 2 deletions(-) create mode 100644 ai-compliance-sdk/internal/ucca/authority.go create mode 100644 ai-compliance-sdk/internal/ucca/authority_rerank.go create mode 100644 ai-compliance-sdk/internal/ucca/authority_rerank_test.go create mode 100644 ai-compliance-sdk/internal/ucca/authority_test.go diff --git a/ai-compliance-sdk/internal/ucca/authority.go b/ai-compliance-sdk/internal/ucca/authority.go new file mode 100644 index 00000000..53994f5a --- /dev/null +++ b/ai-compliance-sdk/internal/ucca/authority.go @@ -0,0 +1,220 @@ +package ucca + +import ( + "regexp" + "strconv" + "strings" +) + +// authorityInfo is the normative classification of a search result, used internally +// for re-ranking only (Phase 1 changes ordering, not the response contract). +type authorityInfo struct { + weight int // 100 binding_law, 70 guidance, 0 foreign_law, 50 unknown + sourceClass string // binding_law | supervisory_guidance | foreign_law | unknown + jurisdiction string // DE | EU | CH +} + +var ( + guidanceMarkers = []string{ + "DSK", "EDPB", "BfDI", "BFDI", "BayLfD", "Baylfb", "ENISA", "BSI", "EUCC", + "Standards Mapping", "Kpnr", "Orientierungshilfe", "Handreichung", "Beschluss", + "Leitlinie", "Guidance", "Empfehlung", "NIST", "OECD", "CISA", "Blue Guide", + } + foreignMarkers = []string{"RevDSG", "fedlex", "(CH)"} + deMarkers = []string{"BDSG", "DSK", "BfDI", "BFDI", "BayLfD", "Baylfb", "BSI"} + normPattern = regexp.MustCompile(`(§|Art\.?)\s*\d`) + bdsgParagraph = regexp.MustCompile(`§\s*(\d+)`) +) + +// classifyAuthority derives weight/source-class/jurisdiction. Explicitly tagged payload +// values win; otherwise it falls back to the curated category + name markers, so the +// not-yet-re-ingested (untagged) corpus is still classified deterministically. +func classifyAuthority(r LegalSearchResult) authorityInfo { + jur := r.Jurisdiction + if jur == "" { + jur = inferJurisdiction(r) + } + if r.SourceClass != "" { + w := r.AuthorityWeight + if w == 0 && r.SourceClass == "binding_law" { + w = 100 + } + return authorityInfo{weight: w, sourceClass: r.SourceClass, jurisdiction: jur} + } + if r.AuthorityWeight > 0 { + return authorityInfo{weight: r.AuthorityWeight, sourceClass: sourceClassFromWeight(r.AuthorityWeight), jurisdiction: jur} + } + hay := r.ArticleLabel + " " + r.RegulationShort + " " + r.RegulationName + " " + r.RegulationCode + switch { + case containsAny(hay, foreignMarkers): + return authorityInfo{weight: 0, sourceClass: "foreign_law", jurisdiction: "CH"} + case r.Category == "guidance" || containsAny(hay, guidanceMarkers): + return authorityInfo{weight: 70, sourceClass: "supervisory_guidance", jurisdiction: jur} + case r.Category == "regulation" || r.Category == "eu_recht" || normPattern.MatchString(r.ArticleLabel): + return authorityInfo{weight: 100, sourceClass: "binding_law", jurisdiction: jur} + default: + return authorityInfo{weight: 50, sourceClass: "unknown", jurisdiction: jur} + } +} + +func sourceClassFromWeight(w int) string { + switch { + case w >= 100: + return "binding_law" + case w >= 70: + return "supervisory_guidance" + case w <= 0: + return "foreign_law" + default: + return "unknown" + } +} + +func inferJurisdiction(r LegalSearchResult) string { + hay := r.ArticleLabel + " " + r.RegulationShort + " " + r.RegulationName + switch { + case containsAny(hay, foreignMarkers): + return "CH" + case strings.Contains(hay, "§") || containsAny(hay, deMarkers): + return "DE" + default: + return "EU" + } +} + +// --- Domain routing: separates same-authority but topically foreign norms --- + +type domainDef struct { + name string + regs []string // regulation markers found in a chunk + keywords []string // query keywords that signal this domain +} + +// Deterministic order (slice, not map) — important for stable classification + tests. +var domains = []domainDef{ + {"data_protection", + []string{"DSGVO", "GDPR", "BDSG", "EDPB", "DSK", "BfDI", "BayLfD", "DPF"}, + []string{"personenbezogen", "betroffene", "datenschutz", "datenschutzbeauftrag", "dsb", + "datenpanne", "auskunft", "loesch", "lösch", "einwilligung", "besondere kategorien", "auftragsverarbeiter"}}, + {"cyber", + []string{"CRA", "NIS2", "NIS-2", "ENISA", "DORA", "EUCC"}, + []string{"security update", "sicherheitsupdate", "sicherheitsaktualisierung", "schwachstelle", "sbom", + "cybersicherheit", "konformit", "hersteller", "importeur", "haendler", "händler", "ikt-", + "resilienz", "sicherheitsvorfall", "digitalen elementen"}}, + {"ai", + []string{"AI Act", "KI-VO", "KI-Verordnung"}, + []string{"ki-system", "ki-modell", "hochrisiko", "kuenstliche intelligenz", "künstliche intelligenz"}}, + {"product_safety", + []string{"Maschinenverordnung", "MaschinenVO", "GPSR", "RED", "MDR"}, + nil}, +} + +func queryDomain(query string) string { + ql := strings.ToLower(query) + for _, d := range domains { + for _, kw := range d.keywords { + if strings.Contains(ql, kw) { + return d.name + } + } + } + return "" +} + +func chunkDomain(r LegalSearchResult) string { + hay := r.ArticleLabel + " " + r.RegulationShort + " " + r.RegulationCode + " " + r.RegulationName + for _, d := range domains { + if containsAny(hay, d.regs) { + return d.name + } + } + return "" +} + +// scopeClass flags special sub-regimes that must not win general questions — +// BDSG Teil 3 (§§ 45-84) implements the JI directive (law enforcement), not the general regime. +func scopeClass(r LegalSearchResult) string { + hay := r.ArticleLabel + " " + r.RegulationShort + if strings.Contains(hay, "BDSG") { + if m := bdsgParagraph.FindStringSubmatch(hay); m != nil { + if n, err := strconv.Atoi(m[1]); err == nil && n >= 45 && n <= 84 { + return "law_enforcement" + } + } + } + return "general" +} + +// --- Topic ontology: amplifier only (boost), never an override --- + +type topicDef struct { + keywords []string + norms []string // preferred canonical citation fragments +} + +var topics = []topicDef{ + {[]string{"datenschutzbeauftrag", "dsb", "benennung"}, []string{"Art. 37", "§ 38 BDSG"}}, + {[]string{"stellung des"}, []string{"Art. 38"}}, + {[]string{"aufgaben des"}, []string{"Art. 39"}}, + {[]string{"folgenabsch", "dsfa"}, []string{"Art. 35"}}, + {[]string{"besondere kategorien"}, []string{"Art. 9", "§ 22 BDSG"}}, + {[]string{"auskunft"}, []string{"Art. 15", "§ 34 BDSG"}}, + {[]string{"loesch", "lösch"}, []string{"Art. 17", "§ 35 BDSG"}}, + {[]string{"bussgeld", "geldbusse"}, []string{"Art. 83"}}, + {[]string{"security update", "sicherheitsupdate", "schwachstelle", "sbom", "cybersicherheitsanforderung"}, []string{"CRA Anhang I"}}, + {[]string{"meldepflicht", "sicherheitsvorfall"}, []string{"Art. 14 CRA"}}, +} + +// resultMatchesTopic reports whether the result is a preferred norm of a topic the query hits. +func resultMatchesTopic(query string, r LegalSearchResult) bool { + ql := strings.ToLower(query) + hay := r.ArticleLabel + " " + r.RegulationShort + for _, t := range topics { + if !containsAnyLower(ql, t.keywords) { + continue + } + for _, n := range t.norms { + if normMatches(hay, n) { + return true + } + } + } + return false +} + +// normMatches checks that norm appears in hay with a non-digit boundary, so "Art. 9" +// matches "Art. 9 DSGVO" but not "Art. 90". +func normMatches(hay, norm string) bool { + idx := strings.Index(hay, norm) + if idx < 0 { + return false + } + end := idx + len(norm) + if end < len(hay) && hay[end] >= '0' && hay[end] <= '9' { + return false + } + return true +} + +func queryIsForeign(query string) bool { + return containsAnyLower(strings.ToLower(query), + []string{"schweiz", "revdsg", "fedlex", " ch ", "oesterreich", "österreich"}) +} + +func containsAny(hay string, markers []string) bool { + for _, m := range markers { + if strings.Contains(hay, m) { + return true + } + } + return false +} + +func containsAnyLower(haylower string, markers []string) bool { + for _, m := range markers { + if strings.Contains(haylower, strings.ToLower(m)) { + return true + } + } + return false +} diff --git a/ai-compliance-sdk/internal/ucca/authority_rerank.go b/ai-compliance-sdk/internal/ucca/authority_rerank.go new file mode 100644 index 00000000..6c6232eb --- /dev/null +++ b/ai-compliance-sdk/internal/ucca/authority_rerank.go @@ -0,0 +1,68 @@ +package ucca + +import "sort" + +// Re-ranking coefficients (validated in the offline golden harness; Phase A — conservative). +const ( + authorityCoef = 0.40 // * weight/100 + jurisdictionGain = 0.05 // binding/guidance from DE or EU + foreignPenalty = 0.60 // foreign law on a DE/EU question (demoted, not removed) + unknownPenalty = 0.08 + domainMatchGain = 0.15 + offDomainPenalty = 0.10 // off-domain binding (demoted, not removed) + scopePenalty = 0.25 // BDSG Teil 3 (law enforcement) on a general DP question + topicGain = 0.18 // amplifier only +) + +// authorityScore computes the normative relevance of a result for a query. It augments the +// semantic score with authority/jurisdiction/domain/scope/topic signals. Exposed for tests. +func authorityScore(query string, r LegalSearchResult, qDomain string, qForeign bool) float64 { + info := classifyAuthority(r) + score := r.Score + authorityCoef*float64(info.weight)/100.0 + + if info.jurisdiction == "CH" && !qForeign { + score -= foreignPenalty // Fremdrecht bei DE/EU-Frage: demoted, nicht geloescht + } else { + score += jurisdictionGain + } + if info.sourceClass == "unknown" { + score -= unknownPenalty + } + if qDomain != "" { + switch cd := chunkDomain(r); { + case cd == qDomain: + score += domainMatchGain + case cd != "": + score -= offDomainPenalty // off-domain binding: demoted, nicht geloescht + } + } + if qDomain == "data_protection" && scopeClass(r) == "law_enforcement" { + score -= scopePenalty + } + if resultMatchesTopic(query, r) { + score += topicGain // Verstaerker, kein Override + } + return score +} + +// rerankByAuthority re-orders results so binding law from the matching jurisdiction/domain +// ranks above guidance, foreign and off-domain law — WITHOUT dropping anything (guidance is +// kept as interpretation context). The computed score is written back to Score so downstream +// merges (e.g. the multi-collection advisor) preserve this order. Pure + deterministic. +func rerankByAuthority(query string, results []LegalSearchResult) []LegalSearchResult { + if len(results) < 2 { + return results + } + qDomain := queryDomain(query) + qForeign := queryIsForeign(query) + + out := make([]LegalSearchResult, len(results)) + copy(out, results) + for i := range out { + out[i].Score = authorityScore(query, out[i], qDomain, qForeign) + } + sort.SliceStable(out, func(a, b int) bool { + return out[a].Score > out[b].Score + }) + return out +} diff --git a/ai-compliance-sdk/internal/ucca/authority_rerank_test.go b/ai-compliance-sdk/internal/ucca/authority_rerank_test.go new file mode 100644 index 00000000..65e5f16c --- /dev/null +++ b/ai-compliance-sdk/internal/ucca/authority_rerank_test.go @@ -0,0 +1,96 @@ +package ucca + +import "testing" + +func bindingRes(label, reg, jur string, score float64) LegalSearchResult { + return LegalSearchResult{ArticleLabel: label, RegulationShort: reg, SourceClass: "binding_law", AuthorityWeight: 100, Jurisdiction: jur, Score: score} +} + +func guidanceRes(label, reg string, score float64) LegalSearchResult { + return LegalSearchResult{ArticleLabel: label, RegulationShort: reg, SourceClass: "supervisory_guidance", AuthorityWeight: 70, Jurisdiction: "EU", Score: score} +} + +func foreignRes(label string, score float64) LegalSearchResult { + return LegalSearchResult{ArticleLabel: label, RegulationShort: "RevDSG", SourceClass: "foreign_law", AuthorityWeight: 0, Jurisdiction: "CH", Score: score} +} + +// Acceptance criteria (Phase 1) expressed as ordering tests. +func TestRerankByAuthority_Acceptance(t *testing.T) { + t.Run("guidance does not overtake semantically competitive binding", func(t *testing.T) { + out := rerankByAuthority("Was gilt hier?", []LegalSearchResult{ + guidanceRes("ENISA Mapping", "ENISA", 0.72), + bindingRes("CRA Anhang I", "CRA", "EU", 0.66), + }) + if out[0].RegulationShort != "CRA" { + t.Fatalf("binding must rank first over competitive guidance, got %q", out[0].RegulationShort) + } + }) + + t.Run("foreign law demoted on DE/EU question but kept", func(t *testing.T) { + in := []LegalSearchResult{foreignRes("RevDSG Art 1", 0.85), bindingRes("Art. 9 DSGVO", "DSGVO", "EU", 0.62)} + out := rerankByAuthority("Welche Daten sind besonders geschuetzt?", in) + if out[0].RegulationShort != "DSGVO" { + t.Fatalf("binding EU must beat foreign on a DE/EU query, got %q", out[0].RegulationShort) + } + if len(out) != 2 { + t.Fatalf("foreign law must be kept, got len=%d", len(out)) + } + }) + + t.Run("off-domain binding demoted but not removed", func(t *testing.T) { + in := []LegalSearchResult{ + bindingRes("Art. 13 EU MDR", "MDR", "EU", 0.70), + bindingRes("Art. 13 CRA", "CRA", "EU", 0.60), + } + out := rerankByAuthority("Welche Pflichten hat der Hersteller von Produkten mit digitalen Elementen?", in) + if out[0].RegulationShort != "CRA" { + t.Fatalf("on-domain CRA must beat off-domain MDR, got %q", out[0].RegulationShort) + } + if len(out) != 2 { + t.Fatalf("off-domain MDR must be kept, got len=%d", len(out)) + } + }) + + t.Run("same-regime binding wins over guidance", func(t *testing.T) { + out := rerankByAuthority("Was gilt hier?", []LegalSearchResult{ + bindingRes("Art. 13 CRA", "CRA", "EU", 0.70), + guidanceRes("ENISA Mapping", "ENISA", 0.60), + }) + if out[0].RegulationShort != "CRA" { + t.Fatalf("binding must win, got %q", out[0].RegulationShort) + } + }) + + t.Run("BDSG Teil 3 demoted below DSGVO on general DP question", func(t *testing.T) { + in := []LegalSearchResult{ + bindingRes("§ 48 BDSG", "BDSG", "DE", 0.70), // Teil 3 (law enforcement) + bindingRes("Art. 9 DSGVO", "DSGVO", "EU", 0.62), + } + out := rerankByAuthority("Was sind besondere Kategorien personenbezogener Daten?", in) + if out[0].RegulationShort != "DSGVO" { + t.Fatalf("DSGVO must beat BDSG Teil 3 on a general DP question, got %q", out[0].RegulationShort) + } + }) + + t.Run("nothing is dropped and topic amplifies", func(t *testing.T) { + in := []LegalSearchResult{ + guidanceRes("ENISA", "ENISA", 0.72), + bindingRes("CRA Anhang I", "CRA", "EU", 0.66), + foreignRes("RevDSG", 0.5), + } + out := rerankByAuthority("Anforderungen an Security Updates?", in) + if len(out) != len(in) { + t.Fatalf("rerank must preserve all results, got %d want %d", len(out), len(in)) + } + if out[0].ArticleLabel != "CRA Anhang I" { + t.Fatalf("topic+authority must lift CRA Anhang I to top, got %q", out[0].ArticleLabel) + } + }) + + t.Run("single result returned unchanged", func(t *testing.T) { + in := []LegalSearchResult{bindingRes("Art. 1 CRA", "CRA", "EU", 0.5)} + if out := rerankByAuthority("x", in); len(out) != 1 { + t.Fatalf("len=%d", len(out)) + } + }) +} diff --git a/ai-compliance-sdk/internal/ucca/authority_test.go b/ai-compliance-sdk/internal/ucca/authority_test.go new file mode 100644 index 00000000..d4109c65 --- /dev/null +++ b/ai-compliance-sdk/internal/ucca/authority_test.go @@ -0,0 +1,125 @@ +package ucca + +import "testing" + +func TestClassifyAuthority(t *testing.T) { + tests := []struct { + name string + result LegalSearchResult + wantW int + wantSC string + wantJur string + }{ + {"tagged binding EU", LegalSearchResult{AuthorityWeight: 100, SourceClass: "binding_law", Jurisdiction: "EU"}, 100, "binding_law", "EU"}, + {"tagged guidance DE", LegalSearchResult{AuthorityWeight: 70, SourceClass: "supervisory_guidance", Jurisdiction: "DE"}, 70, "supervisory_guidance", "DE"}, + {"tagged foreign CH", LegalSearchResult{AuthorityWeight: 0, SourceClass: "foreign_law", Jurisdiction: "CH"}, 0, "foreign_law", "CH"}, + {"untagged ENISA guidance", LegalSearchResult{RegulationShort: "ENISA", ArticleLabel: "ENISA CRA Standards Mapping"}, 70, "supervisory_guidance", "EU"}, + {"untagged CRA binding", LegalSearchResult{RegulationShort: "CRA", ArticleLabel: "Art. 13 CRA", Category: "regulation"}, 100, "binding_law", "EU"}, + {"untagged BDSG binding DE", LegalSearchResult{RegulationShort: "BDSG", ArticleLabel: "§ 38 BDSG"}, 100, "binding_law", "DE"}, + {"untagged RevDSG foreign", LegalSearchResult{RegulationShort: "RevDSG", ArticleLabel: "RevDSG (CH)"}, 0, "foreign_law", "CH"}, + {"untagged unknown", LegalSearchResult{RegulationShort: "", ArticleLabel: ""}, 50, "unknown", "EU"}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := classifyAuthority(tt.result) + if got.weight != tt.wantW || got.sourceClass != tt.wantSC || got.jurisdiction != tt.wantJur { + t.Errorf("classifyAuthority() = {%d %s %s}, want {%d %s %s}", + got.weight, got.sourceClass, got.jurisdiction, tt.wantW, tt.wantSC, tt.wantJur) + } + }) + } +} + +func TestQueryDomain(t *testing.T) { + tests := []struct{ q, want string }{ + {"Welche Anforderungen an Security Updates?", "cyber"}, + {"Wer braucht einen Datenschutzbeauftragten?", "data_protection"}, + {"Was sind besondere Kategorien personenbezogener Daten?", "data_protection"}, + {"Welche Pflichten beim Hochrisiko-KI-System?", "ai"}, + {"Wie spaet ist es?", ""}, + } + for _, tt := range tests { + if got := queryDomain(tt.q); got != tt.want { + t.Errorf("queryDomain(%q) = %q, want %q", tt.q, got, tt.want) + } + } +} + +func TestChunkDomain(t *testing.T) { + tests := []struct { + name string + r LegalSearchResult + want string + }{ + {"CRA cyber", LegalSearchResult{RegulationShort: "CRA", ArticleLabel: "Art. 13 CRA"}, "cyber"}, + {"DSGVO dp", LegalSearchResult{RegulationShort: "DSGVO", ArticleLabel: "Art. 9 DSGVO"}, "data_protection"}, + {"AI Act ai", LegalSearchResult{RegulationShort: "AI Act", ArticleLabel: "Art. 10 AI Act"}, "ai"}, + {"MDR product", LegalSearchResult{RegulationShort: "MDR", ArticleLabel: "Art. 13 EU MDR"}, "product_safety"}, + {"unknown", LegalSearchResult{RegulationShort: "XYZ"}, ""}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := chunkDomain(tt.r); got != tt.want { + t.Errorf("chunkDomain() = %q, want %q", got, tt.want) + } + }) + } +} + +func TestScopeClass(t *testing.T) { + tests := []struct { + name string + r LegalSearchResult + want string + }{ + {"BDSG Teil 3 law enforcement", LegalSearchResult{RegulationShort: "BDSG", ArticleLabel: "§ 48 BDSG"}, "law_enforcement"}, + {"BDSG general part", LegalSearchResult{RegulationShort: "BDSG", ArticleLabel: "§ 38 BDSG"}, "general"}, + {"DSGVO general", LegalSearchResult{RegulationShort: "DSGVO", ArticleLabel: "Art. 9 DSGVO"}, "general"}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := scopeClass(tt.r); got != tt.want { + t.Errorf("scopeClass() = %q, want %q", got, tt.want) + } + }) + } +} + +func TestResultMatchesTopic(t *testing.T) { + tests := []struct { + name string + query string + r LegalSearchResult + want bool + }{ + {"besondere Kategorien -> Art 9 match", "Was sind besondere Kategorien?", LegalSearchResult{ArticleLabel: "Art. 9 DSGVO"}, true}, + {"besondere Kategorien -> Art 90 no match", "Was sind besondere Kategorien?", LegalSearchResult{ArticleLabel: "Art. 90 DSGVO"}, false}, + {"security updates -> CRA Anhang I", "Anforderungen an Security Updates?", LegalSearchResult{ArticleLabel: "CRA Anhang I"}, true}, + {"no topic keyword", "Wie spaet ist es?", LegalSearchResult{ArticleLabel: "Art. 9 DSGVO"}, false}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := resultMatchesTopic(tt.query, tt.r); got != tt.want { + t.Errorf("resultMatchesTopic() = %v, want %v", got, tt.want) + } + }) + } +} + +func TestNormMatches(t *testing.T) { + tests := []struct { + hay, norm string + want bool + }{ + {"Art. 9 DSGVO", "Art. 9", true}, + {"Art. 90 DSGVO", "Art. 9", false}, + {"§ 38 BDSG", "§ 38 BDSG", true}, + {"§ 380 BDSG", "§ 38", false}, + {"Art. 14 CRA", "Art. 14 CRA", true}, + } + for _, tt := range tests { + if got := normMatches(tt.hay, tt.norm); got != tt.want { + t.Errorf("normMatches(%q,%q) = %v, want %v", tt.hay, tt.norm, got, tt.want) + } + } +} diff --git a/ai-compliance-sdk/internal/ucca/legal_rag_client.go b/ai-compliance-sdk/internal/ucca/legal_rag_client.go index 9f27ee13..e60bd198 100644 --- a/ai-compliance-sdk/internal/ucca/legal_rag_client.go +++ b/ai-compliance-sdk/internal/ucca/legal_rag_client.go @@ -93,6 +93,13 @@ func (c *LegalRAGClient) searchInternal(ctx context.Context, collection string, hits = denseHits } + // Stratified: den binding_law-Pool ERGAENZEN (nicht ersetzen), damit die Pflichtquelle + // immer Kandidat ist — Guidance bleibt als Auslegungskontext erhalten. Best-effort: + // Fehler beim Binding-Query degradieren still auf den semantischen Pool. + if bindingHits, bErr := c.searchBinding(ctx, collection, embedding, topK); bErr == nil { + hits = mergeDedupHits(hits, bindingHits) + } + results := make([]LegalSearchResult, len(hits)) for i, hit := range hits { // Legal-Metadaten nach rag_reingest_spec.md §2: bevorzugt die normalisierten Felder @@ -121,12 +128,41 @@ func (c *LegalRAGClient) searchInternal(ctx context.Context, collection string, Pages: getIntSlice(hit.Payload, "pages"), SourceURL: getString(hit.Payload, "source"), Score: hit.Score, + AuthorityWeight: getInt(hit.Payload, "authority_weight"), + SourceClass: getString(hit.Payload, "source_class"), + Jurisdiction: getString(hit.Payload, "jurisdiction"), } } + // Authority-aware Re-Ranking: bindendes Recht der passenden Jurisdiktion/Domaene nach + // oben, Guidance/Fremdrecht/Off-Domain runter (nichts wird geloescht). Reihenfolge only, + // Response-Schema unveraendert. Score traegt den Authority-Score, damit nachgelagerte + // Multi-Collection-Merges (Advisor) die Ordnung bewahren. + results = rerankByAuthority(query, results) + if topK > 0 && len(results) > topK { + results = results[:topK] + } + return results, nil } +// mergeDedupHits concatenates two hit lists, keeping the first occurrence of each point ID. +func mergeDedupHits(primary, extra []qdrantSearchHit) []qdrantSearchHit { + seen := make(map[string]bool, len(primary)+len(extra)) + out := make([]qdrantSearchHit, 0, len(primary)+len(extra)) + for _, list := range [][]qdrantSearchHit{primary, extra} { + for _, h := range list { + id := fmt.Sprint(h.ID) + if seen[id] { + continue + } + seen[id] = true + out = append(out, h) + } + } + return out +} + // FormatLegalContextForPrompt formats the legal context for inclusion in an LLM prompt. func (c *LegalRAGClient) FormatLegalContextForPrompt(lc *LegalContext) string { if lc == nil || len(lc.Results) == 0 { diff --git a/ai-compliance-sdk/internal/ucca/legal_rag_http.go b/ai-compliance-sdk/internal/ucca/legal_rag_http.go index 1baf8286..5d68181e 100644 --- a/ai-compliance-sdk/internal/ucca/legal_rag_http.go +++ b/ai-compliance-sdk/internal/ucca/legal_rag_http.go @@ -185,6 +185,27 @@ func (c *LegalRAGClient) searchDense(ctx context.Context, collection string, emb searchReq.Filter = &qdrantFilter{Should: conditions} } + return c.doPointsSearch(ctx, collection, searchReq) +} + +// searchBinding fetches the top binding_law hits (authority-stratified pool) so the +// obligation source is always a candidate even when guidance dominates semantically. +// It AUGMENTS the semantic pool — guidance is preserved as interpretation context. +func (c *LegalRAGClient) searchBinding(ctx context.Context, collection string, embedding []float64, topK int) ([]qdrantSearchHit, error) { + searchReq := qdrantSearchRequest{ + Vector: embedding, + Limit: topK, + WithPayload: true, + Filter: &qdrantFilter{Must: []qdrantCondition{ + {Key: "source_class", Match: qdrantMatch{Value: "binding_law"}}, + }}, + } + + return c.doPointsSearch(ctx, collection, searchReq) +} + +// doPointsSearch issues a POST /points/search and decodes the hits. +func (c *LegalRAGClient) doPointsSearch(ctx context.Context, collection string, searchReq qdrantSearchRequest) ([]qdrantSearchHit, error) { jsonBody, err := json.Marshal(searchReq) if err != nil { return nil, fmt.Errorf("failed to marshal search request: %w", err) diff --git a/ai-compliance-sdk/internal/ucca/legal_rag_scroll.go b/ai-compliance-sdk/internal/ucca/legal_rag_scroll.go index 8058149b..73fd0538 100644 --- a/ai-compliance-sdk/internal/ucca/legal_rag_scroll.go +++ b/ai-compliance-sdk/internal/ucca/legal_rag_scroll.go @@ -225,6 +225,18 @@ func getIntSlice(m map[string]interface{}, key string) []int { return result } +func getInt(m map[string]interface{}, key string) int { + if v, ok := m[key]; ok { + switch n := v.(type) { + case float64: + return int(n) + case int: + return n + } + } + return 0 +} + func contains(slice []string, item string) bool { for _, s := range slice { if s == item { diff --git a/ai-compliance-sdk/internal/ucca/legal_rag_test.go b/ai-compliance-sdk/internal/ucca/legal_rag_test.go index e7c4fa34..35056238 100644 --- a/ai-compliance-sdk/internal/ucca/legal_rag_test.go +++ b/ai-compliance-sdk/internal/ucca/legal_rag_test.go @@ -399,8 +399,9 @@ func TestHybridSearch_UsesQueryAPI(t *testing.T) { return } - // Fallback: should not reach dense search - t.Error("Unexpected dense search call when hybrid succeeded") + // /points/search is now the stratified binding-law augmentation query (it AUGMENTS + // the hybrid pool, it is not a dense fallback). Return empty so the hybrid hit + // remains the sole result for this test. json.NewEncoder(w).Encode(qdrantSearchResponse{Result: []qdrantSearchHit{}}) })) defer qdrantMock.Close() @@ -446,6 +447,59 @@ func TestHybridSearch_UsesQueryAPI(t *testing.T) { } } +// TestSearch_StratifiedBindingRerank verifies that the binding-law pool augments the +// semantic pool and that authority re-ranking lifts binding law above higher-semantic guidance. +func TestSearch_StratifiedBindingRerank(t *testing.T) { + ollamaMock := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + json.NewEncoder(w).Encode(ollamaEmbeddingResponse{Embedding: make([]float64, 1024)}) + })) + defer ollamaMock.Close() + + qdrantMock := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if strings.Contains(r.URL.Path, "/index") { + w.WriteHeader(http.StatusOK) + w.Write([]byte(`{"result":{"status":"completed"}}`)) + return + } + if strings.Contains(r.URL.Path, "/points/query") { + json.NewEncoder(w).Encode(qdrantQueryResponse{Result: []qdrantSearchHit{ + {ID: "g1", Score: 0.72, Payload: map[string]interface{}{ + "chunk_text": "ENISA guidance", "regulation_short": "ENISA", + "article_label": "ENISA CRA Mapping", "source_class": "supervisory_guidance", + "authority_weight": float64(70), "jurisdiction": "EU", + }}, + }}) + return + } + // /points/search = stratified binding-law pool (source_class=binding_law) + json.NewEncoder(w).Encode(qdrantSearchResponse{Result: []qdrantSearchHit{ + {ID: "b1", Score: 0.66, Payload: map[string]interface{}{ + "chunk_text": "CRA Anhang I requirement", "regulation_short": "CRA", + "article_label": "CRA Anhang I", "source_class": "binding_law", + "authority_weight": float64(100), "jurisdiction": "EU", + }}, + }}) + })) + defer qdrantMock.Close() + + client := &LegalRAGClient{ + qdrantURL: qdrantMock.URL, ollamaURL: ollamaMock.URL, embeddingModel: "bge-m3", + collection: "bp_compliance_ce", textIndexEnsured: make(map[string]bool), + hybridEnabled: true, httpClient: http.DefaultClient, + } + + results, err := client.Search(context.Background(), "Was gilt hier?", nil, 5) + if err != nil { + t.Fatalf("search failed: %v", err) + } + if len(results) != 2 { + t.Fatalf("expected 2 merged results (guidance + binding), got %d", len(results)) + } + if results[0].RegulationShort != "CRA" { + t.Errorf("binding CRA must rank first over higher-semantic guidance, got %q", results[0].RegulationShort) + } +} + func TestHybridSearch_FallbackToDense(t *testing.T) { var requestedPaths []string diff --git a/ai-compliance-sdk/internal/ucca/legal_rag_types.go b/ai-compliance-sdk/internal/ucca/legal_rag_types.go index 5f9e0832..38fa4738 100644 --- a/ai-compliance-sdk/internal/ucca/legal_rag_types.go +++ b/ai-compliance-sdk/internal/ucca/legal_rag_types.go @@ -20,6 +20,13 @@ type LegalSearchResult struct { Pages []int `json:"pages,omitempty"` SourceURL string `json:"source_url"` Score float64 `json:"score"` + + // Interne Felder fuer das Authority-Re-Ranking (Phase 1) — NICHT serialisiert + // (json:"-"), daher kein Contract-Change. Aus dem Qdrant-Payload befuellt und nur + // fuer die Sortierung in rerankByAuthority verwendet. + AuthorityWeight int `json:"-"` + SourceClass string `json:"-"` + Jurisdiction string `json:"-"` } // LegalContext represents aggregated legal context for an assessment.