0903e3a8d1
CI / detect-changes (push) Successful in 5s
CI / branch-name (push) Has been skipped
CI / guardrail-integrity (push) Has been skipped
CI / secret-scan (push) Has been skipped
CI / dep-audit (push) Has been skipped
CI / sbom-scan (push) Has been skipped
CI / build-sha-integrity (push) Successful in 5s
CI / validate-canonical-controls (push) Successful in 4s
CI / loc-budget (push) Successful in 18s
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / nodejs-build (push) Successful in 3m0s
CI / test-go (push) Successful in 59s
CI / iace-gt-coverage (push) Successful in 17s
CI / test-python-backend (push) Has been skipped
CI / test-python-document-crawler (push) Has been skipped
CI / test-python-dsms-gateway (push) Has been skipped
Authority Router re-embedded the query per collection (6x); on dev the embed endpoint (OVH) is remote so that was 6 round-trips = 7-12s per /retrieve. Embed once, reuse via ctx across the concurrent per-collection searches. DetectIntent + ConceptNorms now fold ae/oe/ue/ss so ASCII (Pruefe) and umlaut (Pruefe) inputs both match.
134 lines
4.6 KiB
Go
134 lines
4.6 KiB
Go
package ucca
|
|
|
|
import (
|
|
"context"
|
|
"os"
|
|
"sort"
|
|
"strings"
|
|
"sync"
|
|
)
|
|
|
|
// routerBaseCollections is the broad authority base the Authority Router fans out over. It mirrors
|
|
// the Advisor's historical multi-collection set; the KB-2026.1 slice is added separately when the
|
|
// query is in scope. Override via RAG_ROUTER_COLLECTIONS (comma-separated) per environment.
|
|
func (c *LegalRAGClient) routerBaseCollections() []string {
|
|
if v := strings.TrimSpace(os.Getenv("RAG_ROUTER_COLLECTIONS")); v != "" {
|
|
var out []string
|
|
for _, p := range strings.Split(v, ",") {
|
|
if s := strings.TrimSpace(p); s != "" {
|
|
out = append(out, s)
|
|
}
|
|
}
|
|
if len(out) > 0 {
|
|
return out
|
|
}
|
|
}
|
|
return []string{
|
|
"bp_compliance_gesetze",
|
|
"bp_compliance_ce",
|
|
"bp_compliance_datenschutz",
|
|
"bp_dsfa_corpus",
|
|
"bp_compliance_recht",
|
|
"bp_legal_templates",
|
|
}
|
|
}
|
|
|
|
const routerPerCollectionTopK = 3
|
|
|
|
// Retrieve is the Authority Router entry point: callers (the Advisor) pass ONLY a query and stay
|
|
// collection-agnostic. The router fans out over the broad authority base and ADDS the KB-2026.1
|
|
// slice when the query is in scope (inKBScope), then merges all hits, deduplicates, and returns the
|
|
// top-K by authority score. This moves the former Advisor-side collection fan-out into the retrieval
|
|
// layer (the "Retriever" tier of the quality pyramid), so the proven KB-2026.1 slice gain reaches
|
|
// the product path without the Advisor knowing about individual collections.
|
|
//
|
|
// The merged set is ordered by the per-collection authority score that rerankByAuthority already
|
|
// produced inside searchInternal — i.e. binding-vs-guidance ordering is preserved across the merge.
|
|
// Per-collection failures (e.g. a collection absent on an environment) degrade gracefully.
|
|
func (c *LegalRAGClient) Retrieve(ctx context.Context, query string, topK int) ([]LegalSearchResult, error) {
|
|
if topK <= 0 {
|
|
topK = 8
|
|
}
|
|
|
|
collections := c.routerBaseCollections()
|
|
if c.kbScopeRoutingEnabled && c.kbSliceCollection != "" && inKBScope(query) {
|
|
collections = append(collections, c.kbSliceCollection)
|
|
}
|
|
|
|
// Cross-regulation queries (>=2 explicitly named regulations) get a larger per-collection budget
|
|
// so each collection's multi-regulation search isn't truncated down to the keyword-dominant
|
|
// domain; the final per-regulation balancing then guarantees every named domain in the top-K.
|
|
regs := detectRegulations(query)
|
|
perColl := routerPerCollectionTopK
|
|
if len(regs) >= 2 {
|
|
perColl = routerPerCollectionTopK * len(regs)
|
|
}
|
|
|
|
// Warm the full-text indexes sequentially first so the concurrent fan-out below only READS the
|
|
// shared textIndexEnsured map (the writes happen here, serialized) — closes the cold-start map
|
|
// race deterministically. Best-effort: a missing collection just stays un-indexed (hybrid then
|
|
// falls back to dense, or the per-collection search degrades to nothing).
|
|
if c.hybridEnabled {
|
|
for _, coll := range collections {
|
|
_ = c.ensureTextIndex(ctx, coll)
|
|
}
|
|
}
|
|
|
|
// Embed the query ONCE and stash it in ctx so the concurrent per-collection searches
|
|
// below reuse it instead of each re-embedding (was N remote round-trips on dev/OVH).
|
|
ctx = c.withQueryEmbedding(ctx, query)
|
|
|
|
out := make([][]LegalSearchResult, len(collections))
|
|
var wg sync.WaitGroup
|
|
for i, coll := range collections {
|
|
wg.Add(1)
|
|
go func(i int, coll string) {
|
|
defer wg.Done()
|
|
if res, err := c.searchInternal(ctx, coll, query, nil, perColl); err == nil {
|
|
out[i] = res
|
|
}
|
|
}(i, coll)
|
|
}
|
|
wg.Wait()
|
|
|
|
merged := make([]LegalSearchResult, 0, len(collections)*perColl)
|
|
for _, r := range out {
|
|
merged = append(merged, r...)
|
|
}
|
|
merged = dedupResults(merged)
|
|
sort.SliceStable(merged, func(a, b int) bool { return merged[a].Score > merged[b].Score })
|
|
|
|
// Cross-regulation: guarantee every named domain is represented (0070-class fix) instead of
|
|
// letting a global score-sort starve the non-dominant domain.
|
|
if len(regs) >= 2 {
|
|
return balanceByRegulation(merged, regs, topK), nil
|
|
}
|
|
if len(merged) > topK {
|
|
merged = merged[:topK]
|
|
}
|
|
return merged, nil
|
|
}
|
|
|
|
// dedupResults removes duplicate passages that can appear when collections overlap, keeping the
|
|
// highest-scoring occurrence. Identity = regulation_code + article_label + a text prefix.
|
|
func dedupResults(in []LegalSearchResult) []LegalSearchResult {
|
|
pos := make(map[string]int, len(in))
|
|
out := make([]LegalSearchResult, 0, len(in))
|
|
for _, r := range in {
|
|
text := r.Text
|
|
if len(text) > 80 {
|
|
text = text[:80]
|
|
}
|
|
key := r.RegulationCode + "|" + r.ArticleLabel + "|" + text
|
|
if idx, ok := pos[key]; ok {
|
|
if r.Score > out[idx].Score {
|
|
out[idx] = r
|
|
}
|
|
continue
|
|
}
|
|
pos[key] = len(out)
|
|
out = append(out, r)
|
|
}
|
|
return out
|
|
}
|