package ucca import ( "context" "os" "sort" "strings" "sync" ) // routerBaseCollections is the broad authority base the Authority Router fans out over. It mirrors // the Advisor's historical multi-collection set; the KB-2026.1 slice is added separately when the // query is in scope. Override via RAG_ROUTER_COLLECTIONS (comma-separated) per environment. func (c *LegalRAGClient) routerBaseCollections() []string { if v := strings.TrimSpace(os.Getenv("RAG_ROUTER_COLLECTIONS")); v != "" { var out []string for _, p := range strings.Split(v, ",") { if s := strings.TrimSpace(p); s != "" { out = append(out, s) } } if len(out) > 0 { return out } } return []string{ "bp_compliance_gesetze", "bp_compliance_ce", "bp_compliance_datenschutz", "bp_dsfa_corpus", "bp_compliance_recht", "bp_legal_templates", } } const routerPerCollectionTopK = 3 // Retrieve is the Authority Router entry point: callers (the Advisor) pass ONLY a query and stay // collection-agnostic. The router fans out over the broad authority base and ADDS the KB-2026.1 // slice when the query is in scope (inKBScope), then merges all hits, deduplicates, and returns the // top-K by authority score. This moves the former Advisor-side collection fan-out into the retrieval // layer (the "Retriever" tier of the quality pyramid), so the proven KB-2026.1 slice gain reaches // the product path without the Advisor knowing about individual collections. // // The merged set is ordered by the per-collection authority score that rerankByAuthority already // produced inside searchInternal — i.e. binding-vs-guidance ordering is preserved across the merge. // Per-collection failures (e.g. a collection absent on an environment) degrade gracefully. func (c *LegalRAGClient) Retrieve(ctx context.Context, query string, topK int) ([]LegalSearchResult, error) { if topK <= 0 { topK = 8 } collections := c.routerBaseCollections() if c.kbScopeRoutingEnabled && c.kbSliceCollection != "" && inKBScope(query) { collections = append(collections, c.kbSliceCollection) } // Warm the full-text indexes sequentially first so the concurrent fan-out below only READS the // shared textIndexEnsured map (the writes happen here, serialized) — closes the cold-start map // race deterministically. Best-effort: a missing collection just stays un-indexed (hybrid then // falls back to dense, or the per-collection search degrades to nothing). if c.hybridEnabled { for _, coll := range collections { _ = c.ensureTextIndex(ctx, coll) } } out := make([][]LegalSearchResult, len(collections)) var wg sync.WaitGroup for i, coll := range collections { wg.Add(1) go func(i int, coll string) { defer wg.Done() if res, err := c.searchInternal(ctx, coll, query, nil, routerPerCollectionTopK); err == nil { out[i] = res } }(i, coll) } wg.Wait() merged := make([]LegalSearchResult, 0, len(collections)*routerPerCollectionTopK) for _, r := range out { merged = append(merged, r...) } merged = dedupResults(merged) sort.SliceStable(merged, func(a, b int) bool { return merged[a].Score > merged[b].Score }) if len(merged) > topK { merged = merged[:topK] } return merged, nil } // dedupResults removes duplicate passages that can appear when collections overlap, keeping the // highest-scoring occurrence. Identity = regulation_code + article_label + a text prefix. func dedupResults(in []LegalSearchResult) []LegalSearchResult { pos := make(map[string]int, len(in)) out := make([]LegalSearchResult, 0, len(in)) for _, r := range in { text := r.Text if len(text) > 80 { text = text[:80] } key := r.RegulationCode + "|" + r.ArticleLabel + "|" + text if idx, ok := pos[key]; ok { if r.Score > out[idx].Score { out[idx] = r } continue } pos[key] = len(out) out = append(out, r) } return out }