f2d445b891
CI / detect-changes (pull_request) Successful in 13s
CI / branch-name (pull_request) Successful in 1s
CI / guardrail-integrity (pull_request) Successful in 9s
CI / secret-scan (pull_request) Successful in 10s
CI / dep-audit (pull_request) Failing after 56s
CI / sbom-scan (pull_request) Failing after 59s
CI / build-sha-integrity (pull_request) Successful in 5s
CI / validate-canonical-controls (pull_request) Successful in 3s
CI / test-python-document-crawler (pull_request) Successful in 15s
CI / test-python-dsms-gateway (pull_request) Successful in 13s
CI / loc-budget (pull_request) Successful in 23s
CI / go-lint (pull_request) Failing after 51s
CI / python-lint (pull_request) Failing after 18s
CI / nodejs-lint (pull_request) Failing after 1m8s
CI / nodejs-build (pull_request) Successful in 3m6s
CI / test-go (pull_request) Successful in 1m3s
CI / iace-gt-coverage (pull_request) Successful in 18s
CI / test-python-backend (pull_request) Successful in 28s
Der einzige offene Retrieval-Haertefall: eine Query mit >=2 genannten Regelwerken
("CRA und Maschinenverordnung") lieferte nur die keyword-dominante Domaene (CRA),
MaschVO fiel raus. Drei zusammenwirkende Ursachen, alle behoben:
1. CodeValues-Mismatch: MaschVO heisst je Collection anders (Slice MASCHVO ·
gesetze MVO · ce MACHINERY/MASCHINENVO), der Catalog hatte nur ["MASCHVO","MaschVO"]
→ Filter fand MaschVO nur in der Slice. Jetzt alle Varianten als CodeValues.
2. Per-Collection-Truncation: der Router gab perColl=3 → searchMultiRegulation holte
3+3=6, schnitt auf 3 → konnte eine Domaene je Collection verlieren. Multi-Reg-Queries
bekommen jetzt perColl = 3*len(regs).
3. Router-Score-Merge starvte die nicht-dominante Domaene. Neue balanceByRegulation()
gruppiert den gemergten Pool per Regelwerk (exakter regulation_code-Match) und nimmt
round-robin ueber die genannten Domaenen → jede Domaene mit Treffern ist im Top-K.
Generisch ueber jede genannte Menge; Single-Domain-Pfad unveraendert.
Validierung: Go-Unit (balanceByRegulation: dominante CRA verdraengt MaschVO NICHT mehr);
0070-e2e gegen dev (Retrieve() → [CRA MVO CRA MVO CRA MVO CRA MASCHINENVO] = beide
Domaenen, vorher nur CRA); CB-100-Stichprobe REGR 0 (Gain-Profil unveraendert).
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
130 lines
4.4 KiB
Go
130 lines
4.4 KiB
Go
package ucca
|
|
|
|
import (
|
|
"context"
|
|
"os"
|
|
"sort"
|
|
"strings"
|
|
"sync"
|
|
)
|
|
|
|
// routerBaseCollections is the broad authority base the Authority Router fans out over. It mirrors
|
|
// the Advisor's historical multi-collection set; the KB-2026.1 slice is added separately when the
|
|
// query is in scope. Override via RAG_ROUTER_COLLECTIONS (comma-separated) per environment.
|
|
func (c *LegalRAGClient) routerBaseCollections() []string {
|
|
if v := strings.TrimSpace(os.Getenv("RAG_ROUTER_COLLECTIONS")); v != "" {
|
|
var out []string
|
|
for _, p := range strings.Split(v, ",") {
|
|
if s := strings.TrimSpace(p); s != "" {
|
|
out = append(out, s)
|
|
}
|
|
}
|
|
if len(out) > 0 {
|
|
return out
|
|
}
|
|
}
|
|
return []string{
|
|
"bp_compliance_gesetze",
|
|
"bp_compliance_ce",
|
|
"bp_compliance_datenschutz",
|
|
"bp_dsfa_corpus",
|
|
"bp_compliance_recht",
|
|
"bp_legal_templates",
|
|
}
|
|
}
|
|
|
|
const routerPerCollectionTopK = 3
|
|
|
|
// Retrieve is the Authority Router entry point: callers (the Advisor) pass ONLY a query and stay
|
|
// collection-agnostic. The router fans out over the broad authority base and ADDS the KB-2026.1
|
|
// slice when the query is in scope (inKBScope), then merges all hits, deduplicates, and returns the
|
|
// top-K by authority score. This moves the former Advisor-side collection fan-out into the retrieval
|
|
// layer (the "Retriever" tier of the quality pyramid), so the proven KB-2026.1 slice gain reaches
|
|
// the product path without the Advisor knowing about individual collections.
|
|
//
|
|
// The merged set is ordered by the per-collection authority score that rerankByAuthority already
|
|
// produced inside searchInternal — i.e. binding-vs-guidance ordering is preserved across the merge.
|
|
// Per-collection failures (e.g. a collection absent on an environment) degrade gracefully.
|
|
func (c *LegalRAGClient) Retrieve(ctx context.Context, query string, topK int) ([]LegalSearchResult, error) {
|
|
if topK <= 0 {
|
|
topK = 8
|
|
}
|
|
|
|
collections := c.routerBaseCollections()
|
|
if c.kbScopeRoutingEnabled && c.kbSliceCollection != "" && inKBScope(query) {
|
|
collections = append(collections, c.kbSliceCollection)
|
|
}
|
|
|
|
// Cross-regulation queries (>=2 explicitly named regulations) get a larger per-collection budget
|
|
// so each collection's multi-regulation search isn't truncated down to the keyword-dominant
|
|
// domain; the final per-regulation balancing then guarantees every named domain in the top-K.
|
|
regs := detectRegulations(query)
|
|
perColl := routerPerCollectionTopK
|
|
if len(regs) >= 2 {
|
|
perColl = routerPerCollectionTopK * len(regs)
|
|
}
|
|
|
|
// Warm the full-text indexes sequentially first so the concurrent fan-out below only READS the
|
|
// shared textIndexEnsured map (the writes happen here, serialized) — closes the cold-start map
|
|
// race deterministically. Best-effort: a missing collection just stays un-indexed (hybrid then
|
|
// falls back to dense, or the per-collection search degrades to nothing).
|
|
if c.hybridEnabled {
|
|
for _, coll := range collections {
|
|
_ = c.ensureTextIndex(ctx, coll)
|
|
}
|
|
}
|
|
|
|
out := make([][]LegalSearchResult, len(collections))
|
|
var wg sync.WaitGroup
|
|
for i, coll := range collections {
|
|
wg.Add(1)
|
|
go func(i int, coll string) {
|
|
defer wg.Done()
|
|
if res, err := c.searchInternal(ctx, coll, query, nil, perColl); err == nil {
|
|
out[i] = res
|
|
}
|
|
}(i, coll)
|
|
}
|
|
wg.Wait()
|
|
|
|
merged := make([]LegalSearchResult, 0, len(collections)*perColl)
|
|
for _, r := range out {
|
|
merged = append(merged, r...)
|
|
}
|
|
merged = dedupResults(merged)
|
|
sort.SliceStable(merged, func(a, b int) bool { return merged[a].Score > merged[b].Score })
|
|
|
|
// Cross-regulation: guarantee every named domain is represented (0070-class fix) instead of
|
|
// letting a global score-sort starve the non-dominant domain.
|
|
if len(regs) >= 2 {
|
|
return balanceByRegulation(merged, regs, topK), nil
|
|
}
|
|
if len(merged) > topK {
|
|
merged = merged[:topK]
|
|
}
|
|
return merged, nil
|
|
}
|
|
|
|
// dedupResults removes duplicate passages that can appear when collections overlap, keeping the
|
|
// highest-scoring occurrence. Identity = regulation_code + article_label + a text prefix.
|
|
func dedupResults(in []LegalSearchResult) []LegalSearchResult {
|
|
pos := make(map[string]int, len(in))
|
|
out := make([]LegalSearchResult, 0, len(in))
|
|
for _, r := range in {
|
|
text := r.Text
|
|
if len(text) > 80 {
|
|
text = text[:80]
|
|
}
|
|
key := r.RegulationCode + "|" + r.ArticleLabel + "|" + text
|
|
if idx, ok := pos[key]; ok {
|
|
if r.Score > out[idx].Score {
|
|
out[idx] = r
|
|
}
|
|
continue
|
|
}
|
|
pos[key] = len(out)
|
|
out = append(out, r)
|
|
}
|
|
return out
|
|
}
|