fix(ucca): Cross-Reg 0070 — beide Regelwerk-Domaenen im Router-Top-K (Known Defects 0)
CI / detect-changes (pull_request) Successful in 13s
CI / branch-name (pull_request) Successful in 1s
CI / guardrail-integrity (pull_request) Successful in 9s
CI / secret-scan (pull_request) Successful in 10s
CI / dep-audit (pull_request) Failing after 56s
CI / sbom-scan (pull_request) Failing after 59s
CI / build-sha-integrity (pull_request) Successful in 5s
CI / validate-canonical-controls (pull_request) Successful in 3s
CI / test-python-document-crawler (pull_request) Successful in 15s
CI / test-python-dsms-gateway (pull_request) Successful in 13s
CI / loc-budget (pull_request) Successful in 23s
CI / go-lint (pull_request) Failing after 51s
CI / python-lint (pull_request) Failing after 18s
CI / nodejs-lint (pull_request) Failing after 1m8s
CI / nodejs-build (pull_request) Successful in 3m6s
CI / test-go (pull_request) Successful in 1m3s
CI / iace-gt-coverage (pull_request) Successful in 18s
CI / test-python-backend (pull_request) Successful in 28s
CI / detect-changes (pull_request) Successful in 13s
CI / branch-name (pull_request) Successful in 1s
CI / guardrail-integrity (pull_request) Successful in 9s
CI / secret-scan (pull_request) Successful in 10s
CI / dep-audit (pull_request) Failing after 56s
CI / sbom-scan (pull_request) Failing after 59s
CI / build-sha-integrity (pull_request) Successful in 5s
CI / validate-canonical-controls (pull_request) Successful in 3s
CI / test-python-document-crawler (pull_request) Successful in 15s
CI / test-python-dsms-gateway (pull_request) Successful in 13s
CI / loc-budget (pull_request) Successful in 23s
CI / go-lint (pull_request) Failing after 51s
CI / python-lint (pull_request) Failing after 18s
CI / nodejs-lint (pull_request) Failing after 1m8s
CI / nodejs-build (pull_request) Successful in 3m6s
CI / test-go (pull_request) Successful in 1m3s
CI / iace-gt-coverage (pull_request) Successful in 18s
CI / test-python-backend (pull_request) Successful in 28s
Der einzige offene Retrieval-Haertefall: eine Query mit >=2 genannten Regelwerken
("CRA und Maschinenverordnung") lieferte nur die keyword-dominante Domaene (CRA),
MaschVO fiel raus. Drei zusammenwirkende Ursachen, alle behoben:
1. CodeValues-Mismatch: MaschVO heisst je Collection anders (Slice MASCHVO ·
gesetze MVO · ce MACHINERY/MASCHINENVO), der Catalog hatte nur ["MASCHVO","MaschVO"]
→ Filter fand MaschVO nur in der Slice. Jetzt alle Varianten als CodeValues.
2. Per-Collection-Truncation: der Router gab perColl=3 → searchMultiRegulation holte
3+3=6, schnitt auf 3 → konnte eine Domaene je Collection verlieren. Multi-Reg-Queries
bekommen jetzt perColl = 3*len(regs).
3. Router-Score-Merge starvte die nicht-dominante Domaene. Neue balanceByRegulation()
gruppiert den gemergten Pool per Regelwerk (exakter regulation_code-Match) und nimmt
round-robin ueber die genannten Domaenen → jede Domaene mit Treffern ist im Top-K.
Generisch ueber jede genannte Menge; Single-Domain-Pfad unveraendert.
Validierung: Go-Unit (balanceByRegulation: dominante CRA verdraengt MaschVO NICHT mehr);
0070-e2e gegen dev (Retrieve() → [CRA MVO CRA MVO CRA MVO CRA MASCHINENVO] = beide
Domaenen, vorher nur CRA); CB-100-Stichprobe REGR 0 (Gain-Profil unveraendert).
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -20,7 +20,9 @@ var regulationCatalog = []struct {
|
||||
CodeValues []string
|
||||
}{
|
||||
{"CRA", []string{"cra", "cyber resilience"}, []string{"CRA"}},
|
||||
{"MaschVO", []string{"maschinenverordnung", "maschvo", "machinery regulation"}, []string{"MASCHVO", "MaschVO"}},
|
||||
// MaschVO heisst je Collection anders: Slice MASCHVO · gesetze MVO · ce MACHINERY/MASCHINENVO.
|
||||
// Alle Varianten als CodeValues, sonst findet der per-Reg-Filter MaschVO nur in der Slice (0070).
|
||||
{"MaschVO", []string{"maschinenverordnung", "maschvo", "machinery regulation"}, []string{"MASCHVO", "MaschVO", "MVO", "MASCHINENVO", "MACHINERY"}},
|
||||
{"NIS2", []string{"nis2", "nis-2", "nis 2"}, []string{"NIS2"}},
|
||||
{"DORA", []string{"dora"}, []string{"DORA"}},
|
||||
{"Data Act", []string{"data act", "datengesetz"}, []string{"DATA ACT", "DataAct"}},
|
||||
@@ -53,6 +55,62 @@ func detectRegulations(query string) []detectedRegulation {
|
||||
|
||||
func hitID(h qdrantSearchHit) string { return fmt.Sprintf("%v", h.ID) }
|
||||
|
||||
// balanceByRegulation builds the final top-K so EVERY explicitly-named regulation with hits is
|
||||
// represented, instead of letting the keyword-dominant domain (e.g. CRA) crowd out the other
|
||||
// (e.g. MaschVO) in a cross-regulation query. The input pool must already be score-ordered;
|
||||
// results are grouped by exact regulation_code match against each regulation's CodeValues, then
|
||||
// taken round-robin across the named domains (highest-scored first within each), with any
|
||||
// remaining slots filled by the leftover pool in score order. Generic; no doc-specific logic.
|
||||
func balanceByRegulation(pool []LegalSearchResult, regs []detectedRegulation, topK int) []LegalSearchResult {
|
||||
if topK <= 0 {
|
||||
topK = 8
|
||||
}
|
||||
byReg := make([][]LegalSearchResult, len(regs))
|
||||
matched := make([]bool, len(pool))
|
||||
for ri, r := range regs {
|
||||
for pi := range pool {
|
||||
if matched[pi] {
|
||||
continue
|
||||
}
|
||||
code := strings.ToUpper(strings.TrimSpace(pool[pi].RegulationCode))
|
||||
for _, cv := range r.CodeValues {
|
||||
if code == strings.ToUpper(cv) {
|
||||
byReg[ri] = append(byReg[ri], pool[pi])
|
||||
matched[pi] = true
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
out := make([]LegalSearchResult, 0, topK)
|
||||
idx := make([]int, len(regs))
|
||||
for len(out) < topK {
|
||||
progressed := false
|
||||
for ri := range regs {
|
||||
if idx[ri] < len(byReg[ri]) {
|
||||
out = append(out, byReg[ri][idx[ri]])
|
||||
idx[ri]++
|
||||
progressed = true
|
||||
if len(out) >= topK {
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
if !progressed {
|
||||
break
|
||||
}
|
||||
}
|
||||
for pi := range pool {
|
||||
if len(out) >= topK {
|
||||
break
|
||||
}
|
||||
if !matched[pi] {
|
||||
out = append(out, pool[pi])
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// searchMultiRegulation retrieves each explicitly-named regulation SEPARATELY (per-regulation
|
||||
// filter) and merges, so a cross-regulation query ("Wie greifen CRA und MaschVO ineinander?")
|
||||
// returns BOTH domains in the prompt instead of only the keyword-dominant one. Generic over any
|
||||
|
||||
Reference in New Issue
Block a user