feat(ucca): Multi-Regulation-Retrieval für Cross-Regulation-Fragen
CI / detect-changes (pull_request) Successful in 10s
CI / branch-name (pull_request) Successful in 1s
CI / guardrail-integrity (pull_request) Successful in 8s
CI / secret-scan (pull_request) Successful in 9s
CI / dep-audit (pull_request) Failing after 56s
CI / sbom-scan (pull_request) Failing after 58s
CI / build-sha-integrity (pull_request) Successful in 9s
CI / validate-canonical-controls (pull_request) Successful in 7s
CI / loc-budget (pull_request) Successful in 24s
CI / go-lint (pull_request) Successful in 54s
CI / python-lint (pull_request) Failing after 16s
CI / nodejs-lint (pull_request) Failing after 1m9s
CI / nodejs-build (pull_request) Successful in 3m6s
CI / test-go (pull_request) Successful in 1m3s
CI / iace-gt-coverage (pull_request) Successful in 19s
CI / test-python-backend (pull_request) Successful in 26s
CI / test-python-document-crawler (pull_request) Successful in 15s
CI / test-python-dsms-gateway (pull_request) Successful in 12s

Nennt eine Query EXPLIZIT >=2 Regelwerke ("Wie greifen CRA und Maschinen-
verordnung ineinander?"), retrievt searchInternal pro Regelwerk separat
(regulation_code/regulation_id-Filter) und merged — damit BEIDE Domänen im
Prompt landen statt nur der keyword-dominanten. Generisch (Query->Regelwerke,
KEINE doc-spezifische Logik), gegated auf >=2 erkannte Regelwerke; sonst
unveränderter Single-Domain-Pfad.

Behebt GQ-0070: vorher CRA x8 / null MaschVO -> Modell halluzinierte
MaschVO=2019/2144 + falsche "CRA ausgenommen"-Konklusion. Nachher CRA + MaschVO
im Prompt -> korrekt "beide gleichzeitig anwendbar" + Art. 20(9)
Konformitätsvermutung, gegroundet.

Validierung (Build-Collection, echtes SearchCollection):
- Unit: detectRegulations-Scoping (>=2 -> multi, 1/0 -> single)
- 5 Cross-Reg-Fälle (0070 + DSGVO+TDDDG/CRA+NIS2/DORA+NIS2/AI Act+DSGVO):
  beide Regelwerke in Top-8
- CB-100 Freeze-Regression: NUR GQ-0070 + GQ-0095 geändert (beide echte
  Cross-Reg, beide verbessert), 98/100 byte-identisch
- 10 Hard Cases: 9 Single-Domain unverändert, 0070 behält CRA Rang 1

Filter erweitert auf regulation_id UND regulation_code (rückwärtskompatibel,
aktiviert die re-ingestierte Build-Collection).

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-06-30 08:17:16 +02:00
parent e5e7b825af
commit 9760dca443
4 changed files with 265 additions and 49 deletions
@@ -0,0 +1,143 @@
package ucca
import (
"context"
"fmt"
"strings"
)
// multiRegMinPerRegulation is the minimum number of hits fetched per named regulation, so
// each domain is fairly represented even when topK/len(regs) would be tiny.
const multiRegMinPerRegulation = 3
// regulationCatalog maps a regulation to (a) the aliases that signal it is EXPLICITLY named
// in a query and (b) the regulation_code/regulation_id values used to filter the corpus.
// Deterministic + generic: a query naming >=2 regulations triggers per-regulation retrieval
// so a cross-regulation question returns every named domain — NOT a doc-specific rule.
var regulationCatalog = []struct {
Canonical string
Aliases []string
CodeValues []string
}{
{"CRA", []string{"cra", "cyber resilience"}, []string{"CRA"}},
{"MaschVO", []string{"maschinenverordnung", "maschvo", "machinery regulation"}, []string{"MASCHVO", "MaschVO"}},
{"NIS2", []string{"nis2", "nis-2", "nis 2"}, []string{"NIS2"}},
{"DORA", []string{"dora"}, []string{"DORA"}},
{"Data Act", []string{"data act", "datengesetz"}, []string{"DATA ACT", "DataAct"}},
{"AI Act", []string{"ai act", "ki-vo", "ki-verordnung", "ai-verordnung"}, []string{"AI ACT", "AIAct"}},
{"DSGVO", []string{"dsgvo", "gdpr"}, []string{"DSGVO"}},
{"TDDDG", []string{"tdddg"}, []string{"TDDDG"}},
{"BDSG", []string{"bdsg"}, []string{"BDSG"}},
}
type detectedRegulation struct {
Canonical string
CodeValues []string
}
// detectRegulations returns the DISTINCT regulations explicitly named in the query. >=2 of
// them is the trigger for multi-regulation retrieval. Pure + deterministic, no LLM.
func detectRegulations(query string) []detectedRegulation {
q := strings.ToLower(query)
var out []detectedRegulation
for _, r := range regulationCatalog {
for _, a := range r.Aliases {
if strings.Contains(q, a) {
out = append(out, detectedRegulation{Canonical: r.Canonical, CodeValues: r.CodeValues})
break
}
}
}
return out
}
func hitID(h qdrantSearchHit) string { return fmt.Sprintf("%v", h.ID) }
// searchMultiRegulation retrieves each explicitly-named regulation SEPARATELY (per-regulation
// filter) and merges, so a cross-regulation query ("Wie greifen CRA und MaschVO ineinander?")
// returns BOTH domains in the prompt instead of only the keyword-dominant one. Generic over any
// named pair (DSGVO+TDDDG, CRA+NIS2, DORA+NIS2, AI Act+DSGVO, ...). The merged pool is
// authority-reranked once. Pure pool-construction; topK contract preserved.
func (c *LegalRAGClient) searchMultiRegulation(ctx context.Context, collection, query string, regs []detectedRegulation, topK int) ([]LegalSearchResult, error) {
embedding, err := c.generateEmbedding(ctx, query)
if err != nil {
return nil, fmt.Errorf("failed to generate embedding: %w", err)
}
perReg := topK / len(regs)
if perReg < multiRegMinPerRegulation {
perReg = multiRegMinPerRegulation
}
var merged []qdrantSearchHit
seen := make(map[string]bool)
for _, r := range regs {
var hits []qdrantSearchHit
if c.hybridEnabled {
if h, hErr := c.searchHybrid(ctx, collection, embedding, r.CodeValues, perReg); hErr == nil {
hits = h
}
}
if hits == nil {
if h, dErr := c.searchDense(ctx, collection, embedding, r.CodeValues, perReg); dErr == nil {
hits = h
}
}
for _, h := range hits {
id := hitID(h)
if seen[id] {
continue
}
seen[id] = true
merged = append(merged, h)
}
}
if len(merged) == 0 {
return nil, fmt.Errorf("multi-regulation search returned no hits")
}
results := hitsToResults(merged)
results = rerankByAuthority(query, results)
if topK > 0 && len(results) > topK {
results = results[:topK]
}
return results, nil
}
// hitsToResults maps raw Qdrant hits to LegalSearchResult, preferring the normalized payload
// fields (regulation_code/article_label/...) with fallback to the legacy names (regulation_id,
// section) while the corpus is mid-re-ingestion. Shared by searchInternal + searchMultiRegulation.
func hitsToResults(hits []qdrantSearchHit) []LegalSearchResult {
results := make([]LegalSearchResult, len(hits))
for i, hit := range hits {
regCode := getString(hit.Payload, "regulation_code")
if regCode == "" {
regCode = getString(hit.Payload, "regulation_id")
}
article := getString(hit.Payload, "article")
if article == "" {
article = getString(hit.Payload, "section")
}
results[i] = LegalSearchResult{
Text: getString(hit.Payload, "chunk_text"),
RegulationCode: regCode,
RegulationName: getString(hit.Payload, "regulation_name_de"),
RegulationShort: getString(hit.Payload, "regulation_short"),
Category: getString(hit.Payload, "category"),
ArticleLabel: getString(hit.Payload, "article_label"),
Article: article,
Paragraph: getString(hit.Payload, "paragraph"),
Sub: getString(hit.Payload, "sub"),
IsRecital: getBool(hit.Payload, "is_recital"),
CitationStyle: getString(hit.Payload, "citation_style"),
Pages: getIntSlice(hit.Payload, "pages"),
SourceURL: getString(hit.Payload, "source"),
Score: hit.Score,
AuthorityWeight: getInt(hit.Payload, "authority_weight"),
SourceClass: getString(hit.Payload, "source_class"),
Jurisdiction: getString(hit.Payload, "jurisdiction"),
CitationUnit: getString(hit.Payload, "citation_unit"),
ReferencesOut: getStringSlice(hit.Payload, "references_out"),
ReferencesIn: getStringSlice(hit.Payload, "references_in"),
Superseded: getString(hit.Payload, "status") == "superseded",
}
}
return results
}