1d65d99d5f
CI / detect-changes (pull_request) Successful in 14s
CI / branch-name (pull_request) Successful in 2s
CI / guardrail-integrity (pull_request) Successful in 6s
CI / secret-scan (pull_request) Successful in 6s
CI / dep-audit (pull_request) Failing after 54s
CI / sbom-scan (pull_request) Failing after 58s
CI / build-sha-integrity (pull_request) Successful in 5s
CI / validate-canonical-controls (pull_request) Successful in 4s
CI / loc-budget (pull_request) Successful in 20s
CI / go-lint (pull_request) Successful in 43s
CI / python-lint (pull_request) Failing after 18s
CI / nodejs-lint (pull_request) Failing after 1m10s
CI / nodejs-build (pull_request) Successful in 3m1s
CI / test-go (pull_request) Successful in 1m4s
CI / iace-gt-coverage (pull_request) Successful in 16s
CI / test-python-backend (pull_request) Successful in 27s
CI / test-python-document-crawler (pull_request) Successful in 12s
CI / test-python-dsms-gateway (pull_request) Successful in 13s
strings.EqualFold(code, cv) statt code==strings.ToUpper(cv) — behebt den einzigen gocritic-Befund auf der neuen Zeile (CI go-lint, new-from-merge-base). Verhalten unveraendert (case-insensitive exakter regulation_code-Match); Unit + 0070-e2e bleiben gruen. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
202 lines
7.3 KiB
Go
202 lines
7.3 KiB
Go
package ucca
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"strings"
|
|
)
|
|
|
|
// multiRegMinPerRegulation is the minimum number of hits fetched per named regulation, so
|
|
// each domain is fairly represented even when topK/len(regs) would be tiny.
|
|
const multiRegMinPerRegulation = 3
|
|
|
|
// regulationCatalog maps a regulation to (a) the aliases that signal it is EXPLICITLY named
|
|
// in a query and (b) the regulation_code/regulation_id values used to filter the corpus.
|
|
// Deterministic + generic: a query naming >=2 regulations triggers per-regulation retrieval
|
|
// so a cross-regulation question returns every named domain — NOT a doc-specific rule.
|
|
var regulationCatalog = []struct {
|
|
Canonical string
|
|
Aliases []string
|
|
CodeValues []string
|
|
}{
|
|
{"CRA", []string{"cra", "cyber resilience"}, []string{"CRA"}},
|
|
// MaschVO heisst je Collection anders: Slice MASCHVO · gesetze MVO · ce MACHINERY/MASCHINENVO.
|
|
// Alle Varianten als CodeValues, sonst findet der per-Reg-Filter MaschVO nur in der Slice (0070).
|
|
{"MaschVO", []string{"maschinenverordnung", "maschvo", "machinery regulation"}, []string{"MASCHVO", "MaschVO", "MVO", "MASCHINENVO", "MACHINERY"}},
|
|
{"NIS2", []string{"nis2", "nis-2", "nis 2"}, []string{"NIS2"}},
|
|
{"DORA", []string{"dora"}, []string{"DORA"}},
|
|
{"Data Act", []string{"data act", "datengesetz"}, []string{"DATA ACT", "DataAct"}},
|
|
{"AI Act", []string{"ai act", "ki-vo", "ki-verordnung", "ai-verordnung"}, []string{"AI ACT", "AIAct"}},
|
|
{"DSGVO", []string{"dsgvo", "gdpr"}, []string{"DSGVO"}},
|
|
{"TDDDG", []string{"tdddg"}, []string{"TDDDG"}},
|
|
{"BDSG", []string{"bdsg"}, []string{"BDSG"}},
|
|
}
|
|
|
|
type detectedRegulation struct {
|
|
Canonical string
|
|
CodeValues []string
|
|
}
|
|
|
|
// detectRegulations returns the DISTINCT regulations explicitly named in the query. >=2 of
|
|
// them is the trigger for multi-regulation retrieval. Pure + deterministic, no LLM.
|
|
func detectRegulations(query string) []detectedRegulation {
|
|
q := strings.ToLower(query)
|
|
var out []detectedRegulation
|
|
for _, r := range regulationCatalog {
|
|
for _, a := range r.Aliases {
|
|
if strings.Contains(q, a) {
|
|
out = append(out, detectedRegulation{Canonical: r.Canonical, CodeValues: r.CodeValues})
|
|
break
|
|
}
|
|
}
|
|
}
|
|
return out
|
|
}
|
|
|
|
func hitID(h qdrantSearchHit) string { return fmt.Sprintf("%v", h.ID) }
|
|
|
|
// balanceByRegulation builds the final top-K so EVERY explicitly-named regulation with hits is
|
|
// represented, instead of letting the keyword-dominant domain (e.g. CRA) crowd out the other
|
|
// (e.g. MaschVO) in a cross-regulation query. The input pool must already be score-ordered;
|
|
// results are grouped by exact regulation_code match against each regulation's CodeValues, then
|
|
// taken round-robin across the named domains (highest-scored first within each), with any
|
|
// remaining slots filled by the leftover pool in score order. Generic; no doc-specific logic.
|
|
func balanceByRegulation(pool []LegalSearchResult, regs []detectedRegulation, topK int) []LegalSearchResult {
|
|
if topK <= 0 {
|
|
topK = 8
|
|
}
|
|
byReg := make([][]LegalSearchResult, len(regs))
|
|
matched := make([]bool, len(pool))
|
|
for ri, r := range regs {
|
|
for pi := range pool {
|
|
if matched[pi] {
|
|
continue
|
|
}
|
|
code := strings.TrimSpace(pool[pi].RegulationCode)
|
|
for _, cv := range r.CodeValues {
|
|
if strings.EqualFold(code, cv) {
|
|
byReg[ri] = append(byReg[ri], pool[pi])
|
|
matched[pi] = true
|
|
break
|
|
}
|
|
}
|
|
}
|
|
}
|
|
out := make([]LegalSearchResult, 0, topK)
|
|
idx := make([]int, len(regs))
|
|
for len(out) < topK {
|
|
progressed := false
|
|
for ri := range regs {
|
|
if idx[ri] < len(byReg[ri]) {
|
|
out = append(out, byReg[ri][idx[ri]])
|
|
idx[ri]++
|
|
progressed = true
|
|
if len(out) >= topK {
|
|
break
|
|
}
|
|
}
|
|
}
|
|
if !progressed {
|
|
break
|
|
}
|
|
}
|
|
for pi := range pool {
|
|
if len(out) >= topK {
|
|
break
|
|
}
|
|
if !matched[pi] {
|
|
out = append(out, pool[pi])
|
|
}
|
|
}
|
|
return out
|
|
}
|
|
|
|
// searchMultiRegulation retrieves each explicitly-named regulation SEPARATELY (per-regulation
|
|
// filter) and merges, so a cross-regulation query ("Wie greifen CRA und MaschVO ineinander?")
|
|
// returns BOTH domains in the prompt instead of only the keyword-dominant one. Generic over any
|
|
// named pair (DSGVO+TDDDG, CRA+NIS2, DORA+NIS2, AI Act+DSGVO, ...). The merged pool is
|
|
// authority-reranked once. Pure pool-construction; topK contract preserved.
|
|
func (c *LegalRAGClient) searchMultiRegulation(ctx context.Context, collection, query string, regs []detectedRegulation, topK int) ([]LegalSearchResult, error) {
|
|
embedding, err := c.generateEmbedding(ctx, query)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to generate embedding: %w", err)
|
|
}
|
|
perReg := topK / len(regs)
|
|
if perReg < multiRegMinPerRegulation {
|
|
perReg = multiRegMinPerRegulation
|
|
}
|
|
var merged []qdrantSearchHit
|
|
seen := make(map[string]bool)
|
|
for _, r := range regs {
|
|
var hits []qdrantSearchHit
|
|
if c.hybridEnabled {
|
|
if h, hErr := c.searchHybrid(ctx, collection, embedding, r.CodeValues, perReg); hErr == nil {
|
|
hits = h
|
|
}
|
|
}
|
|
if hits == nil {
|
|
if h, dErr := c.searchDense(ctx, collection, embedding, r.CodeValues, perReg); dErr == nil {
|
|
hits = h
|
|
}
|
|
}
|
|
for _, h := range hits {
|
|
id := hitID(h)
|
|
if seen[id] {
|
|
continue
|
|
}
|
|
seen[id] = true
|
|
merged = append(merged, h)
|
|
}
|
|
}
|
|
if len(merged) == 0 {
|
|
return nil, fmt.Errorf("multi-regulation search returned no hits")
|
|
}
|
|
results := hitsToResults(merged)
|
|
results = rerankByAuthority(query, results)
|
|
if topK > 0 && len(results) > topK {
|
|
results = results[:topK]
|
|
}
|
|
return results, nil
|
|
}
|
|
|
|
// hitsToResults maps raw Qdrant hits to LegalSearchResult, preferring the normalized payload
|
|
// fields (regulation_code/article_label/...) with fallback to the legacy names (regulation_id,
|
|
// section) while the corpus is mid-re-ingestion. Shared by searchInternal + searchMultiRegulation.
|
|
func hitsToResults(hits []qdrantSearchHit) []LegalSearchResult {
|
|
results := make([]LegalSearchResult, len(hits))
|
|
for i, hit := range hits {
|
|
regCode := getString(hit.Payload, "regulation_code")
|
|
if regCode == "" {
|
|
regCode = getString(hit.Payload, "regulation_id")
|
|
}
|
|
article := getString(hit.Payload, "article")
|
|
if article == "" {
|
|
article = getString(hit.Payload, "section")
|
|
}
|
|
results[i] = LegalSearchResult{
|
|
Text: getString(hit.Payload, "chunk_text"),
|
|
RegulationCode: regCode,
|
|
RegulationName: getString(hit.Payload, "regulation_name_de"),
|
|
RegulationShort: getString(hit.Payload, "regulation_short"),
|
|
Category: getString(hit.Payload, "category"),
|
|
ArticleLabel: getString(hit.Payload, "article_label"),
|
|
Article: article,
|
|
Paragraph: getString(hit.Payload, "paragraph"),
|
|
Sub: getString(hit.Payload, "sub"),
|
|
IsRecital: getBool(hit.Payload, "is_recital"),
|
|
CitationStyle: getString(hit.Payload, "citation_style"),
|
|
Pages: getIntSlice(hit.Payload, "pages"),
|
|
SourceURL: getString(hit.Payload, "source"),
|
|
Score: hit.Score,
|
|
AuthorityWeight: getInt(hit.Payload, "authority_weight"),
|
|
SourceClass: getString(hit.Payload, "source_class"),
|
|
Jurisdiction: getString(hit.Payload, "jurisdiction"),
|
|
CitationUnit: getString(hit.Payload, "citation_unit"),
|
|
ReferencesOut: getStringSlice(hit.Payload, "references_out"),
|
|
ReferencesIn: getStringSlice(hit.Payload, "references_in"),
|
|
Superseded: getString(hit.Payload, "status") == "superseded",
|
|
}
|
|
}
|
|
return results
|
|
}
|