feat(ucca): Multi-Regulation-Retrieval (Cross-Regulation-Fragen) #43
@@ -78,6 +78,19 @@ func (c *LegalRAGClient) Search(ctx context.Context, query string, regulationIDs
|
|||||||
// If hybrid search is enabled, it uses the Qdrant Query API with RRF fusion
|
// If hybrid search is enabled, it uses the Qdrant Query API with RRF fusion
|
||||||
// (dense + full-text). Falls back to dense-only /points/search on failure.
|
// (dense + full-text). Falls back to dense-only /points/search on failure.
|
||||||
func (c *LegalRAGClient) searchInternal(ctx context.Context, collection string, query string, regulationIDs []string, topK int) ([]LegalSearchResult, error) {
|
func (c *LegalRAGClient) searchInternal(ctx context.Context, collection string, query string, regulationIDs []string, topK int) ([]LegalSearchResult, error) {
|
||||||
|
// Multi-Regulation-Retrieval: nennt die Query EXPLIZIT >=2 Regelwerke (z.B. "CRA und
|
||||||
|
// Maschinenverordnung"), wird pro Regelwerk separat retrieved + gemergt, damit BEIDE
|
||||||
|
// Domaenen im Prompt landen statt nur der keyword-dominanten. Generisch (Query->Regelwerke,
|
||||||
|
// keine doc-spezifische Logik); nur wenn der Caller nicht ohnehin schon auf Regulierungen
|
||||||
|
// filtert. Best-effort: leeres/fehlerhaftes Multi-Ergebnis faellt auf die Standardsuche zurueck.
|
||||||
|
if len(regulationIDs) == 0 {
|
||||||
|
if regs := detectRegulations(query); len(regs) >= 2 {
|
||||||
|
if mr, mErr := c.searchMultiRegulation(ctx, collection, query, regs, topK); mErr == nil && len(mr) > 0 {
|
||||||
|
return mr, nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
embedding, err := c.generateEmbedding(ctx, query)
|
embedding, err := c.generateEmbedding(ctx, query)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("failed to generate embedding: %w", err)
|
return nil, fmt.Errorf("failed to generate embedding: %w", err)
|
||||||
@@ -123,43 +136,7 @@ func (c *LegalRAGClient) searchInternal(ctx context.Context, collection string,
|
|||||||
hits = c.expandViaGraph(ctx, collection, hits)
|
hits = c.expandViaGraph(ctx, collection, hits)
|
||||||
}
|
}
|
||||||
|
|
||||||
results := make([]LegalSearchResult, len(hits))
|
results := hitsToResults(hits)
|
||||||
for i, hit := range hits {
|
|
||||||
// Legal-Metadaten nach rag_reingest_spec.md §2: bevorzugt die normalisierten Felder
|
|
||||||
// (article_label/regulation_code/article/...); Fallback auf alte Feldnamen, solange der
|
|
||||||
// Korpus noch nicht re-ingestiert ist (regulation_id, section="§ 38").
|
|
||||||
regCode := getString(hit.Payload, "regulation_code")
|
|
||||||
if regCode == "" {
|
|
||||||
regCode = getString(hit.Payload, "regulation_id")
|
|
||||||
}
|
|
||||||
article := getString(hit.Payload, "article")
|
|
||||||
if article == "" {
|
|
||||||
article = getString(hit.Payload, "section")
|
|
||||||
}
|
|
||||||
results[i] = LegalSearchResult{
|
|
||||||
Text: getString(hit.Payload, "chunk_text"),
|
|
||||||
RegulationCode: regCode,
|
|
||||||
RegulationName: getString(hit.Payload, "regulation_name_de"),
|
|
||||||
RegulationShort: getString(hit.Payload, "regulation_short"),
|
|
||||||
Category: getString(hit.Payload, "category"),
|
|
||||||
ArticleLabel: getString(hit.Payload, "article_label"),
|
|
||||||
Article: article,
|
|
||||||
Paragraph: getString(hit.Payload, "paragraph"),
|
|
||||||
Sub: getString(hit.Payload, "sub"),
|
|
||||||
IsRecital: getBool(hit.Payload, "is_recital"),
|
|
||||||
CitationStyle: getString(hit.Payload, "citation_style"),
|
|
||||||
Pages: getIntSlice(hit.Payload, "pages"),
|
|
||||||
SourceURL: getString(hit.Payload, "source"),
|
|
||||||
Score: hit.Score,
|
|
||||||
AuthorityWeight: getInt(hit.Payload, "authority_weight"),
|
|
||||||
SourceClass: getString(hit.Payload, "source_class"),
|
|
||||||
Jurisdiction: getString(hit.Payload, "jurisdiction"),
|
|
||||||
CitationUnit: getString(hit.Payload, "citation_unit"),
|
|
||||||
ReferencesOut: getStringSlice(hit.Payload, "references_out"),
|
|
||||||
ReferencesIn: getStringSlice(hit.Payload, "references_in"),
|
|
||||||
Superseded: getString(hit.Payload, "status") == "superseded",
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Authority-aware Re-Ranking: bindendes Recht der passenden Jurisdiktion/Domaene nach
|
// Authority-aware Re-Ranking: bindendes Recht der passenden Jurisdiktion/Domaene nach
|
||||||
// oben, Guidance/Fremdrecht/Off-Domain runter (nichts wird geloescht). Reihenfolge only,
|
// oben, Guidance/Fremdrecht/Off-Domain runter (nichts wird geloescht). Reihenfolge only,
|
||||||
|
|||||||
@@ -122,12 +122,14 @@ func (c *LegalRAGClient) searchHybrid(ctx context.Context, collection string, em
|
|||||||
}
|
}
|
||||||
|
|
||||||
if len(regulationIDs) > 0 {
|
if len(regulationIDs) > 0 {
|
||||||
conditions := make([]qdrantCondition, len(regulationIDs))
|
// Match BOTH the legacy field (regulation_id) and the normalized field
|
||||||
for i, regID := range regulationIDs {
|
// (regulation_code) so per-regulation filtering works on the re-ingested corpus too.
|
||||||
conditions[i] = qdrantCondition{
|
conditions := make([]qdrantCondition, 0, len(regulationIDs)*2)
|
||||||
Key: "regulation_id",
|
for _, regID := range regulationIDs {
|
||||||
Match: qdrantMatch{Value: regID},
|
conditions = append(conditions,
|
||||||
}
|
qdrantCondition{Key: "regulation_id", Match: qdrantMatch{Value: regID}},
|
||||||
|
qdrantCondition{Key: "regulation_code", Match: qdrantMatch{Value: regID}},
|
||||||
|
)
|
||||||
}
|
}
|
||||||
queryReq.Filter = &qdrantFilter{Should: conditions}
|
queryReq.Filter = &qdrantFilter{Should: conditions}
|
||||||
}
|
}
|
||||||
@@ -175,12 +177,14 @@ func (c *LegalRAGClient) searchDense(ctx context.Context, collection string, emb
|
|||||||
}
|
}
|
||||||
|
|
||||||
if len(regulationIDs) > 0 {
|
if len(regulationIDs) > 0 {
|
||||||
conditions := make([]qdrantCondition, len(regulationIDs))
|
// Match BOTH the legacy field (regulation_id) and the normalized field
|
||||||
for i, regID := range regulationIDs {
|
// (regulation_code) so per-regulation filtering works on the re-ingested corpus too.
|
||||||
conditions[i] = qdrantCondition{
|
conditions := make([]qdrantCondition, 0, len(regulationIDs)*2)
|
||||||
Key: "regulation_id",
|
for _, regID := range regulationIDs {
|
||||||
Match: qdrantMatch{Value: regID},
|
conditions = append(conditions,
|
||||||
}
|
qdrantCondition{Key: "regulation_id", Match: qdrantMatch{Value: regID}},
|
||||||
|
qdrantCondition{Key: "regulation_code", Match: qdrantMatch{Value: regID}},
|
||||||
|
)
|
||||||
}
|
}
|
||||||
searchReq.Filter = &qdrantFilter{Should: conditions}
|
searchReq.Filter = &qdrantFilter{Should: conditions}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -0,0 +1,143 @@
|
|||||||
|
package ucca
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"fmt"
|
||||||
|
"strings"
|
||||||
|
)
|
||||||
|
|
||||||
|
// multiRegMinPerRegulation is the minimum number of hits fetched per named regulation, so
|
||||||
|
// each domain is fairly represented even when topK/len(regs) would be tiny.
|
||||||
|
const multiRegMinPerRegulation = 3
|
||||||
|
|
||||||
|
// regulationCatalog maps a regulation to (a) the aliases that signal it is EXPLICITLY named
|
||||||
|
// in a query and (b) the regulation_code/regulation_id values used to filter the corpus.
|
||||||
|
// Deterministic + generic: a query naming >=2 regulations triggers per-regulation retrieval
|
||||||
|
// so a cross-regulation question returns every named domain — NOT a doc-specific rule.
|
||||||
|
var regulationCatalog = []struct {
|
||||||
|
Canonical string
|
||||||
|
Aliases []string
|
||||||
|
CodeValues []string
|
||||||
|
}{
|
||||||
|
{"CRA", []string{"cra", "cyber resilience"}, []string{"CRA"}},
|
||||||
|
{"MaschVO", []string{"maschinenverordnung", "maschvo", "machinery regulation"}, []string{"MASCHVO", "MaschVO"}},
|
||||||
|
{"NIS2", []string{"nis2", "nis-2", "nis 2"}, []string{"NIS2"}},
|
||||||
|
{"DORA", []string{"dora"}, []string{"DORA"}},
|
||||||
|
{"Data Act", []string{"data act", "datengesetz"}, []string{"DATA ACT", "DataAct"}},
|
||||||
|
{"AI Act", []string{"ai act", "ki-vo", "ki-verordnung", "ai-verordnung"}, []string{"AI ACT", "AIAct"}},
|
||||||
|
{"DSGVO", []string{"dsgvo", "gdpr"}, []string{"DSGVO"}},
|
||||||
|
{"TDDDG", []string{"tdddg"}, []string{"TDDDG"}},
|
||||||
|
{"BDSG", []string{"bdsg"}, []string{"BDSG"}},
|
||||||
|
}
|
||||||
|
|
||||||
|
type detectedRegulation struct {
|
||||||
|
Canonical string
|
||||||
|
CodeValues []string
|
||||||
|
}
|
||||||
|
|
||||||
|
// detectRegulations returns the DISTINCT regulations explicitly named in the query. >=2 of
|
||||||
|
// them is the trigger for multi-regulation retrieval. Pure + deterministic, no LLM.
|
||||||
|
func detectRegulations(query string) []detectedRegulation {
|
||||||
|
q := strings.ToLower(query)
|
||||||
|
var out []detectedRegulation
|
||||||
|
for _, r := range regulationCatalog {
|
||||||
|
for _, a := range r.Aliases {
|
||||||
|
if strings.Contains(q, a) {
|
||||||
|
out = append(out, detectedRegulation{Canonical: r.Canonical, CodeValues: r.CodeValues})
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
|
func hitID(h qdrantSearchHit) string { return fmt.Sprintf("%v", h.ID) }
|
||||||
|
|
||||||
|
// searchMultiRegulation retrieves each explicitly-named regulation SEPARATELY (per-regulation
|
||||||
|
// filter) and merges, so a cross-regulation query ("Wie greifen CRA und MaschVO ineinander?")
|
||||||
|
// returns BOTH domains in the prompt instead of only the keyword-dominant one. Generic over any
|
||||||
|
// named pair (DSGVO+TDDDG, CRA+NIS2, DORA+NIS2, AI Act+DSGVO, ...). The merged pool is
|
||||||
|
// authority-reranked once. Pure pool-construction; topK contract preserved.
|
||||||
|
func (c *LegalRAGClient) searchMultiRegulation(ctx context.Context, collection, query string, regs []detectedRegulation, topK int) ([]LegalSearchResult, error) {
|
||||||
|
embedding, err := c.generateEmbedding(ctx, query)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("failed to generate embedding: %w", err)
|
||||||
|
}
|
||||||
|
perReg := topK / len(regs)
|
||||||
|
if perReg < multiRegMinPerRegulation {
|
||||||
|
perReg = multiRegMinPerRegulation
|
||||||
|
}
|
||||||
|
var merged []qdrantSearchHit
|
||||||
|
seen := make(map[string]bool)
|
||||||
|
for _, r := range regs {
|
||||||
|
var hits []qdrantSearchHit
|
||||||
|
if c.hybridEnabled {
|
||||||
|
if h, hErr := c.searchHybrid(ctx, collection, embedding, r.CodeValues, perReg); hErr == nil {
|
||||||
|
hits = h
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if hits == nil {
|
||||||
|
if h, dErr := c.searchDense(ctx, collection, embedding, r.CodeValues, perReg); dErr == nil {
|
||||||
|
hits = h
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for _, h := range hits {
|
||||||
|
id := hitID(h)
|
||||||
|
if seen[id] {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
seen[id] = true
|
||||||
|
merged = append(merged, h)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if len(merged) == 0 {
|
||||||
|
return nil, fmt.Errorf("multi-regulation search returned no hits")
|
||||||
|
}
|
||||||
|
results := hitsToResults(merged)
|
||||||
|
results = rerankByAuthority(query, results)
|
||||||
|
if topK > 0 && len(results) > topK {
|
||||||
|
results = results[:topK]
|
||||||
|
}
|
||||||
|
return results, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// hitsToResults maps raw Qdrant hits to LegalSearchResult, preferring the normalized payload
|
||||||
|
// fields (regulation_code/article_label/...) with fallback to the legacy names (regulation_id,
|
||||||
|
// section) while the corpus is mid-re-ingestion. Shared by searchInternal + searchMultiRegulation.
|
||||||
|
func hitsToResults(hits []qdrantSearchHit) []LegalSearchResult {
|
||||||
|
results := make([]LegalSearchResult, len(hits))
|
||||||
|
for i, hit := range hits {
|
||||||
|
regCode := getString(hit.Payload, "regulation_code")
|
||||||
|
if regCode == "" {
|
||||||
|
regCode = getString(hit.Payload, "regulation_id")
|
||||||
|
}
|
||||||
|
article := getString(hit.Payload, "article")
|
||||||
|
if article == "" {
|
||||||
|
article = getString(hit.Payload, "section")
|
||||||
|
}
|
||||||
|
results[i] = LegalSearchResult{
|
||||||
|
Text: getString(hit.Payload, "chunk_text"),
|
||||||
|
RegulationCode: regCode,
|
||||||
|
RegulationName: getString(hit.Payload, "regulation_name_de"),
|
||||||
|
RegulationShort: getString(hit.Payload, "regulation_short"),
|
||||||
|
Category: getString(hit.Payload, "category"),
|
||||||
|
ArticleLabel: getString(hit.Payload, "article_label"),
|
||||||
|
Article: article,
|
||||||
|
Paragraph: getString(hit.Payload, "paragraph"),
|
||||||
|
Sub: getString(hit.Payload, "sub"),
|
||||||
|
IsRecital: getBool(hit.Payload, "is_recital"),
|
||||||
|
CitationStyle: getString(hit.Payload, "citation_style"),
|
||||||
|
Pages: getIntSlice(hit.Payload, "pages"),
|
||||||
|
SourceURL: getString(hit.Payload, "source"),
|
||||||
|
Score: hit.Score,
|
||||||
|
AuthorityWeight: getInt(hit.Payload, "authority_weight"),
|
||||||
|
SourceClass: getString(hit.Payload, "source_class"),
|
||||||
|
Jurisdiction: getString(hit.Payload, "jurisdiction"),
|
||||||
|
CitationUnit: getString(hit.Payload, "citation_unit"),
|
||||||
|
ReferencesOut: getStringSlice(hit.Payload, "references_out"),
|
||||||
|
ReferencesIn: getStringSlice(hit.Payload, "references_in"),
|
||||||
|
Superseded: getString(hit.Payload, "status") == "superseded",
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return results
|
||||||
|
}
|
||||||
@@ -0,0 +1,92 @@
|
|||||||
|
package ucca
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"fmt"
|
||||||
|
"os"
|
||||||
|
"strings"
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
|
// TestDetectRegulations is a pure unit test of the multi-regulation TRIGGER (no Qdrant):
|
||||||
|
// only an explicit naming of >=2 regulations enables multi-regulation retrieval. A single
|
||||||
|
// named regulation, or a topical question that doesn't name one, stays single-domain.
|
||||||
|
func TestDetectRegulations(t *testing.T) {
|
||||||
|
cases := []struct {
|
||||||
|
q string
|
||||||
|
want int
|
||||||
|
}{
|
||||||
|
{"Welche neun Kriterien nennt WP248 fuer ein voraussichtlich hohes Risiko?", 0},
|
||||||
|
{"Welche Anforderungen gelten fuer wesentliche Veraenderungen einer Maschine?", 0}, // "Maschine" != MaschVO
|
||||||
|
{"Benoetigt eine SPS ohne Netzwerkanschluss eine CRA-Bewertung?", 1}, // 1 -> single
|
||||||
|
{"Wie greifen CRA und Maschinenverordnung bei einer vernetzten Maschine ineinander?", 2},
|
||||||
|
{"Wie greifen DSGVO und TDDDG bei der Nutzung von Cookies ineinander?", 2},
|
||||||
|
{"Wie verhalten sich DORA und NIS2 fuer ein Finanzunternehmen?", 2},
|
||||||
|
{"Wie greifen AI Act und DSGVO bei einem KI-System ineinander?", 2},
|
||||||
|
}
|
||||||
|
for _, c := range cases {
|
||||||
|
if got := len(detectRegulations(c.q)); got != c.want {
|
||||||
|
t.Errorf("detectRegulations(%q) = %d, want %d", c.q, got, c.want)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// TestMultiRegE2E (RUN_E2E=1) verifies against the build collection that an explicit
|
||||||
|
// cross-regulation query returns BOTH named domains in the top-K — the core acceptance
|
||||||
|
// gate for multi-regulation retrieval.
|
||||||
|
func TestMultiRegE2E(t *testing.T) {
|
||||||
|
if os.Getenv("RUN_E2E") != "1" {
|
||||||
|
t.Skip("set RUN_E2E=1 + QDRANT_URL/OLLAMA_URL")
|
||||||
|
}
|
||||||
|
c := NewLegalRAGClient()
|
||||||
|
coll := os.Getenv("E2E_COLLECTION")
|
||||||
|
if coll == "" {
|
||||||
|
coll = "bp_compliance_kb_2026_1_build"
|
||||||
|
}
|
||||||
|
cases := []struct {
|
||||||
|
id string
|
||||||
|
q string
|
||||||
|
want []string
|
||||||
|
}{
|
||||||
|
{"GQ-0070 CRA+MaschVO", "Wie greifen CRA und Maschinenverordnung bei einer vernetzten Maschine ineinander?", []string{"CRA", "MASCH"}},
|
||||||
|
{"DSGVO+TDDDG", "Wie greifen DSGVO und TDDDG bei der Nutzung von Cookies und Tracking-Technologien ineinander?", []string{"DSGVO", "TDDDG"}},
|
||||||
|
{"CRA+NIS2", "Wie verhalten sich CRA und NIS2 bei einem vernetzten Produkt eines wichtigen Unternehmens zueinander?", []string{"CRA", "NIS2"}},
|
||||||
|
{"DORA+NIS2", "Wie greifen DORA und NIS2 bei einem Finanzunternehmen ineinander?", []string{"DORA", "NIS2"}},
|
||||||
|
{"AI Act+DSGVO", "Wie greifen AI Act und DSGVO bei einem KI-System ineinander, das personenbezogene Daten verarbeitet?", []string{"AI ACT", "DSGVO"}},
|
||||||
|
}
|
||||||
|
for _, tc := range cases {
|
||||||
|
res, err := c.SearchCollection(context.Background(), coll, tc.q, nil, 8)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("%s: %v", tc.id, err)
|
||||||
|
}
|
||||||
|
present := map[string]bool{}
|
||||||
|
for _, r := range res {
|
||||||
|
present[strings.ToUpper(r.RegulationCode)] = true
|
||||||
|
}
|
||||||
|
ok := true
|
||||||
|
for _, w := range tc.want {
|
||||||
|
found := false
|
||||||
|
for cd := range present {
|
||||||
|
if strings.Contains(cd, w) {
|
||||||
|
found = true
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if !found {
|
||||||
|
ok = false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
codes := make([]string, 0, len(present))
|
||||||
|
for cd := range present {
|
||||||
|
codes = append(codes, cd)
|
||||||
|
}
|
||||||
|
status := "OK"
|
||||||
|
if !ok {
|
||||||
|
status = "FAIL"
|
||||||
|
}
|
||||||
|
fmt.Printf("%-22s want=%v present=%v %s\n", tc.id, tc.want, codes, status)
|
||||||
|
if !ok {
|
||||||
|
t.Errorf("%s: not all named regulations in top-8 (want %v, got %v)", tc.id, tc.want, codes)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user