package ucca import ( "context" "fmt" "strings" ) // multiRegMinPerRegulation is the minimum number of hits fetched per named regulation, so // each domain is fairly represented even when topK/len(regs) would be tiny. const multiRegMinPerRegulation = 3 // regulationCatalog maps a regulation to (a) the aliases that signal it is EXPLICITLY named // in a query and (b) the regulation_code/regulation_id values used to filter the corpus. // Deterministic + generic: a query naming >=2 regulations triggers per-regulation retrieval // so a cross-regulation question returns every named domain — NOT a doc-specific rule. var regulationCatalog = []struct { Canonical string Aliases []string CodeValues []string }{ {"CRA", []string{"cra", "cyber resilience"}, []string{"CRA"}}, // MaschVO heisst je Collection anders: Slice MASCHVO · gesetze MVO · ce MACHINERY/MASCHINENVO. // Alle Varianten als CodeValues, sonst findet der per-Reg-Filter MaschVO nur in der Slice (0070). {"MaschVO", []string{"maschinenverordnung", "maschvo", "machinery regulation"}, []string{"MASCHVO", "MaschVO", "MVO", "MASCHINENVO", "MACHINERY"}}, {"NIS2", []string{"nis2", "nis-2", "nis 2"}, []string{"NIS2"}}, {"DORA", []string{"dora"}, []string{"DORA"}}, {"Data Act", []string{"data act", "datengesetz"}, []string{"DATA ACT", "DataAct"}}, {"AI Act", []string{"ai act", "ki-vo", "ki-verordnung", "ai-verordnung"}, []string{"AI ACT", "AIAct"}}, {"DSGVO", []string{"dsgvo", "gdpr"}, []string{"DSGVO"}}, {"TDDDG", []string{"tdddg"}, []string{"TDDDG"}}, {"BDSG", []string{"bdsg"}, []string{"BDSG"}}, } type detectedRegulation struct { Canonical string CodeValues []string } // detectRegulations returns the DISTINCT regulations explicitly named in the query. >=2 of // them is the trigger for multi-regulation retrieval. Pure + deterministic, no LLM. func detectRegulations(query string) []detectedRegulation { q := strings.ToLower(query) var out []detectedRegulation for _, r := range regulationCatalog { for _, a := range r.Aliases { if strings.Contains(q, a) { out = append(out, detectedRegulation{Canonical: r.Canonical, CodeValues: r.CodeValues}) break } } } return out } func hitID(h qdrantSearchHit) string { return fmt.Sprintf("%v", h.ID) } // balanceByRegulation builds the final top-K so EVERY explicitly-named regulation with hits is // represented, instead of letting the keyword-dominant domain (e.g. CRA) crowd out the other // (e.g. MaschVO) in a cross-regulation query. The input pool must already be score-ordered; // results are grouped by exact regulation_code match against each regulation's CodeValues, then // taken round-robin across the named domains (highest-scored first within each), with any // remaining slots filled by the leftover pool in score order. Generic; no doc-specific logic. func balanceByRegulation(pool []LegalSearchResult, regs []detectedRegulation, topK int) []LegalSearchResult { if topK <= 0 { topK = 8 } byReg := make([][]LegalSearchResult, len(regs)) matched := make([]bool, len(pool)) for ri, r := range regs { for pi := range pool { if matched[pi] { continue } code := strings.TrimSpace(pool[pi].RegulationCode) for _, cv := range r.CodeValues { if strings.EqualFold(code, cv) { byReg[ri] = append(byReg[ri], pool[pi]) matched[pi] = true break } } } } out := make([]LegalSearchResult, 0, topK) idx := make([]int, len(regs)) for len(out) < topK { progressed := false for ri := range regs { if idx[ri] < len(byReg[ri]) { out = append(out, byReg[ri][idx[ri]]) idx[ri]++ progressed = true if len(out) >= topK { break } } } if !progressed { break } } for pi := range pool { if len(out) >= topK { break } if !matched[pi] { out = append(out, pool[pi]) } } return out } // searchMultiRegulation retrieves each explicitly-named regulation SEPARATELY (per-regulation // filter) and merges, so a cross-regulation query ("Wie greifen CRA und MaschVO ineinander?") // returns BOTH domains in the prompt instead of only the keyword-dominant one. Generic over any // named pair (DSGVO+TDDDG, CRA+NIS2, DORA+NIS2, AI Act+DSGVO, ...). The merged pool is // authority-reranked once. Pure pool-construction; topK contract preserved. func (c *LegalRAGClient) searchMultiRegulation(ctx context.Context, collection, query string, regs []detectedRegulation, topK int) ([]LegalSearchResult, error) { embedding, err := c.generateEmbedding(ctx, query) if err != nil { return nil, fmt.Errorf("failed to generate embedding: %w", err) } perReg := topK / len(regs) if perReg < multiRegMinPerRegulation { perReg = multiRegMinPerRegulation } var merged []qdrantSearchHit seen := make(map[string]bool) for _, r := range regs { var hits []qdrantSearchHit if c.hybridEnabled { if h, hErr := c.searchHybrid(ctx, collection, embedding, r.CodeValues, perReg); hErr == nil { hits = h } } if hits == nil { if h, dErr := c.searchDense(ctx, collection, embedding, r.CodeValues, perReg); dErr == nil { hits = h } } for _, h := range hits { id := hitID(h) if seen[id] { continue } seen[id] = true merged = append(merged, h) } } if len(merged) == 0 { return nil, fmt.Errorf("multi-regulation search returned no hits") } results := hitsToResults(merged) results = rerankByAuthority(query, results) if topK > 0 && len(results) > topK { results = results[:topK] } return results, nil } // hitsToResults maps raw Qdrant hits to LegalSearchResult, preferring the normalized payload // fields (regulation_code/article_label/...) with fallback to the legacy names (regulation_id, // section) while the corpus is mid-re-ingestion. Shared by searchInternal + searchMultiRegulation. func hitsToResults(hits []qdrantSearchHit) []LegalSearchResult { results := make([]LegalSearchResult, len(hits)) for i, hit := range hits { regCode := getString(hit.Payload, "regulation_code") if regCode == "" { regCode = getString(hit.Payload, "regulation_id") } article := getString(hit.Payload, "article") if article == "" { article = getString(hit.Payload, "section") } results[i] = LegalSearchResult{ Text: getString(hit.Payload, "chunk_text"), RegulationCode: regCode, RegulationName: getString(hit.Payload, "regulation_name_de"), RegulationShort: getString(hit.Payload, "regulation_short"), Category: getString(hit.Payload, "category"), ArticleLabel: getString(hit.Payload, "article_label"), Article: article, Paragraph: getString(hit.Payload, "paragraph"), Sub: getString(hit.Payload, "sub"), IsRecital: getBool(hit.Payload, "is_recital"), CitationStyle: getString(hit.Payload, "citation_style"), Pages: getIntSlice(hit.Payload, "pages"), SourceURL: getString(hit.Payload, "source"), Score: hit.Score, AuthorityWeight: getInt(hit.Payload, "authority_weight"), SourceClass: getString(hit.Payload, "source_class"), Jurisdiction: getString(hit.Payload, "jurisdiction"), CitationUnit: getString(hit.Payload, "citation_unit"), ReferencesOut: getStringSlice(hit.Payload, "references_out"), ReferencesIn: getStringSlice(hit.Payload, "references_in"), Superseded: getString(hit.Payload, "status") == "superseded", } } return results }