package ucca import ( "context" "fmt" "strings" ) // multiRegMinPerRegulation is the minimum number of hits fetched per named regulation, so // each domain is fairly represented even when topK/len(regs) would be tiny. const multiRegMinPerRegulation = 3 // regulationCatalog maps a regulation to (a) the aliases that signal it is EXPLICITLY named // in a query and (b) the regulation_code/regulation_id values used to filter the corpus. // Deterministic + generic: a query naming >=2 regulations triggers per-regulation retrieval // so a cross-regulation question returns every named domain — NOT a doc-specific rule. var regulationCatalog = []struct { Canonical string Aliases []string CodeValues []string }{ {"CRA", []string{"cra", "cyber resilience"}, []string{"CRA"}}, {"MaschVO", []string{"maschinenverordnung", "maschvo", "machinery regulation"}, []string{"MASCHVO", "MaschVO"}}, {"NIS2", []string{"nis2", "nis-2", "nis 2"}, []string{"NIS2"}}, {"DORA", []string{"dora"}, []string{"DORA"}}, {"Data Act", []string{"data act", "datengesetz"}, []string{"DATA ACT", "DataAct"}}, {"AI Act", []string{"ai act", "ki-vo", "ki-verordnung", "ai-verordnung"}, []string{"AI ACT", "AIAct"}}, {"DSGVO", []string{"dsgvo", "gdpr"}, []string{"DSGVO"}}, {"TDDDG", []string{"tdddg"}, []string{"TDDDG"}}, {"BDSG", []string{"bdsg"}, []string{"BDSG"}}, } type detectedRegulation struct { Canonical string CodeValues []string } // detectRegulations returns the DISTINCT regulations explicitly named in the query. >=2 of // them is the trigger for multi-regulation retrieval. Pure + deterministic, no LLM. func detectRegulations(query string) []detectedRegulation { q := strings.ToLower(query) var out []detectedRegulation for _, r := range regulationCatalog { for _, a := range r.Aliases { if strings.Contains(q, a) { out = append(out, detectedRegulation{Canonical: r.Canonical, CodeValues: r.CodeValues}) break } } } return out } func hitID(h qdrantSearchHit) string { return fmt.Sprintf("%v", h.ID) } // searchMultiRegulation retrieves each explicitly-named regulation SEPARATELY (per-regulation // filter) and merges, so a cross-regulation query ("Wie greifen CRA und MaschVO ineinander?") // returns BOTH domains in the prompt instead of only the keyword-dominant one. Generic over any // named pair (DSGVO+TDDDG, CRA+NIS2, DORA+NIS2, AI Act+DSGVO, ...). The merged pool is // authority-reranked once. Pure pool-construction; topK contract preserved. func (c *LegalRAGClient) searchMultiRegulation(ctx context.Context, collection, query string, regs []detectedRegulation, topK int) ([]LegalSearchResult, error) { embedding, err := c.generateEmbedding(ctx, query) if err != nil { return nil, fmt.Errorf("failed to generate embedding: %w", err) } perReg := topK / len(regs) if perReg < multiRegMinPerRegulation { perReg = multiRegMinPerRegulation } var merged []qdrantSearchHit seen := make(map[string]bool) for _, r := range regs { var hits []qdrantSearchHit if c.hybridEnabled { if h, hErr := c.searchHybrid(ctx, collection, embedding, r.CodeValues, perReg); hErr == nil { hits = h } } if hits == nil { if h, dErr := c.searchDense(ctx, collection, embedding, r.CodeValues, perReg); dErr == nil { hits = h } } for _, h := range hits { id := hitID(h) if seen[id] { continue } seen[id] = true merged = append(merged, h) } } if len(merged) == 0 { return nil, fmt.Errorf("multi-regulation search returned no hits") } results := hitsToResults(merged) results = rerankByAuthority(query, results) if topK > 0 && len(results) > topK { results = results[:topK] } return results, nil } // hitsToResults maps raw Qdrant hits to LegalSearchResult, preferring the normalized payload // fields (regulation_code/article_label/...) with fallback to the legacy names (regulation_id, // section) while the corpus is mid-re-ingestion. Shared by searchInternal + searchMultiRegulation. func hitsToResults(hits []qdrantSearchHit) []LegalSearchResult { results := make([]LegalSearchResult, len(hits)) for i, hit := range hits { regCode := getString(hit.Payload, "regulation_code") if regCode == "" { regCode = getString(hit.Payload, "regulation_id") } article := getString(hit.Payload, "article") if article == "" { article = getString(hit.Payload, "section") } results[i] = LegalSearchResult{ Text: getString(hit.Payload, "chunk_text"), RegulationCode: regCode, RegulationName: getString(hit.Payload, "regulation_name_de"), RegulationShort: getString(hit.Payload, "regulation_short"), Category: getString(hit.Payload, "category"), ArticleLabel: getString(hit.Payload, "article_label"), Article: article, Paragraph: getString(hit.Payload, "paragraph"), Sub: getString(hit.Payload, "sub"), IsRecital: getBool(hit.Payload, "is_recital"), CitationStyle: getString(hit.Payload, "citation_style"), Pages: getIntSlice(hit.Payload, "pages"), SourceURL: getString(hit.Payload, "source"), Score: hit.Score, AuthorityWeight: getInt(hit.Payload, "authority_weight"), SourceClass: getString(hit.Payload, "source_class"), Jurisdiction: getString(hit.Payload, "jurisdiction"), CitationUnit: getString(hit.Payload, "citation_unit"), ReferencesOut: getStringSlice(hit.Payload, "references_out"), ReferencesIn: getStringSlice(hit.Payload, "references_in"), Superseded: getString(hit.Payload, "status") == "superseded", } } return results }