package ucca import ( "sort" "strings" ) // Clarity is the READ-ONLY, INSTRUMENTED clarity-gate signal emitted alongside a // retrieve response. It does NOT change retrieval or advisor behaviour yet — the // advisor still answers normally. Once ~30-50 real questions are collected the // thresholds get finalised and the gate is activated in the advisor flow. // // Ambiguity has two independent sources (empirically measured, 12-question set): // - retrieval scatter: hits spread across many knowledge spaces (low // concentration / high domain_count) — the retriever itself can't localise. // - conceptual generality: a general term the corpus OVER-localises (e.g. "PDCA" // concentrates on datenschutz but is cross-domain) — only an LLM knows this. // The middle band is where the LLM-intent classifier must decide. // // G1 (explicit scope): when the query NAMES a regulation ("... nach TRGS", "CRA // ...", "MaschinenVO ..."), that explicit context beats the embedding scatter — // the gate scopes to the named regulation's knowledge space regardless of // concentration. This is regulation detection, NOT a broad-term list. type Clarity struct { Mode string `json:"mode"` // "answer" | "clarify" Reason string `json:"reason"` // low_concentration | many_domains | high_confidence_scope | middle_band_llm_needed | explicit_scope | no_domain_signal Concentration float64 `json:"concentration"` // fraction of tagged hits in the dominant knowledge space DomainCount int `json:"domain_count"` // distinct knowledge spaces in the hits DominantContext string `json:"dominant_context"` // knowledge-space id (explicit scope wins if the query names a regulation) CandidateContexts []ClarityContext `json:"candidate_contexts"` // corpus-grounded chips (spaces actually present) } // ClarityContext is one corpus-grounded context chip. type ClarityContext struct { ID string `json:"id"` Label string `json:"label"` Hits int `json:"hits"` } // Tiered thresholds — INSTRUMENTED DEFAULTS, calibrate on 30-50 real questions. const ( clarityMaxConcentration = 0.45 // <= this => clarify (retrieval scatter) clarityMinDomains = 4 // >= this => clarify (broad spread) clarityAnswerConc = 0.75 // >= this => answer (confident scope) ) // QueryKnowledgeSpace detects an EXPLICIT regulation mention in the query and maps // it to a knowledge space. Regulation detection (authority), not a broad-term list: // only fires when the user names a concrete regelwerk. "" if none named. func QueryKnowledgeSpace(query string) string { q := " " + strings.ToLower(query) + " " has := func(subs ...string) bool { for _, s := range subs { if strings.Contains(q, s) { return true } } return false } switch { case has("trgs", "trbs", " asr ", "gefahrstoff", "arbeitsplatzgrenzwert", "arbeitsschutz"): return "arbeitsschutz" case has("dsgvo", "gdpr", "bdsg", "tdddg", "ttdsg", " dsk ", "edpb", "datenschutz", " dsfa "): return "datenschutz" case has(" cra ", "cyber resilience", "nis2", "nis-2", " dora ", "enisa", "bsig", "kritis"): return "cyber" case has("ai act", "ki-vo", "ki-verordnung", "ki-system"): return "ki" case has("maschinenverordnung", "maschinenvo", "maschvo", "maschinenrichtlinie", " gpsr ", "produktsicherheit"): return "produktsicherheit" case has(" mdr ", "medizinprodukt", "medical device"): return "produktsicherheit" default: return "" } } // ClassifyClarity computes the read-only clarity signal. Deterministic tiers on the // knowledge-space concentration, PLUS the G1 explicit-scope override: if the query // names a regulation, that scope wins over the embedding scatter. func ClassifyClarity(query string, results []LegalSearchResult) Clarity { counts := map[string]int{} total := 0 for _, r := range results { if s := KnowledgeSpaceOf(r.RegulationCode); s != "" { counts[s]++ total++ } } cl := Clarity{Mode: "answer", Reason: "high_confidence_scope", CandidateContexts: []ClarityContext{}} if total == 0 { cl.Mode, cl.Reason = "clarify", "no_domain_signal" if ks := QueryKnowledgeSpace(query); ks != "" { cl.Mode, cl.Reason, cl.DominantContext = "answer", "explicit_scope", ks } return cl } type kc struct { id string n int } ks := make([]kc, 0, len(counts)) for id, n := range counts { ks = append(ks, kc{id, n}) } sort.Slice(ks, func(i, j int) bool { if ks[i].n != ks[j].n { return ks[i].n > ks[j].n } return ks[i].id < ks[j].id }) cl.DominantContext = ks[0].id cl.Concentration = float64(ks[0].n) / float64(total) cl.DomainCount = len(counts) for _, k := range ks { cl.CandidateContexts = append(cl.CandidateContexts, ClarityContext{ ID: k.id, Label: KnowledgeSpaceLabel[k.id], Hits: k.n, }) } switch { case cl.Concentration <= clarityMaxConcentration: cl.Mode, cl.Reason = "clarify", "low_concentration" case cl.DomainCount >= clarityMinDomains: cl.Mode, cl.Reason = "clarify", "many_domains" case cl.Concentration >= clarityAnswerConc: cl.Mode, cl.Reason = "answer", "high_confidence_scope" default: cl.Mode, cl.Reason = "answer", "middle_band_llm_needed" } // G1: an explicitly named regulation beats the embedding scatter. if q := QueryKnowledgeSpace(query); q != "" { cl.Mode, cl.Reason, cl.DominantContext = "answer", "explicit_scope", q } return cl }