136 lines
5.4 KiB
Go
136 lines
5.4 KiB
Go
package ucca
|
|
|
|
import (
|
|
"sort"
|
|
"strings"
|
|
)
|
|
|
|
// Clarity is the READ-ONLY, INSTRUMENTED clarity-gate signal emitted alongside a
|
|
// retrieve response. It does NOT change retrieval or advisor behaviour yet — the
|
|
// advisor still answers normally. Once ~30-50 real questions are collected the
|
|
// thresholds get finalised and the gate is activated in the advisor flow.
|
|
//
|
|
// Ambiguity has two independent sources (empirically measured, 12-question set):
|
|
// - retrieval scatter: hits spread across many knowledge spaces (low
|
|
// concentration / high domain_count) — the retriever itself can't localise.
|
|
// - conceptual generality: a general term the corpus OVER-localises (e.g. "PDCA"
|
|
// concentrates on datenschutz but is cross-domain) — only an LLM knows this.
|
|
// The middle band is where the LLM-intent classifier must decide.
|
|
//
|
|
// G1 (explicit scope): when the query NAMES a regulation ("... nach TRGS", "CRA
|
|
// ...", "MaschinenVO ..."), that explicit context beats the embedding scatter —
|
|
// the gate scopes to the named regulation's knowledge space regardless of
|
|
// concentration. This is regulation detection, NOT a broad-term list.
|
|
type Clarity struct {
|
|
Mode string `json:"mode"` // "answer" | "clarify"
|
|
Reason string `json:"reason"` // low_concentration | many_domains | high_confidence_scope | middle_band_llm_needed | explicit_scope | no_domain_signal
|
|
Concentration float64 `json:"concentration"` // fraction of tagged hits in the dominant knowledge space
|
|
DomainCount int `json:"domain_count"` // distinct knowledge spaces in the hits
|
|
DominantContext string `json:"dominant_context"` // knowledge-space id (explicit scope wins if the query names a regulation)
|
|
CandidateContexts []ClarityContext `json:"candidate_contexts"` // corpus-grounded chips (spaces actually present)
|
|
}
|
|
|
|
// ClarityContext is one corpus-grounded context chip.
|
|
type ClarityContext struct {
|
|
ID string `json:"id"`
|
|
Label string `json:"label"`
|
|
Hits int `json:"hits"`
|
|
}
|
|
|
|
// Tiered thresholds — INSTRUMENTED DEFAULTS, calibrate on 30-50 real questions.
|
|
const (
|
|
clarityMaxConcentration = 0.45 // <= this => clarify (retrieval scatter)
|
|
clarityMinDomains = 4 // >= this => clarify (broad spread)
|
|
clarityAnswerConc = 0.75 // >= this => answer (confident scope)
|
|
)
|
|
|
|
// QueryKnowledgeSpace detects an EXPLICIT regulation mention in the query and maps
|
|
// it to a knowledge space. Regulation detection (authority), not a broad-term list:
|
|
// only fires when the user names a concrete regelwerk. "" if none named.
|
|
func QueryKnowledgeSpace(query string) string {
|
|
q := " " + strings.ToLower(query) + " "
|
|
has := func(subs ...string) bool {
|
|
for _, s := range subs {
|
|
if strings.Contains(q, s) {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
switch {
|
|
case has("trgs", "trbs", " asr ", "gefahrstoff", "arbeitsplatzgrenzwert", "arbeitsschutz"):
|
|
return "arbeitsschutz"
|
|
case has("dsgvo", "gdpr", "bdsg", "tdddg", "ttdsg", " dsk ", "edpb", "datenschutz", " dsfa "):
|
|
return "datenschutz"
|
|
case has(" cra ", "cyber resilience", "nis2", "nis-2", " dora ", "enisa", "bsig", "kritis"):
|
|
return "cyber"
|
|
case has("ai act", "ki-vo", "ki-verordnung", "ki-system"):
|
|
return "ki"
|
|
case has("maschinenverordnung", "maschinenvo", "maschvo", "maschinenrichtlinie", " gpsr ", "produktsicherheit"):
|
|
return "produktsicherheit"
|
|
case has(" mdr ", "medizinprodukt", "medical device"):
|
|
return "produktsicherheit"
|
|
default:
|
|
return ""
|
|
}
|
|
}
|
|
|
|
// ClassifyClarity computes the read-only clarity signal. Deterministic tiers on the
|
|
// knowledge-space concentration, PLUS the G1 explicit-scope override: if the query
|
|
// names a regulation, that scope wins over the embedding scatter.
|
|
func ClassifyClarity(query string, results []LegalSearchResult) Clarity {
|
|
counts := map[string]int{}
|
|
total := 0
|
|
for _, r := range results {
|
|
if s := KnowledgeSpaceOf(r.RegulationCode); s != "" {
|
|
counts[s]++
|
|
total++
|
|
}
|
|
}
|
|
cl := Clarity{Mode: "answer", Reason: "high_confidence_scope", CandidateContexts: []ClarityContext{}}
|
|
if total == 0 {
|
|
cl.Mode, cl.Reason = "clarify", "no_domain_signal"
|
|
if ks := QueryKnowledgeSpace(query); ks != "" {
|
|
cl.Mode, cl.Reason, cl.DominantContext = "answer", "explicit_scope", ks
|
|
}
|
|
return cl
|
|
}
|
|
type kc struct {
|
|
id string
|
|
n int
|
|
}
|
|
ks := make([]kc, 0, len(counts))
|
|
for id, n := range counts {
|
|
ks = append(ks, kc{id, n})
|
|
}
|
|
sort.Slice(ks, func(i, j int) bool {
|
|
if ks[i].n != ks[j].n {
|
|
return ks[i].n > ks[j].n
|
|
}
|
|
return ks[i].id < ks[j].id
|
|
})
|
|
cl.DominantContext = ks[0].id
|
|
cl.Concentration = float64(ks[0].n) / float64(total)
|
|
cl.DomainCount = len(counts)
|
|
for _, k := range ks {
|
|
cl.CandidateContexts = append(cl.CandidateContexts, ClarityContext{
|
|
ID: k.id, Label: KnowledgeSpaceLabel[k.id], Hits: k.n,
|
|
})
|
|
}
|
|
switch {
|
|
case cl.Concentration <= clarityMaxConcentration:
|
|
cl.Mode, cl.Reason = "clarify", "low_concentration"
|
|
case cl.DomainCount >= clarityMinDomains:
|
|
cl.Mode, cl.Reason = "clarify", "many_domains"
|
|
case cl.Concentration >= clarityAnswerConc:
|
|
cl.Mode, cl.Reason = "answer", "high_confidence_scope"
|
|
default:
|
|
cl.Mode, cl.Reason = "answer", "middle_band_llm_needed"
|
|
}
|
|
// G1: an explicitly named regulation beats the embedding scatter.
|
|
if q := QueryKnowledgeSpace(query); q != "" {
|
|
cl.Mode, cl.Reason, cl.DominantContext = "answer", "explicit_scope", q
|
|
}
|
|
return cl
|
|
}
|