breakpilot-compliance/ai-compliance-sdk/internal/ucca/clarity.go

package ucca

import (
	"sort"
	"strings"
)

// Clarity is the READ-ONLY, INSTRUMENTED clarity-gate signal emitted alongside a
// retrieve response. It does NOT change retrieval or advisor behaviour yet — the
// advisor still answers normally. Once ~30-50 real questions are collected the
// thresholds get finalised and the gate is activated in the advisor flow.
//
// Ambiguity has two independent sources (empirically measured, 12-question set):
//   - retrieval scatter: hits spread across many knowledge spaces (low
//     concentration / high domain_count) — the retriever itself can't localise.
//   - conceptual generality: a general term the corpus OVER-localises (e.g. "PDCA"
//     concentrates on datenschutz but is cross-domain) — only an LLM knows this.
//     The middle band is where the LLM-intent classifier must decide.
//
// G1 (explicit scope): when the query NAMES a regulation ("... nach TRGS", "CRA
// ...", "MaschinenVO ..."), that explicit context beats the embedding scatter —
// the gate scopes to the named regulation's knowledge space regardless of
// concentration. This is regulation detection, NOT a broad-term list.
type Clarity struct {
	Mode              string           `json:"mode"`               // "answer" | "clarify"
	Reason            string           `json:"reason"`             // low_concentration | many_domains | high_confidence_scope | middle_band_llm_needed | explicit_scope | no_domain_signal
	Concentration     float64          `json:"concentration"`      // fraction of tagged hits in the dominant knowledge space
	DomainCount       int              `json:"domain_count"`       // distinct knowledge spaces in the hits
	DominantContext   string           `json:"dominant_context"`   // knowledge-space id (explicit scope wins if the query names a regulation)
	CandidateContexts []ClarityContext `json:"candidate_contexts"` // corpus-grounded chips (spaces actually present)
}

// ClarityContext is one corpus-grounded context chip.
type ClarityContext struct {
	ID    string `json:"id"`
	Label string `json:"label"`
	Hits  int    `json:"hits"`
}

// Tiered thresholds — INSTRUMENTED DEFAULTS, calibrate on 30-50 real questions.
const (
	clarityMaxConcentration = 0.45 // <= this => clarify (retrieval scatter)
	clarityMinDomains       = 4    // >= this => clarify (broad spread)
	clarityAnswerConc       = 0.75 // >= this => answer (confident scope)
)

// QueryKnowledgeSpace detects an EXPLICIT regulation mention in the query and maps
// it to a knowledge space. Regulation detection (authority), not a broad-term list:
// only fires when the user names a concrete regelwerk. "" if none named.
func QueryKnowledgeSpace(query string) string {
	q := " " + strings.ToLower(query) + " "
	has := func(subs ...string) bool {
		for _, s := range subs {
			if strings.Contains(q, s) {
				return true
			}
		}
		return false
	}
	switch {
	case has("trgs", "trbs", " asr ", "gefahrstoff", "arbeitsplatzgrenzwert", "arbeitsschutz"):
		return "arbeitsschutz"
	case has("dsgvo", "gdpr", "bdsg", "tdddg", "ttdsg", " dsk ", "edpb", "datenschutz", " dsfa "):
		return "datenschutz"
	case has(" cra ", "cyber resilience", "nis2", "nis-2", " dora ", "enisa", "bsig", "kritis"):
		return "cyber"
	case has("ai act", "ki-vo", "ki-verordnung", "ki-system"):
		return "ki"
	case has("maschinenverordnung", "maschinenvo", "maschvo", "maschinenrichtlinie", " gpsr ", "produktsicherheit"):
		return "produktsicherheit"
	case has(" mdr ", "medizinprodukt", "medical device"):
		return "produktsicherheit"
	default:
		return ""
	}
}

// ClassifyClarity computes the read-only clarity signal. Deterministic tiers on the
// knowledge-space concentration, PLUS the G1 explicit-scope override: if the query
// names a regulation, that scope wins over the embedding scatter.
func ClassifyClarity(query string, results []LegalSearchResult) Clarity {
	counts := map[string]int{}
	total := 0
	for _, r := range results {
		if s := KnowledgeSpaceOf(r.RegulationCode); s != "" {
			counts[s]++
			total++
		}
	}
	cl := Clarity{Mode: "answer", Reason: "high_confidence_scope", CandidateContexts: []ClarityContext{}}
	if total == 0 {
		cl.Mode, cl.Reason = "clarify", "no_domain_signal"
		if ks := QueryKnowledgeSpace(query); ks != "" {
			cl.Mode, cl.Reason, cl.DominantContext = "answer", "explicit_scope", ks
		}
		return cl
	}
	type kc struct {
		id string
		n  int
	}
	ks := make([]kc, 0, len(counts))
	for id, n := range counts {
		ks = append(ks, kc{id, n})
	}
	sort.Slice(ks, func(i, j int) bool {
		if ks[i].n != ks[j].n {
			return ks[i].n > ks[j].n
		}
		return ks[i].id < ks[j].id
	})
	cl.DominantContext = ks[0].id
	cl.Concentration = float64(ks[0].n) / float64(total)
	cl.DomainCount = len(counts)
	for _, k := range ks {
		cl.CandidateContexts = append(cl.CandidateContexts, ClarityContext{
			ID: k.id, Label: KnowledgeSpaceLabel[k.id], Hits: k.n,
		})
	}
	switch {
	case cl.Concentration <= clarityMaxConcentration:
		cl.Mode, cl.Reason = "clarify", "low_concentration"
	case cl.DomainCount >= clarityMinDomains:
		cl.Mode, cl.Reason = "clarify", "many_domains"
	case cl.Concentration >= clarityAnswerConc:
		cl.Mode, cl.Reason = "answer", "high_confidence_scope"
	default:
		cl.Mode, cl.Reason = "answer", "middle_band_llm_needed"
	}
	// G1: an explicitly named regulation beats the embedding scatter.
	if q := QueryKnowledgeSpace(query); q != "" {
		cl.Mode, cl.Reason, cl.DominantContext = "answer", "explicit_scope", q
	}
	return cl
}