feat(ai-sdk): Advisor Reasoning Stack — Clarity+G1+Concept-Injector+Context-Scope+Term-Resolution+E4-Curation+Intent-Signal

2026-07-01 15:27:23 +02:00
parent a606000a20
commit e901447096
12 changed files with 902 additions and 10 deletions
@@ -0,0 +1,135 @@
+package ucca
+
+import (
+	"sort"
+	"strings"
+)
+
+// Clarity is the READ-ONLY, INSTRUMENTED clarity-gate signal emitted alongside a
+// retrieve response. It does NOT change retrieval or advisor behaviour yet — the
+// advisor still answers normally. Once ~30-50 real questions are collected the
+// thresholds get finalised and the gate is activated in the advisor flow.
+//
+// Ambiguity has two independent sources (empirically measured, 12-question set):
+//   - retrieval scatter: hits spread across many knowledge spaces (low
+//     concentration / high domain_count) — the retriever itself can't localise.
+//   - conceptual generality: a general term the corpus OVER-localises (e.g. "PDCA"
+//     concentrates on datenschutz but is cross-domain) — only an LLM knows this.
+//     The middle band is where the LLM-intent classifier must decide.
+//
+// G1 (explicit scope): when the query NAMES a regulation ("... nach TRGS", "CRA
+// ...", "MaschinenVO ..."), that explicit context beats the embedding scatter —
+// the gate scopes to the named regulation's knowledge space regardless of
+// concentration. This is regulation detection, NOT a broad-term list.
+type Clarity struct {
+	Mode              string           `json:"mode"`               // "answer" | "clarify"
+	Reason            string           `json:"reason"`             // low_concentration | many_domains | high_confidence_scope | middle_band_llm_needed | explicit_scope | no_domain_signal
+	Concentration     float64          `json:"concentration"`      // fraction of tagged hits in the dominant knowledge space
+	DomainCount       int              `json:"domain_count"`       // distinct knowledge spaces in the hits
+	DominantContext   string           `json:"dominant_context"`   // knowledge-space id (explicit scope wins if the query names a regulation)
+	CandidateContexts []ClarityContext `json:"candidate_contexts"` // corpus-grounded chips (spaces actually present)
+}
+
+// ClarityContext is one corpus-grounded context chip.
+type ClarityContext struct {
+	ID    string `json:"id"`
+	Label string `json:"label"`
+	Hits  int    `json:"hits"`
+}
+
+// Tiered thresholds — INSTRUMENTED DEFAULTS, calibrate on 30-50 real questions.
+const (
+	clarityMaxConcentration = 0.45 // <= this => clarify (retrieval scatter)
+	clarityMinDomains       = 4    // >= this => clarify (broad spread)
+	clarityAnswerConc       = 0.75 // >= this => answer (confident scope)
+)
+
+// QueryKnowledgeSpace detects an EXPLICIT regulation mention in the query and maps
+// it to a knowledge space. Regulation detection (authority), not a broad-term list:
+// only fires when the user names a concrete regelwerk. "" if none named.
+func QueryKnowledgeSpace(query string) string {
+	q := " " + strings.ToLower(query) + " "
+	has := func(subs ...string) bool {
+		for _, s := range subs {
+			if strings.Contains(q, s) {
+				return true
+			}
+		}
+		return false
+	}
+	switch {
+	case has("trgs", "trbs", " asr ", "gefahrstoff", "arbeitsplatzgrenzwert", "arbeitsschutz"):
+		return "arbeitsschutz"
+	case has("dsgvo", "gdpr", "bdsg", "tdddg", "ttdsg", " dsk ", "edpb", "datenschutz", " dsfa "):
+		return "datenschutz"
+	case has(" cra ", "cyber resilience", "nis2", "nis-2", " dora ", "enisa", "bsig", "kritis"):
+		return "cyber"
+	case has("ai act", "ki-vo", "ki-verordnung", "ki-system"):
+		return "ki"
+	case has("maschinenverordnung", "maschinenvo", "maschvo", "maschinenrichtlinie", " gpsr ", "produktsicherheit"):
+		return "produktsicherheit"
+	case has(" mdr ", "medizinprodukt", "medical device"):
+		return "produktsicherheit"
+	default:
+		return ""
+	}
+}
+
+// ClassifyClarity computes the read-only clarity signal. Deterministic tiers on the
+// knowledge-space concentration, PLUS the G1 explicit-scope override: if the query
+// names a regulation, that scope wins over the embedding scatter.
+func ClassifyClarity(query string, results []LegalSearchResult) Clarity {
+	counts := map[string]int{}
+	total := 0
+	for _, r := range results {
+		if s := KnowledgeSpaceOf(r.RegulationCode); s != "" {
+			counts[s]++
+			total++
+		}
+	}
+	cl := Clarity{Mode: "answer", Reason: "high_confidence_scope", CandidateContexts: []ClarityContext{}}
+	if total == 0 {
+		cl.Mode, cl.Reason = "clarify", "no_domain_signal"
+		if ks := QueryKnowledgeSpace(query); ks != "" {
+			cl.Mode, cl.Reason, cl.DominantContext = "answer", "explicit_scope", ks
+		}
+		return cl
+	}
+	type kc struct {
+		id string
+		n  int
+	}
+	ks := make([]kc, 0, len(counts))
+	for id, n := range counts {
+		ks = append(ks, kc{id, n})
+	}
+	sort.Slice(ks, func(i, j int) bool {
+		if ks[i].n != ks[j].n {
+			return ks[i].n > ks[j].n
+		}
+		return ks[i].id < ks[j].id
+	})
+	cl.DominantContext = ks[0].id
+	cl.Concentration = float64(ks[0].n) / float64(total)
+	cl.DomainCount = len(counts)
+	for _, k := range ks {
+		cl.CandidateContexts = append(cl.CandidateContexts, ClarityContext{
+			ID: k.id, Label: KnowledgeSpaceLabel[k.id], Hits: k.n,
+		})
+	}
+	switch {
+	case cl.Concentration <= clarityMaxConcentration:
+		cl.Mode, cl.Reason = "clarify", "low_concentration"
+	case cl.DomainCount >= clarityMinDomains:
+		cl.Mode, cl.Reason = "clarify", "many_domains"
+	case cl.Concentration >= clarityAnswerConc:
+		cl.Mode, cl.Reason = "answer", "high_confidence_scope"
+	default:
+		cl.Mode, cl.Reason = "answer", "middle_band_llm_needed"
+	}
+	// G1: an explicitly named regulation beats the embedding scatter.
+	if q := QueryKnowledgeSpace(query); q != "" {
+		cl.Mode, cl.Reason, cl.DominantContext = "answer", "explicit_scope", q
+	}
+	return cl
+}