feat(ai-sdk): Advisor Reasoning Stack — Clarity+G1+Concept-Injector+Context-Scope+Term-Resolution+E4-Curation+Intent-Signal

This commit is contained in:
Claude
2026-07-01 15:27:23 +02:00
parent a606000a20
commit e901447096
12 changed files with 902 additions and 10 deletions
@@ -1,6 +1,9 @@
package handlers
import (
"encoding/json"
"fmt"
"log"
"net/http"
"strconv"
@@ -87,6 +90,7 @@ func (h *RAGHandlers) Search(c *gin.Context) {
type RetrieveRequest struct {
Query string `json:"query" binding:"required"`
TopK int `json:"top_k,omitempty"`
Context string `json:"context,omitempty"`
}
// Retrieve is the Authority Router endpoint. The Advisor calls this with ONLY a query and stays
@@ -105,6 +109,13 @@ func (h *RAGHandlers) Retrieve(c *gin.Context) {
req.TopK = 8
}
// E2 Term Resolution: expand unambiguous abbreviations (TOM/VVT/AVV/DSB/DSFA) into the
// query so retrieval finds them; ambiguous ones (DSE/DPA) are surfaced to the FE — NOT
// auto-mapped (chat context E1 wins, else the FE asks).
intent := ucca.DetectIntent(req.Query)
termRes := ucca.ResolveAbbreviations(req.Query)
req.Query = termRes.Expanded
results, err := h.ragClient.Retrieve(c.Request.Context(), req.Query, req.TopK)
if err != nil {
c.JSON(http.StatusInternalServerError, gin.H{"error": "RAG retrieve failed: " + err.Error()})
@@ -114,7 +125,42 @@ func (h *RAGHandlers) Retrieve(c *gin.Context) {
// Evidence-Type-Schicht: die autoritative typisierte Evidence (Fußnoten/Tabellen/Figuren) aus
// dem KB-Wissensraum SEPARAT surfacen, statt sie im Breit-Basis-Text-Merge zu verlieren.
// results[] bleibt der Text-Kontext fürs LLM + die Quellen-Liste.
// Context scoping (E5): the user explicitly chose a knowledge space (chip), so scope
// the evidence HARD to it (wider re-retrieve + domain filter) — no off-domain regelwerke
// (MDR/UStG/eIDAS) after a context decision.
if req.Context != "" {
if wide, werr := h.ragClient.Retrieve(c.Request.Context(), req.Query, 30); werr == nil && len(wide) > 0 {
results = ucca.FilterByKnowledgeSpace(wide, req.Context, req.TopK)
} else {
results = ucca.FilterByKnowledgeSpace(results, req.Context, req.TopK)
}
}
// G1 scope-gating: a named regulation scopes the evidence to its knowledge space.
// Re-retrieve wider and lead with the named regulation's domain so the L2 answer +
// [n] citations are built on scoped evidence, not the embedding-majority domain.
if scope := ucca.QueryKnowledgeSpace(req.Query); scope != "" {
if wide, werr := h.ragClient.Retrieve(c.Request.Context(), req.Query, 30); werr == nil && len(wide) > 0 {
results = ucca.ScopeResults(wide, scope, req.TopK)
} else {
results = ucca.ScopeResults(results, scope, req.TopK)
}
}
ev := h.ragClient.RetrieveEvidence(c.Request.Context(), req.Query)
// Concept->Norm recall injector: if the query names a legal concept, fetch its
// load-bearing norms (Datenschutzerklärung -> Art. 12/13/14 DSGVO, ...) and inject
// them into the evidence set so they surface (embedding similarity misses them).
if norms := ucca.ConceptNorms(req.Query); len(norms) > 0 {
top := 0.9
if len(results) > 0 {
top = results[0].Score
}
injected := h.ragClient.FetchByNormIDs(c.Request.Context(), norms, top-0.001)
results = ucca.InjectConceptNorms(results, injected, req.TopK)
}
clarity := ucca.ClassifyClarity(req.Query, results)
traceClarity(req.Query, clarity, results)
c.JSON(http.StatusOK, gin.H{
"query": req.Query,
@@ -123,7 +169,11 @@ func (h *RAGHandlers) Retrieve(c *gin.Context) {
"assessment": ucca.Assess(results),
"footnotes": footnotesFromEvidence(ev[ucca.EvidenceFootnote]),
"tables": tablesFromEvidence(ev[ucca.EvidenceTable]),
"figures": figuresFromEvidence(ev[ucca.EvidenceFigure]),
"evidence": evidenceFromResults(results),
"visual_evidence": visualEvidenceFromEvidence(ev[ucca.EvidenceFigure]),
"clarity": clarity,
"term_resolution": termRes.Ambiguous,
"interaction_intent": intent,
})
}
@@ -163,23 +213,67 @@ func tablesFromEvidence(rs []ucca.LegalSearchResult) []gin.H {
return out
}
// figuresFromEvidence maps FIGURE evidence (C8). Empty until C8 populates figure units; image_url/
// caption/vision_summary get added here when C8 lands — same path, no router change.
func figuresFromEvidence(rs []ucca.LegalSearchResult) []gin.H {
// visualEvidenceFromEvidence maps FIGURE evidence to the Visual Evidence contract shape
// (C8). visual_type/image_ref/vision_summary populate once C8 lands; the shape is stable now.
func visualEvidenceFromEvidence(rs []ucca.LegalSearchResult) []gin.H {
out := make([]gin.H, 0, len(rs))
for _, r := range rs {
out = append(out, gin.H{
"figure_id": r.CitationUnit,
"caption": r.ArticleLabel,
"regulation_code": r.RegulationCode,
"regulation_short": r.RegulationShort,
"regulation_name": r.RegulationName,
"section": r.RefCitationUnit,
"visual_id": r.CitationUnit,
"visual_type": "figure",
"caption": r.ArticleLabel,
"document": evidenceDocName(r),
"context": ucca.KnowledgeSpaceOf(r.RegulationCode),
"regulation_code": r.RegulationCode,
"section": r.RefCitationUnit,
"image_ref": "",
"vision_summary": "",
})
}
return out
}
// evidenceFromResults maps retrieval hits to the Evidence contract shape the Advisor
// Evidence Workspace renders (citations[] reference evidence_id). Populated at retrieve
// time; citations[] (the [n]<->evidence coupling) come from the answer-generation step.
func evidenceFromResults(rs []ucca.LegalSearchResult) []gin.H {
out := make([]gin.H, 0, len(rs))
for _, r := range rs {
id := r.CitationUnit
if id == "" {
id = r.ArticleLabel
}
out = append(out, gin.H{
"evidence_id": id,
"document": evidenceDocName(r),
"section": r.ArticleLabel,
"paragraph": r.Paragraph,
"snippet": evidenceSnippet(r.Text, 280),
"url": r.SourceURL,
"regulation_code": r.RegulationCode,
"context": ucca.KnowledgeSpaceOf(r.RegulationCode),
})
}
return out
}
// evidenceDocName is the human-facing source name (short code, else full name).
func evidenceDocName(r ucca.LegalSearchResult) string {
if r.RegulationShort != "" {
return r.RegulationShort
}
return r.RegulationName
}
// evidenceSnippet returns a trimmed excerpt of at most n runes.
func evidenceSnippet(s string, n int) string {
rs := []rune(s)
if len(rs) <= n {
return s
}
return string(rs[:n]) + "…"
}
// ListRegulations returns the list of available regulations in the corpus.
// GET /sdk/v1/rag/regulations
func (h *RAGHandlers) ListRegulations(c *gin.Context) {
@@ -334,3 +428,29 @@ func (h *RAGHandlers) LegalCorpusStructure(c *gin.Context) {
},
})
}
// traceClarity emits a structured CLARITY_TRACE log line per retrieve for the macmini
// test session, so qualitative user ratings can be correlated with the gate decision.
func traceClarity(query string, cl ucca.Clarity, results []ucca.LegalSearchResult) {
top := make([]string, 0, 3)
for i, r := range results {
if i >= 3 {
break
}
top = append(top, r.RegulationShort)
}
chips := make([]string, 0, len(cl.CandidateContexts))
for _, c := range cl.CandidateContexts {
chips = append(chips, fmt.Sprintf("%s:%d", c.ID, c.Hits))
}
b, _ := json.Marshal(map[string]interface{}{
"query": query,
"mode": cl.Mode,
"reason": cl.Reason,
"concentration": cl.Concentration,
"dominant": cl.DominantContext,
"chips": chips,
"top_evidence": top,
})
log.Printf("CLARITY_TRACE %s", string(b))
}
@@ -0,0 +1,33 @@
package ucca
import (
"strings"
"testing"
)
func TestResolveAbbreviations(t *testing.T) {
// unambiguous -> expanded, not flagged
tr := ResolveAbbreviations("Was ist eine TOM?")
if !strings.Contains(tr.Expanded, "technische und organisatorische") {
t.Errorf("TOM must be expanded, got %q", tr.Expanded)
}
if len(tr.Ambiguous) != 0 {
t.Errorf("TOM must not be ambiguous, got %v", tr.Ambiguous)
}
// ambiguous DSE -> flagged, NOT auto-expanded (chat context must win, else FE asks)
tr2 := ResolveAbbreviations("welche Infos in eine DSE?")
if tr2.Expanded != "welche Infos in eine DSE?" {
t.Errorf("DSE must NOT be auto-mapped, got %q", tr2.Expanded)
}
if len(tr2.Ambiguous) != 1 || tr2.Ambiguous[0].Abbreviation != "DSE" || len(tr2.Ambiguous[0].Candidates) != 2 {
t.Errorf("DSE must be flagged ambiguous with 2 candidates, got %v", tr2.Ambiguous)
}
// no abbreviation -> unchanged
if ResolveAbbreviations("Wie ist das Wetter?").Expanded != "Wie ist das Wetter?" {
t.Errorf("query without abbreviation must be unchanged")
}
// substring must NOT match ("atom" contains "tom" but is not the word TOM)
if strings.Contains(ResolveAbbreviations("Was ist ein Atom?").Expanded, "organisatorische") {
t.Errorf("substring 'tom' in 'Atom' must not trigger expansion")
}
}
@@ -0,0 +1,65 @@
package ucca
import (
"strings"
"unicode"
)
// TermResolution is the E2 (Term Resolution) signal in the Advisor Reasoning Stack.
// Expanded drives retrieval internally (unambiguous abbreviations are spelled out so
// the embedding/concept layer finds them). Ambiguous is surfaced to the FE, which
// resolves it via chat context (E1) or asks the user ("Meinst du X oder Y?"). The
// lexicon NEVER auto-maps an ambiguous abbreviation (e.g. DSE) — real-life discipline.
type TermResolution struct {
Expanded string `json:"-"`
Ambiguous []TermAmbiguity `json:"ambiguous,omitempty"`
}
// TermAmbiguity flags one abbreviation the SDK could not resolve deterministically.
type TermAmbiguity struct {
Abbreviation string `json:"abbreviation"`
Candidates []string `json:"candidates"`
}
// abbreviationLexicon maps a (lowercased) abbreviation to its canonical term(s).
// >1 candidate = ambiguous → flagged, not expanded. Start small (User-Spec).
var abbreviationLexicon = map[string][]string{
"dse": {"Datenschutzerklärung", "Datenschutz-Folgenabschätzung"}, // ambiguous — context wins, else ask
"dsfa": {"Datenschutz-Folgenabschätzung"},
"tom": {"technische und organisatorische Maßnahmen"},
"vvt": {"Verzeichnis von Verarbeitungstätigkeiten"},
"avv": {"Auftragsverarbeitungsvertrag"},
"dsb": {"Datenschutzbeauftragter"},
"dpa": {"Data Processing Agreement", "Datenschutzaufsichtsbehörde"}, // ambiguous
}
// ResolveAbbreviations expands unambiguous abbreviations into the query and flags
// ambiguous ones. Deterministic: iterates query tokens in order (no map-order
// dependence). Whole-word match (case-insensitive) so "TOM" hits but "atom" does not.
func ResolveAbbreviations(query string) TermResolution {
tr := TermResolution{Expanded: query}
words := strings.FieldsFunc(query, func(r rune) bool {
return !unicode.IsLetter(r) && !unicode.IsNumber(r)
})
seen := map[string]bool{}
var expansions []string
for _, w := range words {
lw := strings.ToLower(w)
cands, ok := abbreviationLexicon[lw]
if !ok || seen[lw] {
continue
}
seen[lw] = true
if len(cands) == 1 {
expansions = append(expansions, cands[0])
} else {
tr.Ambiguous = append(tr.Ambiguous, TermAmbiguity{
Abbreviation: strings.ToUpper(lw), Candidates: cands,
})
}
}
if len(expansions) > 0 {
tr.Expanded = query + " " + strings.Join(expansions, " ")
}
return tr
}
+135
View File
@@ -0,0 +1,135 @@
package ucca
import (
"sort"
"strings"
)
// Clarity is the READ-ONLY, INSTRUMENTED clarity-gate signal emitted alongside a
// retrieve response. It does NOT change retrieval or advisor behaviour yet — the
// advisor still answers normally. Once ~30-50 real questions are collected the
// thresholds get finalised and the gate is activated in the advisor flow.
//
// Ambiguity has two independent sources (empirically measured, 12-question set):
// - retrieval scatter: hits spread across many knowledge spaces (low
// concentration / high domain_count) — the retriever itself can't localise.
// - conceptual generality: a general term the corpus OVER-localises (e.g. "PDCA"
// concentrates on datenschutz but is cross-domain) — only an LLM knows this.
// The middle band is where the LLM-intent classifier must decide.
//
// G1 (explicit scope): when the query NAMES a regulation ("... nach TRGS", "CRA
// ...", "MaschinenVO ..."), that explicit context beats the embedding scatter —
// the gate scopes to the named regulation's knowledge space regardless of
// concentration. This is regulation detection, NOT a broad-term list.
type Clarity struct {
Mode string `json:"mode"` // "answer" | "clarify"
Reason string `json:"reason"` // low_concentration | many_domains | high_confidence_scope | middle_band_llm_needed | explicit_scope | no_domain_signal
Concentration float64 `json:"concentration"` // fraction of tagged hits in the dominant knowledge space
DomainCount int `json:"domain_count"` // distinct knowledge spaces in the hits
DominantContext string `json:"dominant_context"` // knowledge-space id (explicit scope wins if the query names a regulation)
CandidateContexts []ClarityContext `json:"candidate_contexts"` // corpus-grounded chips (spaces actually present)
}
// ClarityContext is one corpus-grounded context chip.
type ClarityContext struct {
ID string `json:"id"`
Label string `json:"label"`
Hits int `json:"hits"`
}
// Tiered thresholds — INSTRUMENTED DEFAULTS, calibrate on 30-50 real questions.
const (
clarityMaxConcentration = 0.45 // <= this => clarify (retrieval scatter)
clarityMinDomains = 4 // >= this => clarify (broad spread)
clarityAnswerConc = 0.75 // >= this => answer (confident scope)
)
// QueryKnowledgeSpace detects an EXPLICIT regulation mention in the query and maps
// it to a knowledge space. Regulation detection (authority), not a broad-term list:
// only fires when the user names a concrete regelwerk. "" if none named.
func QueryKnowledgeSpace(query string) string {
q := " " + strings.ToLower(query) + " "
has := func(subs ...string) bool {
for _, s := range subs {
if strings.Contains(q, s) {
return true
}
}
return false
}
switch {
case has("trgs", "trbs", " asr ", "gefahrstoff", "arbeitsplatzgrenzwert", "arbeitsschutz"):
return "arbeitsschutz"
case has("dsgvo", "gdpr", "bdsg", "tdddg", "ttdsg", " dsk ", "edpb", "datenschutz", " dsfa "):
return "datenschutz"
case has(" cra ", "cyber resilience", "nis2", "nis-2", " dora ", "enisa", "bsig", "kritis"):
return "cyber"
case has("ai act", "ki-vo", "ki-verordnung", "ki-system"):
return "ki"
case has("maschinenverordnung", "maschinenvo", "maschvo", "maschinenrichtlinie", " gpsr ", "produktsicherheit"):
return "produktsicherheit"
case has(" mdr ", "medizinprodukt", "medical device"):
return "produktsicherheit"
default:
return ""
}
}
// ClassifyClarity computes the read-only clarity signal. Deterministic tiers on the
// knowledge-space concentration, PLUS the G1 explicit-scope override: if the query
// names a regulation, that scope wins over the embedding scatter.
func ClassifyClarity(query string, results []LegalSearchResult) Clarity {
counts := map[string]int{}
total := 0
for _, r := range results {
if s := KnowledgeSpaceOf(r.RegulationCode); s != "" {
counts[s]++
total++
}
}
cl := Clarity{Mode: "answer", Reason: "high_confidence_scope", CandidateContexts: []ClarityContext{}}
if total == 0 {
cl.Mode, cl.Reason = "clarify", "no_domain_signal"
if ks := QueryKnowledgeSpace(query); ks != "" {
cl.Mode, cl.Reason, cl.DominantContext = "answer", "explicit_scope", ks
}
return cl
}
type kc struct {
id string
n int
}
ks := make([]kc, 0, len(counts))
for id, n := range counts {
ks = append(ks, kc{id, n})
}
sort.Slice(ks, func(i, j int) bool {
if ks[i].n != ks[j].n {
return ks[i].n > ks[j].n
}
return ks[i].id < ks[j].id
})
cl.DominantContext = ks[0].id
cl.Concentration = float64(ks[0].n) / float64(total)
cl.DomainCount = len(counts)
for _, k := range ks {
cl.CandidateContexts = append(cl.CandidateContexts, ClarityContext{
ID: k.id, Label: KnowledgeSpaceLabel[k.id], Hits: k.n,
})
}
switch {
case cl.Concentration <= clarityMaxConcentration:
cl.Mode, cl.Reason = "clarify", "low_concentration"
case cl.DomainCount >= clarityMinDomains:
cl.Mode, cl.Reason = "clarify", "many_domains"
case cl.Concentration >= clarityAnswerConc:
cl.Mode, cl.Reason = "answer", "high_confidence_scope"
default:
cl.Mode, cl.Reason = "answer", "middle_band_llm_needed"
}
// G1: an explicitly named regulation beats the embedding scatter.
if q := QueryKnowledgeSpace(query); q != "" {
cl.Mode, cl.Reason, cl.DominantContext = "answer", "explicit_scope", q
}
return cl
}
@@ -0,0 +1,64 @@
package ucca
import "testing"
func TestKnowledgeSpaceOf(t *testing.T) {
cases := map[string]string{
"DSGVO": "datenschutz",
"BDSG": "datenschutz",
"DSK SDM B51 ZUGRIFFE": "datenschutz",
"EDPS DIGITAL ETHICS": "datenschutz",
"TRGS 900": "arbeitsschutz",
"OSHA 1910 SUBPART O": "arbeitsschutz",
"HGB": "wirtschaftsrecht",
"BGB": "wirtschaftsrecht",
"MASCHINENVO": "produktsicherheit",
"MVO": "produktsicherheit",
"CRA": "cyber",
"NIST SP800 53R5": "cyber",
"AI ACT": "ki",
"KI-VO": "ki",
"DORA": "finanz",
"ARG": "arbeitsrecht",
"": "",
}
for code, want := range cases {
if got := KnowledgeSpaceOf(code); got != want {
t.Errorf("KnowledgeSpaceOf(%q)=%q want %q", code, got, want)
}
}
}
func TestClassifyClarity(t *testing.T) {
scattered := []LegalSearchResult{
{RegulationCode: "CRA"}, {RegulationCode: "MASCHINENVO"}, {RegulationCode: "EU MDR"},
{RegulationCode: "KI-VO"}, {RegulationCode: "TRBS 1111"}, {RegulationCode: "OWASP TOP10"},
}
if c := ClassifyClarity("Welche Risiken gibt es?", scattered); c.Mode != "clarify" {
t.Errorf("scattered: mode=%q reason=%q want clarify", c.Mode, c.Reason)
}
concentrated := []LegalSearchResult{
{RegulationCode: "DSGVO"}, {RegulationCode: "BDSG"}, {RegulationCode: "DSK SDM"},
{RegulationCode: "EDPB WP243"}, {RegulationCode: "TDDDG"},
}
c := ClassifyClarity("Was ist eine DSFA?", concentrated)
if c.Mode != "answer" || c.DominantContext != "datenschutz" {
t.Errorf("concentrated: mode=%q dominant=%q want answer/datenschutz", c.Mode, c.DominantContext)
}
}
func TestClassifyClarity_ExplicitScope(t *testing.T) {
// G1: query names TRGS -> arbeitsschutz wins even though retrieval scatters to datenschutz.
scattered := []LegalSearchResult{
{RegulationCode: "DSK SDM METHODE"}, {RegulationCode: "DSK SDM V31"}, {RegulationCode: "DSK SDM B41 PLANEN"},
{RegulationCode: "DSGVO"}, {RegulationCode: "DSK SDM"}, {RegulationCode: "TRGS 900"}, {RegulationCode: "TRGS 554"},
}
c := ClassifyClarity("Schwellwertanalyse nach TRGS", scattered)
if c.Mode != "answer" || c.Reason != "explicit_scope" || c.DominantContext != "arbeitsschutz" {
t.Errorf("explicit TRGS: mode=%q reason=%q dominant=%q want answer/explicit_scope/arbeitsschutz", c.Mode, c.Reason, c.DominantContext)
}
// no regulation named -> falls through to tiered logic
if c := ClassifyClarity("Welche Risiken gibt es?", scattered); c.Reason == "explicit_scope" {
t.Errorf("no reg named should not be explicit_scope, got %q", c.Reason)
}
}
@@ -0,0 +1,88 @@
package ucca
import (
"bytes"
"context"
"encoding/json"
"fmt"
"net/http"
)
// FetchByNormIDs loads one representative unit per norm_id from the KB slice
// collection — the fetch side of the Concept->Norm recall injector. Returns
// LegalSearchResult with the caller-provided concept-relevance score (there is no
// similarity query; the injector places them by that score). Returns nil on any
// error or when no KB slice is configured (graceful degradation).
func (c *LegalRAGClient) FetchByNormIDs(ctx context.Context, normIDs []string, score float64) []LegalSearchResult {
if c.kbSliceCollection == "" || len(normIDs) == 0 {
return nil
}
should := make([]map[string]interface{}, 0, len(normIDs))
for _, nid := range normIDs {
should = append(should, map[string]interface{}{"key": "norm_id", "match": map[string]interface{}{"value": nid}})
}
reqBody := map[string]interface{}{
"limit": len(normIDs) * 3,
"with_payload": true,
"with_vectors": false,
"filter": map[string]interface{}{"should": should},
}
jsonBody, err := json.Marshal(reqBody)
if err != nil {
return nil
}
url := fmt.Sprintf("%s/collections/%s/points/scroll", c.qdrantURL, c.kbSliceCollection)
req, err := http.NewRequestWithContext(ctx, "POST", url, bytes.NewReader(jsonBody))
if err != nil {
return nil
}
req.Header.Set("Content-Type", "application/json")
if c.qdrantAPIKey != "" {
req.Header.Set("api-key", c.qdrantAPIKey)
}
resp, err := c.httpClient.Do(req)
if err != nil {
return nil
}
defer func() { _ = resp.Body.Close() }()
if resp.StatusCode != http.StatusOK {
return nil
}
var scrollResp qdrantScrollResponse
if err := json.NewDecoder(resp.Body).Decode(&scrollResp); err != nil {
return nil
}
seen := map[string]bool{}
out := make([]LegalSearchResult, 0, len(normIDs))
for _, pt := range scrollResp.Result.Points {
nid := getString(pt.Payload, "norm_id")
if nid == "" || seen[nid] {
continue
}
seen[nid] = true
out = append(out, scrollPointToResult(pt.Payload, score))
}
return out
}
// scrollPointToResult maps a scroll-point payload to a LegalSearchResult. Mirrors
// hitsToResults' payload keys; the score is assigned by the caller (concept rank).
func scrollPointToResult(payload map[string]interface{}, score float64) LegalSearchResult {
regCode := getString(payload, "regulation_code")
if regCode == "" {
regCode = getString(payload, "regulation_id")
}
return LegalSearchResult{
Text: getString(payload, "chunk_text"),
RegulationCode: regCode,
RegulationName: getString(payload, "regulation_name_de"),
RegulationShort: getString(payload, "regulation_short"),
Category: getString(payload, "category"),
Article: getString(payload, "article"),
ArticleLabel: getString(payload, "article_label"),
Paragraph: getString(payload, "paragraph"),
SourceURL: getString(payload, "source_url"),
CitationUnit: getString(payload, "citation_unit"),
Score: score,
}
}
@@ -0,0 +1,97 @@
package ucca
import (
"sort"
"strings"
)
// Legal Concept Ontology — the fachliche IP bridge for the Concept->Norm recall
// injector. The words users type ("Datenschutzerklärung", "Cookie Banner") are
// rarely identical to the article titles that actually govern them (Art. 12/13/14
// DSGVO, § 25 TDDDG). Embedding similarity misses this leap, so these bridges are
// curated: concept keyword -> load-bearing norm_ids. This is NOT a fallback to
// hardcoding — it is domain knowledge that surfaces the normatively load-bearing
// units within the (already correctly retrieved) documents.
type conceptNorm struct {
keywords []string
normIDs []string
}
var legalConceptOntology = []conceptNorm{
{[]string{"datenschutzerklärung", "datenschutzerklaerung", "privacy policy", "datenschutzhinweise", "datenschutzinformation"},
[]string{"EU-DSGVO-Art12", "EU-DSGVO-Art13", "EU-DSGVO-Art14"}},
{[]string{"cookie banner", "cookie-banner", "cookies", "cookie", "tracking"},
[]string{"DE-TDDDG-§25", "EU-DSGVO-Art6", "EU-DSGVO-Art7"}},
{[]string{"dsfa", "folgenabschätzung", "folgenabschaetzung", "datenschutz-folgenabschätzung"},
[]string{"EU-DSGVO-Art35", "EU-DSGVO-Art36"}},
{[]string{"auskunft", "auskunftsrecht", "auskunftsersuchen"},
[]string{"EU-DSGVO-Art15"}},
{[]string{"löschung", "loeschung", "vergessenwerden", "recht auf vergessen"},
[]string{"EU-DSGVO-Art17"}},
{[]string{"datenübertragbarkeit", "datenuebertragbarkeit", "portabilität", "portabilitaet"},
[]string{"EU-DSGVO-Art20"}},
{[]string{"widerspruch", "widerspruchsrecht"},
[]string{"EU-DSGVO-Art21"}},
{[]string{"datenpanne", "datenschutzverletzung", "data breach", "verletzung des schutzes"},
[]string{"EU-DSGVO-Art33", "EU-DSGVO-Art34"}},
// E4-Quick-Curation (2026-07-01): resolved abbreviations (E2) pull their core norms.
{[]string{"technische und organisatorische maßnahmen", "technische und organisatorische massnahmen"},
[]string{"EU-DSGVO-Art32", "EU-DSGVO-Art25", "EU-DSGVO-Art5"}},
{[]string{"verzeichnis von verarbeitungstätigkeiten", "verzeichnis von verarbeitungstaetigkeiten", "verarbeitungsverzeichnis"},
[]string{"EU-DSGVO-Art30"}},
{[]string{"auftragsverarbeitungsvertrag", "auftragsverarbeitung", "auftragsverarbeiter"},
[]string{"EU-DSGVO-Art28"}},
{[]string{"datenschutzbeauftragt"},
[]string{"EU-DSGVO-Art37", "EU-DSGVO-Art38", "EU-DSGVO-Art39"}},
}
// ConceptNorms returns the load-bearing norm_ids for the concepts named in the
// query (dedup, order-preserving). Empty if no concept is named.
func ConceptNorms(query string) []string {
q := strings.ToLower(query)
seen := map[string]bool{}
out := []string{}
for _, cn := range legalConceptOntology {
for _, kw := range cn.keywords {
if strings.Contains(q, kw) {
for _, nid := range cn.normIDs {
if !seen[nid] {
seen[nid] = true
out = append(out, nid)
}
}
break
}
}
}
return out
}
// InjectConceptNorms merges concept-injected norm units into the results so the
// load-bearing norms are VISIBLE in the evidence set. Dedups by citation_unit
// (skips norms already retrieved), then re-sorts by score — the injected units
// carry a just-below-top score so they surface high WITHOUT displacing the top
// document hit (inject, don't blindly dominate). Caps at topK.
func InjectConceptNorms(results, injected []LegalSearchResult, topK int) []LegalSearchResult {
if len(injected) == 0 {
return results
}
present := map[string]bool{}
for _, r := range results {
if r.CitationUnit != "" {
present[r.CitationUnit] = true
}
}
merged := append([]LegalSearchResult{}, results...)
for _, in := range injected {
if in.CitationUnit != "" && !present[in.CitationUnit] {
merged = append(merged, in)
present[in.CitationUnit] = true
}
}
sort.SliceStable(merged, func(i, j int) bool { return merged[i].Score > merged[j].Score })
if topK > 0 && len(merged) > topK {
merged = merged[:topK]
}
return merged
}
@@ -0,0 +1,48 @@
package ucca
import "testing"
func TestConceptNorms(t *testing.T) {
q := "Was muss ich beachten wenn ich meine Datenschutzerklärung schreibe für meine Website mit Cookie Banner?"
got := ConceptNorms(q)
want := map[string]bool{
"EU-DSGVO-Art12": true, "EU-DSGVO-Art13": true, "EU-DSGVO-Art14": true,
"DE-TDDDG-§25": true, "EU-DSGVO-Art6": true, "EU-DSGVO-Art7": true,
}
for _, nid := range got {
delete(want, nid)
}
if len(want) > 0 {
t.Errorf("ConceptNorms missing %v; got %v", want, got)
}
if len(ConceptNorms("Wie ist das Wetter heute?")) != 0 {
t.Errorf("no concept named should yield no norms")
}
}
func TestInjectConceptNorms(t *testing.T) {
results := []LegalSearchResult{
{CitationUnit: "DSK OH Telemedien", Score: 0.98},
{CitationUnit: "Art. 25 DSGVO", Score: 0.95},
}
injected := []LegalSearchResult{
{CitationUnit: "Art. 13 DSGVO", Score: 0.979},
{CitationUnit: "Art. 25 DSGVO", Score: 0.979}, // already present -> must not double
}
out := InjectConceptNorms(results, injected, 10)
if out[0].CitationUnit != "DSK OH Telemedien" {
t.Errorf("top document hit must stay #1 (not dominated), got %s", out[0].CitationUnit)
}
if len(out) != 3 {
t.Errorf("expected 3 (Art.25 not duplicated), got %d", len(out))
}
found := false
for _, r := range out {
if r.CitationUnit == "Art. 13 DSGVO" {
found = true
}
}
if !found {
t.Errorf("Art. 13 DSGVO must be injected + visible")
}
}
@@ -0,0 +1,26 @@
package ucca
import "testing"
func TestFilterByKnowledgeSpace(t *testing.T) {
results := []LegalSearchResult{
{CitationUnit: "Art. 13 DSGVO", RegulationCode: "DSGVO"},
{CitationUnit: "EU Mdr", RegulationCode: "EU MDR"},
{CitationUnit: "UStG § 14", RegulationCode: "USTG"},
{CitationUnit: "DSK OH Telemedien", RegulationCode: "DSK OH TELEMEDIEN"},
{CitationUnit: "eIDAS", RegulationCode: "EIDAS"},
}
out := FilterByKnowledgeSpace(results, "datenschutz", 10)
for _, r := range out {
if KnowledgeSpaceOf(r.RegulationCode) != "datenschutz" {
t.Errorf("off-domain leaked into scoped result: %s (%s)", r.CitationUnit, r.RegulationCode)
}
}
if len(out) != 2 { // Art. 13 DSGVO + DSK OH Telemedien
t.Errorf("expected 2 datenschutz hits, got %d", len(out))
}
// domain with no hits -> fall back to input (never strand the answer)
if len(FilterByKnowledgeSpace(results, "maschinen", 10)) != len(results) {
t.Errorf("no-hit domain should fall back to full input")
}
}
+46
View File
@@ -0,0 +1,46 @@
package ucca
import "strings"
// DetectIntent classifies the INTERACTION INTENT of a query (Advisor Reasoning
// Stack E3). The same norms answer very differently depending on the TASK the user
// wants: "Was ist X?" (definition) vs "Wie schreibe ich X?" (anleitung) vs "Prüfe X"
// (review). The SDK detects the intent deterministically and emits it; the FE picks
// the answer FORM, so the LLM gets a precise assignment ("write an Anleitung over
// this evidence") instead of guessing the format. Returns "" (neutral) when no
// clear task is signalled. First tier of ~20-30 intent types.
func DetectIntent(query string) string {
q := " " + strings.ToLower(query) + " "
has := func(subs ...string) bool {
for _, s := range subs {
if strings.Contains(q, s) {
return true
}
}
return false
}
switch {
case has("prüfe", "prüf mein", "überprüfe", "überprüf", "review", "checke mein",
"ist mein", "ist meine", "ist unser", "ist unsere", "konform", "stimmt mein",
"bewerte mein", "analysiere mein"):
return "review"
case has("checkliste", "was muss ich alles", "was gehört alles", "was gehört in",
"welche punkte muss", "was brauche ich alles"):
return "checkliste"
case has("vergleich", "unterschied", "worin unterscheid", " vs ", " versus ",
"gegenüber", "im gegensatz"):
return "vergleich"
case has("wie schreibe", "wie erstelle", "wie erstell", "wie mache", "wie baue",
"wie setze ich", "wie gehe ich vor", "wie formuliere", "wie richte ich",
"anleitung", "schritt für schritt", "schritt-für-schritt", "erstelle mir",
"erstell mir", "generiere", "was muss ich beachten", "worauf muss ich achten"):
return "anleitung"
case has("welche risiken", "welche gefahren", "risikoanalyse", "welche bedrohungen"):
return "risikoanalyse"
case has("was ist", "was bedeutet", "was versteht man", "was sind", "definition",
"erkläre mir", "erklär mir", "was heißt", "was genau ist"):
return "definition"
default:
return ""
}
}
@@ -0,0 +1,22 @@
package ucca
import "testing"
func TestDetectIntent(t *testing.T) {
cases := map[string]string{
"Was ist eine Datenschutzerklärung?": "definition",
"Wie schreibe ich eine Datenschutzerklärung?": "anleitung",
"Was muss ich beachten wenn ich eine DSE schreibe?": "anleitung",
"Prüfe meine Datenschutzerklärung.": "review",
"Ist meine Datenschutzerklärung konform?": "review",
"Vergleiche DSGVO und BDSG.": "vergleich",
"Welche Risiken gibt es?": "risikoanalyse",
"Erstelle mir eine Checkliste für die DSFA.": "checkliste",
"Wie ist das Wetter?": "",
}
for q, want := range cases {
if got := DetectIntent(q); got != want {
t.Errorf("DetectIntent(%q)=%q want %q", q, got, want)
}
}
}
@@ -0,0 +1,148 @@
package ucca
import "strings"
// KnowledgeSpace is the CHIP-level knowledge domain used by the clarity gate's
// concentration signal + the user-facing context chips. It is deliberately RICHER
// than the 4 authority domains in authority.go (data_protection/cyber/ai/
// product_safety), which drive the EU-primary/subsidiarity rerank. The clarity
// gate must reflect the FULL corpus breadth (arbeitsschutz, arbeitsrecht,
// wirtschaftsrecht, finanz, ...) so a broad query surfaces as broad. Kept separate
// + additive so the tuned authority rerank stays untouched. Corpus-grounded from
// the 463 real regulation codes (0.3% fall through to "sonstiges").
// knowledgeSpaceExact matches short/ambiguous codes by EXACT string (substring
// would misfire on 2-3 char codes like "OR"/"AO"/"BGB").
var knowledgeSpaceExact = map[string]string{
"HGB": "wirtschaftsrecht", "BGB": "wirtschaftsrecht", "AO": "wirtschaftsrecht", "OR": "wirtschaftsrecht",
"ABGB": "wirtschaftsrecht", "UGB": "wirtschaftsrecht", "IFRS": "wirtschaftsrecht", "BAO": "wirtschaftsrecht",
"GMBHG": "wirtschaftsrecht", "AKTG": "wirtschaftsrecht", "INSO": "wirtschaftsrecht", "USTG": "wirtschaftsrecht",
"GOBD": "wirtschaftsrecht", "EGBGB": "wirtschaftsrecht", "GEWO": "wirtschaftsrecht", "URHG": "wirtschaftsrecht",
"DPF": "datenschutz", "TKG": "datenschutz", "TMG": "datenschutz", "DDG": "datenschutz", "DSG": "datenschutz",
"DSV": "datenschutz", "DSM": "datenschutz", "SCC": "datenschutz", "EPRIVACY": "datenschutz",
"SCHREMS II": "datenschutz", "CH_REVDSG": "datenschutz", "PLANET49": "datenschutz", "GOOGLE FONTS": "datenschutz",
"DSA": "digitale_dienste", "DMA": "digitale_dienste", "DGA": "digitale_dienste", "EHDS": "digitale_dienste",
"EIDAS": "digitale_dienste", "EIDAS 2.0": "digitale_dienste", "DATA ACT": "digitale_dienste",
"DATAACT": "digitale_dienste", "DIGITAL CONTENT": "digitale_dienste",
"MVO": "produktsicherheit", "MACHINERY": "produktsicherheit", "MASCHVO": "produktsicherheit",
"MASCHINENVO": "produktsicherheit", "GPSR": "produktsicherheit", "PID": "produktsicherheit",
"EAA": "produktsicherheit", "BFSG": "produktsicherheit", "ELEKTROG": "produktsicherheit",
"VERPACKG": "produktsicherheit", "BATTVO": "produktsicherheit", "BATTDG": "produktsicherheit", "EU MDR": "produktsicherheit",
"DORA": "finanz", "PSD2": "finanz", "MICA": "finanz", "AMLR": "finanz", "VAIT": "finanz", "BAIT": "finanz", "GWG": "finanz",
"UWG": "verbraucherschutz", "UCPD": "verbraucherschutz", "VSBG": "verbraucherschutz", "PANGV": "verbraucherschutz",
"DL-INFOV": "verbraucherschutz", "OMNIBUS": "verbraucherschutz", "UWG AT": "verbraucherschutz",
"PRODHAFTG": "verbraucherschutz", "PRODUKTHAFTUNGS-RL": "verbraucherschutz",
"ARG": "arbeitsrecht",
}
// KnowledgeSpaceLabel maps a knowledge-space id to a user-facing chip label.
var KnowledgeSpaceLabel = map[string]string{
"datenschutz": "Datenschutz", "cyber": "Cybersecurity", "ki": "KI",
"produktsicherheit": "Produktsicherheit", "arbeitsschutz": "Arbeitsschutz",
"arbeitsrecht": "Arbeitsrecht", "wirtschaftsrecht": "Wirtschaftsrecht",
"finanz": "Finanzregulierung", "digitale_dienste": "Digitale Dienste",
"verbraucherschutz": "Verbraucherschutz", "lieferkette": "Lieferkette/Nachhaltigkeit",
"hinweisgeber": "Hinweisgeberschutz", "sonstiges": "Sonstiges",
}
// KnowledgeSpaceOf maps a regulation_code to a knowledge space. Robust to code
// variants (MVO/MASCHVO/MASCHINENVO -> produktsicherheit; DSK SDM / SDM B51 ->
// datenschutz). Returns "" for empty/untagged codes (not a knowledge space).
func KnowledgeSpaceOf(code string) string {
c := strings.ToUpper(strings.TrimSpace(code))
if c == "" || c == "NONE" {
return ""
}
if d, ok := knowledgeSpaceExact[c]; ok {
return d
}
has := func(subs ...string) bool {
for _, s := range subs {
if strings.Contains(c, s) {
return true
}
}
return false
}
pre := func(subs ...string) bool {
for _, s := range subs {
if strings.HasPrefix(c, s) {
return true
}
}
return false
}
switch {
case pre("TRGS", "TRBS", "ASR", "OSHA") || has("ARBSCHG", "GEFAHRSTOFF"):
return "arbeitsschutz"
case has("AI ACT", "KI-VO", "KI VERORDNUNG", "GPAI", "AI RMF", "HLEG AI", "GENAI", "OECD AI", "AI PRINCIPLES", "OH KI", "KI BEHOERDEN", "KI SICHERHEIT", "POS KI"):
return "ki"
case pre("DSGVO", "BDSG", "TDDDG", "DSK", "EDPB", "WP24", "WP25", "WP26", "DSFA", "BFDI", "BAYLDA", "BAYLFB", "EDPS") || has("DATENSCHUTZ", "LOESCHKONZEPT", "LOESCHUNG", "VVT", "TELEMEDIEN", "EU US DPF", "BESCHAEFTIGTENDATEN"):
return "datenschutz"
case has("CRA", "NIS2", "NISG", "BSIG", "BSI-TR", "BSI_KRITIS", "KRITIS", "ENISA", "NIST", "OWASP", "EUCSA", "EUCC", "CISA", "CYCLONEDX", "SPDX", "SLSA", "OPENTELEMETRY", "CVSS", "SECURE BY DESIGN"):
return "cyber"
case has("MACHINERY", "MASCH", "BLUE GUIDE", "FDA HFE"):
return "produktsicherheit"
case has("LKSG", "CSDDD", "CSRD", "TAXONOMY"):
return "lieferkette"
case has("HINSCHG", "GESCHGEHG"):
return "hinweisgeber"
case pre("BAG ", "BAG_") || has("ARBVG", "AZG", "ARBZG", "BETRVG", "KSCHG", "MUSCHG", "AGG", "MILOG", "TZBFG", "NACHWG", "BURLG", "611A", "PAY TRANSPARENCY", "ANGG", "MUTTERSCHUTZ"):
return "arbeitsrecht"
case has("ECOMMERCE", "ECG", "MEDIENG", "VERBRAUCHERRECHTE", "DIGITAL CONTENT"):
return "verbraucherschutz"
case pre("EUGH", "BVERFG", "BVGE", "BGH", "OGH") || has("EU TAXONOMY"):
return "wirtschaftsrecht"
default:
return "sonstiges"
}
}
// ScopeResults implements G1 scope-gating: when the query names a regulation, its
// knowledge space's hits LEAD the result set (the L2 answer + [n] citations are
// built on this order, so scoped answers cite the named regulation instead of the
// embedding-majority domain). Non-scoped hits backfill to keep topK. Stable within
// each partition. Returns results unchanged when scope is "".
func ScopeResults(results []LegalSearchResult, scope string, topK int) []LegalSearchResult {
if scope == "" {
return results
}
scoped := make([]LegalSearchResult, 0, len(results))
rest := make([]LegalSearchResult, 0, len(results))
for _, r := range results {
if KnowledgeSpaceOf(r.RegulationCode) == scope {
scoped = append(scoped, r)
} else {
rest = append(rest, r)
}
}
out := append(scoped, rest...)
if topK > 0 && len(out) > topK {
out = out[:topK]
}
return out
}
// FilterByKnowledgeSpace returns ONLY the results in the given knowledge space —
// a HARD scope with no off-domain backfill. Used by E5 context scoping: when the
// user explicitly chose a domain chip, off-domain regelwerke (MDR/UStG/eIDAS) must
// not reappear in the evidence. Falls back to the input when the domain has no hits
// (never strand the answer). Caps topK.
func FilterByKnowledgeSpace(results []LegalSearchResult, scope string, topK int) []LegalSearchResult {
if scope == "" {
return results
}
out := make([]LegalSearchResult, 0, len(results))
for _, r := range results {
if KnowledgeSpaceOf(r.RegulationCode) == scope {
out = append(out, r)
}
}
if len(out) == 0 {
return results
}
if topK > 0 && len(out) > topK {
out = out[:topK]
}
return out
}