merge(ai-sdk): Advisor Reasoning Stack → main (Clarity+G1+Concept+Scope+Term+Intent)
This commit is contained in:
@@ -1,6 +1,9 @@
|
||||
package handlers
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"log"
|
||||
"net/http"
|
||||
"strconv"
|
||||
|
||||
@@ -87,6 +90,7 @@ func (h *RAGHandlers) Search(c *gin.Context) {
|
||||
type RetrieveRequest struct {
|
||||
Query string `json:"query" binding:"required"`
|
||||
TopK int `json:"top_k,omitempty"`
|
||||
Context string `json:"context,omitempty"`
|
||||
}
|
||||
|
||||
// Retrieve is the Authority Router endpoint. The Advisor calls this with ONLY a query and stays
|
||||
@@ -105,20 +109,171 @@ func (h *RAGHandlers) Retrieve(c *gin.Context) {
|
||||
req.TopK = 8
|
||||
}
|
||||
|
||||
// E2 Term Resolution: expand unambiguous abbreviations (TOM/VVT/AVV/DSB/DSFA) into the
|
||||
// query so retrieval finds them; ambiguous ones (DSE/DPA) are surfaced to the FE — NOT
|
||||
// auto-mapped (chat context E1 wins, else the FE asks).
|
||||
intent := ucca.DetectIntent(req.Query)
|
||||
termRes := ucca.ResolveAbbreviations(req.Query)
|
||||
req.Query = termRes.Expanded
|
||||
|
||||
results, err := h.ragClient.Retrieve(c.Request.Context(), req.Query, req.TopK)
|
||||
if err != nil {
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": "RAG retrieve failed: " + err.Error()})
|
||||
return
|
||||
}
|
||||
|
||||
// Evidence-Type-Schicht: die autoritative typisierte Evidence (Fußnoten/Tabellen/Figuren) aus
|
||||
// dem KB-Wissensraum SEPARAT surfacen, statt sie im Breit-Basis-Text-Merge zu verlieren.
|
||||
// results[] bleibt der Text-Kontext fürs LLM + die Quellen-Liste.
|
||||
// Context scoping (E5): the user explicitly chose a knowledge space (chip), so scope
|
||||
// the evidence HARD to it (wider re-retrieve + domain filter) — no off-domain regelwerke
|
||||
// (MDR/UStG/eIDAS) after a context decision.
|
||||
if req.Context != "" {
|
||||
if wide, werr := h.ragClient.Retrieve(c.Request.Context(), req.Query, 30); werr == nil && len(wide) > 0 {
|
||||
results = ucca.FilterByKnowledgeSpace(wide, req.Context, req.TopK)
|
||||
} else {
|
||||
results = ucca.FilterByKnowledgeSpace(results, req.Context, req.TopK)
|
||||
}
|
||||
}
|
||||
|
||||
// G1 scope-gating: a named regulation scopes the evidence to its knowledge space.
|
||||
// Re-retrieve wider and lead with the named regulation's domain so the L2 answer +
|
||||
// [n] citations are built on scoped evidence, not the embedding-majority domain.
|
||||
if scope := ucca.QueryKnowledgeSpace(req.Query); scope != "" {
|
||||
if wide, werr := h.ragClient.Retrieve(c.Request.Context(), req.Query, 30); werr == nil && len(wide) > 0 {
|
||||
results = ucca.ScopeResults(wide, scope, req.TopK)
|
||||
} else {
|
||||
results = ucca.ScopeResults(results, scope, req.TopK)
|
||||
}
|
||||
}
|
||||
|
||||
ev := h.ragClient.RetrieveEvidence(c.Request.Context(), req.Query)
|
||||
// Concept->Norm recall injector: if the query names a legal concept, fetch its
|
||||
// load-bearing norms (Datenschutzerklärung -> Art. 12/13/14 DSGVO, ...) and inject
|
||||
// them into the evidence set so they surface (embedding similarity misses them).
|
||||
if norms := ucca.ConceptNorms(req.Query); len(norms) > 0 {
|
||||
top := 0.9
|
||||
if len(results) > 0 {
|
||||
top = results[0].Score
|
||||
}
|
||||
injected := h.ragClient.FetchByNormIDs(c.Request.Context(), norms, top-0.001)
|
||||
results = ucca.InjectConceptNorms(results, injected, req.TopK)
|
||||
}
|
||||
clarity := ucca.ClassifyClarity(req.Query, results)
|
||||
traceClarity(req.Query, clarity, results)
|
||||
|
||||
c.JSON(http.StatusOK, gin.H{
|
||||
"query": req.Query,
|
||||
"results": results,
|
||||
"count": len(results),
|
||||
"assessment": ucca.Assess(results),
|
||||
"footnotes": footnotesFromEvidence(ev[ucca.EvidenceFootnote]),
|
||||
"tables": tablesFromEvidence(ev[ucca.EvidenceTable]),
|
||||
"evidence": evidenceFromResults(results),
|
||||
"visual_evidence": visualEvidenceFromEvidence(ev[ucca.EvidenceFigure]),
|
||||
"clarity": clarity,
|
||||
"term_resolution": termRes.Ambiguous,
|
||||
"interaction_intent": intent,
|
||||
})
|
||||
}
|
||||
|
||||
// footnotesFromEvidence maps FOOTNOTE evidence to the Evidence-Workspace RawFootnote shape.
|
||||
func footnotesFromEvidence(rs []ucca.LegalSearchResult) []gin.H {
|
||||
out := make([]gin.H, 0, len(rs))
|
||||
for _, r := range rs {
|
||||
out = append(out, gin.H{
|
||||
"id": r.CitationUnit,
|
||||
"ref": r.CitationUnit,
|
||||
"number": r.FootnoteLabel,
|
||||
"regulation_code": r.RegulationCode,
|
||||
"regulation_short": r.RegulationShort,
|
||||
"regulation_name": r.RegulationName,
|
||||
"section": r.RefCitationUnit,
|
||||
"text": r.FootnoteVerbatim,
|
||||
})
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// tablesFromEvidence maps TABLE evidence (C6/C9). Key is present so the same Evidence-Type path
|
||||
// carries tables the moment the UI adds a table section.
|
||||
func tablesFromEvidence(rs []ucca.LegalSearchResult) []gin.H {
|
||||
out := make([]gin.H, 0, len(rs))
|
||||
for _, r := range rs {
|
||||
out = append(out, gin.H{
|
||||
"id": r.CitationUnit,
|
||||
"caption": r.ArticleLabel,
|
||||
"regulation_code": r.RegulationCode,
|
||||
"regulation_short": r.RegulationShort,
|
||||
"regulation_name": r.RegulationName,
|
||||
"section": r.RefCitationUnit,
|
||||
"text": r.Text,
|
||||
})
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// visualEvidenceFromEvidence maps FIGURE evidence to the Visual Evidence contract shape
|
||||
// (C8). visual_type/image_ref/vision_summary populate once C8 lands; the shape is stable now.
|
||||
func visualEvidenceFromEvidence(rs []ucca.LegalSearchResult) []gin.H {
|
||||
out := make([]gin.H, 0, len(rs))
|
||||
for _, r := range rs {
|
||||
out = append(out, gin.H{
|
||||
"visual_id": r.CitationUnit,
|
||||
"visual_type": "figure",
|
||||
"caption": r.ArticleLabel,
|
||||
"document": evidenceDocName(r),
|
||||
"context": ucca.KnowledgeSpaceOf(r.RegulationCode),
|
||||
"regulation_code": r.RegulationCode,
|
||||
"section": r.RefCitationUnit,
|
||||
"image_ref": "",
|
||||
"vision_summary": "",
|
||||
})
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// evidenceFromResults maps retrieval hits to the Evidence contract shape the Advisor
|
||||
// Evidence Workspace renders (citations[] reference evidence_id). Populated at retrieve
|
||||
// time; citations[] (the [n]<->evidence coupling) come from the answer-generation step.
|
||||
func evidenceFromResults(rs []ucca.LegalSearchResult) []gin.H {
|
||||
out := make([]gin.H, 0, len(rs))
|
||||
for _, r := range rs {
|
||||
id := r.CitationUnit
|
||||
if id == "" {
|
||||
id = r.ArticleLabel
|
||||
}
|
||||
out = append(out, gin.H{
|
||||
"evidence_id": id,
|
||||
"document": evidenceDocName(r),
|
||||
"section": r.ArticleLabel,
|
||||
"paragraph": r.Paragraph,
|
||||
"snippet": evidenceSnippet(r.Text, 280),
|
||||
"url": r.SourceURL,
|
||||
"regulation_code": r.RegulationCode,
|
||||
"context": ucca.KnowledgeSpaceOf(r.RegulationCode),
|
||||
})
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// evidenceDocName is the human-facing source name (short code, else full name).
|
||||
func evidenceDocName(r ucca.LegalSearchResult) string {
|
||||
if r.RegulationShort != "" {
|
||||
return r.RegulationShort
|
||||
}
|
||||
return r.RegulationName
|
||||
}
|
||||
|
||||
// evidenceSnippet returns a trimmed excerpt of at most n runes.
|
||||
func evidenceSnippet(s string, n int) string {
|
||||
rs := []rune(s)
|
||||
if len(rs) <= n {
|
||||
return s
|
||||
}
|
||||
return string(rs[:n]) + "…"
|
||||
}
|
||||
|
||||
// ListRegulations returns the list of available regulations in the corpus.
|
||||
// GET /sdk/v1/rag/regulations
|
||||
func (h *RAGHandlers) ListRegulations(c *gin.Context) {
|
||||
@@ -273,3 +428,29 @@ func (h *RAGHandlers) LegalCorpusStructure(c *gin.Context) {
|
||||
},
|
||||
})
|
||||
}
|
||||
|
||||
// traceClarity emits a structured CLARITY_TRACE log line per retrieve for the macmini
|
||||
// test session, so qualitative user ratings can be correlated with the gate decision.
|
||||
func traceClarity(query string, cl ucca.Clarity, results []ucca.LegalSearchResult) {
|
||||
top := make([]string, 0, 3)
|
||||
for i, r := range results {
|
||||
if i >= 3 {
|
||||
break
|
||||
}
|
||||
top = append(top, r.RegulationShort)
|
||||
}
|
||||
chips := make([]string, 0, len(cl.CandidateContexts))
|
||||
for _, c := range cl.CandidateContexts {
|
||||
chips = append(chips, fmt.Sprintf("%s:%d", c.ID, c.Hits))
|
||||
}
|
||||
b, _ := json.Marshal(map[string]interface{}{
|
||||
"query": query,
|
||||
"mode": cl.Mode,
|
||||
"reason": cl.Reason,
|
||||
"concentration": cl.Concentration,
|
||||
"dominant": cl.DominantContext,
|
||||
"chips": chips,
|
||||
"top_evidence": top,
|
||||
})
|
||||
log.Printf("CLARITY_TRACE %s", string(b))
|
||||
}
|
||||
|
||||
@@ -0,0 +1,33 @@
|
||||
package ucca
|
||||
|
||||
import (
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestResolveAbbreviations(t *testing.T) {
|
||||
// unambiguous -> expanded, not flagged
|
||||
tr := ResolveAbbreviations("Was ist eine TOM?")
|
||||
if !strings.Contains(tr.Expanded, "technische und organisatorische") {
|
||||
t.Errorf("TOM must be expanded, got %q", tr.Expanded)
|
||||
}
|
||||
if len(tr.Ambiguous) != 0 {
|
||||
t.Errorf("TOM must not be ambiguous, got %v", tr.Ambiguous)
|
||||
}
|
||||
// ambiguous DSE -> flagged, NOT auto-expanded (chat context must win, else FE asks)
|
||||
tr2 := ResolveAbbreviations("welche Infos in eine DSE?")
|
||||
if tr2.Expanded != "welche Infos in eine DSE?" {
|
||||
t.Errorf("DSE must NOT be auto-mapped, got %q", tr2.Expanded)
|
||||
}
|
||||
if len(tr2.Ambiguous) != 1 || tr2.Ambiguous[0].Abbreviation != "DSE" || len(tr2.Ambiguous[0].Candidates) != 2 {
|
||||
t.Errorf("DSE must be flagged ambiguous with 2 candidates, got %v", tr2.Ambiguous)
|
||||
}
|
||||
// no abbreviation -> unchanged
|
||||
if ResolveAbbreviations("Wie ist das Wetter?").Expanded != "Wie ist das Wetter?" {
|
||||
t.Errorf("query without abbreviation must be unchanged")
|
||||
}
|
||||
// substring must NOT match ("atom" contains "tom" but is not the word TOM)
|
||||
if strings.Contains(ResolveAbbreviations("Was ist ein Atom?").Expanded, "organisatorische") {
|
||||
t.Errorf("substring 'tom' in 'Atom' must not trigger expansion")
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,65 @@
|
||||
package ucca
|
||||
|
||||
import (
|
||||
"strings"
|
||||
"unicode"
|
||||
)
|
||||
|
||||
// TermResolution is the E2 (Term Resolution) signal in the Advisor Reasoning Stack.
|
||||
// Expanded drives retrieval internally (unambiguous abbreviations are spelled out so
|
||||
// the embedding/concept layer finds them). Ambiguous is surfaced to the FE, which
|
||||
// resolves it via chat context (E1) or asks the user ("Meinst du X oder Y?"). The
|
||||
// lexicon NEVER auto-maps an ambiguous abbreviation (e.g. DSE) — real-life discipline.
|
||||
type TermResolution struct {
|
||||
Expanded string `json:"-"`
|
||||
Ambiguous []TermAmbiguity `json:"ambiguous,omitempty"`
|
||||
}
|
||||
|
||||
// TermAmbiguity flags one abbreviation the SDK could not resolve deterministically.
|
||||
type TermAmbiguity struct {
|
||||
Abbreviation string `json:"abbreviation"`
|
||||
Candidates []string `json:"candidates"`
|
||||
}
|
||||
|
||||
// abbreviationLexicon maps a (lowercased) abbreviation to its canonical term(s).
|
||||
// >1 candidate = ambiguous → flagged, not expanded. Start small (User-Spec).
|
||||
var abbreviationLexicon = map[string][]string{
|
||||
"dse": {"Datenschutzerklärung", "Datenschutz-Folgenabschätzung"}, // ambiguous — context wins, else ask
|
||||
"dsfa": {"Datenschutz-Folgenabschätzung"},
|
||||
"tom": {"technische und organisatorische Maßnahmen"},
|
||||
"vvt": {"Verzeichnis von Verarbeitungstätigkeiten"},
|
||||
"avv": {"Auftragsverarbeitungsvertrag"},
|
||||
"dsb": {"Datenschutzbeauftragter"},
|
||||
"dpa": {"Data Processing Agreement", "Datenschutzaufsichtsbehörde"}, // ambiguous
|
||||
}
|
||||
|
||||
// ResolveAbbreviations expands unambiguous abbreviations into the query and flags
|
||||
// ambiguous ones. Deterministic: iterates query tokens in order (no map-order
|
||||
// dependence). Whole-word match (case-insensitive) so "TOM" hits but "atom" does not.
|
||||
func ResolveAbbreviations(query string) TermResolution {
|
||||
tr := TermResolution{Expanded: query}
|
||||
words := strings.FieldsFunc(query, func(r rune) bool {
|
||||
return !unicode.IsLetter(r) && !unicode.IsNumber(r)
|
||||
})
|
||||
seen := map[string]bool{}
|
||||
var expansions []string
|
||||
for _, w := range words {
|
||||
lw := strings.ToLower(w)
|
||||
cands, ok := abbreviationLexicon[lw]
|
||||
if !ok || seen[lw] {
|
||||
continue
|
||||
}
|
||||
seen[lw] = true
|
||||
if len(cands) == 1 {
|
||||
expansions = append(expansions, cands[0])
|
||||
} else {
|
||||
tr.Ambiguous = append(tr.Ambiguous, TermAmbiguity{
|
||||
Abbreviation: strings.ToUpper(lw), Candidates: cands,
|
||||
})
|
||||
}
|
||||
}
|
||||
if len(expansions) > 0 {
|
||||
tr.Expanded = query + " " + strings.Join(expansions, " ")
|
||||
}
|
||||
return tr
|
||||
}
|
||||
@@ -0,0 +1,135 @@
|
||||
package ucca
|
||||
|
||||
import (
|
||||
"sort"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// Clarity is the READ-ONLY, INSTRUMENTED clarity-gate signal emitted alongside a
|
||||
// retrieve response. It does NOT change retrieval or advisor behaviour yet — the
|
||||
// advisor still answers normally. Once ~30-50 real questions are collected the
|
||||
// thresholds get finalised and the gate is activated in the advisor flow.
|
||||
//
|
||||
// Ambiguity has two independent sources (empirically measured, 12-question set):
|
||||
// - retrieval scatter: hits spread across many knowledge spaces (low
|
||||
// concentration / high domain_count) — the retriever itself can't localise.
|
||||
// - conceptual generality: a general term the corpus OVER-localises (e.g. "PDCA"
|
||||
// concentrates on datenschutz but is cross-domain) — only an LLM knows this.
|
||||
// The middle band is where the LLM-intent classifier must decide.
|
||||
//
|
||||
// G1 (explicit scope): when the query NAMES a regulation ("... nach TRGS", "CRA
|
||||
// ...", "MaschinenVO ..."), that explicit context beats the embedding scatter —
|
||||
// the gate scopes to the named regulation's knowledge space regardless of
|
||||
// concentration. This is regulation detection, NOT a broad-term list.
|
||||
type Clarity struct {
|
||||
Mode string `json:"mode"` // "answer" | "clarify"
|
||||
Reason string `json:"reason"` // low_concentration | many_domains | high_confidence_scope | middle_band_llm_needed | explicit_scope | no_domain_signal
|
||||
Concentration float64 `json:"concentration"` // fraction of tagged hits in the dominant knowledge space
|
||||
DomainCount int `json:"domain_count"` // distinct knowledge spaces in the hits
|
||||
DominantContext string `json:"dominant_context"` // knowledge-space id (explicit scope wins if the query names a regulation)
|
||||
CandidateContexts []ClarityContext `json:"candidate_contexts"` // corpus-grounded chips (spaces actually present)
|
||||
}
|
||||
|
||||
// ClarityContext is one corpus-grounded context chip.
|
||||
type ClarityContext struct {
|
||||
ID string `json:"id"`
|
||||
Label string `json:"label"`
|
||||
Hits int `json:"hits"`
|
||||
}
|
||||
|
||||
// Tiered thresholds — INSTRUMENTED DEFAULTS, calibrate on 30-50 real questions.
|
||||
const (
|
||||
clarityMaxConcentration = 0.45 // <= this => clarify (retrieval scatter)
|
||||
clarityMinDomains = 4 // >= this => clarify (broad spread)
|
||||
clarityAnswerConc = 0.75 // >= this => answer (confident scope)
|
||||
)
|
||||
|
||||
// QueryKnowledgeSpace detects an EXPLICIT regulation mention in the query and maps
|
||||
// it to a knowledge space. Regulation detection (authority), not a broad-term list:
|
||||
// only fires when the user names a concrete regelwerk. "" if none named.
|
||||
func QueryKnowledgeSpace(query string) string {
|
||||
q := " " + strings.ToLower(query) + " "
|
||||
has := func(subs ...string) bool {
|
||||
for _, s := range subs {
|
||||
if strings.Contains(q, s) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
switch {
|
||||
case has("trgs", "trbs", " asr ", "gefahrstoff", "arbeitsplatzgrenzwert", "arbeitsschutz"):
|
||||
return "arbeitsschutz"
|
||||
case has("dsgvo", "gdpr", "bdsg", "tdddg", "ttdsg", " dsk ", "edpb", "datenschutz", " dsfa "):
|
||||
return "datenschutz"
|
||||
case has(" cra ", "cyber resilience", "nis2", "nis-2", " dora ", "enisa", "bsig", "kritis"):
|
||||
return "cyber"
|
||||
case has("ai act", "ki-vo", "ki-verordnung", "ki-system"):
|
||||
return "ki"
|
||||
case has("maschinenverordnung", "maschinenvo", "maschvo", "maschinenrichtlinie", " gpsr ", "produktsicherheit"):
|
||||
return "produktsicherheit"
|
||||
case has(" mdr ", "medizinprodukt", "medical device"):
|
||||
return "produktsicherheit"
|
||||
default:
|
||||
return ""
|
||||
}
|
||||
}
|
||||
|
||||
// ClassifyClarity computes the read-only clarity signal. Deterministic tiers on the
|
||||
// knowledge-space concentration, PLUS the G1 explicit-scope override: if the query
|
||||
// names a regulation, that scope wins over the embedding scatter.
|
||||
func ClassifyClarity(query string, results []LegalSearchResult) Clarity {
|
||||
counts := map[string]int{}
|
||||
total := 0
|
||||
for _, r := range results {
|
||||
if s := KnowledgeSpaceOf(r.RegulationCode); s != "" {
|
||||
counts[s]++
|
||||
total++
|
||||
}
|
||||
}
|
||||
cl := Clarity{Mode: "answer", Reason: "high_confidence_scope", CandidateContexts: []ClarityContext{}}
|
||||
if total == 0 {
|
||||
cl.Mode, cl.Reason = "clarify", "no_domain_signal"
|
||||
if ks := QueryKnowledgeSpace(query); ks != "" {
|
||||
cl.Mode, cl.Reason, cl.DominantContext = "answer", "explicit_scope", ks
|
||||
}
|
||||
return cl
|
||||
}
|
||||
type kc struct {
|
||||
id string
|
||||
n int
|
||||
}
|
||||
ks := make([]kc, 0, len(counts))
|
||||
for id, n := range counts {
|
||||
ks = append(ks, kc{id, n})
|
||||
}
|
||||
sort.Slice(ks, func(i, j int) bool {
|
||||
if ks[i].n != ks[j].n {
|
||||
return ks[i].n > ks[j].n
|
||||
}
|
||||
return ks[i].id < ks[j].id
|
||||
})
|
||||
cl.DominantContext = ks[0].id
|
||||
cl.Concentration = float64(ks[0].n) / float64(total)
|
||||
cl.DomainCount = len(counts)
|
||||
for _, k := range ks {
|
||||
cl.CandidateContexts = append(cl.CandidateContexts, ClarityContext{
|
||||
ID: k.id, Label: KnowledgeSpaceLabel[k.id], Hits: k.n,
|
||||
})
|
||||
}
|
||||
switch {
|
||||
case cl.Concentration <= clarityMaxConcentration:
|
||||
cl.Mode, cl.Reason = "clarify", "low_concentration"
|
||||
case cl.DomainCount >= clarityMinDomains:
|
||||
cl.Mode, cl.Reason = "clarify", "many_domains"
|
||||
case cl.Concentration >= clarityAnswerConc:
|
||||
cl.Mode, cl.Reason = "answer", "high_confidence_scope"
|
||||
default:
|
||||
cl.Mode, cl.Reason = "answer", "middle_band_llm_needed"
|
||||
}
|
||||
// G1: an explicitly named regulation beats the embedding scatter.
|
||||
if q := QueryKnowledgeSpace(query); q != "" {
|
||||
cl.Mode, cl.Reason, cl.DominantContext = "answer", "explicit_scope", q
|
||||
}
|
||||
return cl
|
||||
}
|
||||
@@ -0,0 +1,64 @@
|
||||
package ucca
|
||||
|
||||
import "testing"
|
||||
|
||||
func TestKnowledgeSpaceOf(t *testing.T) {
|
||||
cases := map[string]string{
|
||||
"DSGVO": "datenschutz",
|
||||
"BDSG": "datenschutz",
|
||||
"DSK SDM B51 ZUGRIFFE": "datenschutz",
|
||||
"EDPS DIGITAL ETHICS": "datenschutz",
|
||||
"TRGS 900": "arbeitsschutz",
|
||||
"OSHA 1910 SUBPART O": "arbeitsschutz",
|
||||
"HGB": "wirtschaftsrecht",
|
||||
"BGB": "wirtschaftsrecht",
|
||||
"MASCHINENVO": "produktsicherheit",
|
||||
"MVO": "produktsicherheit",
|
||||
"CRA": "cyber",
|
||||
"NIST SP800 53R5": "cyber",
|
||||
"AI ACT": "ki",
|
||||
"KI-VO": "ki",
|
||||
"DORA": "finanz",
|
||||
"ARG": "arbeitsrecht",
|
||||
"": "",
|
||||
}
|
||||
for code, want := range cases {
|
||||
if got := KnowledgeSpaceOf(code); got != want {
|
||||
t.Errorf("KnowledgeSpaceOf(%q)=%q want %q", code, got, want)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestClassifyClarity(t *testing.T) {
|
||||
scattered := []LegalSearchResult{
|
||||
{RegulationCode: "CRA"}, {RegulationCode: "MASCHINENVO"}, {RegulationCode: "EU MDR"},
|
||||
{RegulationCode: "KI-VO"}, {RegulationCode: "TRBS 1111"}, {RegulationCode: "OWASP TOP10"},
|
||||
}
|
||||
if c := ClassifyClarity("Welche Risiken gibt es?", scattered); c.Mode != "clarify" {
|
||||
t.Errorf("scattered: mode=%q reason=%q want clarify", c.Mode, c.Reason)
|
||||
}
|
||||
concentrated := []LegalSearchResult{
|
||||
{RegulationCode: "DSGVO"}, {RegulationCode: "BDSG"}, {RegulationCode: "DSK SDM"},
|
||||
{RegulationCode: "EDPB WP243"}, {RegulationCode: "TDDDG"},
|
||||
}
|
||||
c := ClassifyClarity("Was ist eine DSFA?", concentrated)
|
||||
if c.Mode != "answer" || c.DominantContext != "datenschutz" {
|
||||
t.Errorf("concentrated: mode=%q dominant=%q want answer/datenschutz", c.Mode, c.DominantContext)
|
||||
}
|
||||
}
|
||||
|
||||
func TestClassifyClarity_ExplicitScope(t *testing.T) {
|
||||
// G1: query names TRGS -> arbeitsschutz wins even though retrieval scatters to datenschutz.
|
||||
scattered := []LegalSearchResult{
|
||||
{RegulationCode: "DSK SDM METHODE"}, {RegulationCode: "DSK SDM V31"}, {RegulationCode: "DSK SDM B41 PLANEN"},
|
||||
{RegulationCode: "DSGVO"}, {RegulationCode: "DSK SDM"}, {RegulationCode: "TRGS 900"}, {RegulationCode: "TRGS 554"},
|
||||
}
|
||||
c := ClassifyClarity("Schwellwertanalyse nach TRGS", scattered)
|
||||
if c.Mode != "answer" || c.Reason != "explicit_scope" || c.DominantContext != "arbeitsschutz" {
|
||||
t.Errorf("explicit TRGS: mode=%q reason=%q dominant=%q want answer/explicit_scope/arbeitsschutz", c.Mode, c.Reason, c.DominantContext)
|
||||
}
|
||||
// no regulation named -> falls through to tiered logic
|
||||
if c := ClassifyClarity("Welche Risiken gibt es?", scattered); c.Reason == "explicit_scope" {
|
||||
t.Errorf("no reg named should not be explicit_scope, got %q", c.Reason)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,88 @@
|
||||
package ucca
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"net/http"
|
||||
)
|
||||
|
||||
// FetchByNormIDs loads one representative unit per norm_id from the KB slice
|
||||
// collection — the fetch side of the Concept->Norm recall injector. Returns
|
||||
// LegalSearchResult with the caller-provided concept-relevance score (there is no
|
||||
// similarity query; the injector places them by that score). Returns nil on any
|
||||
// error or when no KB slice is configured (graceful degradation).
|
||||
func (c *LegalRAGClient) FetchByNormIDs(ctx context.Context, normIDs []string, score float64) []LegalSearchResult {
|
||||
if c.kbSliceCollection == "" || len(normIDs) == 0 {
|
||||
return nil
|
||||
}
|
||||
should := make([]map[string]interface{}, 0, len(normIDs))
|
||||
for _, nid := range normIDs {
|
||||
should = append(should, map[string]interface{}{"key": "norm_id", "match": map[string]interface{}{"value": nid}})
|
||||
}
|
||||
reqBody := map[string]interface{}{
|
||||
"limit": len(normIDs) * 3,
|
||||
"with_payload": true,
|
||||
"with_vectors": false,
|
||||
"filter": map[string]interface{}{"should": should},
|
||||
}
|
||||
jsonBody, err := json.Marshal(reqBody)
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
url := fmt.Sprintf("%s/collections/%s/points/scroll", c.qdrantURL, c.kbSliceCollection)
|
||||
req, err := http.NewRequestWithContext(ctx, "POST", url, bytes.NewReader(jsonBody))
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
req.Header.Set("Content-Type", "application/json")
|
||||
if c.qdrantAPIKey != "" {
|
||||
req.Header.Set("api-key", c.qdrantAPIKey)
|
||||
}
|
||||
resp, err := c.httpClient.Do(req)
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
defer func() { _ = resp.Body.Close() }()
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return nil
|
||||
}
|
||||
var scrollResp qdrantScrollResponse
|
||||
if err := json.NewDecoder(resp.Body).Decode(&scrollResp); err != nil {
|
||||
return nil
|
||||
}
|
||||
seen := map[string]bool{}
|
||||
out := make([]LegalSearchResult, 0, len(normIDs))
|
||||
for _, pt := range scrollResp.Result.Points {
|
||||
nid := getString(pt.Payload, "norm_id")
|
||||
if nid == "" || seen[nid] {
|
||||
continue
|
||||
}
|
||||
seen[nid] = true
|
||||
out = append(out, scrollPointToResult(pt.Payload, score))
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// scrollPointToResult maps a scroll-point payload to a LegalSearchResult. Mirrors
|
||||
// hitsToResults' payload keys; the score is assigned by the caller (concept rank).
|
||||
func scrollPointToResult(payload map[string]interface{}, score float64) LegalSearchResult {
|
||||
regCode := getString(payload, "regulation_code")
|
||||
if regCode == "" {
|
||||
regCode = getString(payload, "regulation_id")
|
||||
}
|
||||
return LegalSearchResult{
|
||||
Text: getString(payload, "chunk_text"),
|
||||
RegulationCode: regCode,
|
||||
RegulationName: getString(payload, "regulation_name_de"),
|
||||
RegulationShort: getString(payload, "regulation_short"),
|
||||
Category: getString(payload, "category"),
|
||||
Article: getString(payload, "article"),
|
||||
ArticleLabel: getString(payload, "article_label"),
|
||||
Paragraph: getString(payload, "paragraph"),
|
||||
SourceURL: getString(payload, "source_url"),
|
||||
CitationUnit: getString(payload, "citation_unit"),
|
||||
Score: score,
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,97 @@
|
||||
package ucca
|
||||
|
||||
import (
|
||||
"sort"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// Legal Concept Ontology — the fachliche IP bridge for the Concept->Norm recall
|
||||
// injector. The words users type ("Datenschutzerklärung", "Cookie Banner") are
|
||||
// rarely identical to the article titles that actually govern them (Art. 12/13/14
|
||||
// DSGVO, § 25 TDDDG). Embedding similarity misses this leap, so these bridges are
|
||||
// curated: concept keyword -> load-bearing norm_ids. This is NOT a fallback to
|
||||
// hardcoding — it is domain knowledge that surfaces the normatively load-bearing
|
||||
// units within the (already correctly retrieved) documents.
|
||||
type conceptNorm struct {
|
||||
keywords []string
|
||||
normIDs []string
|
||||
}
|
||||
|
||||
var legalConceptOntology = []conceptNorm{
|
||||
{[]string{"datenschutzerklärung", "datenschutzerklaerung", "privacy policy", "datenschutzhinweise", "datenschutzinformation"},
|
||||
[]string{"EU-DSGVO-Art12", "EU-DSGVO-Art13", "EU-DSGVO-Art14"}},
|
||||
{[]string{"cookie banner", "cookie-banner", "cookies", "cookie", "tracking"},
|
||||
[]string{"DE-TDDDG-§25", "EU-DSGVO-Art6", "EU-DSGVO-Art7"}},
|
||||
{[]string{"dsfa", "folgenabschätzung", "folgenabschaetzung", "datenschutz-folgenabschätzung"},
|
||||
[]string{"EU-DSGVO-Art35", "EU-DSGVO-Art36"}},
|
||||
{[]string{"auskunft", "auskunftsrecht", "auskunftsersuchen"},
|
||||
[]string{"EU-DSGVO-Art15"}},
|
||||
{[]string{"löschung", "loeschung", "vergessenwerden", "recht auf vergessen"},
|
||||
[]string{"EU-DSGVO-Art17"}},
|
||||
{[]string{"datenübertragbarkeit", "datenuebertragbarkeit", "portabilität", "portabilitaet"},
|
||||
[]string{"EU-DSGVO-Art20"}},
|
||||
{[]string{"widerspruch", "widerspruchsrecht"},
|
||||
[]string{"EU-DSGVO-Art21"}},
|
||||
{[]string{"datenpanne", "datenschutzverletzung", "data breach", "verletzung des schutzes"},
|
||||
[]string{"EU-DSGVO-Art33", "EU-DSGVO-Art34"}},
|
||||
// E4-Quick-Curation (2026-07-01): resolved abbreviations (E2) pull their core norms.
|
||||
{[]string{"technische und organisatorische maßnahmen", "technische und organisatorische massnahmen"},
|
||||
[]string{"EU-DSGVO-Art32", "EU-DSGVO-Art25", "EU-DSGVO-Art5"}},
|
||||
{[]string{"verzeichnis von verarbeitungstätigkeiten", "verzeichnis von verarbeitungstaetigkeiten", "verarbeitungsverzeichnis"},
|
||||
[]string{"EU-DSGVO-Art30"}},
|
||||
{[]string{"auftragsverarbeitungsvertrag", "auftragsverarbeitung", "auftragsverarbeiter"},
|
||||
[]string{"EU-DSGVO-Art28"}},
|
||||
{[]string{"datenschutzbeauftragt"},
|
||||
[]string{"EU-DSGVO-Art37", "EU-DSGVO-Art38", "EU-DSGVO-Art39"}},
|
||||
}
|
||||
|
||||
// ConceptNorms returns the load-bearing norm_ids for the concepts named in the
|
||||
// query (dedup, order-preserving). Empty if no concept is named.
|
||||
func ConceptNorms(query string) []string {
|
||||
q := strings.ToLower(query)
|
||||
seen := map[string]bool{}
|
||||
out := []string{}
|
||||
for _, cn := range legalConceptOntology {
|
||||
for _, kw := range cn.keywords {
|
||||
if strings.Contains(q, kw) {
|
||||
for _, nid := range cn.normIDs {
|
||||
if !seen[nid] {
|
||||
seen[nid] = true
|
||||
out = append(out, nid)
|
||||
}
|
||||
}
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// InjectConceptNorms merges concept-injected norm units into the results so the
|
||||
// load-bearing norms are VISIBLE in the evidence set. Dedups by citation_unit
|
||||
// (skips norms already retrieved), then re-sorts by score — the injected units
|
||||
// carry a just-below-top score so they surface high WITHOUT displacing the top
|
||||
// document hit (inject, don't blindly dominate). Caps at topK.
|
||||
func InjectConceptNorms(results, injected []LegalSearchResult, topK int) []LegalSearchResult {
|
||||
if len(injected) == 0 {
|
||||
return results
|
||||
}
|
||||
present := map[string]bool{}
|
||||
for _, r := range results {
|
||||
if r.CitationUnit != "" {
|
||||
present[r.CitationUnit] = true
|
||||
}
|
||||
}
|
||||
merged := append([]LegalSearchResult{}, results...)
|
||||
for _, in := range injected {
|
||||
if in.CitationUnit != "" && !present[in.CitationUnit] {
|
||||
merged = append(merged, in)
|
||||
present[in.CitationUnit] = true
|
||||
}
|
||||
}
|
||||
sort.SliceStable(merged, func(i, j int) bool { return merged[i].Score > merged[j].Score })
|
||||
if topK > 0 && len(merged) > topK {
|
||||
merged = merged[:topK]
|
||||
}
|
||||
return merged
|
||||
}
|
||||
@@ -0,0 +1,48 @@
|
||||
package ucca
|
||||
|
||||
import "testing"
|
||||
|
||||
func TestConceptNorms(t *testing.T) {
|
||||
q := "Was muss ich beachten wenn ich meine Datenschutzerklärung schreibe für meine Website mit Cookie Banner?"
|
||||
got := ConceptNorms(q)
|
||||
want := map[string]bool{
|
||||
"EU-DSGVO-Art12": true, "EU-DSGVO-Art13": true, "EU-DSGVO-Art14": true,
|
||||
"DE-TDDDG-§25": true, "EU-DSGVO-Art6": true, "EU-DSGVO-Art7": true,
|
||||
}
|
||||
for _, nid := range got {
|
||||
delete(want, nid)
|
||||
}
|
||||
if len(want) > 0 {
|
||||
t.Errorf("ConceptNorms missing %v; got %v", want, got)
|
||||
}
|
||||
if len(ConceptNorms("Wie ist das Wetter heute?")) != 0 {
|
||||
t.Errorf("no concept named should yield no norms")
|
||||
}
|
||||
}
|
||||
|
||||
func TestInjectConceptNorms(t *testing.T) {
|
||||
results := []LegalSearchResult{
|
||||
{CitationUnit: "DSK OH Telemedien", Score: 0.98},
|
||||
{CitationUnit: "Art. 25 DSGVO", Score: 0.95},
|
||||
}
|
||||
injected := []LegalSearchResult{
|
||||
{CitationUnit: "Art. 13 DSGVO", Score: 0.979},
|
||||
{CitationUnit: "Art. 25 DSGVO", Score: 0.979}, // already present -> must not double
|
||||
}
|
||||
out := InjectConceptNorms(results, injected, 10)
|
||||
if out[0].CitationUnit != "DSK OH Telemedien" {
|
||||
t.Errorf("top document hit must stay #1 (not dominated), got %s", out[0].CitationUnit)
|
||||
}
|
||||
if len(out) != 3 {
|
||||
t.Errorf("expected 3 (Art.25 not duplicated), got %d", len(out))
|
||||
}
|
||||
found := false
|
||||
for _, r := range out {
|
||||
if r.CitationUnit == "Art. 13 DSGVO" {
|
||||
found = true
|
||||
}
|
||||
}
|
||||
if !found {
|
||||
t.Errorf("Art. 13 DSGVO must be injected + visible")
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,68 @@
|
||||
package ucca
|
||||
|
||||
import "context"
|
||||
|
||||
// EvidenceType classifies a retrieved unit by WHAT KIND of evidence it is, independent of its
|
||||
// collection. Footnotes/tables/figures are Evidence Types, not collections. The Authority Router
|
||||
// surfaces non-text evidence from the authoritative knowledge space (the KB slice) SEPARATELY from
|
||||
// the merged text top-K, so fine-grained evidence isn't outranked by broad-base text.
|
||||
//
|
||||
// The layer this introduces: Intent -> Knowledge Space -> EvidenceType -> Collection -> Merge ->
|
||||
// Authority. Today FOOTNOTE is populated; FIGURE arrives with C8 and TABLE is already present from
|
||||
// C6/C9 — no router rebuild needed, the same path carries every new evidence type.
|
||||
type EvidenceType string
|
||||
|
||||
const (
|
||||
EvidenceText EvidenceType = "text"
|
||||
EvidenceFootnote EvidenceType = "footnote"
|
||||
EvidenceTable EvidenceType = "table"
|
||||
EvidenceFigure EvidenceType = "figure"
|
||||
)
|
||||
|
||||
// classifyEvidence derives the EvidenceType from a result's payload markers. Precedence
|
||||
// footnote > figure > table > text (a unit carries at most one is_* marker in practice).
|
||||
func classifyEvidence(r LegalSearchResult) EvidenceType {
|
||||
switch {
|
||||
case r.IsFootnote:
|
||||
return EvidenceFootnote
|
||||
case r.IsFigure:
|
||||
return EvidenceFigure
|
||||
case r.IsTable:
|
||||
return EvidenceTable
|
||||
default:
|
||||
return EvidenceText
|
||||
}
|
||||
}
|
||||
|
||||
// evidenceRetrievalTopK is the budget for the authoritative-KB evidence pass. Deliberately targeted
|
||||
// (the authoritative slice within the recognized knowledge space), NOT a blanket top-K increase of
|
||||
// the merged result set — the successes came from BETTER-targeted evidence, not MORE evidence.
|
||||
const evidenceRetrievalTopK = 20
|
||||
|
||||
// maxEvidencePerType caps each surfaced evidence type.
|
||||
const maxEvidencePerType = 6
|
||||
|
||||
// RetrieveEvidence returns the authoritative typed evidence (footnotes/tables/figures) for an
|
||||
// in-scope query, pulled from the KB slice and grouped by EvidenceType. This is the "Evidence Type"
|
||||
// router layer (Option A): when the query is in the KB knowledge space, the authoritative evidence
|
||||
// within that space is surfaced separately so it isn't lost in the broad-base text merge. Returns an
|
||||
// empty map when out of scope or KB routing is disabled. Text evidence is NOT returned here — it
|
||||
// flows through the normal Retrieve() merge (the LLM context + the sources list).
|
||||
func (c *LegalRAGClient) RetrieveEvidence(ctx context.Context, query string) map[EvidenceType][]LegalSearchResult {
|
||||
ev := map[EvidenceType][]LegalSearchResult{}
|
||||
if !c.kbScopeRoutingEnabled || c.kbSliceCollection == "" || !inKBScope(query) {
|
||||
return ev
|
||||
}
|
||||
hits, err := c.searchInternal(ctx, c.kbSliceCollection, query, nil, evidenceRetrievalTopK)
|
||||
if err != nil {
|
||||
return ev
|
||||
}
|
||||
for _, h := range hits {
|
||||
t := classifyEvidence(h)
|
||||
if t == EvidenceText || len(ev[t]) >= maxEvidencePerType {
|
||||
continue
|
||||
}
|
||||
ev[t] = append(ev[t], h)
|
||||
}
|
||||
return ev
|
||||
}
|
||||
@@ -0,0 +1,26 @@
|
||||
package ucca
|
||||
|
||||
import "testing"
|
||||
|
||||
func TestFilterByKnowledgeSpace(t *testing.T) {
|
||||
results := []LegalSearchResult{
|
||||
{CitationUnit: "Art. 13 DSGVO", RegulationCode: "DSGVO"},
|
||||
{CitationUnit: "EU Mdr", RegulationCode: "EU MDR"},
|
||||
{CitationUnit: "UStG § 14", RegulationCode: "USTG"},
|
||||
{CitationUnit: "DSK OH Telemedien", RegulationCode: "DSK OH TELEMEDIEN"},
|
||||
{CitationUnit: "eIDAS", RegulationCode: "EIDAS"},
|
||||
}
|
||||
out := FilterByKnowledgeSpace(results, "datenschutz", 10)
|
||||
for _, r := range out {
|
||||
if KnowledgeSpaceOf(r.RegulationCode) != "datenschutz" {
|
||||
t.Errorf("off-domain leaked into scoped result: %s (%s)", r.CitationUnit, r.RegulationCode)
|
||||
}
|
||||
}
|
||||
if len(out) != 2 { // Art. 13 DSGVO + DSK OH Telemedien
|
||||
t.Errorf("expected 2 datenschutz hits, got %d", len(out))
|
||||
}
|
||||
// domain with no hits -> fall back to input (never strand the answer)
|
||||
if len(FilterByKnowledgeSpace(results, "maschinen", 10)) != len(results) {
|
||||
t.Errorf("no-hit domain should fall back to full input")
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,46 @@
|
||||
package ucca
|
||||
|
||||
import "strings"
|
||||
|
||||
// DetectIntent classifies the INTERACTION INTENT of a query (Advisor Reasoning
|
||||
// Stack E3). The same norms answer very differently depending on the TASK the user
|
||||
// wants: "Was ist X?" (definition) vs "Wie schreibe ich X?" (anleitung) vs "Prüfe X"
|
||||
// (review). The SDK detects the intent deterministically and emits it; the FE picks
|
||||
// the answer FORM, so the LLM gets a precise assignment ("write an Anleitung over
|
||||
// this evidence") instead of guessing the format. Returns "" (neutral) when no
|
||||
// clear task is signalled. First tier of ~20-30 intent types.
|
||||
func DetectIntent(query string) string {
|
||||
q := " " + strings.ToLower(query) + " "
|
||||
has := func(subs ...string) bool {
|
||||
for _, s := range subs {
|
||||
if strings.Contains(q, s) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
switch {
|
||||
case has("prüfe", "prüf mein", "überprüfe", "überprüf", "review", "checke mein",
|
||||
"ist mein", "ist meine", "ist unser", "ist unsere", "konform", "stimmt mein",
|
||||
"bewerte mein", "analysiere mein"):
|
||||
return "review"
|
||||
case has("checkliste", "was muss ich alles", "was gehört alles", "was gehört in",
|
||||
"welche punkte muss", "was brauche ich alles"):
|
||||
return "checkliste"
|
||||
case has("vergleich", "unterschied", "worin unterscheid", " vs ", " versus ",
|
||||
"gegenüber", "im gegensatz"):
|
||||
return "vergleich"
|
||||
case has("wie schreibe", "wie erstelle", "wie erstell", "wie mache", "wie baue",
|
||||
"wie setze ich", "wie gehe ich vor", "wie formuliere", "wie richte ich",
|
||||
"anleitung", "schritt für schritt", "schritt-für-schritt", "erstelle mir",
|
||||
"erstell mir", "generiere", "was muss ich beachten", "worauf muss ich achten"):
|
||||
return "anleitung"
|
||||
case has("welche risiken", "welche gefahren", "risikoanalyse", "welche bedrohungen"):
|
||||
return "risikoanalyse"
|
||||
case has("was ist", "was bedeutet", "was versteht man", "was sind", "definition",
|
||||
"erkläre mir", "erklär mir", "was heißt", "was genau ist"):
|
||||
return "definition"
|
||||
default:
|
||||
return ""
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,22 @@
|
||||
package ucca
|
||||
|
||||
import "testing"
|
||||
|
||||
func TestDetectIntent(t *testing.T) {
|
||||
cases := map[string]string{
|
||||
"Was ist eine Datenschutzerklärung?": "definition",
|
||||
"Wie schreibe ich eine Datenschutzerklärung?": "anleitung",
|
||||
"Was muss ich beachten wenn ich eine DSE schreibe?": "anleitung",
|
||||
"Prüfe meine Datenschutzerklärung.": "review",
|
||||
"Ist meine Datenschutzerklärung konform?": "review",
|
||||
"Vergleiche DSGVO und BDSG.": "vergleich",
|
||||
"Welche Risiken gibt es?": "risikoanalyse",
|
||||
"Erstelle mir eine Checkliste für die DSFA.": "checkliste",
|
||||
"Wie ist das Wetter?": "",
|
||||
}
|
||||
for q, want := range cases {
|
||||
if got := DetectIntent(q); got != want {
|
||||
t.Errorf("DetectIntent(%q)=%q want %q", q, got, want)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,148 @@
|
||||
package ucca
|
||||
|
||||
import "strings"
|
||||
|
||||
// KnowledgeSpace is the CHIP-level knowledge domain used by the clarity gate's
|
||||
// concentration signal + the user-facing context chips. It is deliberately RICHER
|
||||
// than the 4 authority domains in authority.go (data_protection/cyber/ai/
|
||||
// product_safety), which drive the EU-primary/subsidiarity rerank. The clarity
|
||||
// gate must reflect the FULL corpus breadth (arbeitsschutz, arbeitsrecht,
|
||||
// wirtschaftsrecht, finanz, ...) so a broad query surfaces as broad. Kept separate
|
||||
// + additive so the tuned authority rerank stays untouched. Corpus-grounded from
|
||||
// the 463 real regulation codes (0.3% fall through to "sonstiges").
|
||||
|
||||
// knowledgeSpaceExact matches short/ambiguous codes by EXACT string (substring
|
||||
// would misfire on 2-3 char codes like "OR"/"AO"/"BGB").
|
||||
var knowledgeSpaceExact = map[string]string{
|
||||
"HGB": "wirtschaftsrecht", "BGB": "wirtschaftsrecht", "AO": "wirtschaftsrecht", "OR": "wirtschaftsrecht",
|
||||
"ABGB": "wirtschaftsrecht", "UGB": "wirtschaftsrecht", "IFRS": "wirtschaftsrecht", "BAO": "wirtschaftsrecht",
|
||||
"GMBHG": "wirtschaftsrecht", "AKTG": "wirtschaftsrecht", "INSO": "wirtschaftsrecht", "USTG": "wirtschaftsrecht",
|
||||
"GOBD": "wirtschaftsrecht", "EGBGB": "wirtschaftsrecht", "GEWO": "wirtschaftsrecht", "URHG": "wirtschaftsrecht",
|
||||
"DPF": "datenschutz", "TKG": "datenschutz", "TMG": "datenschutz", "DDG": "datenschutz", "DSG": "datenschutz",
|
||||
"DSV": "datenschutz", "DSM": "datenschutz", "SCC": "datenschutz", "EPRIVACY": "datenschutz",
|
||||
"SCHREMS II": "datenschutz", "CH_REVDSG": "datenschutz", "PLANET49": "datenschutz", "GOOGLE FONTS": "datenschutz",
|
||||
"DSA": "digitale_dienste", "DMA": "digitale_dienste", "DGA": "digitale_dienste", "EHDS": "digitale_dienste",
|
||||
"EIDAS": "digitale_dienste", "EIDAS 2.0": "digitale_dienste", "DATA ACT": "digitale_dienste",
|
||||
"DATAACT": "digitale_dienste", "DIGITAL CONTENT": "digitale_dienste",
|
||||
"MVO": "produktsicherheit", "MACHINERY": "produktsicherheit", "MASCHVO": "produktsicherheit",
|
||||
"MASCHINENVO": "produktsicherheit", "GPSR": "produktsicherheit", "PID": "produktsicherheit",
|
||||
"EAA": "produktsicherheit", "BFSG": "produktsicherheit", "ELEKTROG": "produktsicherheit",
|
||||
"VERPACKG": "produktsicherheit", "BATTVO": "produktsicherheit", "BATTDG": "produktsicherheit", "EU MDR": "produktsicherheit",
|
||||
"DORA": "finanz", "PSD2": "finanz", "MICA": "finanz", "AMLR": "finanz", "VAIT": "finanz", "BAIT": "finanz", "GWG": "finanz",
|
||||
"UWG": "verbraucherschutz", "UCPD": "verbraucherschutz", "VSBG": "verbraucherschutz", "PANGV": "verbraucherschutz",
|
||||
"DL-INFOV": "verbraucherschutz", "OMNIBUS": "verbraucherschutz", "UWG AT": "verbraucherschutz",
|
||||
"PRODHAFTG": "verbraucherschutz", "PRODUKTHAFTUNGS-RL": "verbraucherschutz",
|
||||
"ARG": "arbeitsrecht",
|
||||
}
|
||||
|
||||
// KnowledgeSpaceLabel maps a knowledge-space id to a user-facing chip label.
|
||||
var KnowledgeSpaceLabel = map[string]string{
|
||||
"datenschutz": "Datenschutz", "cyber": "Cybersecurity", "ki": "KI",
|
||||
"produktsicherheit": "Produktsicherheit", "arbeitsschutz": "Arbeitsschutz",
|
||||
"arbeitsrecht": "Arbeitsrecht", "wirtschaftsrecht": "Wirtschaftsrecht",
|
||||
"finanz": "Finanzregulierung", "digitale_dienste": "Digitale Dienste",
|
||||
"verbraucherschutz": "Verbraucherschutz", "lieferkette": "Lieferkette/Nachhaltigkeit",
|
||||
"hinweisgeber": "Hinweisgeberschutz", "sonstiges": "Sonstiges",
|
||||
}
|
||||
|
||||
// KnowledgeSpaceOf maps a regulation_code to a knowledge space. Robust to code
|
||||
// variants (MVO/MASCHVO/MASCHINENVO -> produktsicherheit; DSK SDM / SDM B51 ->
|
||||
// datenschutz). Returns "" for empty/untagged codes (not a knowledge space).
|
||||
func KnowledgeSpaceOf(code string) string {
|
||||
c := strings.ToUpper(strings.TrimSpace(code))
|
||||
if c == "" || c == "NONE" {
|
||||
return ""
|
||||
}
|
||||
if d, ok := knowledgeSpaceExact[c]; ok {
|
||||
return d
|
||||
}
|
||||
has := func(subs ...string) bool {
|
||||
for _, s := range subs {
|
||||
if strings.Contains(c, s) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
pre := func(subs ...string) bool {
|
||||
for _, s := range subs {
|
||||
if strings.HasPrefix(c, s) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
switch {
|
||||
case pre("TRGS", "TRBS", "ASR", "OSHA") || has("ARBSCHG", "GEFAHRSTOFF"):
|
||||
return "arbeitsschutz"
|
||||
case has("AI ACT", "KI-VO", "KI VERORDNUNG", "GPAI", "AI RMF", "HLEG AI", "GENAI", "OECD AI", "AI PRINCIPLES", "OH KI", "KI BEHOERDEN", "KI SICHERHEIT", "POS KI"):
|
||||
return "ki"
|
||||
case pre("DSGVO", "BDSG", "TDDDG", "DSK", "EDPB", "WP24", "WP25", "WP26", "DSFA", "BFDI", "BAYLDA", "BAYLFB", "EDPS") || has("DATENSCHUTZ", "LOESCHKONZEPT", "LOESCHUNG", "VVT", "TELEMEDIEN", "EU US DPF", "BESCHAEFTIGTENDATEN"):
|
||||
return "datenschutz"
|
||||
case has("CRA", "NIS2", "NISG", "BSIG", "BSI-TR", "BSI_KRITIS", "KRITIS", "ENISA", "NIST", "OWASP", "EUCSA", "EUCC", "CISA", "CYCLONEDX", "SPDX", "SLSA", "OPENTELEMETRY", "CVSS", "SECURE BY DESIGN"):
|
||||
return "cyber"
|
||||
case has("MACHINERY", "MASCH", "BLUE GUIDE", "FDA HFE"):
|
||||
return "produktsicherheit"
|
||||
case has("LKSG", "CSDDD", "CSRD", "TAXONOMY"):
|
||||
return "lieferkette"
|
||||
case has("HINSCHG", "GESCHGEHG"):
|
||||
return "hinweisgeber"
|
||||
case pre("BAG ", "BAG_") || has("ARBVG", "AZG", "ARBZG", "BETRVG", "KSCHG", "MUSCHG", "AGG", "MILOG", "TZBFG", "NACHWG", "BURLG", "611A", "PAY TRANSPARENCY", "ANGG", "MUTTERSCHUTZ"):
|
||||
return "arbeitsrecht"
|
||||
case has("ECOMMERCE", "ECG", "MEDIENG", "VERBRAUCHERRECHTE", "DIGITAL CONTENT"):
|
||||
return "verbraucherschutz"
|
||||
case pre("EUGH", "BVERFG", "BVGE", "BGH", "OGH") || has("EU TAXONOMY"):
|
||||
return "wirtschaftsrecht"
|
||||
default:
|
||||
return "sonstiges"
|
||||
}
|
||||
}
|
||||
|
||||
// ScopeResults implements G1 scope-gating: when the query names a regulation, its
|
||||
// knowledge space's hits LEAD the result set (the L2 answer + [n] citations are
|
||||
// built on this order, so scoped answers cite the named regulation instead of the
|
||||
// embedding-majority domain). Non-scoped hits backfill to keep topK. Stable within
|
||||
// each partition. Returns results unchanged when scope is "".
|
||||
func ScopeResults(results []LegalSearchResult, scope string, topK int) []LegalSearchResult {
|
||||
if scope == "" {
|
||||
return results
|
||||
}
|
||||
scoped := make([]LegalSearchResult, 0, len(results))
|
||||
rest := make([]LegalSearchResult, 0, len(results))
|
||||
for _, r := range results {
|
||||
if KnowledgeSpaceOf(r.RegulationCode) == scope {
|
||||
scoped = append(scoped, r)
|
||||
} else {
|
||||
rest = append(rest, r)
|
||||
}
|
||||
}
|
||||
out := append(scoped, rest...)
|
||||
if topK > 0 && len(out) > topK {
|
||||
out = out[:topK]
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// FilterByKnowledgeSpace returns ONLY the results in the given knowledge space —
|
||||
// a HARD scope with no off-domain backfill. Used by E5 context scoping: when the
|
||||
// user explicitly chose a domain chip, off-domain regelwerke (MDR/UStG/eIDAS) must
|
||||
// not reappear in the evidence. Falls back to the input when the domain has no hits
|
||||
// (never strand the answer). Caps topK.
|
||||
func FilterByKnowledgeSpace(results []LegalSearchResult, scope string, topK int) []LegalSearchResult {
|
||||
if scope == "" {
|
||||
return results
|
||||
}
|
||||
out := make([]LegalSearchResult, 0, len(results))
|
||||
for _, r := range results {
|
||||
if KnowledgeSpaceOf(r.RegulationCode) == scope {
|
||||
out = append(out, r)
|
||||
}
|
||||
}
|
||||
if len(out) == 0 {
|
||||
return results
|
||||
}
|
||||
if topK > 0 && len(out) > topK {
|
||||
out = out[:topK]
|
||||
}
|
||||
return out
|
||||
}
|
||||
@@ -37,6 +37,17 @@ type LegalSearchResult struct {
|
||||
// Supersede-Status (status="superseded", use_for_primary=false) — Alt-Quelle,
|
||||
// die fuer Default-Fragen demoted wird (nicht versteckt; fuer Historie auffindbar).
|
||||
Superseded bool `json:"-"`
|
||||
|
||||
// Evidence-Type-Marker — intern (json:"-", kein Pro-Result-Contract-Change), aus dem
|
||||
// Qdrant-Payload befuellt. classifyEvidence() leitet daraus den EvidenceType ab; der
|
||||
// Router surfacet nicht-Text-Evidence (Fußnote/Tabelle/Figur) getrennt vom Text-Merge,
|
||||
// damit feingranulare Evidence nicht von Breit-Basis-Text ueberrankt wird.
|
||||
IsFootnote bool `json:"-"`
|
||||
FootnoteLabel string `json:"-"`
|
||||
FootnoteVerbatim string `json:"-"`
|
||||
RefCitationUnit string `json:"-"`
|
||||
IsTable bool `json:"-"` // C6/C9: is_table (liniiert + borderless)
|
||||
IsFigure bool `json:"-"` // C8: is_figure (noch nicht befuellt bis C8)
|
||||
}
|
||||
|
||||
// LegalAssessment is the auditable explanation layer over a ranked result set:
|
||||
|
||||
@@ -195,6 +195,13 @@ func hitsToResults(hits []qdrantSearchHit) []LegalSearchResult {
|
||||
ReferencesOut: getStringSlice(hit.Payload, "references_out"),
|
||||
ReferencesIn: getStringSlice(hit.Payload, "references_in"),
|
||||
Superseded: getString(hit.Payload, "status") == "superseded",
|
||||
|
||||
IsFootnote: getBool(hit.Payload, "is_footnote"),
|
||||
FootnoteLabel: getString(hit.Payload, "footnote_label"),
|
||||
FootnoteVerbatim: getString(hit.Payload, "footnote_verbatim"),
|
||||
RefCitationUnit: getString(hit.Payload, "ref_citation_unit"),
|
||||
IsTable: getBool(hit.Payload, "is_table"),
|
||||
IsFigure: getBool(hit.Payload, "is_figure"),
|
||||
}
|
||||
}
|
||||
return results
|
||||
|
||||
Reference in New Issue
Block a user