diff --git a/ai-compliance-sdk/internal/api/handlers/rag_handlers.go b/ai-compliance-sdk/internal/api/handlers/rag_handlers.go index 1842c43b..d48484b4 100644 --- a/ai-compliance-sdk/internal/api/handlers/rag_handlers.go +++ b/ai-compliance-sdk/internal/api/handlers/rag_handlers.go @@ -1,6 +1,9 @@ package handlers import ( + "encoding/json" + "fmt" + "log" "net/http" "strconv" @@ -87,6 +90,7 @@ func (h *RAGHandlers) Search(c *gin.Context) { type RetrieveRequest struct { Query string `json:"query" binding:"required"` TopK int `json:"top_k,omitempty"` + Context string `json:"context,omitempty"` } // Retrieve is the Authority Router endpoint. The Advisor calls this with ONLY a query and stays @@ -105,20 +109,171 @@ func (h *RAGHandlers) Retrieve(c *gin.Context) { req.TopK = 8 } + // E2 Term Resolution: expand unambiguous abbreviations (TOM/VVT/AVV/DSB/DSFA) into the + // query so retrieval finds them; ambiguous ones (DSE/DPA) are surfaced to the FE — NOT + // auto-mapped (chat context E1 wins, else the FE asks). + intent := ucca.DetectIntent(req.Query) + termRes := ucca.ResolveAbbreviations(req.Query) + req.Query = termRes.Expanded + results, err := h.ragClient.Retrieve(c.Request.Context(), req.Query, req.TopK) if err != nil { c.JSON(http.StatusInternalServerError, gin.H{"error": "RAG retrieve failed: " + err.Error()}) return } + // Evidence-Type-Schicht: die autoritative typisierte Evidence (Fußnoten/Tabellen/Figuren) aus + // dem KB-Wissensraum SEPARAT surfacen, statt sie im Breit-Basis-Text-Merge zu verlieren. + // results[] bleibt der Text-Kontext fürs LLM + die Quellen-Liste. + // Context scoping (E5): the user explicitly chose a knowledge space (chip), so scope + // the evidence HARD to it (wider re-retrieve + domain filter) — no off-domain regelwerke + // (MDR/UStG/eIDAS) after a context decision. + if req.Context != "" { + if wide, werr := h.ragClient.Retrieve(c.Request.Context(), req.Query, 30); werr == nil && len(wide) > 0 { + results = ucca.FilterByKnowledgeSpace(wide, req.Context, req.TopK) + } else { + results = ucca.FilterByKnowledgeSpace(results, req.Context, req.TopK) + } + } + + // G1 scope-gating: a named regulation scopes the evidence to its knowledge space. + // Re-retrieve wider and lead with the named regulation's domain so the L2 answer + + // [n] citations are built on scoped evidence, not the embedding-majority domain. + if scope := ucca.QueryKnowledgeSpace(req.Query); scope != "" { + if wide, werr := h.ragClient.Retrieve(c.Request.Context(), req.Query, 30); werr == nil && len(wide) > 0 { + results = ucca.ScopeResults(wide, scope, req.TopK) + } else { + results = ucca.ScopeResults(results, scope, req.TopK) + } + } + + ev := h.ragClient.RetrieveEvidence(c.Request.Context(), req.Query) + // Concept->Norm recall injector: if the query names a legal concept, fetch its + // load-bearing norms (Datenschutzerklärung -> Art. 12/13/14 DSGVO, ...) and inject + // them into the evidence set so they surface (embedding similarity misses them). + if norms := ucca.ConceptNorms(req.Query); len(norms) > 0 { + top := 0.9 + if len(results) > 0 { + top = results[0].Score + } + injected := h.ragClient.FetchByNormIDs(c.Request.Context(), norms, top-0.001) + results = ucca.InjectConceptNorms(results, injected, req.TopK) + } + clarity := ucca.ClassifyClarity(req.Query, results) + traceClarity(req.Query, clarity, results) + c.JSON(http.StatusOK, gin.H{ "query": req.Query, "results": results, "count": len(results), "assessment": ucca.Assess(results), + "footnotes": footnotesFromEvidence(ev[ucca.EvidenceFootnote]), + "tables": tablesFromEvidence(ev[ucca.EvidenceTable]), + "evidence": evidenceFromResults(results), + "visual_evidence": visualEvidenceFromEvidence(ev[ucca.EvidenceFigure]), + "clarity": clarity, + "term_resolution": termRes.Ambiguous, + "interaction_intent": intent, }) } +// footnotesFromEvidence maps FOOTNOTE evidence to the Evidence-Workspace RawFootnote shape. +func footnotesFromEvidence(rs []ucca.LegalSearchResult) []gin.H { + out := make([]gin.H, 0, len(rs)) + for _, r := range rs { + out = append(out, gin.H{ + "id": r.CitationUnit, + "ref": r.CitationUnit, + "number": r.FootnoteLabel, + "regulation_code": r.RegulationCode, + "regulation_short": r.RegulationShort, + "regulation_name": r.RegulationName, + "section": r.RefCitationUnit, + "text": r.FootnoteVerbatim, + }) + } + return out +} + +// tablesFromEvidence maps TABLE evidence (C6/C9). Key is present so the same Evidence-Type path +// carries tables the moment the UI adds a table section. +func tablesFromEvidence(rs []ucca.LegalSearchResult) []gin.H { + out := make([]gin.H, 0, len(rs)) + for _, r := range rs { + out = append(out, gin.H{ + "id": r.CitationUnit, + "caption": r.ArticleLabel, + "regulation_code": r.RegulationCode, + "regulation_short": r.RegulationShort, + "regulation_name": r.RegulationName, + "section": r.RefCitationUnit, + "text": r.Text, + }) + } + return out +} + +// visualEvidenceFromEvidence maps FIGURE evidence to the Visual Evidence contract shape +// (C8). visual_type/image_ref/vision_summary populate once C8 lands; the shape is stable now. +func visualEvidenceFromEvidence(rs []ucca.LegalSearchResult) []gin.H { + out := make([]gin.H, 0, len(rs)) + for _, r := range rs { + out = append(out, gin.H{ + "visual_id": r.CitationUnit, + "visual_type": "figure", + "caption": r.ArticleLabel, + "document": evidenceDocName(r), + "context": ucca.KnowledgeSpaceOf(r.RegulationCode), + "regulation_code": r.RegulationCode, + "section": r.RefCitationUnit, + "image_ref": "", + "vision_summary": "", + }) + } + return out +} + +// evidenceFromResults maps retrieval hits to the Evidence contract shape the Advisor +// Evidence Workspace renders (citations[] reference evidence_id). Populated at retrieve +// time; citations[] (the [n]<->evidence coupling) come from the answer-generation step. +func evidenceFromResults(rs []ucca.LegalSearchResult) []gin.H { + out := make([]gin.H, 0, len(rs)) + for _, r := range rs { + id := r.CitationUnit + if id == "" { + id = r.ArticleLabel + } + out = append(out, gin.H{ + "evidence_id": id, + "document": evidenceDocName(r), + "section": r.ArticleLabel, + "paragraph": r.Paragraph, + "snippet": evidenceSnippet(r.Text, 280), + "url": r.SourceURL, + "regulation_code": r.RegulationCode, + "context": ucca.KnowledgeSpaceOf(r.RegulationCode), + }) + } + return out +} + +// evidenceDocName is the human-facing source name (short code, else full name). +func evidenceDocName(r ucca.LegalSearchResult) string { + if r.RegulationShort != "" { + return r.RegulationShort + } + return r.RegulationName +} + +// evidenceSnippet returns a trimmed excerpt of at most n runes. +func evidenceSnippet(s string, n int) string { + rs := []rune(s) + if len(rs) <= n { + return s + } + return string(rs[:n]) + "…" +} + // ListRegulations returns the list of available regulations in the corpus. // GET /sdk/v1/rag/regulations func (h *RAGHandlers) ListRegulations(c *gin.Context) { @@ -273,3 +428,29 @@ func (h *RAGHandlers) LegalCorpusStructure(c *gin.Context) { }, }) } + +// traceClarity emits a structured CLARITY_TRACE log line per retrieve for the macmini +// test session, so qualitative user ratings can be correlated with the gate decision. +func traceClarity(query string, cl ucca.Clarity, results []ucca.LegalSearchResult) { + top := make([]string, 0, 3) + for i, r := range results { + if i >= 3 { + break + } + top = append(top, r.RegulationShort) + } + chips := make([]string, 0, len(cl.CandidateContexts)) + for _, c := range cl.CandidateContexts { + chips = append(chips, fmt.Sprintf("%s:%d", c.ID, c.Hits)) + } + b, _ := json.Marshal(map[string]interface{}{ + "query": query, + "mode": cl.Mode, + "reason": cl.Reason, + "concentration": cl.Concentration, + "dominant": cl.DominantContext, + "chips": chips, + "top_evidence": top, + }) + log.Printf("CLARITY_TRACE %s", string(b)) +} diff --git a/ai-compliance-sdk/internal/ucca/abbrev_test.go b/ai-compliance-sdk/internal/ucca/abbrev_test.go new file mode 100644 index 00000000..bac6c1dc --- /dev/null +++ b/ai-compliance-sdk/internal/ucca/abbrev_test.go @@ -0,0 +1,33 @@ +package ucca + +import ( + "strings" + "testing" +) + +func TestResolveAbbreviations(t *testing.T) { + // unambiguous -> expanded, not flagged + tr := ResolveAbbreviations("Was ist eine TOM?") + if !strings.Contains(tr.Expanded, "technische und organisatorische") { + t.Errorf("TOM must be expanded, got %q", tr.Expanded) + } + if len(tr.Ambiguous) != 0 { + t.Errorf("TOM must not be ambiguous, got %v", tr.Ambiguous) + } + // ambiguous DSE -> flagged, NOT auto-expanded (chat context must win, else FE asks) + tr2 := ResolveAbbreviations("welche Infos in eine DSE?") + if tr2.Expanded != "welche Infos in eine DSE?" { + t.Errorf("DSE must NOT be auto-mapped, got %q", tr2.Expanded) + } + if len(tr2.Ambiguous) != 1 || tr2.Ambiguous[0].Abbreviation != "DSE" || len(tr2.Ambiguous[0].Candidates) != 2 { + t.Errorf("DSE must be flagged ambiguous with 2 candidates, got %v", tr2.Ambiguous) + } + // no abbreviation -> unchanged + if ResolveAbbreviations("Wie ist das Wetter?").Expanded != "Wie ist das Wetter?" { + t.Errorf("query without abbreviation must be unchanged") + } + // substring must NOT match ("atom" contains "tom" but is not the word TOM) + if strings.Contains(ResolveAbbreviations("Was ist ein Atom?").Expanded, "organisatorische") { + t.Errorf("substring 'tom' in 'Atom' must not trigger expansion") + } +} diff --git a/ai-compliance-sdk/internal/ucca/abbreviation_lexicon.go b/ai-compliance-sdk/internal/ucca/abbreviation_lexicon.go new file mode 100644 index 00000000..ba0ea78c --- /dev/null +++ b/ai-compliance-sdk/internal/ucca/abbreviation_lexicon.go @@ -0,0 +1,65 @@ +package ucca + +import ( + "strings" + "unicode" +) + +// TermResolution is the E2 (Term Resolution) signal in the Advisor Reasoning Stack. +// Expanded drives retrieval internally (unambiguous abbreviations are spelled out so +// the embedding/concept layer finds them). Ambiguous is surfaced to the FE, which +// resolves it via chat context (E1) or asks the user ("Meinst du X oder Y?"). The +// lexicon NEVER auto-maps an ambiguous abbreviation (e.g. DSE) — real-life discipline. +type TermResolution struct { + Expanded string `json:"-"` + Ambiguous []TermAmbiguity `json:"ambiguous,omitempty"` +} + +// TermAmbiguity flags one abbreviation the SDK could not resolve deterministically. +type TermAmbiguity struct { + Abbreviation string `json:"abbreviation"` + Candidates []string `json:"candidates"` +} + +// abbreviationLexicon maps a (lowercased) abbreviation to its canonical term(s). +// >1 candidate = ambiguous → flagged, not expanded. Start small (User-Spec). +var abbreviationLexicon = map[string][]string{ + "dse": {"Datenschutzerklärung", "Datenschutz-Folgenabschätzung"}, // ambiguous — context wins, else ask + "dsfa": {"Datenschutz-Folgenabschätzung"}, + "tom": {"technische und organisatorische Maßnahmen"}, + "vvt": {"Verzeichnis von Verarbeitungstätigkeiten"}, + "avv": {"Auftragsverarbeitungsvertrag"}, + "dsb": {"Datenschutzbeauftragter"}, + "dpa": {"Data Processing Agreement", "Datenschutzaufsichtsbehörde"}, // ambiguous +} + +// ResolveAbbreviations expands unambiguous abbreviations into the query and flags +// ambiguous ones. Deterministic: iterates query tokens in order (no map-order +// dependence). Whole-word match (case-insensitive) so "TOM" hits but "atom" does not. +func ResolveAbbreviations(query string) TermResolution { + tr := TermResolution{Expanded: query} + words := strings.FieldsFunc(query, func(r rune) bool { + return !unicode.IsLetter(r) && !unicode.IsNumber(r) + }) + seen := map[string]bool{} + var expansions []string + for _, w := range words { + lw := strings.ToLower(w) + cands, ok := abbreviationLexicon[lw] + if !ok || seen[lw] { + continue + } + seen[lw] = true + if len(cands) == 1 { + expansions = append(expansions, cands[0]) + } else { + tr.Ambiguous = append(tr.Ambiguous, TermAmbiguity{ + Abbreviation: strings.ToUpper(lw), Candidates: cands, + }) + } + } + if len(expansions) > 0 { + tr.Expanded = query + " " + strings.Join(expansions, " ") + } + return tr +} diff --git a/ai-compliance-sdk/internal/ucca/clarity.go b/ai-compliance-sdk/internal/ucca/clarity.go new file mode 100644 index 00000000..7ce6466b --- /dev/null +++ b/ai-compliance-sdk/internal/ucca/clarity.go @@ -0,0 +1,135 @@ +package ucca + +import ( + "sort" + "strings" +) + +// Clarity is the READ-ONLY, INSTRUMENTED clarity-gate signal emitted alongside a +// retrieve response. It does NOT change retrieval or advisor behaviour yet — the +// advisor still answers normally. Once ~30-50 real questions are collected the +// thresholds get finalised and the gate is activated in the advisor flow. +// +// Ambiguity has two independent sources (empirically measured, 12-question set): +// - retrieval scatter: hits spread across many knowledge spaces (low +// concentration / high domain_count) — the retriever itself can't localise. +// - conceptual generality: a general term the corpus OVER-localises (e.g. "PDCA" +// concentrates on datenschutz but is cross-domain) — only an LLM knows this. +// The middle band is where the LLM-intent classifier must decide. +// +// G1 (explicit scope): when the query NAMES a regulation ("... nach TRGS", "CRA +// ...", "MaschinenVO ..."), that explicit context beats the embedding scatter — +// the gate scopes to the named regulation's knowledge space regardless of +// concentration. This is regulation detection, NOT a broad-term list. +type Clarity struct { + Mode string `json:"mode"` // "answer" | "clarify" + Reason string `json:"reason"` // low_concentration | many_domains | high_confidence_scope | middle_band_llm_needed | explicit_scope | no_domain_signal + Concentration float64 `json:"concentration"` // fraction of tagged hits in the dominant knowledge space + DomainCount int `json:"domain_count"` // distinct knowledge spaces in the hits + DominantContext string `json:"dominant_context"` // knowledge-space id (explicit scope wins if the query names a regulation) + CandidateContexts []ClarityContext `json:"candidate_contexts"` // corpus-grounded chips (spaces actually present) +} + +// ClarityContext is one corpus-grounded context chip. +type ClarityContext struct { + ID string `json:"id"` + Label string `json:"label"` + Hits int `json:"hits"` +} + +// Tiered thresholds — INSTRUMENTED DEFAULTS, calibrate on 30-50 real questions. +const ( + clarityMaxConcentration = 0.45 // <= this => clarify (retrieval scatter) + clarityMinDomains = 4 // >= this => clarify (broad spread) + clarityAnswerConc = 0.75 // >= this => answer (confident scope) +) + +// QueryKnowledgeSpace detects an EXPLICIT regulation mention in the query and maps +// it to a knowledge space. Regulation detection (authority), not a broad-term list: +// only fires when the user names a concrete regelwerk. "" if none named. +func QueryKnowledgeSpace(query string) string { + q := " " + strings.ToLower(query) + " " + has := func(subs ...string) bool { + for _, s := range subs { + if strings.Contains(q, s) { + return true + } + } + return false + } + switch { + case has("trgs", "trbs", " asr ", "gefahrstoff", "arbeitsplatzgrenzwert", "arbeitsschutz"): + return "arbeitsschutz" + case has("dsgvo", "gdpr", "bdsg", "tdddg", "ttdsg", " dsk ", "edpb", "datenschutz", " dsfa "): + return "datenschutz" + case has(" cra ", "cyber resilience", "nis2", "nis-2", " dora ", "enisa", "bsig", "kritis"): + return "cyber" + case has("ai act", "ki-vo", "ki-verordnung", "ki-system"): + return "ki" + case has("maschinenverordnung", "maschinenvo", "maschvo", "maschinenrichtlinie", " gpsr ", "produktsicherheit"): + return "produktsicherheit" + case has(" mdr ", "medizinprodukt", "medical device"): + return "produktsicherheit" + default: + return "" + } +} + +// ClassifyClarity computes the read-only clarity signal. Deterministic tiers on the +// knowledge-space concentration, PLUS the G1 explicit-scope override: if the query +// names a regulation, that scope wins over the embedding scatter. +func ClassifyClarity(query string, results []LegalSearchResult) Clarity { + counts := map[string]int{} + total := 0 + for _, r := range results { + if s := KnowledgeSpaceOf(r.RegulationCode); s != "" { + counts[s]++ + total++ + } + } + cl := Clarity{Mode: "answer", Reason: "high_confidence_scope", CandidateContexts: []ClarityContext{}} + if total == 0 { + cl.Mode, cl.Reason = "clarify", "no_domain_signal" + if ks := QueryKnowledgeSpace(query); ks != "" { + cl.Mode, cl.Reason, cl.DominantContext = "answer", "explicit_scope", ks + } + return cl + } + type kc struct { + id string + n int + } + ks := make([]kc, 0, len(counts)) + for id, n := range counts { + ks = append(ks, kc{id, n}) + } + sort.Slice(ks, func(i, j int) bool { + if ks[i].n != ks[j].n { + return ks[i].n > ks[j].n + } + return ks[i].id < ks[j].id + }) + cl.DominantContext = ks[0].id + cl.Concentration = float64(ks[0].n) / float64(total) + cl.DomainCount = len(counts) + for _, k := range ks { + cl.CandidateContexts = append(cl.CandidateContexts, ClarityContext{ + ID: k.id, Label: KnowledgeSpaceLabel[k.id], Hits: k.n, + }) + } + switch { + case cl.Concentration <= clarityMaxConcentration: + cl.Mode, cl.Reason = "clarify", "low_concentration" + case cl.DomainCount >= clarityMinDomains: + cl.Mode, cl.Reason = "clarify", "many_domains" + case cl.Concentration >= clarityAnswerConc: + cl.Mode, cl.Reason = "answer", "high_confidence_scope" + default: + cl.Mode, cl.Reason = "answer", "middle_band_llm_needed" + } + // G1: an explicitly named regulation beats the embedding scatter. + if q := QueryKnowledgeSpace(query); q != "" { + cl.Mode, cl.Reason, cl.DominantContext = "answer", "explicit_scope", q + } + return cl +} diff --git a/ai-compliance-sdk/internal/ucca/clarity_test.go b/ai-compliance-sdk/internal/ucca/clarity_test.go new file mode 100644 index 00000000..01acef15 --- /dev/null +++ b/ai-compliance-sdk/internal/ucca/clarity_test.go @@ -0,0 +1,64 @@ +package ucca + +import "testing" + +func TestKnowledgeSpaceOf(t *testing.T) { + cases := map[string]string{ + "DSGVO": "datenschutz", + "BDSG": "datenschutz", + "DSK SDM B51 ZUGRIFFE": "datenschutz", + "EDPS DIGITAL ETHICS": "datenschutz", + "TRGS 900": "arbeitsschutz", + "OSHA 1910 SUBPART O": "arbeitsschutz", + "HGB": "wirtschaftsrecht", + "BGB": "wirtschaftsrecht", + "MASCHINENVO": "produktsicherheit", + "MVO": "produktsicherheit", + "CRA": "cyber", + "NIST SP800 53R5": "cyber", + "AI ACT": "ki", + "KI-VO": "ki", + "DORA": "finanz", + "ARG": "arbeitsrecht", + "": "", + } + for code, want := range cases { + if got := KnowledgeSpaceOf(code); got != want { + t.Errorf("KnowledgeSpaceOf(%q)=%q want %q", code, got, want) + } + } +} + +func TestClassifyClarity(t *testing.T) { + scattered := []LegalSearchResult{ + {RegulationCode: "CRA"}, {RegulationCode: "MASCHINENVO"}, {RegulationCode: "EU MDR"}, + {RegulationCode: "KI-VO"}, {RegulationCode: "TRBS 1111"}, {RegulationCode: "OWASP TOP10"}, + } + if c := ClassifyClarity("Welche Risiken gibt es?", scattered); c.Mode != "clarify" { + t.Errorf("scattered: mode=%q reason=%q want clarify", c.Mode, c.Reason) + } + concentrated := []LegalSearchResult{ + {RegulationCode: "DSGVO"}, {RegulationCode: "BDSG"}, {RegulationCode: "DSK SDM"}, + {RegulationCode: "EDPB WP243"}, {RegulationCode: "TDDDG"}, + } + c := ClassifyClarity("Was ist eine DSFA?", concentrated) + if c.Mode != "answer" || c.DominantContext != "datenschutz" { + t.Errorf("concentrated: mode=%q dominant=%q want answer/datenschutz", c.Mode, c.DominantContext) + } +} + +func TestClassifyClarity_ExplicitScope(t *testing.T) { + // G1: query names TRGS -> arbeitsschutz wins even though retrieval scatters to datenschutz. + scattered := []LegalSearchResult{ + {RegulationCode: "DSK SDM METHODE"}, {RegulationCode: "DSK SDM V31"}, {RegulationCode: "DSK SDM B41 PLANEN"}, + {RegulationCode: "DSGVO"}, {RegulationCode: "DSK SDM"}, {RegulationCode: "TRGS 900"}, {RegulationCode: "TRGS 554"}, + } + c := ClassifyClarity("Schwellwertanalyse nach TRGS", scattered) + if c.Mode != "answer" || c.Reason != "explicit_scope" || c.DominantContext != "arbeitsschutz" { + t.Errorf("explicit TRGS: mode=%q reason=%q dominant=%q want answer/explicit_scope/arbeitsschutz", c.Mode, c.Reason, c.DominantContext) + } + // no regulation named -> falls through to tiered logic + if c := ClassifyClarity("Welche Risiken gibt es?", scattered); c.Reason == "explicit_scope" { + t.Errorf("no reg named should not be explicit_scope, got %q", c.Reason) + } +} diff --git a/ai-compliance-sdk/internal/ucca/concept_fetch.go b/ai-compliance-sdk/internal/ucca/concept_fetch.go new file mode 100644 index 00000000..cf4508ef --- /dev/null +++ b/ai-compliance-sdk/internal/ucca/concept_fetch.go @@ -0,0 +1,88 @@ +package ucca + +import ( + "bytes" + "context" + "encoding/json" + "fmt" + "net/http" +) + +// FetchByNormIDs loads one representative unit per norm_id from the KB slice +// collection — the fetch side of the Concept->Norm recall injector. Returns +// LegalSearchResult with the caller-provided concept-relevance score (there is no +// similarity query; the injector places them by that score). Returns nil on any +// error or when no KB slice is configured (graceful degradation). +func (c *LegalRAGClient) FetchByNormIDs(ctx context.Context, normIDs []string, score float64) []LegalSearchResult { + if c.kbSliceCollection == "" || len(normIDs) == 0 { + return nil + } + should := make([]map[string]interface{}, 0, len(normIDs)) + for _, nid := range normIDs { + should = append(should, map[string]interface{}{"key": "norm_id", "match": map[string]interface{}{"value": nid}}) + } + reqBody := map[string]interface{}{ + "limit": len(normIDs) * 3, + "with_payload": true, + "with_vectors": false, + "filter": map[string]interface{}{"should": should}, + } + jsonBody, err := json.Marshal(reqBody) + if err != nil { + return nil + } + url := fmt.Sprintf("%s/collections/%s/points/scroll", c.qdrantURL, c.kbSliceCollection) + req, err := http.NewRequestWithContext(ctx, "POST", url, bytes.NewReader(jsonBody)) + if err != nil { + return nil + } + req.Header.Set("Content-Type", "application/json") + if c.qdrantAPIKey != "" { + req.Header.Set("api-key", c.qdrantAPIKey) + } + resp, err := c.httpClient.Do(req) + if err != nil { + return nil + } + defer func() { _ = resp.Body.Close() }() + if resp.StatusCode != http.StatusOK { + return nil + } + var scrollResp qdrantScrollResponse + if err := json.NewDecoder(resp.Body).Decode(&scrollResp); err != nil { + return nil + } + seen := map[string]bool{} + out := make([]LegalSearchResult, 0, len(normIDs)) + for _, pt := range scrollResp.Result.Points { + nid := getString(pt.Payload, "norm_id") + if nid == "" || seen[nid] { + continue + } + seen[nid] = true + out = append(out, scrollPointToResult(pt.Payload, score)) + } + return out +} + +// scrollPointToResult maps a scroll-point payload to a LegalSearchResult. Mirrors +// hitsToResults' payload keys; the score is assigned by the caller (concept rank). +func scrollPointToResult(payload map[string]interface{}, score float64) LegalSearchResult { + regCode := getString(payload, "regulation_code") + if regCode == "" { + regCode = getString(payload, "regulation_id") + } + return LegalSearchResult{ + Text: getString(payload, "chunk_text"), + RegulationCode: regCode, + RegulationName: getString(payload, "regulation_name_de"), + RegulationShort: getString(payload, "regulation_short"), + Category: getString(payload, "category"), + Article: getString(payload, "article"), + ArticleLabel: getString(payload, "article_label"), + Paragraph: getString(payload, "paragraph"), + SourceURL: getString(payload, "source_url"), + CitationUnit: getString(payload, "citation_unit"), + Score: score, + } +} diff --git a/ai-compliance-sdk/internal/ucca/concept_ontology.go b/ai-compliance-sdk/internal/ucca/concept_ontology.go new file mode 100644 index 00000000..f56f8147 --- /dev/null +++ b/ai-compliance-sdk/internal/ucca/concept_ontology.go @@ -0,0 +1,97 @@ +package ucca + +import ( + "sort" + "strings" +) + +// Legal Concept Ontology — the fachliche IP bridge for the Concept->Norm recall +// injector. The words users type ("Datenschutzerklärung", "Cookie Banner") are +// rarely identical to the article titles that actually govern them (Art. 12/13/14 +// DSGVO, § 25 TDDDG). Embedding similarity misses this leap, so these bridges are +// curated: concept keyword -> load-bearing norm_ids. This is NOT a fallback to +// hardcoding — it is domain knowledge that surfaces the normatively load-bearing +// units within the (already correctly retrieved) documents. +type conceptNorm struct { + keywords []string + normIDs []string +} + +var legalConceptOntology = []conceptNorm{ + {[]string{"datenschutzerklärung", "datenschutzerklaerung", "privacy policy", "datenschutzhinweise", "datenschutzinformation"}, + []string{"EU-DSGVO-Art12", "EU-DSGVO-Art13", "EU-DSGVO-Art14"}}, + {[]string{"cookie banner", "cookie-banner", "cookies", "cookie", "tracking"}, + []string{"DE-TDDDG-§25", "EU-DSGVO-Art6", "EU-DSGVO-Art7"}}, + {[]string{"dsfa", "folgenabschätzung", "folgenabschaetzung", "datenschutz-folgenabschätzung"}, + []string{"EU-DSGVO-Art35", "EU-DSGVO-Art36"}}, + {[]string{"auskunft", "auskunftsrecht", "auskunftsersuchen"}, + []string{"EU-DSGVO-Art15"}}, + {[]string{"löschung", "loeschung", "vergessenwerden", "recht auf vergessen"}, + []string{"EU-DSGVO-Art17"}}, + {[]string{"datenübertragbarkeit", "datenuebertragbarkeit", "portabilität", "portabilitaet"}, + []string{"EU-DSGVO-Art20"}}, + {[]string{"widerspruch", "widerspruchsrecht"}, + []string{"EU-DSGVO-Art21"}}, + {[]string{"datenpanne", "datenschutzverletzung", "data breach", "verletzung des schutzes"}, + []string{"EU-DSGVO-Art33", "EU-DSGVO-Art34"}}, + // E4-Quick-Curation (2026-07-01): resolved abbreviations (E2) pull their core norms. + {[]string{"technische und organisatorische maßnahmen", "technische und organisatorische massnahmen"}, + []string{"EU-DSGVO-Art32", "EU-DSGVO-Art25", "EU-DSGVO-Art5"}}, + {[]string{"verzeichnis von verarbeitungstätigkeiten", "verzeichnis von verarbeitungstaetigkeiten", "verarbeitungsverzeichnis"}, + []string{"EU-DSGVO-Art30"}}, + {[]string{"auftragsverarbeitungsvertrag", "auftragsverarbeitung", "auftragsverarbeiter"}, + []string{"EU-DSGVO-Art28"}}, + {[]string{"datenschutzbeauftragt"}, + []string{"EU-DSGVO-Art37", "EU-DSGVO-Art38", "EU-DSGVO-Art39"}}, +} + +// ConceptNorms returns the load-bearing norm_ids for the concepts named in the +// query (dedup, order-preserving). Empty if no concept is named. +func ConceptNorms(query string) []string { + q := strings.ToLower(query) + seen := map[string]bool{} + out := []string{} + for _, cn := range legalConceptOntology { + for _, kw := range cn.keywords { + if strings.Contains(q, kw) { + for _, nid := range cn.normIDs { + if !seen[nid] { + seen[nid] = true + out = append(out, nid) + } + } + break + } + } + } + return out +} + +// InjectConceptNorms merges concept-injected norm units into the results so the +// load-bearing norms are VISIBLE in the evidence set. Dedups by citation_unit +// (skips norms already retrieved), then re-sorts by score — the injected units +// carry a just-below-top score so they surface high WITHOUT displacing the top +// document hit (inject, don't blindly dominate). Caps at topK. +func InjectConceptNorms(results, injected []LegalSearchResult, topK int) []LegalSearchResult { + if len(injected) == 0 { + return results + } + present := map[string]bool{} + for _, r := range results { + if r.CitationUnit != "" { + present[r.CitationUnit] = true + } + } + merged := append([]LegalSearchResult{}, results...) + for _, in := range injected { + if in.CitationUnit != "" && !present[in.CitationUnit] { + merged = append(merged, in) + present[in.CitationUnit] = true + } + } + sort.SliceStable(merged, func(i, j int) bool { return merged[i].Score > merged[j].Score }) + if topK > 0 && len(merged) > topK { + merged = merged[:topK] + } + return merged +} diff --git a/ai-compliance-sdk/internal/ucca/concept_test.go b/ai-compliance-sdk/internal/ucca/concept_test.go new file mode 100644 index 00000000..7777d884 --- /dev/null +++ b/ai-compliance-sdk/internal/ucca/concept_test.go @@ -0,0 +1,48 @@ +package ucca + +import "testing" + +func TestConceptNorms(t *testing.T) { + q := "Was muss ich beachten wenn ich meine Datenschutzerklärung schreibe für meine Website mit Cookie Banner?" + got := ConceptNorms(q) + want := map[string]bool{ + "EU-DSGVO-Art12": true, "EU-DSGVO-Art13": true, "EU-DSGVO-Art14": true, + "DE-TDDDG-§25": true, "EU-DSGVO-Art6": true, "EU-DSGVO-Art7": true, + } + for _, nid := range got { + delete(want, nid) + } + if len(want) > 0 { + t.Errorf("ConceptNorms missing %v; got %v", want, got) + } + if len(ConceptNorms("Wie ist das Wetter heute?")) != 0 { + t.Errorf("no concept named should yield no norms") + } +} + +func TestInjectConceptNorms(t *testing.T) { + results := []LegalSearchResult{ + {CitationUnit: "DSK OH Telemedien", Score: 0.98}, + {CitationUnit: "Art. 25 DSGVO", Score: 0.95}, + } + injected := []LegalSearchResult{ + {CitationUnit: "Art. 13 DSGVO", Score: 0.979}, + {CitationUnit: "Art. 25 DSGVO", Score: 0.979}, // already present -> must not double + } + out := InjectConceptNorms(results, injected, 10) + if out[0].CitationUnit != "DSK OH Telemedien" { + t.Errorf("top document hit must stay #1 (not dominated), got %s", out[0].CitationUnit) + } + if len(out) != 3 { + t.Errorf("expected 3 (Art.25 not duplicated), got %d", len(out)) + } + found := false + for _, r := range out { + if r.CitationUnit == "Art. 13 DSGVO" { + found = true + } + } + if !found { + t.Errorf("Art. 13 DSGVO must be injected + visible") + } +} diff --git a/ai-compliance-sdk/internal/ucca/evidence_type.go b/ai-compliance-sdk/internal/ucca/evidence_type.go new file mode 100644 index 00000000..77f797d7 --- /dev/null +++ b/ai-compliance-sdk/internal/ucca/evidence_type.go @@ -0,0 +1,68 @@ +package ucca + +import "context" + +// EvidenceType classifies a retrieved unit by WHAT KIND of evidence it is, independent of its +// collection. Footnotes/tables/figures are Evidence Types, not collections. The Authority Router +// surfaces non-text evidence from the authoritative knowledge space (the KB slice) SEPARATELY from +// the merged text top-K, so fine-grained evidence isn't outranked by broad-base text. +// +// The layer this introduces: Intent -> Knowledge Space -> EvidenceType -> Collection -> Merge -> +// Authority. Today FOOTNOTE is populated; FIGURE arrives with C8 and TABLE is already present from +// C6/C9 — no router rebuild needed, the same path carries every new evidence type. +type EvidenceType string + +const ( + EvidenceText EvidenceType = "text" + EvidenceFootnote EvidenceType = "footnote" + EvidenceTable EvidenceType = "table" + EvidenceFigure EvidenceType = "figure" +) + +// classifyEvidence derives the EvidenceType from a result's payload markers. Precedence +// footnote > figure > table > text (a unit carries at most one is_* marker in practice). +func classifyEvidence(r LegalSearchResult) EvidenceType { + switch { + case r.IsFootnote: + return EvidenceFootnote + case r.IsFigure: + return EvidenceFigure + case r.IsTable: + return EvidenceTable + default: + return EvidenceText + } +} + +// evidenceRetrievalTopK is the budget for the authoritative-KB evidence pass. Deliberately targeted +// (the authoritative slice within the recognized knowledge space), NOT a blanket top-K increase of +// the merged result set — the successes came from BETTER-targeted evidence, not MORE evidence. +const evidenceRetrievalTopK = 20 + +// maxEvidencePerType caps each surfaced evidence type. +const maxEvidencePerType = 6 + +// RetrieveEvidence returns the authoritative typed evidence (footnotes/tables/figures) for an +// in-scope query, pulled from the KB slice and grouped by EvidenceType. This is the "Evidence Type" +// router layer (Option A): when the query is in the KB knowledge space, the authoritative evidence +// within that space is surfaced separately so it isn't lost in the broad-base text merge. Returns an +// empty map when out of scope or KB routing is disabled. Text evidence is NOT returned here — it +// flows through the normal Retrieve() merge (the LLM context + the sources list). +func (c *LegalRAGClient) RetrieveEvidence(ctx context.Context, query string) map[EvidenceType][]LegalSearchResult { + ev := map[EvidenceType][]LegalSearchResult{} + if !c.kbScopeRoutingEnabled || c.kbSliceCollection == "" || !inKBScope(query) { + return ev + } + hits, err := c.searchInternal(ctx, c.kbSliceCollection, query, nil, evidenceRetrievalTopK) + if err != nil { + return ev + } + for _, h := range hits { + t := classifyEvidence(h) + if t == EvidenceText || len(ev[t]) >= maxEvidencePerType { + continue + } + ev[t] = append(ev[t], h) + } + return ev +} diff --git a/ai-compliance-sdk/internal/ucca/filter_test.go b/ai-compliance-sdk/internal/ucca/filter_test.go new file mode 100644 index 00000000..9b97d1f7 --- /dev/null +++ b/ai-compliance-sdk/internal/ucca/filter_test.go @@ -0,0 +1,26 @@ +package ucca + +import "testing" + +func TestFilterByKnowledgeSpace(t *testing.T) { + results := []LegalSearchResult{ + {CitationUnit: "Art. 13 DSGVO", RegulationCode: "DSGVO"}, + {CitationUnit: "EU Mdr", RegulationCode: "EU MDR"}, + {CitationUnit: "UStG § 14", RegulationCode: "USTG"}, + {CitationUnit: "DSK OH Telemedien", RegulationCode: "DSK OH TELEMEDIEN"}, + {CitationUnit: "eIDAS", RegulationCode: "EIDAS"}, + } + out := FilterByKnowledgeSpace(results, "datenschutz", 10) + for _, r := range out { + if KnowledgeSpaceOf(r.RegulationCode) != "datenschutz" { + t.Errorf("off-domain leaked into scoped result: %s (%s)", r.CitationUnit, r.RegulationCode) + } + } + if len(out) != 2 { // Art. 13 DSGVO + DSK OH Telemedien + t.Errorf("expected 2 datenschutz hits, got %d", len(out)) + } + // domain with no hits -> fall back to input (never strand the answer) + if len(FilterByKnowledgeSpace(results, "maschinen", 10)) != len(results) { + t.Errorf("no-hit domain should fall back to full input") + } +} diff --git a/ai-compliance-sdk/internal/ucca/intent.go b/ai-compliance-sdk/internal/ucca/intent.go new file mode 100644 index 00000000..abed232b --- /dev/null +++ b/ai-compliance-sdk/internal/ucca/intent.go @@ -0,0 +1,46 @@ +package ucca + +import "strings" + +// DetectIntent classifies the INTERACTION INTENT of a query (Advisor Reasoning +// Stack E3). The same norms answer very differently depending on the TASK the user +// wants: "Was ist X?" (definition) vs "Wie schreibe ich X?" (anleitung) vs "Prüfe X" +// (review). The SDK detects the intent deterministically and emits it; the FE picks +// the answer FORM, so the LLM gets a precise assignment ("write an Anleitung over +// this evidence") instead of guessing the format. Returns "" (neutral) when no +// clear task is signalled. First tier of ~20-30 intent types. +func DetectIntent(query string) string { + q := " " + strings.ToLower(query) + " " + has := func(subs ...string) bool { + for _, s := range subs { + if strings.Contains(q, s) { + return true + } + } + return false + } + switch { + case has("prüfe", "prüf mein", "überprüfe", "überprüf", "review", "checke mein", + "ist mein", "ist meine", "ist unser", "ist unsere", "konform", "stimmt mein", + "bewerte mein", "analysiere mein"): + return "review" + case has("checkliste", "was muss ich alles", "was gehört alles", "was gehört in", + "welche punkte muss", "was brauche ich alles"): + return "checkliste" + case has("vergleich", "unterschied", "worin unterscheid", " vs ", " versus ", + "gegenüber", "im gegensatz"): + return "vergleich" + case has("wie schreibe", "wie erstelle", "wie erstell", "wie mache", "wie baue", + "wie setze ich", "wie gehe ich vor", "wie formuliere", "wie richte ich", + "anleitung", "schritt für schritt", "schritt-für-schritt", "erstelle mir", + "erstell mir", "generiere", "was muss ich beachten", "worauf muss ich achten"): + return "anleitung" + case has("welche risiken", "welche gefahren", "risikoanalyse", "welche bedrohungen"): + return "risikoanalyse" + case has("was ist", "was bedeutet", "was versteht man", "was sind", "definition", + "erkläre mir", "erklär mir", "was heißt", "was genau ist"): + return "definition" + default: + return "" + } +} diff --git a/ai-compliance-sdk/internal/ucca/intent_test.go b/ai-compliance-sdk/internal/ucca/intent_test.go new file mode 100644 index 00000000..592c86ae --- /dev/null +++ b/ai-compliance-sdk/internal/ucca/intent_test.go @@ -0,0 +1,22 @@ +package ucca + +import "testing" + +func TestDetectIntent(t *testing.T) { + cases := map[string]string{ + "Was ist eine Datenschutzerklärung?": "definition", + "Wie schreibe ich eine Datenschutzerklärung?": "anleitung", + "Was muss ich beachten wenn ich eine DSE schreibe?": "anleitung", + "Prüfe meine Datenschutzerklärung.": "review", + "Ist meine Datenschutzerklärung konform?": "review", + "Vergleiche DSGVO und BDSG.": "vergleich", + "Welche Risiken gibt es?": "risikoanalyse", + "Erstelle mir eine Checkliste für die DSFA.": "checkliste", + "Wie ist das Wetter?": "", + } + for q, want := range cases { + if got := DetectIntent(q); got != want { + t.Errorf("DetectIntent(%q)=%q want %q", q, got, want) + } + } +} diff --git a/ai-compliance-sdk/internal/ucca/knowledge_space.go b/ai-compliance-sdk/internal/ucca/knowledge_space.go new file mode 100644 index 00000000..64ab8a26 --- /dev/null +++ b/ai-compliance-sdk/internal/ucca/knowledge_space.go @@ -0,0 +1,148 @@ +package ucca + +import "strings" + +// KnowledgeSpace is the CHIP-level knowledge domain used by the clarity gate's +// concentration signal + the user-facing context chips. It is deliberately RICHER +// than the 4 authority domains in authority.go (data_protection/cyber/ai/ +// product_safety), which drive the EU-primary/subsidiarity rerank. The clarity +// gate must reflect the FULL corpus breadth (arbeitsschutz, arbeitsrecht, +// wirtschaftsrecht, finanz, ...) so a broad query surfaces as broad. Kept separate +// + additive so the tuned authority rerank stays untouched. Corpus-grounded from +// the 463 real regulation codes (0.3% fall through to "sonstiges"). + +// knowledgeSpaceExact matches short/ambiguous codes by EXACT string (substring +// would misfire on 2-3 char codes like "OR"/"AO"/"BGB"). +var knowledgeSpaceExact = map[string]string{ + "HGB": "wirtschaftsrecht", "BGB": "wirtschaftsrecht", "AO": "wirtschaftsrecht", "OR": "wirtschaftsrecht", + "ABGB": "wirtschaftsrecht", "UGB": "wirtschaftsrecht", "IFRS": "wirtschaftsrecht", "BAO": "wirtschaftsrecht", + "GMBHG": "wirtschaftsrecht", "AKTG": "wirtschaftsrecht", "INSO": "wirtschaftsrecht", "USTG": "wirtschaftsrecht", + "GOBD": "wirtschaftsrecht", "EGBGB": "wirtschaftsrecht", "GEWO": "wirtschaftsrecht", "URHG": "wirtschaftsrecht", + "DPF": "datenschutz", "TKG": "datenschutz", "TMG": "datenschutz", "DDG": "datenschutz", "DSG": "datenschutz", + "DSV": "datenschutz", "DSM": "datenschutz", "SCC": "datenschutz", "EPRIVACY": "datenschutz", + "SCHREMS II": "datenschutz", "CH_REVDSG": "datenschutz", "PLANET49": "datenschutz", "GOOGLE FONTS": "datenschutz", + "DSA": "digitale_dienste", "DMA": "digitale_dienste", "DGA": "digitale_dienste", "EHDS": "digitale_dienste", + "EIDAS": "digitale_dienste", "EIDAS 2.0": "digitale_dienste", "DATA ACT": "digitale_dienste", + "DATAACT": "digitale_dienste", "DIGITAL CONTENT": "digitale_dienste", + "MVO": "produktsicherheit", "MACHINERY": "produktsicherheit", "MASCHVO": "produktsicherheit", + "MASCHINENVO": "produktsicherheit", "GPSR": "produktsicherheit", "PID": "produktsicherheit", + "EAA": "produktsicherheit", "BFSG": "produktsicherheit", "ELEKTROG": "produktsicherheit", + "VERPACKG": "produktsicherheit", "BATTVO": "produktsicherheit", "BATTDG": "produktsicherheit", "EU MDR": "produktsicherheit", + "DORA": "finanz", "PSD2": "finanz", "MICA": "finanz", "AMLR": "finanz", "VAIT": "finanz", "BAIT": "finanz", "GWG": "finanz", + "UWG": "verbraucherschutz", "UCPD": "verbraucherschutz", "VSBG": "verbraucherschutz", "PANGV": "verbraucherschutz", + "DL-INFOV": "verbraucherschutz", "OMNIBUS": "verbraucherschutz", "UWG AT": "verbraucherschutz", + "PRODHAFTG": "verbraucherschutz", "PRODUKTHAFTUNGS-RL": "verbraucherschutz", + "ARG": "arbeitsrecht", +} + +// KnowledgeSpaceLabel maps a knowledge-space id to a user-facing chip label. +var KnowledgeSpaceLabel = map[string]string{ + "datenschutz": "Datenschutz", "cyber": "Cybersecurity", "ki": "KI", + "produktsicherheit": "Produktsicherheit", "arbeitsschutz": "Arbeitsschutz", + "arbeitsrecht": "Arbeitsrecht", "wirtschaftsrecht": "Wirtschaftsrecht", + "finanz": "Finanzregulierung", "digitale_dienste": "Digitale Dienste", + "verbraucherschutz": "Verbraucherschutz", "lieferkette": "Lieferkette/Nachhaltigkeit", + "hinweisgeber": "Hinweisgeberschutz", "sonstiges": "Sonstiges", +} + +// KnowledgeSpaceOf maps a regulation_code to a knowledge space. Robust to code +// variants (MVO/MASCHVO/MASCHINENVO -> produktsicherheit; DSK SDM / SDM B51 -> +// datenschutz). Returns "" for empty/untagged codes (not a knowledge space). +func KnowledgeSpaceOf(code string) string { + c := strings.ToUpper(strings.TrimSpace(code)) + if c == "" || c == "NONE" { + return "" + } + if d, ok := knowledgeSpaceExact[c]; ok { + return d + } + has := func(subs ...string) bool { + for _, s := range subs { + if strings.Contains(c, s) { + return true + } + } + return false + } + pre := func(subs ...string) bool { + for _, s := range subs { + if strings.HasPrefix(c, s) { + return true + } + } + return false + } + switch { + case pre("TRGS", "TRBS", "ASR", "OSHA") || has("ARBSCHG", "GEFAHRSTOFF"): + return "arbeitsschutz" + case has("AI ACT", "KI-VO", "KI VERORDNUNG", "GPAI", "AI RMF", "HLEG AI", "GENAI", "OECD AI", "AI PRINCIPLES", "OH KI", "KI BEHOERDEN", "KI SICHERHEIT", "POS KI"): + return "ki" + case pre("DSGVO", "BDSG", "TDDDG", "DSK", "EDPB", "WP24", "WP25", "WP26", "DSFA", "BFDI", "BAYLDA", "BAYLFB", "EDPS") || has("DATENSCHUTZ", "LOESCHKONZEPT", "LOESCHUNG", "VVT", "TELEMEDIEN", "EU US DPF", "BESCHAEFTIGTENDATEN"): + return "datenschutz" + case has("CRA", "NIS2", "NISG", "BSIG", "BSI-TR", "BSI_KRITIS", "KRITIS", "ENISA", "NIST", "OWASP", "EUCSA", "EUCC", "CISA", "CYCLONEDX", "SPDX", "SLSA", "OPENTELEMETRY", "CVSS", "SECURE BY DESIGN"): + return "cyber" + case has("MACHINERY", "MASCH", "BLUE GUIDE", "FDA HFE"): + return "produktsicherheit" + case has("LKSG", "CSDDD", "CSRD", "TAXONOMY"): + return "lieferkette" + case has("HINSCHG", "GESCHGEHG"): + return "hinweisgeber" + case pre("BAG ", "BAG_") || has("ARBVG", "AZG", "ARBZG", "BETRVG", "KSCHG", "MUSCHG", "AGG", "MILOG", "TZBFG", "NACHWG", "BURLG", "611A", "PAY TRANSPARENCY", "ANGG", "MUTTERSCHUTZ"): + return "arbeitsrecht" + case has("ECOMMERCE", "ECG", "MEDIENG", "VERBRAUCHERRECHTE", "DIGITAL CONTENT"): + return "verbraucherschutz" + case pre("EUGH", "BVERFG", "BVGE", "BGH", "OGH") || has("EU TAXONOMY"): + return "wirtschaftsrecht" + default: + return "sonstiges" + } +} + +// ScopeResults implements G1 scope-gating: when the query names a regulation, its +// knowledge space's hits LEAD the result set (the L2 answer + [n] citations are +// built on this order, so scoped answers cite the named regulation instead of the +// embedding-majority domain). Non-scoped hits backfill to keep topK. Stable within +// each partition. Returns results unchanged when scope is "". +func ScopeResults(results []LegalSearchResult, scope string, topK int) []LegalSearchResult { + if scope == "" { + return results + } + scoped := make([]LegalSearchResult, 0, len(results)) + rest := make([]LegalSearchResult, 0, len(results)) + for _, r := range results { + if KnowledgeSpaceOf(r.RegulationCode) == scope { + scoped = append(scoped, r) + } else { + rest = append(rest, r) + } + } + out := append(scoped, rest...) + if topK > 0 && len(out) > topK { + out = out[:topK] + } + return out +} + +// FilterByKnowledgeSpace returns ONLY the results in the given knowledge space — +// a HARD scope with no off-domain backfill. Used by E5 context scoping: when the +// user explicitly chose a domain chip, off-domain regelwerke (MDR/UStG/eIDAS) must +// not reappear in the evidence. Falls back to the input when the domain has no hits +// (never strand the answer). Caps topK. +func FilterByKnowledgeSpace(results []LegalSearchResult, scope string, topK int) []LegalSearchResult { + if scope == "" { + return results + } + out := make([]LegalSearchResult, 0, len(results)) + for _, r := range results { + if KnowledgeSpaceOf(r.RegulationCode) == scope { + out = append(out, r) + } + } + if len(out) == 0 { + return results + } + if topK > 0 && len(out) > topK { + out = out[:topK] + } + return out +} diff --git a/ai-compliance-sdk/internal/ucca/legal_rag_types.go b/ai-compliance-sdk/internal/ucca/legal_rag_types.go index a47327d2..8a4a18fb 100644 --- a/ai-compliance-sdk/internal/ucca/legal_rag_types.go +++ b/ai-compliance-sdk/internal/ucca/legal_rag_types.go @@ -37,6 +37,17 @@ type LegalSearchResult struct { // Supersede-Status (status="superseded", use_for_primary=false) — Alt-Quelle, // die fuer Default-Fragen demoted wird (nicht versteckt; fuer Historie auffindbar). Superseded bool `json:"-"` + + // Evidence-Type-Marker — intern (json:"-", kein Pro-Result-Contract-Change), aus dem + // Qdrant-Payload befuellt. classifyEvidence() leitet daraus den EvidenceType ab; der + // Router surfacet nicht-Text-Evidence (Fußnote/Tabelle/Figur) getrennt vom Text-Merge, + // damit feingranulare Evidence nicht von Breit-Basis-Text ueberrankt wird. + IsFootnote bool `json:"-"` + FootnoteLabel string `json:"-"` + FootnoteVerbatim string `json:"-"` + RefCitationUnit string `json:"-"` + IsTable bool `json:"-"` // C6/C9: is_table (liniiert + borderless) + IsFigure bool `json:"-"` // C8: is_figure (noch nicht befuellt bis C8) } // LegalAssessment is the auditable explanation layer over a ranked result set: diff --git a/ai-compliance-sdk/internal/ucca/multi_regulation.go b/ai-compliance-sdk/internal/ucca/multi_regulation.go index 890a5e2c..0d5ae068 100644 --- a/ai-compliance-sdk/internal/ucca/multi_regulation.go +++ b/ai-compliance-sdk/internal/ucca/multi_regulation.go @@ -195,6 +195,13 @@ func hitsToResults(hits []qdrantSearchHit) []LegalSearchResult { ReferencesOut: getStringSlice(hit.Payload, "references_out"), ReferencesIn: getStringSlice(hit.Payload, "references_in"), Superseded: getString(hit.Payload, "status") == "superseded", + + IsFootnote: getBool(hit.Payload, "is_footnote"), + FootnoteLabel: getString(hit.Payload, "footnote_label"), + FootnoteVerbatim: getString(hit.Payload, "footnote_verbatim"), + RefCitationUnit: getString(hit.Payload, "ref_citation_unit"), + IsTable: getBool(hit.Payload, "is_table"), + IsFigure: getBool(hit.Payload, "is_figure"), } } return results