feat(ai-sdk): citation-graph assessment + opt-in graph expansion (Phase 2)
CI / detect-changes (pull_request) Successful in 14s
CI / branch-name (pull_request) Successful in 1s
CI / guardrail-integrity (pull_request) Successful in 16s
CI / secret-scan (pull_request) Successful in 18s
CI / dep-audit (pull_request) Failing after 1m2s
CI / sbom-scan (pull_request) Failing after 1m10s
CI / build-sha-integrity (pull_request) Successful in 13s
CI / validate-canonical-controls (pull_request) Successful in 14s
CI / loc-budget (pull_request) Successful in 23s
CI / go-lint (pull_request) Successful in 50s
CI / python-lint (pull_request) Failing after 18s
CI / nodejs-lint (pull_request) Failing after 1m8s
CI / nodejs-build (pull_request) Successful in 3m7s
CI / test-go (pull_request) Successful in 1m6s
CI / iace-gt-coverage (pull_request) Successful in 26s
CI / test-python-backend (pull_request) Successful in 33s
CI / test-python-document-crawler (pull_request) Successful in 21s
CI / test-python-dsms-gateway (pull_request) Successful in 21s
CI / detect-changes (pull_request) Successful in 14s
CI / branch-name (pull_request) Successful in 1s
CI / guardrail-integrity (pull_request) Successful in 16s
CI / secret-scan (pull_request) Successful in 18s
CI / dep-audit (pull_request) Failing after 1m2s
CI / sbom-scan (pull_request) Failing after 1m10s
CI / build-sha-integrity (pull_request) Successful in 13s
CI / validate-canonical-controls (pull_request) Successful in 14s
CI / loc-budget (pull_request) Successful in 23s
CI / go-lint (pull_request) Successful in 50s
CI / python-lint (pull_request) Failing after 18s
CI / nodejs-lint (pull_request) Failing after 1m8s
CI / nodejs-build (pull_request) Successful in 3m7s
CI / test-go (pull_request) Successful in 1m6s
CI / iace-gt-coverage (pull_request) Successful in 26s
CI / test-python-backend (pull_request) Successful in 33s
CI / test-python-document-crawler (pull_request) Successful in 21s
CI / test-python-dsms-gateway (pull_request) Successful in 21s
Add an `assessment` object to the legal RAG search response: primary norm, connected norms (from the citation graph references_out/in of the primary), cross_regime, human_review_flag, a norm-level winner_margin and a short reasoning string. The margin is computed over DISTINCT norms, so a long article split into several chunks no longer fabricates uncertainty. The per-result schema stays frozen — graph fields are internal (json:"-"). Also wire optional citation-graph expansion (RAG_GRAPH_EXPANSION=true, default off): top hits pull their referenced norms into the candidate pool via the precise edge (e.g. Art. 13 CRA -> Anhang I). Measured to add no rank gain over the existing binding-law augmentation, with +1 Qdrant call per search and reverse-edge fan-out risk, so it ships off-by-default as a recall safety net. The graph EXPLAINS retrieval (assessment), it does not expand it by default. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -78,6 +78,7 @@ func (h *RAGHandlers) Search(c *gin.Context) {
|
|||||||
"query": req.Query,
|
"query": req.Query,
|
||||||
"results": results,
|
"results": results,
|
||||||
"count": len(results),
|
"count": len(results),
|
||||||
|
"assessment": ucca.Assess(results),
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -0,0 +1,134 @@
|
|||||||
|
package ucca
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"strings"
|
||||||
|
)
|
||||||
|
|
||||||
|
const (
|
||||||
|
assessConnectedCap = 12 // cap connected norms surfaced in the assessment
|
||||||
|
assessCrossRegimeTopN = 5 // window over which "cross regime" is judged
|
||||||
|
assessReviewMargin = 0.05 // a tighter winner gap → recommend human review
|
||||||
|
)
|
||||||
|
|
||||||
|
// Assess builds the auditable explanation layer over a ranked result set:
|
||||||
|
// primary norm, the norms it connects to (citation graph), cross-regime, a
|
||||||
|
// human-review flag, the winner margin and a short reasoning string. Pure →
|
||||||
|
// unit-testable. It EXPLAINS the ranking, it does not change it. Returns nil for
|
||||||
|
// an empty result set.
|
||||||
|
func Assess(results []LegalSearchResult) *LegalAssessment {
|
||||||
|
if len(results) == 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
// Norm-level view: collapse multiple chunks of the same article/annex so the
|
||||||
|
// margin and cross-regime are judged between DISTINCT norms, not near-identical
|
||||||
|
// chunks of one norm (which would make every winner margin ~0).
|
||||||
|
norms := distinctNorms(results)
|
||||||
|
p := norms[0]
|
||||||
|
|
||||||
|
primary := primaryLabel(p)
|
||||||
|
connected := dedupStrings(p.ReferencesOut, p.ReferencesIn, p.CitationUnit)
|
||||||
|
if len(connected) > assessConnectedCap {
|
||||||
|
connected = connected[:assessConnectedCap]
|
||||||
|
}
|
||||||
|
|
||||||
|
window := norms
|
||||||
|
if len(window) > assessCrossRegimeTopN {
|
||||||
|
window = window[:assessCrossRegimeTopN]
|
||||||
|
}
|
||||||
|
regimes := make(map[string]bool)
|
||||||
|
for _, r := range window {
|
||||||
|
if r.RegulationShort != "" {
|
||||||
|
regimes[r.RegulationShort] = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
crossRegime := len(regimes) > 1
|
||||||
|
|
||||||
|
margin := 0.0
|
||||||
|
if len(norms) > 1 {
|
||||||
|
margin = norms[0].Score - norms[1].Score
|
||||||
|
}
|
||||||
|
|
||||||
|
primaryBinding := p.SourceClass == "binding_law"
|
||||||
|
humanReview := margin < assessReviewMargin || crossRegime || !primaryBinding
|
||||||
|
|
||||||
|
return &LegalAssessment{
|
||||||
|
PrimaryNorm: primary,
|
||||||
|
PrimaryRegulation: p.RegulationShort,
|
||||||
|
ConnectedNorms: connected,
|
||||||
|
CrossRegime: crossRegime,
|
||||||
|
HumanReviewFlag: humanReview,
|
||||||
|
WinnerMargin: margin,
|
||||||
|
ScoreReasoning: assessReasoning(p, margin, crossRegime, primaryBinding),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func primaryLabel(p LegalSearchResult) string {
|
||||||
|
if p.CitationUnit != "" {
|
||||||
|
return p.CitationUnit
|
||||||
|
}
|
||||||
|
if p.ArticleLabel != "" {
|
||||||
|
return p.ArticleLabel
|
||||||
|
}
|
||||||
|
return strings.TrimSpace(p.RegulationShort + " " + p.Article)
|
||||||
|
}
|
||||||
|
|
||||||
|
// assessReasoning renders a short, human-readable justification (German).
|
||||||
|
func assessReasoning(p LegalSearchResult, margin float64, crossRegime, primaryBinding bool) string {
|
||||||
|
label := primaryLabel(p)
|
||||||
|
parts := make([]string, 0, 4)
|
||||||
|
if primaryBinding {
|
||||||
|
parts = append(parts, fmt.Sprintf("Primärtreffer %s: bindendes Recht (Autorität %d).", label, p.AuthorityWeight))
|
||||||
|
} else {
|
||||||
|
parts = append(parts, fmt.Sprintf("Primärtreffer %s ist keine bindende Norm (Leitlinie/Standard) — Quelle prüfen.", label))
|
||||||
|
}
|
||||||
|
if margin > 0 {
|
||||||
|
parts = append(parts, fmt.Sprintf("Vorsprung %.2f vor #2.", margin))
|
||||||
|
}
|
||||||
|
if margin < assessReviewMargin {
|
||||||
|
parts = append(parts, "Knapper Vorsprung — Alternativtreffer prüfen.")
|
||||||
|
}
|
||||||
|
if crossRegime {
|
||||||
|
parts = append(parts, "Mehrere Regime betroffen — Querbezug prüfen.")
|
||||||
|
}
|
||||||
|
return strings.Join(parts, " ")
|
||||||
|
}
|
||||||
|
|
||||||
|
// distinctNorms collapses results that share a citation (multiple chunks of the
|
||||||
|
// same article/annex) to the first — i.e. highest-ranked — occurrence. Results
|
||||||
|
// without any citation identity are each kept, since they cannot be matched.
|
||||||
|
func distinctNorms(results []LegalSearchResult) []LegalSearchResult {
|
||||||
|
seen := make(map[string]bool, len(results))
|
||||||
|
out := make([]LegalSearchResult, 0, len(results))
|
||||||
|
for _, r := range results {
|
||||||
|
key := r.CitationUnit
|
||||||
|
if key == "" {
|
||||||
|
key = r.ArticleLabel
|
||||||
|
}
|
||||||
|
if key != "" {
|
||||||
|
if seen[key] {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
seen[key] = true
|
||||||
|
}
|
||||||
|
out = append(out, r)
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
|
// dedupStrings concatenates out+in, drops empties and the excluded value, and
|
||||||
|
// returns a stable de-duplicated slice (insertion order preserved).
|
||||||
|
func dedupStrings(out, in []string, exclude string) []string {
|
||||||
|
seen := map[string]bool{exclude: true}
|
||||||
|
res := make([]string, 0, len(out)+len(in))
|
||||||
|
for _, list := range [][]string{out, in} {
|
||||||
|
for _, s := range list {
|
||||||
|
if s == "" || seen[s] {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
seen[s] = true
|
||||||
|
res = append(res, s)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return res
|
||||||
|
}
|
||||||
@@ -0,0 +1,112 @@
|
|||||||
|
package ucca
|
||||||
|
|
||||||
|
import "testing"
|
||||||
|
|
||||||
|
func ares(reg, cu, sc string, score float64, weight int, out, in []string) LegalSearchResult {
|
||||||
|
return LegalSearchResult{
|
||||||
|
RegulationShort: reg, CitationUnit: cu, SourceClass: sc, Score: score,
|
||||||
|
AuthorityWeight: weight, ReferencesOut: out, ReferencesIn: in,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestAssess_Empty(t *testing.T) {
|
||||||
|
if Assess(nil) != nil {
|
||||||
|
t.Error("empty results → nil assessment")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestAssess_BindingPrimary_NoReview(t *testing.T) {
|
||||||
|
results := []LegalSearchResult{
|
||||||
|
ares("CRA", "Art. 13 CRA", "binding_law", 1.05, 100,
|
||||||
|
[]string{"CRA Anhang I", "Art. 14 CRA"}, []string{"Art. 12 CRA"}),
|
||||||
|
ares("CRA", "Art. 14 CRA", "binding_law", 0.80, 100, nil, nil),
|
||||||
|
}
|
||||||
|
a := Assess(results)
|
||||||
|
if a == nil {
|
||||||
|
t.Fatal("nil assessment")
|
||||||
|
}
|
||||||
|
if a.PrimaryNorm != "Art. 13 CRA" || a.PrimaryRegulation != "CRA" {
|
||||||
|
t.Errorf("primary wrong: %+v", a)
|
||||||
|
}
|
||||||
|
if len(a.ConnectedNorms) != 3 { // out(2) + in(1), self excluded, deduped
|
||||||
|
t.Errorf("connected norms: %v", a.ConnectedNorms)
|
||||||
|
}
|
||||||
|
if a.CrossRegime {
|
||||||
|
t.Error("single regime must not be cross-regime")
|
||||||
|
}
|
||||||
|
if a.WinnerMargin < 0.24 || a.WinnerMargin > 0.26 {
|
||||||
|
t.Errorf("margin = %v, want ~0.25", a.WinnerMargin)
|
||||||
|
}
|
||||||
|
if a.HumanReviewFlag {
|
||||||
|
t.Error("clean binding + healthy margin + single regime → no review")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestAssess_CrossRegimeFlagsReview(t *testing.T) {
|
||||||
|
a := Assess([]LegalSearchResult{
|
||||||
|
ares("CRA", "Art. 13 CRA", "binding_law", 1.05, 100, nil, nil),
|
||||||
|
ares("DORA", "Art. 6 DORA", "binding_law", 0.70, 100, nil, nil),
|
||||||
|
})
|
||||||
|
if !a.CrossRegime || !a.HumanReviewFlag {
|
||||||
|
t.Errorf("cross-regime must flag review: %+v", a)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestAssess_NonBindingFlagsReview(t *testing.T) {
|
||||||
|
a := Assess([]LegalSearchResult{
|
||||||
|
ares("ENISA", "ENISA SBOM", "supervisory_guidance", 0.90, 70, nil, nil),
|
||||||
|
ares("ENISA", "ENISA X", "supervisory_guidance", 0.40, 70, nil, nil),
|
||||||
|
})
|
||||||
|
if !a.HumanReviewFlag {
|
||||||
|
t.Error("non-binding primary → review")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestAssess_TightMarginFlagsReview(t *testing.T) {
|
||||||
|
a := Assess([]LegalSearchResult{
|
||||||
|
ares("CRA", "Art. 13 CRA", "binding_law", 1.00, 100, nil, nil),
|
||||||
|
ares("CRA", "Art. 14 CRA", "binding_law", 0.98, 100, nil, nil),
|
||||||
|
})
|
||||||
|
if a.WinnerMargin >= 0.05 || !a.HumanReviewFlag {
|
||||||
|
t.Errorf("tight margin → review: %+v", a)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestAssess_MarginIsNormLevelNotChunkLevel(t *testing.T) {
|
||||||
|
// Two near-identical chunks of the SAME norm at the top, then a distinct norm.
|
||||||
|
results := []LegalSearchResult{
|
||||||
|
ares("CRA", "Art. 13 CRA", "binding_law", 1.050, 100, []string{"CRA Anhang I"}, nil),
|
||||||
|
ares("CRA", "Art. 13 CRA", "binding_law", 1.049, 100, nil, nil), // same norm
|
||||||
|
ares("CRA", "Art. 14 CRA", "binding_law", 0.800, 100, nil, nil),
|
||||||
|
}
|
||||||
|
a := Assess(results)
|
||||||
|
if a.WinnerMargin < 0.24 || a.WinnerMargin > 0.26 { // Art.13 vs Art.14, not chunk vs chunk
|
||||||
|
t.Errorf("margin must be norm-level (~0.25), got %v", a.WinnerMargin)
|
||||||
|
}
|
||||||
|
if a.HumanReviewFlag {
|
||||||
|
t.Error("healthy norm-level margin → no review")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestDistinctNorms(t *testing.T) {
|
||||||
|
got := distinctNorms([]LegalSearchResult{
|
||||||
|
{CitationUnit: "Art. 13 CRA"},
|
||||||
|
{CitationUnit: "Art. 13 CRA"}, // duplicate norm → collapsed
|
||||||
|
{CitationUnit: "Art. 14 CRA"},
|
||||||
|
{CitationUnit: ""}, // no identity → kept
|
||||||
|
{CitationUnit: ""}, // no identity → kept
|
||||||
|
})
|
||||||
|
if len(got) != 4 {
|
||||||
|
t.Errorf("want 4 (2 distinct + 2 unidentified), got %d", len(got))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestDedupStrings(t *testing.T) {
|
||||||
|
got := dedupStrings([]string{"a", "b", "", "a"}, []string{"b", "c"}, "self")
|
||||||
|
if len(got) != 3 || got[0] != "a" || got[1] != "b" || got[2] != "c" {
|
||||||
|
t.Errorf("dedup: %v", got)
|
||||||
|
}
|
||||||
|
if len(dedupStrings([]string{"self"}, nil, "self")) != 0 {
|
||||||
|
t.Error("excluded value must be dropped")
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -20,6 +20,7 @@ type LegalRAGClient struct {
|
|||||||
httpClient *http.Client
|
httpClient *http.Client
|
||||||
textIndexEnsured map[string]bool
|
textIndexEnsured map[string]bool
|
||||||
hybridEnabled bool
|
hybridEnabled bool
|
||||||
|
graphEnabled bool
|
||||||
}
|
}
|
||||||
|
|
||||||
// NewLegalRAGClient creates a new Legal RAG client using Ollama bge-m3 embeddings.
|
// NewLegalRAGClient creates a new Legal RAG client using Ollama bge-m3 embeddings.
|
||||||
@@ -38,6 +39,11 @@ func NewLegalRAGClient() *LegalRAGClient {
|
|||||||
}
|
}
|
||||||
|
|
||||||
hybridEnabled := os.Getenv("RAG_HYBRID_SEARCH") != "false"
|
hybridEnabled := os.Getenv("RAG_HYBRID_SEARCH") != "false"
|
||||||
|
// Graph-Expansion ist OPT-IN: kein gemessener Rang-Nutzen ggue. der Binding-Augmentation,
|
||||||
|
// +1 Qdrant-Call/Suche, Flutungsrisiko ueber Reverse-Kanten. Bleibt als Recall-Sicherheitsnetz
|
||||||
|
// fuer spaetere Luecken (RAG_GRAPH_EXPANSION=true). Die Graph-Kanten werden in der Response
|
||||||
|
// zur Begruendung/Vollstaendigkeit genutzt, nicht zur Pool-Expansion (Default).
|
||||||
|
graphEnabled := os.Getenv("RAG_GRAPH_EXPANSION") == "true"
|
||||||
|
|
||||||
return &LegalRAGClient{
|
return &LegalRAGClient{
|
||||||
qdrantURL: qdrantURL,
|
qdrantURL: qdrantURL,
|
||||||
@@ -47,6 +53,7 @@ func NewLegalRAGClient() *LegalRAGClient {
|
|||||||
collection: "bp_compliance_ce",
|
collection: "bp_compliance_ce",
|
||||||
textIndexEnsured: make(map[string]bool),
|
textIndexEnsured: make(map[string]bool),
|
||||||
hybridEnabled: hybridEnabled,
|
hybridEnabled: hybridEnabled,
|
||||||
|
graphEnabled: graphEnabled,
|
||||||
httpClient: &http.Client{
|
httpClient: &http.Client{
|
||||||
Timeout: 60 * time.Second,
|
Timeout: 60 * time.Second,
|
||||||
},
|
},
|
||||||
@@ -100,6 +107,13 @@ func (c *LegalRAGClient) searchInternal(ctx context.Context, collection string,
|
|||||||
hits = mergeDedupHits(hits, bindingHits)
|
hits = mergeDedupHits(hits, bindingHits)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Graph-Augmentation: verbundene Normen (references_out/in) der Top-Hits ueber die
|
||||||
|
// praezise Zitations-Kante in den Pool ziehen — z.B. Art. 13 CRA zieht Anhang I (die
|
||||||
|
// eigentliche Pflichtquelle). Pool-Augmentation only; Re-Rank + topK bleiben.
|
||||||
|
if c.graphEnabled {
|
||||||
|
hits = c.expandViaGraph(ctx, collection, hits)
|
||||||
|
}
|
||||||
|
|
||||||
results := make([]LegalSearchResult, len(hits))
|
results := make([]LegalSearchResult, len(hits))
|
||||||
for i, hit := range hits {
|
for i, hit := range hits {
|
||||||
// Legal-Metadaten nach rag_reingest_spec.md §2: bevorzugt die normalisierten Felder
|
// Legal-Metadaten nach rag_reingest_spec.md §2: bevorzugt die normalisierten Felder
|
||||||
@@ -131,6 +145,9 @@ func (c *LegalRAGClient) searchInternal(ctx context.Context, collection string,
|
|||||||
AuthorityWeight: getInt(hit.Payload, "authority_weight"),
|
AuthorityWeight: getInt(hit.Payload, "authority_weight"),
|
||||||
SourceClass: getString(hit.Payload, "source_class"),
|
SourceClass: getString(hit.Payload, "source_class"),
|
||||||
Jurisdiction: getString(hit.Payload, "jurisdiction"),
|
Jurisdiction: getString(hit.Payload, "jurisdiction"),
|
||||||
|
CitationUnit: getString(hit.Payload, "citation_unit"),
|
||||||
|
ReferencesOut: getStringSlice(hit.Payload, "references_out"),
|
||||||
|
ReferencesIn: getStringSlice(hit.Payload, "references_in"),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -0,0 +1,162 @@
|
|||||||
|
package ucca
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bytes"
|
||||||
|
"context"
|
||||||
|
"encoding/json"
|
||||||
|
"fmt"
|
||||||
|
"io"
|
||||||
|
"net/http"
|
||||||
|
"sort"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Graph-augmented retrieval: when a top hit cites an annex/article (references_out)
|
||||||
|
// or is cited by one (references_in), pull that connected norm into the candidate
|
||||||
|
// pool via the PRECISE citation graph instead of hoping semantic search surfaces
|
||||||
|
// it. E.g. a hit on CRA Art. 13 pulls in CRA Anhang I (the actual requirement).
|
||||||
|
// Pool-augmentation only — authority re-rank + topK slice still apply, so the
|
||||||
|
// response schema is unchanged.
|
||||||
|
const (
|
||||||
|
graphSeedCount = 5 // only the top hits seed the expansion
|
||||||
|
graphMaxExpand = 15 // cap connected norms pulled in (avoid pool explosion)
|
||||||
|
graphHopPenalty = 0.05 // a one-hop neighbour ranks just below its seed
|
||||||
|
)
|
||||||
|
|
||||||
|
// expandViaGraph augments hits with the norms they cite and the norms that cite
|
||||||
|
// them. Best-effort: on any error (or nothing to expand) the original hits are
|
||||||
|
// returned unchanged.
|
||||||
|
func (c *LegalRAGClient) expandViaGraph(ctx context.Context, collection string, hits []qdrantSearchHit) []qdrantSearchHit {
|
||||||
|
if len(hits) == 0 {
|
||||||
|
return hits
|
||||||
|
}
|
||||||
|
present := make(map[string]bool, len(hits))
|
||||||
|
for _, h := range hits {
|
||||||
|
if cu := getString(h.Payload, "citation_unit"); cu != "" {
|
||||||
|
present[cu] = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
seeds := hits
|
||||||
|
if len(seeds) > graphSeedCount {
|
||||||
|
seeds = seeds[:graphSeedCount]
|
||||||
|
}
|
||||||
|
// Forward edges only (references_out = the detail a hit explicitly points to,
|
||||||
|
// e.g. Art. 13 → Anhang I). Reverse (references_in) has high fan-out for popular
|
||||||
|
// annexes (Anhang I is cited by 23 articles) → pool flooding; it is surfaced as
|
||||||
|
// connected-norm metadata in the Phase 2 response instead of expanding the pool.
|
||||||
|
want := make(map[string]float64) // connected citation_unit -> best seeding score
|
||||||
|
for _, h := range seeds {
|
||||||
|
for _, cu := range getStringSlice(h.Payload, "references_out") {
|
||||||
|
if cu == "" || present[cu] {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if s, ok := want[cu]; !ok || h.Score > s {
|
||||||
|
want[cu] = h.Score
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if len(want) == 0 {
|
||||||
|
return hits
|
||||||
|
}
|
||||||
|
|
||||||
|
units := topByScore(want, graphMaxExpand)
|
||||||
|
fetched, err := c.fetchByCitationUnits(ctx, collection, units)
|
||||||
|
if err != nil || len(fetched) == 0 {
|
||||||
|
return hits
|
||||||
|
}
|
||||||
|
neighbours := make([]qdrantSearchHit, 0, len(fetched))
|
||||||
|
for cu, pt := range fetched {
|
||||||
|
neighbours = append(neighbours, qdrantSearchHit{ID: pt.ID, Score: want[cu] - graphHopPenalty, Payload: pt.Payload})
|
||||||
|
}
|
||||||
|
return mergeDedupHits(hits, neighbours)
|
||||||
|
}
|
||||||
|
|
||||||
|
// topByScore returns up to n keys with the highest values. Deterministic: ties
|
||||||
|
// broken by the key string so the cap is stable across runs.
|
||||||
|
func topByScore(m map[string]float64, n int) []string {
|
||||||
|
keys := make([]string, 0, len(m))
|
||||||
|
for k := range m {
|
||||||
|
keys = append(keys, k)
|
||||||
|
}
|
||||||
|
sort.Slice(keys, func(i, j int) bool {
|
||||||
|
if m[keys[i]] != m[keys[j]] {
|
||||||
|
return m[keys[i]] > m[keys[j]]
|
||||||
|
}
|
||||||
|
return keys[i] < keys[j]
|
||||||
|
})
|
||||||
|
if len(keys) > n {
|
||||||
|
keys = keys[:n]
|
||||||
|
}
|
||||||
|
return keys
|
||||||
|
}
|
||||||
|
|
||||||
|
// fetchByCitationUnits loads one representative point (the first chunk) per
|
||||||
|
// citation_unit from the given collection.
|
||||||
|
func (c *LegalRAGClient) fetchByCitationUnits(ctx context.Context, collection string, units []string) (map[string]qdrantScrollPoint, error) {
|
||||||
|
should := make([]map[string]interface{}, 0, len(units))
|
||||||
|
for _, cu := range units {
|
||||||
|
should = append(should, map[string]interface{}{"key": "citation_unit", "match": map[string]interface{}{"value": cu}})
|
||||||
|
}
|
||||||
|
reqBody := map[string]interface{}{
|
||||||
|
"limit": len(units) * 4,
|
||||||
|
"with_payload": true,
|
||||||
|
"with_vectors": false,
|
||||||
|
"filter": map[string]interface{}{"should": should},
|
||||||
|
}
|
||||||
|
jsonBody, err := json.Marshal(reqBody)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
url := fmt.Sprintf("%s/collections/%s/points/scroll", c.qdrantURL, collection)
|
||||||
|
req, err := http.NewRequestWithContext(ctx, "POST", url, bytes.NewReader(jsonBody))
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
req.Header.Set("Content-Type", "application/json")
|
||||||
|
if c.qdrantAPIKey != "" {
|
||||||
|
req.Header.Set("api-key", c.qdrantAPIKey)
|
||||||
|
}
|
||||||
|
resp, err := c.httpClient.Do(req)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
defer func() { _ = resp.Body.Close() }()
|
||||||
|
if resp.StatusCode != http.StatusOK {
|
||||||
|
body, _ := io.ReadAll(resp.Body)
|
||||||
|
return nil, fmt.Errorf("qdrant scroll returned %d: %s", resp.StatusCode, string(body))
|
||||||
|
}
|
||||||
|
var scrollResp qdrantScrollResponse
|
||||||
|
if err := json.NewDecoder(resp.Body).Decode(&scrollResp); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
out := make(map[string]qdrantScrollPoint, len(units))
|
||||||
|
for _, pt := range scrollResp.Result.Points {
|
||||||
|
cu := getString(pt.Payload, "citation_unit")
|
||||||
|
if cu != "" {
|
||||||
|
if _, seen := out[cu]; !seen {
|
||||||
|
out[cu] = pt
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return out, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// getStringSlice extracts a []string from a Qdrant payload list field
|
||||||
|
// (references_out / references_in are stored as JSON arrays of strings).
|
||||||
|
func getStringSlice(m map[string]interface{}, key string) []string {
|
||||||
|
v, ok := m[key]
|
||||||
|
if !ok {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
arr, ok := v.([]interface{})
|
||||||
|
if !ok {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
out := make([]string, 0, len(arr))
|
||||||
|
for _, item := range arr {
|
||||||
|
if s, ok := item.(string); ok {
|
||||||
|
out = append(out, s)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
@@ -0,0 +1,89 @@
|
|||||||
|
package ucca
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"encoding/json"
|
||||||
|
"net/http"
|
||||||
|
"net/http/httptest"
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestGetStringSlice(t *testing.T) {
|
||||||
|
m := map[string]interface{}{
|
||||||
|
"refs": []interface{}{"a", "b", 3, "c"}, // non-strings are skipped
|
||||||
|
"str": "not-a-list",
|
||||||
|
}
|
||||||
|
got := getStringSlice(m, "refs")
|
||||||
|
if len(got) != 3 || got[0] != "a" || got[2] != "c" {
|
||||||
|
t.Errorf("refs: %v", got)
|
||||||
|
}
|
||||||
|
if getStringSlice(m, "missing") != nil {
|
||||||
|
t.Error("missing key should be nil")
|
||||||
|
}
|
||||||
|
if getStringSlice(m, "str") != nil {
|
||||||
|
t.Error("non-list should be nil")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestTopByScore_DeterministicCap(t *testing.T) {
|
||||||
|
m := map[string]float64{"x": 0.5, "y": 0.9, "z": 0.5, "w": 0.7}
|
||||||
|
got := topByScore(m, 2)
|
||||||
|
if len(got) != 2 || got[0] != "y" || got[1] != "w" {
|
||||||
|
t.Errorf("want [y w], got %v", got)
|
||||||
|
}
|
||||||
|
all := topByScore(m, 10)
|
||||||
|
if all[2] != "x" || all[3] != "z" { // tie 0.5 broken by key string
|
||||||
|
t.Errorf("tie-break not deterministic: %v", all)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestExpandViaGraph_NoSeedsOrRefs(t *testing.T) {
|
||||||
|
c := &LegalRAGClient{} // nil httpClient → must not be called on these paths
|
||||||
|
if out := c.expandViaGraph(context.Background(), "x", nil); out != nil {
|
||||||
|
t.Error("empty hits should return nil")
|
||||||
|
}
|
||||||
|
hits := []qdrantSearchHit{{ID: 1, Score: 0.8, Payload: map[string]interface{}{"citation_unit": "Art. 1 CRA"}}}
|
||||||
|
if out := c.expandViaGraph(context.Background(), "x", hits); len(out) != 1 {
|
||||||
|
t.Errorf("no references → unchanged, got %d", len(out))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestExpandViaGraph_PullsConnectedNorm(t *testing.T) {
|
||||||
|
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
|
||||||
|
_ = json.NewEncoder(w).Encode(map[string]interface{}{
|
||||||
|
"result": map[string]interface{}{
|
||||||
|
"points": []map[string]interface{}{
|
||||||
|
{"id": 99, "payload": map[string]interface{}{
|
||||||
|
"citation_unit": "CRA Anhang I", "chunk_text": "Sicherheitsanforderungen",
|
||||||
|
"source_class": "binding_law", "authority_weight": 100, "regulation_short": "CRA",
|
||||||
|
}},
|
||||||
|
},
|
||||||
|
"next_page_offset": nil,
|
||||||
|
},
|
||||||
|
})
|
||||||
|
}))
|
||||||
|
defer srv.Close()
|
||||||
|
|
||||||
|
c := &LegalRAGClient{qdrantURL: srv.URL, httpClient: srv.Client()}
|
||||||
|
hits := []qdrantSearchHit{
|
||||||
|
{ID: 1, Score: 0.70, Payload: map[string]interface{}{
|
||||||
|
"citation_unit": "Art. 13 CRA", "references_out": []interface{}{"CRA Anhang I"},
|
||||||
|
}},
|
||||||
|
}
|
||||||
|
out := c.expandViaGraph(context.Background(), "bp_compliance_ce", hits)
|
||||||
|
if len(out) != 2 {
|
||||||
|
t.Fatalf("want 2 hits (seed + connected annex), got %d", len(out))
|
||||||
|
}
|
||||||
|
var found *qdrantSearchHit
|
||||||
|
for i := range out {
|
||||||
|
if getString(out[i].Payload, "citation_unit") == "CRA Anhang I" {
|
||||||
|
found = &out[i]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if found == nil {
|
||||||
|
t.Fatal("connected norm CRA Anhang I was not pulled into the pool")
|
||||||
|
}
|
||||||
|
if found.Score < 0.64 || found.Score > 0.66 { // 0.70 seed − 0.05 hop penalty
|
||||||
|
t.Errorf("connected score = %v, want ~0.65", found.Score)
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -27,6 +27,27 @@ type LegalSearchResult struct {
|
|||||||
AuthorityWeight int `json:"-"`
|
AuthorityWeight int `json:"-"`
|
||||||
SourceClass string `json:"-"`
|
SourceClass string `json:"-"`
|
||||||
Jurisdiction string `json:"-"`
|
Jurisdiction string `json:"-"`
|
||||||
|
|
||||||
|
// Zitations-Graph (Phase 2) — intern, speist nur die Assessment-Berechnung
|
||||||
|
// (verbundene Normen, Begruendung). Pro-Result-Schema bleibt eingefroren.
|
||||||
|
CitationUnit string `json:"-"`
|
||||||
|
ReferencesOut []string `json:"-"`
|
||||||
|
ReferencesIn []string `json:"-"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// LegalAssessment is the auditable explanation layer over a ranked result set:
|
||||||
|
// which norm is primary, which norms connect to it via the citation graph,
|
||||||
|
// whether the answer crosses regulatory regimes, and whether a human should
|
||||||
|
// review. Computed from the already-ranked results — it EXPLAINS retrieval, it
|
||||||
|
// does not change it (graph edges for reasoning/completeness, not pool-expansion).
|
||||||
|
type LegalAssessment struct {
|
||||||
|
PrimaryNorm string `json:"primary_norm"`
|
||||||
|
PrimaryRegulation string `json:"primary_regulation"`
|
||||||
|
ConnectedNorms []string `json:"connected_norms"`
|
||||||
|
CrossRegime bool `json:"cross_regime"`
|
||||||
|
HumanReviewFlag bool `json:"human_review_flag"`
|
||||||
|
WinnerMargin float64 `json:"winner_margin"`
|
||||||
|
ScoreReasoning string `json:"score_reasoning"`
|
||||||
}
|
}
|
||||||
|
|
||||||
// LegalContext represents aggregated legal context for an assessment.
|
// LegalContext represents aggregated legal context for an assessment.
|
||||||
|
|||||||
Reference in New Issue
Block a user