feat(ai-sdk): legal-corpus structure endpoint + coverage page
Expose GET /sdk/v1/rag/legal-corpus, which scrolls the eur-lex legal corpus (filtered to a few hundred points regardless of total size) and aggregates each ingested act's composition: distinct articles, annexes, recitals and chunk count. Surface it as a new section on /sdk/coverage so the ingested corpus is no longer a black box — a developer SEES what each act actually contains, not only its name. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,167 @@
|
||||
package ucca
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"sort"
|
||||
)
|
||||
|
||||
// LegalActStructure is the composition of one ingested eur-lex legal act — how
|
||||
// many distinct articles, annexes and recitals it consists of (plus the raw
|
||||
// chunk count). Backs the coverage page so the ingested corpus is not a black
|
||||
// box: a developer SEES what each act actually contains, not only its name.
|
||||
type LegalActStructure struct {
|
||||
RegulationShort string `json:"regulation_short"`
|
||||
RegulationName string `json:"regulation_name"`
|
||||
Articles int `json:"articles"`
|
||||
Annexes int `json:"annexes"`
|
||||
Recitals int `json:"recitals"`
|
||||
Chunks int `json:"chunks"`
|
||||
}
|
||||
|
||||
const eurlexSource = "eur-lex.europa.eu"
|
||||
|
||||
// legalStructureCollections hold the clean eur-lex legal corpus (chunks tagged
|
||||
// with chunk_scope = section | annex | recital).
|
||||
var legalStructureCollections = []string{"bp_compliance_ce", "bp_compliance_datenschutz"}
|
||||
|
||||
// chunkScopeBucket maps a Qdrant chunk_scope to the structure field it feeds.
|
||||
var chunkScopeBucket = map[string]string{"section": "articles", "annex": "annexes", "recital": "recitals"}
|
||||
|
||||
// CorpusStructure scrolls the eur-lex legal corpus across the legal collections
|
||||
// and aggregates the per-act composition. The source filter keeps it to a few
|
||||
// hundred points regardless of total corpus size. Read-only; a collection that
|
||||
// fails to scroll is skipped rather than failing the whole call.
|
||||
func (c *LegalRAGClient) CorpusStructure(ctx context.Context) ([]LegalActStructure, error) {
|
||||
var all []qdrantScrollPoint
|
||||
for _, coll := range legalStructureCollections {
|
||||
pts, err := c.scrollLegalCorpus(ctx, coll)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
all = append(all, pts...)
|
||||
}
|
||||
return aggregateStructure(all), nil
|
||||
}
|
||||
|
||||
// aggregateStructure counts distinct article labels per (regulation, scope).
|
||||
// Pure → unit-testable without a vector store.
|
||||
func aggregateStructure(points []qdrantScrollPoint) []LegalActStructure {
|
||||
distinct := map[string]map[string]map[string]struct{}{}
|
||||
names := map[string]string{}
|
||||
chunks := map[string]int{}
|
||||
order := []string{}
|
||||
|
||||
for _, pt := range points {
|
||||
reg := getString(pt.Payload, "regulation_short")
|
||||
if reg == "" {
|
||||
continue
|
||||
}
|
||||
if _, seen := names[reg]; !seen {
|
||||
name := getString(pt.Payload, "regulation_name_de")
|
||||
if name == "" {
|
||||
name = reg
|
||||
}
|
||||
names[reg] = name
|
||||
distinct[reg] = map[string]map[string]struct{}{}
|
||||
order = append(order, reg)
|
||||
}
|
||||
chunks[reg]++
|
||||
bucket, ok := chunkScopeBucket[getString(pt.Payload, "chunk_scope")]
|
||||
article := getString(pt.Payload, "article")
|
||||
if !ok || article == "" {
|
||||
continue
|
||||
}
|
||||
if distinct[reg][bucket] == nil {
|
||||
distinct[reg][bucket] = map[string]struct{}{}
|
||||
}
|
||||
distinct[reg][bucket][article] = struct{}{}
|
||||
}
|
||||
|
||||
out := make([]LegalActStructure, 0, len(order))
|
||||
for _, reg := range order {
|
||||
out = append(out, LegalActStructure{
|
||||
RegulationShort: reg,
|
||||
RegulationName: names[reg],
|
||||
Articles: len(distinct[reg]["articles"]),
|
||||
Annexes: len(distinct[reg]["annexes"]),
|
||||
Recitals: len(distinct[reg]["recitals"]),
|
||||
Chunks: chunks[reg],
|
||||
})
|
||||
}
|
||||
sort.SliceStable(out, func(i, j int) bool {
|
||||
if out[i].Articles != out[j].Articles {
|
||||
return out[i].Articles > out[j].Articles
|
||||
}
|
||||
return out[i].RegulationShort < out[j].RegulationShort
|
||||
})
|
||||
return out
|
||||
}
|
||||
|
||||
// scrollLegalCorpus pages through one collection, filtered to the eur-lex legal
|
||||
// corpus, returning minimal-payload points (no text/vectors).
|
||||
func (c *LegalRAGClient) scrollLegalCorpus(ctx context.Context, collection string) ([]qdrantScrollPoint, error) {
|
||||
var all []qdrantScrollPoint
|
||||
var offset interface{}
|
||||
for {
|
||||
points, next, err := c.scrollLegalPage(ctx, collection, offset)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
all = append(all, points...)
|
||||
if next == nil {
|
||||
break
|
||||
}
|
||||
offset = next
|
||||
}
|
||||
return all, nil
|
||||
}
|
||||
|
||||
// scrollLegalPage fetches one page of the filtered scroll and returns the
|
||||
// points plus the next-page offset (nil when exhausted).
|
||||
func (c *LegalRAGClient) scrollLegalPage(ctx context.Context, collection string, offset interface{}) ([]qdrantScrollPoint, interface{}, error) {
|
||||
reqBody := map[string]interface{}{
|
||||
"limit": 500,
|
||||
"with_payload": map[string]interface{}{"include": []string{"regulation_short", "regulation_name_de", "chunk_scope", "article"}},
|
||||
"with_vectors": false,
|
||||
"filter": map[string]interface{}{
|
||||
"must": []map[string]interface{}{
|
||||
{"key": "source", "match": map[string]interface{}{"value": eurlexSource}},
|
||||
},
|
||||
},
|
||||
}
|
||||
if offset != nil {
|
||||
reqBody["offset"] = offset
|
||||
}
|
||||
jsonBody, err := json.Marshal(reqBody)
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
url := fmt.Sprintf("%s/collections/%s/points/scroll", c.qdrantURL, collection)
|
||||
req, err := http.NewRequestWithContext(ctx, "POST", url, bytes.NewReader(jsonBody))
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
req.Header.Set("Content-Type", "application/json")
|
||||
if c.qdrantAPIKey != "" {
|
||||
req.Header.Set("api-key", c.qdrantAPIKey)
|
||||
}
|
||||
resp, err := c.httpClient.Do(req)
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
defer func() { _ = resp.Body.Close() }()
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
body, _ := io.ReadAll(resp.Body)
|
||||
return nil, nil, fmt.Errorf("qdrant returned %d: %s", resp.StatusCode, string(body))
|
||||
}
|
||||
var scrollResp qdrantScrollResponse
|
||||
if err := json.NewDecoder(resp.Body).Decode(&scrollResp); err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
return scrollResp.Result.Points, scrollResp.Result.NextPageOffset, nil
|
||||
}
|
||||
@@ -0,0 +1,50 @@
|
||||
package ucca
|
||||
|
||||
import "testing"
|
||||
|
||||
func structPoint(reg, name, scope, article string) qdrantScrollPoint {
|
||||
return qdrantScrollPoint{Payload: map[string]interface{}{
|
||||
"regulation_short": reg,
|
||||
"regulation_name_de": name,
|
||||
"chunk_scope": scope,
|
||||
"article": article,
|
||||
}}
|
||||
}
|
||||
|
||||
func TestAggregateStructure_CountsDistinctPerScope(t *testing.T) {
|
||||
points := []qdrantScrollPoint{
|
||||
structPoint("CRA", "Cyber Resilience Act", "section", "13"),
|
||||
structPoint("CRA", "Cyber Resilience Act", "section", "13"), // duplicate article → still 1
|
||||
structPoint("CRA", "Cyber Resilience Act", "section", "14"),
|
||||
structPoint("CRA", "Cyber Resilience Act", "annex", "Anhang-I"),
|
||||
structPoint("CRA", "Cyber Resilience Act", "annex", "Anhang-VII"),
|
||||
structPoint("DORA", "", "section", "6"), // first sighting has no name →
|
||||
structPoint("DORA", "", "section", "19"), // regulation_name falls back to short
|
||||
structPoint("DORA", "", "recital", ""), // empty article → ignored for distinct
|
||||
structPoint("", "x", "section", "1"), // missing regulation → skipped entirely
|
||||
}
|
||||
|
||||
got := aggregateStructure(points)
|
||||
|
||||
if len(got) != 2 {
|
||||
t.Fatalf("want 2 acts, got %d (%+v)", len(got), got)
|
||||
}
|
||||
// CRA has more articles → sorts first.
|
||||
cra := got[0]
|
||||
if cra.RegulationShort != "CRA" || cra.Articles != 2 || cra.Annexes != 2 || cra.Recitals != 0 || cra.Chunks != 5 {
|
||||
t.Errorf("CRA wrong: %+v", cra)
|
||||
}
|
||||
dora := got[1]
|
||||
if dora.RegulationShort != "DORA" || dora.Articles != 2 || dora.Chunks != 3 {
|
||||
t.Errorf("DORA wrong: %+v", dora)
|
||||
}
|
||||
if dora.RegulationName != "DORA" {
|
||||
t.Errorf("DORA name fallback failed: %q", dora.RegulationName)
|
||||
}
|
||||
}
|
||||
|
||||
func TestAggregateStructure_Empty(t *testing.T) {
|
||||
if got := aggregateStructure(nil); len(got) != 0 {
|
||||
t.Errorf("want empty, got %+v", got)
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user