feat(ai-sdk): legal-corpus structure endpoint + coverage page

Expose GET /sdk/v1/rag/legal-corpus, which scrolls the eur-lex legal
corpus (filtered to a few hundred points regardless of total size) and
aggregates each ingested act's composition: distinct articles, annexes,
recitals and chunk count. Surface it as a new section on /sdk/coverage so
the ingested corpus is no longer a black box — a developer SEES what each
act actually contains, not only its name.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-06-23 19:47:17 +02:00
parent b83c3e6e00
commit 4c99773fa1
6 changed files with 352 additions and 3 deletions
@@ -46,6 +46,28 @@ export interface CorpusOverview {
totals: { documents: number; catalog_sources: number }
}
// --- Ingested legal-corpus structure (from the vector store, via the Go SDK).
// Shows WHAT each eur-lex act consists of (articles/annexes/recitals), so the
// ingested corpus is not a black box for developers. ---
export interface LegalActStructure {
regulation_short: string
regulation_name: string
articles: number
annexes: number
recitals: number
chunks: number
}
export interface LegalCorpus {
regulations: LegalActStructure[]
totals: {
regulations: number
articles: number
annexes: number
recitals: number
}
}
// --- Korpus-Dokumente: gruppieren nach Art (Gesetz/Leitfaden/Standard/Urteil)
// + Herausgeber-Familie (DSK, EDPB, OWASP, NIST …). Deterministisch, pure. ---
interface DocCat {
+83 -3
View File
@@ -3,6 +3,7 @@ import Link from 'next/link'
import {
type UseCaseRow,
type CorpusOverview,
type LegalCorpus,
licenseTierBadgeClass,
commercialBadgeClass,
groupUseCases,
@@ -11,28 +12,46 @@ import {
const BACKEND_URL =
process.env.COMPLIANCE_BACKEND_URL || 'http://backend-compliance:8002'
// The legal-corpus structure comes from the Go SDK (it owns the vector store).
const SDK_URL = process.env.SDK_URL || 'http://ai-compliance-sdk:8090'
export const dynamic = 'force-dynamic'
// Fetched from the SDK and isolated in its own try/catch so a vector-store
// hiccup degrades to "no structure shown" instead of blanking the whole page.
async function fetchLegalCorpus(): Promise<LegalCorpus | null> {
try {
const res = await fetch(`${SDK_URL}/sdk/v1/rag/legal-corpus`, {
cache: 'no-store',
})
return res.ok ? await res.json() : null
} catch {
return null
}
}
async function getData(): Promise<{
useCases: UseCaseRow[]
corpus: CorpusOverview | null
legalCorpus: LegalCorpus | null
}> {
try {
const [ucRes, corpusRes] = await Promise.all([
const [ucRes, corpusRes, legalCorpus] = await Promise.all([
fetch(`${BACKEND_URL}/api/compliance/v1/controls/use-cases`, {
cache: 'no-store',
}),
fetch(`${BACKEND_URL}/api/compliance/v1/controls/corpus`, {
cache: 'no-store',
}),
fetchLegalCorpus(),
])
return {
useCases: ucRes.ok ? await ucRes.json() : [],
corpus: corpusRes.ok ? await corpusRes.json() : null,
legalCorpus,
}
} catch {
return { useCases: [], corpus: null }
return { useCases: [], corpus: null, legalCorpus: null }
}
}
@@ -46,7 +65,7 @@ function Stat({ label, value }: { label: string; value: string | number }) {
}
export default async function CoveragePage() {
const { useCases, corpus } = await getData()
const { useCases, corpus, legalCorpus } = await getData()
const groups = groupUseCases(useCases)
const totalRelevant = useCases.reduce((s, u) => s + u.atom_relevant, 0)
const totalAtoms = useCases.reduce((s, u) => s + u.atom_total, 0)
@@ -221,6 +240,67 @@ export default async function CoveragePage() {
</div>
</section>
{legalCorpus?.regulations?.length ? (
<section className="space-y-2">
<h2 className="text-lg font-semibold text-gray-900">
Ingestierter Rechtskorpus Struktur ({legalCorpus.totals.regulations}{' '}
Rechtsakte)
</h2>
<p className="text-xs text-gray-500">
Woraus jeder ingestierte eur-lex-Rechtsakt tatsächlich besteht:
Artikel (§), Anhänge, Erwägungsgründe und retrievbare Chunks direkt
aus dem Vektorspeicher, damit kein Black-Box-Korpus entsteht.
</p>
<div className="overflow-auto rounded-lg border border-gray-200">
<table className="min-w-full divide-y divide-gray-200 text-sm">
<thead className="bg-gray-50 text-left text-xs uppercase text-gray-500">
<tr>
<th className="px-4 py-2">Rechtsakt</th>
<th className="px-4 py-2 text-right">Artikel (§)</th>
<th className="px-4 py-2 text-right">Anhänge</th>
<th className="px-4 py-2 text-right">Erwägungsgründe</th>
<th className="px-4 py-2 text-right">Chunks</th>
</tr>
</thead>
<tbody className="divide-y divide-gray-100 bg-white">
{legalCorpus.regulations.map((r) => (
<tr key={r.regulation_short}>
<td className="px-4 py-2 text-gray-900">
<span className="font-medium">{r.regulation_short}</span>
{r.regulation_name !== r.regulation_short ? (
<span className="ml-2 text-xs text-gray-500">
{r.regulation_name}
</span>
) : null}
</td>
<td className="px-4 py-2 text-right font-semibold">
{r.articles.toLocaleString('de-DE')}
</td>
<td className="px-4 py-2 text-right">
{r.annexes > 0 ? (
r.annexes.toLocaleString('de-DE')
) : (
<span className="text-gray-300"></span>
)}
</td>
<td className="px-4 py-2 text-right text-gray-500">
{r.recitals > 0 ? (
r.recitals.toLocaleString('de-DE')
) : (
<span className="text-gray-300"></span>
)}
</td>
<td className="px-4 py-2 text-right text-gray-500">
{r.chunks.toLocaleString('de-DE')}
</td>
</tr>
))}
</tbody>
</table>
</div>
</section>
) : null}
{corpus?.license_catalog?.length ? (
<section className="space-y-2">
<h2 className="text-lg font-semibold text-gray-900">
@@ -206,3 +206,32 @@ func (h *RAGHandlers) HandleScrollChunks(c *gin.Context) {
"total": len(chunks),
})
}
// LegalCorpusStructure returns the composition (distinct articles, annexes,
// recitals + chunk count) of every ingested eur-lex legal act, so the coverage
// page can show WHAT was ingested instead of just the act name.
// GET /sdk/v1/rag/legal-corpus
func (h *RAGHandlers) LegalCorpusStructure(c *gin.Context) {
acts, err := h.ragClient.CorpusStructure(c.Request.Context())
if err != nil {
c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to aggregate legal corpus: " + err.Error()})
return
}
arts, anns, recs := 0, 0, 0
for _, a := range acts {
arts += a.Articles
anns += a.Annexes
recs += a.Recitals
}
c.JSON(http.StatusOK, gin.H{
"regulations": acts,
"totals": gin.H{
"regulations": len(acts),
"articles": arts,
"annexes": anns,
"recitals": recs,
},
})
}
+1
View File
@@ -161,6 +161,7 @@ func registerRAGRoutes(v1 *gin.RouterGroup, h *handlers.RAGHandlers) {
ragRoutes.GET("/corpus-status", h.CorpusStatus)
ragRoutes.GET("/corpus-versions/:collection", h.CorpusVersionHistory)
ragRoutes.GET("/scroll", h.HandleScrollChunks)
ragRoutes.GET("/legal-corpus", h.LegalCorpusStructure)
}
}
@@ -0,0 +1,167 @@
package ucca
import (
"bytes"
"context"
"encoding/json"
"fmt"
"io"
"net/http"
"sort"
)
// LegalActStructure is the composition of one ingested eur-lex legal act — how
// many distinct articles, annexes and recitals it consists of (plus the raw
// chunk count). Backs the coverage page so the ingested corpus is not a black
// box: a developer SEES what each act actually contains, not only its name.
type LegalActStructure struct {
RegulationShort string `json:"regulation_short"`
RegulationName string `json:"regulation_name"`
Articles int `json:"articles"`
Annexes int `json:"annexes"`
Recitals int `json:"recitals"`
Chunks int `json:"chunks"`
}
const eurlexSource = "eur-lex.europa.eu"
// legalStructureCollections hold the clean eur-lex legal corpus (chunks tagged
// with chunk_scope = section | annex | recital).
var legalStructureCollections = []string{"bp_compliance_ce", "bp_compliance_datenschutz"}
// chunkScopeBucket maps a Qdrant chunk_scope to the structure field it feeds.
var chunkScopeBucket = map[string]string{"section": "articles", "annex": "annexes", "recital": "recitals"}
// CorpusStructure scrolls the eur-lex legal corpus across the legal collections
// and aggregates the per-act composition. The source filter keeps it to a few
// hundred points regardless of total corpus size. Read-only; a collection that
// fails to scroll is skipped rather than failing the whole call.
func (c *LegalRAGClient) CorpusStructure(ctx context.Context) ([]LegalActStructure, error) {
var all []qdrantScrollPoint
for _, coll := range legalStructureCollections {
pts, err := c.scrollLegalCorpus(ctx, coll)
if err != nil {
continue
}
all = append(all, pts...)
}
return aggregateStructure(all), nil
}
// aggregateStructure counts distinct article labels per (regulation, scope).
// Pure → unit-testable without a vector store.
func aggregateStructure(points []qdrantScrollPoint) []LegalActStructure {
distinct := map[string]map[string]map[string]struct{}{}
names := map[string]string{}
chunks := map[string]int{}
order := []string{}
for _, pt := range points {
reg := getString(pt.Payload, "regulation_short")
if reg == "" {
continue
}
if _, seen := names[reg]; !seen {
name := getString(pt.Payload, "regulation_name_de")
if name == "" {
name = reg
}
names[reg] = name
distinct[reg] = map[string]map[string]struct{}{}
order = append(order, reg)
}
chunks[reg]++
bucket, ok := chunkScopeBucket[getString(pt.Payload, "chunk_scope")]
article := getString(pt.Payload, "article")
if !ok || article == "" {
continue
}
if distinct[reg][bucket] == nil {
distinct[reg][bucket] = map[string]struct{}{}
}
distinct[reg][bucket][article] = struct{}{}
}
out := make([]LegalActStructure, 0, len(order))
for _, reg := range order {
out = append(out, LegalActStructure{
RegulationShort: reg,
RegulationName: names[reg],
Articles: len(distinct[reg]["articles"]),
Annexes: len(distinct[reg]["annexes"]),
Recitals: len(distinct[reg]["recitals"]),
Chunks: chunks[reg],
})
}
sort.SliceStable(out, func(i, j int) bool {
if out[i].Articles != out[j].Articles {
return out[i].Articles > out[j].Articles
}
return out[i].RegulationShort < out[j].RegulationShort
})
return out
}
// scrollLegalCorpus pages through one collection, filtered to the eur-lex legal
// corpus, returning minimal-payload points (no text/vectors).
func (c *LegalRAGClient) scrollLegalCorpus(ctx context.Context, collection string) ([]qdrantScrollPoint, error) {
var all []qdrantScrollPoint
var offset interface{}
for {
points, next, err := c.scrollLegalPage(ctx, collection, offset)
if err != nil {
return nil, err
}
all = append(all, points...)
if next == nil {
break
}
offset = next
}
return all, nil
}
// scrollLegalPage fetches one page of the filtered scroll and returns the
// points plus the next-page offset (nil when exhausted).
func (c *LegalRAGClient) scrollLegalPage(ctx context.Context, collection string, offset interface{}) ([]qdrantScrollPoint, interface{}, error) {
reqBody := map[string]interface{}{
"limit": 500,
"with_payload": map[string]interface{}{"include": []string{"regulation_short", "regulation_name_de", "chunk_scope", "article"}},
"with_vectors": false,
"filter": map[string]interface{}{
"must": []map[string]interface{}{
{"key": "source", "match": map[string]interface{}{"value": eurlexSource}},
},
},
}
if offset != nil {
reqBody["offset"] = offset
}
jsonBody, err := json.Marshal(reqBody)
if err != nil {
return nil, nil, err
}
url := fmt.Sprintf("%s/collections/%s/points/scroll", c.qdrantURL, collection)
req, err := http.NewRequestWithContext(ctx, "POST", url, bytes.NewReader(jsonBody))
if err != nil {
return nil, nil, err
}
req.Header.Set("Content-Type", "application/json")
if c.qdrantAPIKey != "" {
req.Header.Set("api-key", c.qdrantAPIKey)
}
resp, err := c.httpClient.Do(req)
if err != nil {
return nil, nil, err
}
defer func() { _ = resp.Body.Close() }()
if resp.StatusCode != http.StatusOK {
body, _ := io.ReadAll(resp.Body)
return nil, nil, fmt.Errorf("qdrant returned %d: %s", resp.StatusCode, string(body))
}
var scrollResp qdrantScrollResponse
if err := json.NewDecoder(resp.Body).Decode(&scrollResp); err != nil {
return nil, nil, err
}
return scrollResp.Result.Points, scrollResp.Result.NextPageOffset, nil
}
@@ -0,0 +1,50 @@
package ucca
import "testing"
func structPoint(reg, name, scope, article string) qdrantScrollPoint {
return qdrantScrollPoint{Payload: map[string]interface{}{
"regulation_short": reg,
"regulation_name_de": name,
"chunk_scope": scope,
"article": article,
}}
}
func TestAggregateStructure_CountsDistinctPerScope(t *testing.T) {
points := []qdrantScrollPoint{
structPoint("CRA", "Cyber Resilience Act", "section", "13"),
structPoint("CRA", "Cyber Resilience Act", "section", "13"), // duplicate article → still 1
structPoint("CRA", "Cyber Resilience Act", "section", "14"),
structPoint("CRA", "Cyber Resilience Act", "annex", "Anhang-I"),
structPoint("CRA", "Cyber Resilience Act", "annex", "Anhang-VII"),
structPoint("DORA", "", "section", "6"), // first sighting has no name →
structPoint("DORA", "", "section", "19"), // regulation_name falls back to short
structPoint("DORA", "", "recital", ""), // empty article → ignored for distinct
structPoint("", "x", "section", "1"), // missing regulation → skipped entirely
}
got := aggregateStructure(points)
if len(got) != 2 {
t.Fatalf("want 2 acts, got %d (%+v)", len(got), got)
}
// CRA has more articles → sorts first.
cra := got[0]
if cra.RegulationShort != "CRA" || cra.Articles != 2 || cra.Annexes != 2 || cra.Recitals != 0 || cra.Chunks != 5 {
t.Errorf("CRA wrong: %+v", cra)
}
dora := got[1]
if dora.RegulationShort != "DORA" || dora.Articles != 2 || dora.Chunks != 3 {
t.Errorf("DORA wrong: %+v", dora)
}
if dora.RegulationName != "DORA" {
t.Errorf("DORA name fallback failed: %q", dora.RegulationName)
}
}
func TestAggregateStructure_Empty(t *testing.T) {
if got := aggregateStructure(nil); len(got) != 0 {
t.Errorf("want empty, got %+v", got)
}
}