diff --git a/admin-compliance/app/sdk/coverage/_helpers.ts b/admin-compliance/app/sdk/coverage/_helpers.ts index bc56be0f..fb138dc5 100644 --- a/admin-compliance/app/sdk/coverage/_helpers.ts +++ b/admin-compliance/app/sdk/coverage/_helpers.ts @@ -46,6 +46,28 @@ export interface CorpusOverview { totals: { documents: number; catalog_sources: number } } +// --- Ingested legal-corpus structure (from the vector store, via the Go SDK). +// Shows WHAT each eur-lex act consists of (articles/annexes/recitals), so the +// ingested corpus is not a black box for developers. --- +export interface LegalActStructure { + regulation_short: string + regulation_name: string + articles: number + annexes: number + recitals: number + chunks: number +} + +export interface LegalCorpus { + regulations: LegalActStructure[] + totals: { + regulations: number + articles: number + annexes: number + recitals: number + } +} + // --- Korpus-Dokumente: gruppieren nach Art (Gesetz/Leitfaden/Standard/Urteil) // + Herausgeber-Familie (DSK, EDPB, OWASP, NIST …). Deterministisch, pure. --- interface DocCat { diff --git a/admin-compliance/app/sdk/coverage/page.tsx b/admin-compliance/app/sdk/coverage/page.tsx index dc73df1e..7f254b8b 100644 --- a/admin-compliance/app/sdk/coverage/page.tsx +++ b/admin-compliance/app/sdk/coverage/page.tsx @@ -3,6 +3,7 @@ import Link from 'next/link' import { type UseCaseRow, type CorpusOverview, + type LegalCorpus, licenseTierBadgeClass, commercialBadgeClass, groupUseCases, @@ -11,28 +12,46 @@ import { const BACKEND_URL = process.env.COMPLIANCE_BACKEND_URL || 'http://backend-compliance:8002' +// The legal-corpus structure comes from the Go SDK (it owns the vector store). +const SDK_URL = process.env.SDK_URL || 'http://ai-compliance-sdk:8090' export const dynamic = 'force-dynamic' +// Fetched from the SDK and isolated in its own try/catch so a vector-store +// hiccup degrades to "no structure shown" instead of blanking the whole page. +async function fetchLegalCorpus(): Promise { + try { + const res = await fetch(`${SDK_URL}/sdk/v1/rag/legal-corpus`, { + cache: 'no-store', + }) + return res.ok ? await res.json() : null + } catch { + return null + } +} + async function getData(): Promise<{ useCases: UseCaseRow[] corpus: CorpusOverview | null + legalCorpus: LegalCorpus | null }> { try { - const [ucRes, corpusRes] = await Promise.all([ + const [ucRes, corpusRes, legalCorpus] = await Promise.all([ fetch(`${BACKEND_URL}/api/compliance/v1/controls/use-cases`, { cache: 'no-store', }), fetch(`${BACKEND_URL}/api/compliance/v1/controls/corpus`, { cache: 'no-store', }), + fetchLegalCorpus(), ]) return { useCases: ucRes.ok ? await ucRes.json() : [], corpus: corpusRes.ok ? await corpusRes.json() : null, + legalCorpus, } } catch { - return { useCases: [], corpus: null } + return { useCases: [], corpus: null, legalCorpus: null } } } @@ -46,7 +65,7 @@ function Stat({ label, value }: { label: string; value: string | number }) { } export default async function CoveragePage() { - const { useCases, corpus } = await getData() + const { useCases, corpus, legalCorpus } = await getData() const groups = groupUseCases(useCases) const totalRelevant = useCases.reduce((s, u) => s + u.atom_relevant, 0) const totalAtoms = useCases.reduce((s, u) => s + u.atom_total, 0) @@ -221,6 +240,67 @@ export default async function CoveragePage() { + {legalCorpus?.regulations?.length ? ( +
+

+ Ingestierter Rechtskorpus – Struktur ({legalCorpus.totals.regulations}{' '} + Rechtsakte) +

+

+ Woraus jeder ingestierte eur-lex-Rechtsakt tatsächlich besteht: + Artikel (§), Anhänge, Erwägungsgründe und retrievbare Chunks — direkt + aus dem Vektorspeicher, damit kein Black-Box-Korpus entsteht. +

+
+ + + + + + + + + + + + {legalCorpus.regulations.map((r) => ( + + + + + + + + ))} + +
RechtsaktArtikel (§)AnhängeErwägungsgründeChunks
+ {r.regulation_short} + {r.regulation_name !== r.regulation_short ? ( + + {r.regulation_name} + + ) : null} + + {r.articles.toLocaleString('de-DE')} + + {r.annexes > 0 ? ( + r.annexes.toLocaleString('de-DE') + ) : ( + + )} + + {r.recitals > 0 ? ( + r.recitals.toLocaleString('de-DE') + ) : ( + + )} + + {r.chunks.toLocaleString('de-DE')} +
+
+
+ ) : null} + {corpus?.license_catalog?.length ? (

diff --git a/ai-compliance-sdk/internal/api/handlers/rag_handlers.go b/ai-compliance-sdk/internal/api/handlers/rag_handlers.go index 97dd6d89..99068d8c 100644 --- a/ai-compliance-sdk/internal/api/handlers/rag_handlers.go +++ b/ai-compliance-sdk/internal/api/handlers/rag_handlers.go @@ -206,3 +206,32 @@ func (h *RAGHandlers) HandleScrollChunks(c *gin.Context) { "total": len(chunks), }) } + +// LegalCorpusStructure returns the composition (distinct articles, annexes, +// recitals + chunk count) of every ingested eur-lex legal act, so the coverage +// page can show WHAT was ingested instead of just the act name. +// GET /sdk/v1/rag/legal-corpus +func (h *RAGHandlers) LegalCorpusStructure(c *gin.Context) { + acts, err := h.ragClient.CorpusStructure(c.Request.Context()) + if err != nil { + c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to aggregate legal corpus: " + err.Error()}) + return + } + + arts, anns, recs := 0, 0, 0 + for _, a := range acts { + arts += a.Articles + anns += a.Annexes + recs += a.Recitals + } + + c.JSON(http.StatusOK, gin.H{ + "regulations": acts, + "totals": gin.H{ + "regulations": len(acts), + "articles": arts, + "annexes": anns, + "recitals": recs, + }, + }) +} diff --git a/ai-compliance-sdk/internal/app/routes.go b/ai-compliance-sdk/internal/app/routes.go index e190c39a..396402bb 100644 --- a/ai-compliance-sdk/internal/app/routes.go +++ b/ai-compliance-sdk/internal/app/routes.go @@ -161,6 +161,7 @@ func registerRAGRoutes(v1 *gin.RouterGroup, h *handlers.RAGHandlers) { ragRoutes.GET("/corpus-status", h.CorpusStatus) ragRoutes.GET("/corpus-versions/:collection", h.CorpusVersionHistory) ragRoutes.GET("/scroll", h.HandleScrollChunks) + ragRoutes.GET("/legal-corpus", h.LegalCorpusStructure) } } diff --git a/ai-compliance-sdk/internal/ucca/legal_corpus_structure.go b/ai-compliance-sdk/internal/ucca/legal_corpus_structure.go new file mode 100644 index 00000000..351ff4d1 --- /dev/null +++ b/ai-compliance-sdk/internal/ucca/legal_corpus_structure.go @@ -0,0 +1,167 @@ +package ucca + +import ( + "bytes" + "context" + "encoding/json" + "fmt" + "io" + "net/http" + "sort" +) + +// LegalActStructure is the composition of one ingested eur-lex legal act — how +// many distinct articles, annexes and recitals it consists of (plus the raw +// chunk count). Backs the coverage page so the ingested corpus is not a black +// box: a developer SEES what each act actually contains, not only its name. +type LegalActStructure struct { + RegulationShort string `json:"regulation_short"` + RegulationName string `json:"regulation_name"` + Articles int `json:"articles"` + Annexes int `json:"annexes"` + Recitals int `json:"recitals"` + Chunks int `json:"chunks"` +} + +const eurlexSource = "eur-lex.europa.eu" + +// legalStructureCollections hold the clean eur-lex legal corpus (chunks tagged +// with chunk_scope = section | annex | recital). +var legalStructureCollections = []string{"bp_compliance_ce", "bp_compliance_datenschutz"} + +// chunkScopeBucket maps a Qdrant chunk_scope to the structure field it feeds. +var chunkScopeBucket = map[string]string{"section": "articles", "annex": "annexes", "recital": "recitals"} + +// CorpusStructure scrolls the eur-lex legal corpus across the legal collections +// and aggregates the per-act composition. The source filter keeps it to a few +// hundred points regardless of total corpus size. Read-only; a collection that +// fails to scroll is skipped rather than failing the whole call. +func (c *LegalRAGClient) CorpusStructure(ctx context.Context) ([]LegalActStructure, error) { + var all []qdrantScrollPoint + for _, coll := range legalStructureCollections { + pts, err := c.scrollLegalCorpus(ctx, coll) + if err != nil { + continue + } + all = append(all, pts...) + } + return aggregateStructure(all), nil +} + +// aggregateStructure counts distinct article labels per (regulation, scope). +// Pure → unit-testable without a vector store. +func aggregateStructure(points []qdrantScrollPoint) []LegalActStructure { + distinct := map[string]map[string]map[string]struct{}{} + names := map[string]string{} + chunks := map[string]int{} + order := []string{} + + for _, pt := range points { + reg := getString(pt.Payload, "regulation_short") + if reg == "" { + continue + } + if _, seen := names[reg]; !seen { + name := getString(pt.Payload, "regulation_name_de") + if name == "" { + name = reg + } + names[reg] = name + distinct[reg] = map[string]map[string]struct{}{} + order = append(order, reg) + } + chunks[reg]++ + bucket, ok := chunkScopeBucket[getString(pt.Payload, "chunk_scope")] + article := getString(pt.Payload, "article") + if !ok || article == "" { + continue + } + if distinct[reg][bucket] == nil { + distinct[reg][bucket] = map[string]struct{}{} + } + distinct[reg][bucket][article] = struct{}{} + } + + out := make([]LegalActStructure, 0, len(order)) + for _, reg := range order { + out = append(out, LegalActStructure{ + RegulationShort: reg, + RegulationName: names[reg], + Articles: len(distinct[reg]["articles"]), + Annexes: len(distinct[reg]["annexes"]), + Recitals: len(distinct[reg]["recitals"]), + Chunks: chunks[reg], + }) + } + sort.SliceStable(out, func(i, j int) bool { + if out[i].Articles != out[j].Articles { + return out[i].Articles > out[j].Articles + } + return out[i].RegulationShort < out[j].RegulationShort + }) + return out +} + +// scrollLegalCorpus pages through one collection, filtered to the eur-lex legal +// corpus, returning minimal-payload points (no text/vectors). +func (c *LegalRAGClient) scrollLegalCorpus(ctx context.Context, collection string) ([]qdrantScrollPoint, error) { + var all []qdrantScrollPoint + var offset interface{} + for { + points, next, err := c.scrollLegalPage(ctx, collection, offset) + if err != nil { + return nil, err + } + all = append(all, points...) + if next == nil { + break + } + offset = next + } + return all, nil +} + +// scrollLegalPage fetches one page of the filtered scroll and returns the +// points plus the next-page offset (nil when exhausted). +func (c *LegalRAGClient) scrollLegalPage(ctx context.Context, collection string, offset interface{}) ([]qdrantScrollPoint, interface{}, error) { + reqBody := map[string]interface{}{ + "limit": 500, + "with_payload": map[string]interface{}{"include": []string{"regulation_short", "regulation_name_de", "chunk_scope", "article"}}, + "with_vectors": false, + "filter": map[string]interface{}{ + "must": []map[string]interface{}{ + {"key": "source", "match": map[string]interface{}{"value": eurlexSource}}, + }, + }, + } + if offset != nil { + reqBody["offset"] = offset + } + jsonBody, err := json.Marshal(reqBody) + if err != nil { + return nil, nil, err + } + url := fmt.Sprintf("%s/collections/%s/points/scroll", c.qdrantURL, collection) + req, err := http.NewRequestWithContext(ctx, "POST", url, bytes.NewReader(jsonBody)) + if err != nil { + return nil, nil, err + } + req.Header.Set("Content-Type", "application/json") + if c.qdrantAPIKey != "" { + req.Header.Set("api-key", c.qdrantAPIKey) + } + resp, err := c.httpClient.Do(req) + if err != nil { + return nil, nil, err + } + defer func() { _ = resp.Body.Close() }() + if resp.StatusCode != http.StatusOK { + body, _ := io.ReadAll(resp.Body) + return nil, nil, fmt.Errorf("qdrant returned %d: %s", resp.StatusCode, string(body)) + } + var scrollResp qdrantScrollResponse + if err := json.NewDecoder(resp.Body).Decode(&scrollResp); err != nil { + return nil, nil, err + } + return scrollResp.Result.Points, scrollResp.Result.NextPageOffset, nil +} diff --git a/ai-compliance-sdk/internal/ucca/legal_corpus_structure_test.go b/ai-compliance-sdk/internal/ucca/legal_corpus_structure_test.go new file mode 100644 index 00000000..b5baae94 --- /dev/null +++ b/ai-compliance-sdk/internal/ucca/legal_corpus_structure_test.go @@ -0,0 +1,50 @@ +package ucca + +import "testing" + +func structPoint(reg, name, scope, article string) qdrantScrollPoint { + return qdrantScrollPoint{Payload: map[string]interface{}{ + "regulation_short": reg, + "regulation_name_de": name, + "chunk_scope": scope, + "article": article, + }} +} + +func TestAggregateStructure_CountsDistinctPerScope(t *testing.T) { + points := []qdrantScrollPoint{ + structPoint("CRA", "Cyber Resilience Act", "section", "13"), + structPoint("CRA", "Cyber Resilience Act", "section", "13"), // duplicate article → still 1 + structPoint("CRA", "Cyber Resilience Act", "section", "14"), + structPoint("CRA", "Cyber Resilience Act", "annex", "Anhang-I"), + structPoint("CRA", "Cyber Resilience Act", "annex", "Anhang-VII"), + structPoint("DORA", "", "section", "6"), // first sighting has no name → + structPoint("DORA", "", "section", "19"), // regulation_name falls back to short + structPoint("DORA", "", "recital", ""), // empty article → ignored for distinct + structPoint("", "x", "section", "1"), // missing regulation → skipped entirely + } + + got := aggregateStructure(points) + + if len(got) != 2 { + t.Fatalf("want 2 acts, got %d (%+v)", len(got), got) + } + // CRA has more articles → sorts first. + cra := got[0] + if cra.RegulationShort != "CRA" || cra.Articles != 2 || cra.Annexes != 2 || cra.Recitals != 0 || cra.Chunks != 5 { + t.Errorf("CRA wrong: %+v", cra) + } + dora := got[1] + if dora.RegulationShort != "DORA" || dora.Articles != 2 || dora.Chunks != 3 { + t.Errorf("DORA wrong: %+v", dora) + } + if dora.RegulationName != "DORA" { + t.Errorf("DORA name fallback failed: %q", dora.RegulationName) + } +} + +func TestAggregateStructure_Empty(t *testing.T) { + if got := aggregateStructure(nil); len(got) != 0 { + t.Errorf("want empty, got %+v", got) + } +}