Compare commits
10 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 576063515b | |||
| 9cfe6f83b1 | |||
| df7966656a | |||
| 05d75e8039 | |||
| e24a551ee4 | |||
| f11b2e035f | |||
| 230dc05287 | |||
| b83c3e6e00 | |||
| a1f425d43a | |||
| 23c6ac6f32 |
@@ -136,12 +136,14 @@ jobs:
|
|||||||
runs-on: docker
|
runs-on: docker
|
||||||
needs: detect-changes
|
needs: detect-changes
|
||||||
if: github.event_name == 'pull_request' && needs.detect-changes.outputs.sdk == 'true'
|
if: github.event_name == 'pull_request' && needs.detect-changes.outputs.sdk == 'true'
|
||||||
container: golangci/golangci-lint:v1.62-alpine
|
container: golangci/golangci-lint:v1.64.8-alpine
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout
|
- name: Checkout
|
||||||
run: |
|
run: |
|
||||||
apk add --no-cache git
|
apk add --no-cache git
|
||||||
git clone --depth 1 --branch ${GITHUB_HEAD_REF:-${GITHUB_REF_NAME}} ${GITHUB_SERVER_URL}/${GITHUB_REPOSITORY}.git .
|
# Full clone so `main` is a local ref — new-from-merge-base needs the merge base.
|
||||||
|
git clone ${GITHUB_SERVER_URL}/${GITHUB_REPOSITORY}.git .
|
||||||
|
git checkout ${GITHUB_HEAD_REF:-${GITHUB_REF_NAME}}
|
||||||
- name: Lint ai-compliance-sdk
|
- name: Lint ai-compliance-sdk
|
||||||
run: |
|
run: |
|
||||||
[ -d "ai-compliance-sdk" ] || exit 0
|
[ -d "ai-compliance-sdk" ] || exit 0
|
||||||
|
|||||||
@@ -46,6 +46,28 @@ export interface CorpusOverview {
|
|||||||
totals: { documents: number; catalog_sources: number }
|
totals: { documents: number; catalog_sources: number }
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// --- Ingested legal-corpus structure (from the vector store, via the Go SDK).
|
||||||
|
// Shows WHAT each eur-lex act consists of (articles/annexes/recitals), so the
|
||||||
|
// ingested corpus is not a black box for developers. ---
|
||||||
|
export interface LegalActStructure {
|
||||||
|
regulation_short: string
|
||||||
|
regulation_name: string
|
||||||
|
articles: number
|
||||||
|
annexes: number
|
||||||
|
recitals: number
|
||||||
|
chunks: number
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface LegalCorpus {
|
||||||
|
regulations: LegalActStructure[]
|
||||||
|
totals: {
|
||||||
|
regulations: number
|
||||||
|
articles: number
|
||||||
|
annexes: number
|
||||||
|
recitals: number
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// --- Korpus-Dokumente: gruppieren nach Art (Gesetz/Leitfaden/Standard/Urteil)
|
// --- Korpus-Dokumente: gruppieren nach Art (Gesetz/Leitfaden/Standard/Urteil)
|
||||||
// + Herausgeber-Familie (DSK, EDPB, OWASP, NIST …). Deterministisch, pure. ---
|
// + Herausgeber-Familie (DSK, EDPB, OWASP, NIST …). Deterministisch, pure. ---
|
||||||
interface DocCat {
|
interface DocCat {
|
||||||
|
|||||||
@@ -3,6 +3,7 @@ import Link from 'next/link'
|
|||||||
import {
|
import {
|
||||||
type UseCaseRow,
|
type UseCaseRow,
|
||||||
type CorpusOverview,
|
type CorpusOverview,
|
||||||
|
type LegalCorpus,
|
||||||
licenseTierBadgeClass,
|
licenseTierBadgeClass,
|
||||||
commercialBadgeClass,
|
commercialBadgeClass,
|
||||||
groupUseCases,
|
groupUseCases,
|
||||||
@@ -11,28 +12,46 @@ import {
|
|||||||
|
|
||||||
const BACKEND_URL =
|
const BACKEND_URL =
|
||||||
process.env.COMPLIANCE_BACKEND_URL || 'http://backend-compliance:8002'
|
process.env.COMPLIANCE_BACKEND_URL || 'http://backend-compliance:8002'
|
||||||
|
// The legal-corpus structure comes from the Go SDK (it owns the vector store).
|
||||||
|
const SDK_URL = process.env.SDK_URL || 'http://ai-compliance-sdk:8090'
|
||||||
|
|
||||||
export const dynamic = 'force-dynamic'
|
export const dynamic = 'force-dynamic'
|
||||||
|
|
||||||
|
// Fetched from the SDK and isolated in its own try/catch so a vector-store
|
||||||
|
// hiccup degrades to "no structure shown" instead of blanking the whole page.
|
||||||
|
async function fetchLegalCorpus(): Promise<LegalCorpus | null> {
|
||||||
|
try {
|
||||||
|
const res = await fetch(`${SDK_URL}/sdk/v1/rag/legal-corpus`, {
|
||||||
|
cache: 'no-store',
|
||||||
|
})
|
||||||
|
return res.ok ? await res.json() : null
|
||||||
|
} catch {
|
||||||
|
return null
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
async function getData(): Promise<{
|
async function getData(): Promise<{
|
||||||
useCases: UseCaseRow[]
|
useCases: UseCaseRow[]
|
||||||
corpus: CorpusOverview | null
|
corpus: CorpusOverview | null
|
||||||
|
legalCorpus: LegalCorpus | null
|
||||||
}> {
|
}> {
|
||||||
try {
|
try {
|
||||||
const [ucRes, corpusRes] = await Promise.all([
|
const [ucRes, corpusRes, legalCorpus] = await Promise.all([
|
||||||
fetch(`${BACKEND_URL}/api/compliance/v1/controls/use-cases`, {
|
fetch(`${BACKEND_URL}/api/compliance/v1/controls/use-cases`, {
|
||||||
cache: 'no-store',
|
cache: 'no-store',
|
||||||
}),
|
}),
|
||||||
fetch(`${BACKEND_URL}/api/compliance/v1/controls/corpus`, {
|
fetch(`${BACKEND_URL}/api/compliance/v1/controls/corpus`, {
|
||||||
cache: 'no-store',
|
cache: 'no-store',
|
||||||
}),
|
}),
|
||||||
|
fetchLegalCorpus(),
|
||||||
])
|
])
|
||||||
return {
|
return {
|
||||||
useCases: ucRes.ok ? await ucRes.json() : [],
|
useCases: ucRes.ok ? await ucRes.json() : [],
|
||||||
corpus: corpusRes.ok ? await corpusRes.json() : null,
|
corpus: corpusRes.ok ? await corpusRes.json() : null,
|
||||||
|
legalCorpus,
|
||||||
}
|
}
|
||||||
} catch {
|
} catch {
|
||||||
return { useCases: [], corpus: null }
|
return { useCases: [], corpus: null, legalCorpus: null }
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -46,7 +65,7 @@ function Stat({ label, value }: { label: string; value: string | number }) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
export default async function CoveragePage() {
|
export default async function CoveragePage() {
|
||||||
const { useCases, corpus } = await getData()
|
const { useCases, corpus, legalCorpus } = await getData()
|
||||||
const groups = groupUseCases(useCases)
|
const groups = groupUseCases(useCases)
|
||||||
const totalRelevant = useCases.reduce((s, u) => s + u.atom_relevant, 0)
|
const totalRelevant = useCases.reduce((s, u) => s + u.atom_relevant, 0)
|
||||||
const totalAtoms = useCases.reduce((s, u) => s + u.atom_total, 0)
|
const totalAtoms = useCases.reduce((s, u) => s + u.atom_total, 0)
|
||||||
@@ -221,6 +240,67 @@ export default async function CoveragePage() {
|
|||||||
</div>
|
</div>
|
||||||
</section>
|
</section>
|
||||||
|
|
||||||
|
{legalCorpus?.regulations?.length ? (
|
||||||
|
<section className="space-y-2">
|
||||||
|
<h2 className="text-lg font-semibold text-gray-900">
|
||||||
|
Ingestierter Rechtskorpus – Struktur ({legalCorpus.totals.regulations}{' '}
|
||||||
|
Rechtsakte)
|
||||||
|
</h2>
|
||||||
|
<p className="text-xs text-gray-500">
|
||||||
|
Woraus jeder ingestierte eur-lex-Rechtsakt tatsächlich besteht:
|
||||||
|
Artikel (§), Anhänge, Erwägungsgründe und retrievbare Chunks — direkt
|
||||||
|
aus dem Vektorspeicher, damit kein Black-Box-Korpus entsteht.
|
||||||
|
</p>
|
||||||
|
<div className="overflow-auto rounded-lg border border-gray-200">
|
||||||
|
<table className="min-w-full divide-y divide-gray-200 text-sm">
|
||||||
|
<thead className="bg-gray-50 text-left text-xs uppercase text-gray-500">
|
||||||
|
<tr>
|
||||||
|
<th className="px-4 py-2">Rechtsakt</th>
|
||||||
|
<th className="px-4 py-2 text-right">Artikel (§)</th>
|
||||||
|
<th className="px-4 py-2 text-right">Anhänge</th>
|
||||||
|
<th className="px-4 py-2 text-right">Erwägungsgründe</th>
|
||||||
|
<th className="px-4 py-2 text-right">Chunks</th>
|
||||||
|
</tr>
|
||||||
|
</thead>
|
||||||
|
<tbody className="divide-y divide-gray-100 bg-white">
|
||||||
|
{legalCorpus.regulations.map((r) => (
|
||||||
|
<tr key={r.regulation_short}>
|
||||||
|
<td className="px-4 py-2 text-gray-900">
|
||||||
|
<span className="font-medium">{r.regulation_short}</span>
|
||||||
|
{r.regulation_name !== r.regulation_short ? (
|
||||||
|
<span className="ml-2 text-xs text-gray-500">
|
||||||
|
{r.regulation_name}
|
||||||
|
</span>
|
||||||
|
) : null}
|
||||||
|
</td>
|
||||||
|
<td className="px-4 py-2 text-right font-semibold">
|
||||||
|
{r.articles.toLocaleString('de-DE')}
|
||||||
|
</td>
|
||||||
|
<td className="px-4 py-2 text-right">
|
||||||
|
{r.annexes > 0 ? (
|
||||||
|
r.annexes.toLocaleString('de-DE')
|
||||||
|
) : (
|
||||||
|
<span className="text-gray-300">—</span>
|
||||||
|
)}
|
||||||
|
</td>
|
||||||
|
<td className="px-4 py-2 text-right text-gray-500">
|
||||||
|
{r.recitals > 0 ? (
|
||||||
|
r.recitals.toLocaleString('de-DE')
|
||||||
|
) : (
|
||||||
|
<span className="text-gray-300">—</span>
|
||||||
|
)}
|
||||||
|
</td>
|
||||||
|
<td className="px-4 py-2 text-right text-gray-500">
|
||||||
|
{r.chunks.toLocaleString('de-DE')}
|
||||||
|
</td>
|
||||||
|
</tr>
|
||||||
|
))}
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
</section>
|
||||||
|
) : null}
|
||||||
|
|
||||||
{corpus?.license_catalog?.length ? (
|
{corpus?.license_catalog?.length ? (
|
||||||
<section className="space-y-2">
|
<section className="space-y-2">
|
||||||
<h2 className="text-lg font-semibold text-gray-900">
|
<h2 className="text-lg font-semibold text-gray-900">
|
||||||
|
|||||||
@@ -55,8 +55,7 @@ linters-settings:
|
|||||||
rules:
|
rules:
|
||||||
- name: exported
|
- name: exported
|
||||||
arguments:
|
arguments:
|
||||||
- checkPrivateReceivers: false
|
- disableStutteringCheck
|
||||||
- disableStutteringCheck: true
|
|
||||||
- name: error-return
|
- name: error-return
|
||||||
- name: increment-decrement
|
- name: increment-decrement
|
||||||
- name: var-declaration
|
- name: var-declaration
|
||||||
@@ -83,6 +82,6 @@ issues:
|
|||||||
max-issues-per-linter: 50
|
max-issues-per-linter: 50
|
||||||
max-same-issues: 5
|
max-same-issues: 5
|
||||||
|
|
||||||
# New code only: don't fail on pre-existing issues in files we haven't touched.
|
# New code only: lint lines changed vs main, so pre-existing debt doesn't fail CI.
|
||||||
# Remove this once a clean baseline is established.
|
# Needs the go-lint job to clone with a local `main` ref (see .gitea/workflows/ci.yaml).
|
||||||
new: false
|
new-from-merge-base: main
|
||||||
|
|||||||
@@ -75,9 +75,10 @@ func (h *RAGHandlers) Search(c *gin.Context) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
c.JSON(http.StatusOK, gin.H{
|
c.JSON(http.StatusOK, gin.H{
|
||||||
"query": req.Query,
|
"query": req.Query,
|
||||||
"results": results,
|
"results": results,
|
||||||
"count": len(results),
|
"count": len(results),
|
||||||
|
"assessment": ucca.Assess(results),
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -206,3 +207,32 @@ func (h *RAGHandlers) HandleScrollChunks(c *gin.Context) {
|
|||||||
"total": len(chunks),
|
"total": len(chunks),
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// LegalCorpusStructure returns the composition (distinct articles, annexes,
|
||||||
|
// recitals + chunk count) of every ingested eur-lex legal act, so the coverage
|
||||||
|
// page can show WHAT was ingested instead of just the act name.
|
||||||
|
// GET /sdk/v1/rag/legal-corpus
|
||||||
|
func (h *RAGHandlers) LegalCorpusStructure(c *gin.Context) {
|
||||||
|
acts, err := h.ragClient.CorpusStructure(c.Request.Context())
|
||||||
|
if err != nil {
|
||||||
|
c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to aggregate legal corpus: " + err.Error()})
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
arts, anns, recs := 0, 0, 0
|
||||||
|
for _, a := range acts {
|
||||||
|
arts += a.Articles
|
||||||
|
anns += a.Annexes
|
||||||
|
recs += a.Recitals
|
||||||
|
}
|
||||||
|
|
||||||
|
c.JSON(http.StatusOK, gin.H{
|
||||||
|
"regulations": acts,
|
||||||
|
"totals": gin.H{
|
||||||
|
"regulations": len(acts),
|
||||||
|
"articles": arts,
|
||||||
|
"annexes": anns,
|
||||||
|
"recitals": recs,
|
||||||
|
},
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|||||||
@@ -161,6 +161,7 @@ func registerRAGRoutes(v1 *gin.RouterGroup, h *handlers.RAGHandlers) {
|
|||||||
ragRoutes.GET("/corpus-status", h.CorpusStatus)
|
ragRoutes.GET("/corpus-status", h.CorpusStatus)
|
||||||
ragRoutes.GET("/corpus-versions/:collection", h.CorpusVersionHistory)
|
ragRoutes.GET("/corpus-versions/:collection", h.CorpusVersionHistory)
|
||||||
ragRoutes.GET("/scroll", h.HandleScrollChunks)
|
ragRoutes.GET("/scroll", h.HandleScrollChunks)
|
||||||
|
ragRoutes.GET("/legal-corpus", h.LegalCorpusStructure)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -0,0 +1,230 @@
|
|||||||
|
package ucca
|
||||||
|
|
||||||
|
import (
|
||||||
|
"regexp"
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
|
)
|
||||||
|
|
||||||
|
// authorityInfo is the normative classification of a search result, used internally
|
||||||
|
// for re-ranking only (Phase 1 changes ordering, not the response contract).
|
||||||
|
type authorityInfo struct {
|
||||||
|
weight int // 100 binding, 80 technical_standard, 70 guidance, 0 foreign, 50 unknown
|
||||||
|
sourceClass string // binding_law | technical_standard | supervisory_guidance | foreign_law | unknown
|
||||||
|
jurisdiction string // DE | EU | CH
|
||||||
|
}
|
||||||
|
|
||||||
|
var (
|
||||||
|
guidanceMarkers = []string{
|
||||||
|
"DSK", "EDPB", "BfDI", "BFDI", "BayLfD", "Baylfb", "ENISA", "BSI", "EUCC",
|
||||||
|
"Standards Mapping", "Kpnr", "Orientierungshilfe", "Handreichung", "Beschluss",
|
||||||
|
"Leitlinie", "Guidance", "Empfehlung", "OECD", "CISA", "Blue Guide",
|
||||||
|
}
|
||||||
|
// Technical standards / control frameworks (best-practice controls). Checked BEFORE
|
||||||
|
// guidanceMarkers so a "BSI Grundschutz" chunk classifies as a standard, not BSI guidance.
|
||||||
|
standardMarkers = []string{
|
||||||
|
"NIST", "OWASP", "Grundschutz", "ISO 27001", "ISO/IEC 27001",
|
||||||
|
"CSA CCM", "Cloud Controls Matrix", "CIS Benchmark", "CIS Control",
|
||||||
|
}
|
||||||
|
foreignMarkers = []string{"RevDSG", "fedlex", "(CH)"}
|
||||||
|
deMarkers = []string{"BDSG", "DSK", "BfDI", "BFDI", "BayLfD", "Baylfb", "BSI"}
|
||||||
|
normPattern = regexp.MustCompile(`(§|Art\.?)\s*\d`)
|
||||||
|
bdsgParagraph = regexp.MustCompile(`§\s*(\d+)`)
|
||||||
|
)
|
||||||
|
|
||||||
|
// classifyAuthority derives weight/source-class/jurisdiction. Explicitly tagged payload
|
||||||
|
// values win; otherwise it falls back to the curated category + name markers, so the
|
||||||
|
// not-yet-re-ingested (untagged) corpus is still classified deterministically.
|
||||||
|
func classifyAuthority(r LegalSearchResult) authorityInfo {
|
||||||
|
jur := r.Jurisdiction
|
||||||
|
if jur == "" {
|
||||||
|
jur = inferJurisdiction(r)
|
||||||
|
}
|
||||||
|
if r.SourceClass != "" {
|
||||||
|
w := r.AuthorityWeight
|
||||||
|
if w == 0 && r.SourceClass == "binding_law" {
|
||||||
|
w = 100
|
||||||
|
}
|
||||||
|
return authorityInfo{weight: w, sourceClass: r.SourceClass, jurisdiction: jur}
|
||||||
|
}
|
||||||
|
if r.AuthorityWeight > 0 {
|
||||||
|
return authorityInfo{weight: r.AuthorityWeight, sourceClass: sourceClassFromWeight(r.AuthorityWeight), jurisdiction: jur}
|
||||||
|
}
|
||||||
|
hay := r.ArticleLabel + " " + r.RegulationShort + " " + r.RegulationName + " " + r.RegulationCode
|
||||||
|
switch {
|
||||||
|
case containsAny(hay, foreignMarkers):
|
||||||
|
return authorityInfo{weight: 0, sourceClass: "foreign_law", jurisdiction: "CH"}
|
||||||
|
case r.Category == "standard" || containsAny(hay, standardMarkers):
|
||||||
|
return authorityInfo{weight: 80, sourceClass: "technical_standard", jurisdiction: jur}
|
||||||
|
case r.Category == "guidance" || containsAny(hay, guidanceMarkers):
|
||||||
|
return authorityInfo{weight: 70, sourceClass: "supervisory_guidance", jurisdiction: jur}
|
||||||
|
case r.Category == "regulation" || r.Category == "eu_recht" || normPattern.MatchString(r.ArticleLabel):
|
||||||
|
return authorityInfo{weight: 100, sourceClass: "binding_law", jurisdiction: jur}
|
||||||
|
default:
|
||||||
|
return authorityInfo{weight: 50, sourceClass: "unknown", jurisdiction: jur}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func sourceClassFromWeight(w int) string {
|
||||||
|
switch {
|
||||||
|
case w >= 100:
|
||||||
|
return "binding_law"
|
||||||
|
case w >= 80:
|
||||||
|
return "technical_standard"
|
||||||
|
case w >= 70:
|
||||||
|
return "supervisory_guidance"
|
||||||
|
case w <= 0:
|
||||||
|
return "foreign_law"
|
||||||
|
default:
|
||||||
|
return "unknown"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func inferJurisdiction(r LegalSearchResult) string {
|
||||||
|
hay := r.ArticleLabel + " " + r.RegulationShort + " " + r.RegulationName
|
||||||
|
switch {
|
||||||
|
case containsAny(hay, foreignMarkers):
|
||||||
|
return "CH"
|
||||||
|
case strings.Contains(hay, "§") || containsAny(hay, deMarkers):
|
||||||
|
return "DE"
|
||||||
|
default:
|
||||||
|
return "EU"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// --- Domain routing: separates same-authority but topically foreign norms ---
|
||||||
|
|
||||||
|
type domainDef struct {
|
||||||
|
name string
|
||||||
|
regs []string // regulation markers found in a chunk
|
||||||
|
keywords []string // query keywords that signal this domain
|
||||||
|
}
|
||||||
|
|
||||||
|
// Deterministic order (slice, not map) — important for stable classification + tests.
|
||||||
|
var domains = []domainDef{
|
||||||
|
{"data_protection",
|
||||||
|
[]string{"DSGVO", "GDPR", "BDSG", "EDPB", "DSK", "BfDI", "BayLfD", "DPF"},
|
||||||
|
[]string{"personenbezogen", "betroffene", "datenschutz", "datenschutzbeauftrag", "dsb",
|
||||||
|
"datenpanne", "auskunft", "loesch", "lösch", "einwilligung", "besondere kategorien", "auftragsverarbeiter"}},
|
||||||
|
{"cyber",
|
||||||
|
[]string{"CRA", "NIS2", "NIS-2", "ENISA", "DORA", "EUCC"},
|
||||||
|
[]string{"security update", "sicherheitsupdate", "sicherheitsaktualisierung", "schwachstelle", "sbom",
|
||||||
|
"cybersicherheit", "konformit", "hersteller", "importeur", "haendler", "händler", "ikt-",
|
||||||
|
"resilienz", "sicherheitsvorfall", "digitalen elementen"}},
|
||||||
|
{"ai",
|
||||||
|
[]string{"AI Act", "KI-VO", "KI-Verordnung"},
|
||||||
|
[]string{"ki-system", "ki-modell", "hochrisiko", "kuenstliche intelligenz", "künstliche intelligenz"}},
|
||||||
|
{"product_safety",
|
||||||
|
[]string{"Maschinenverordnung", "MaschinenVO", "GPSR", "RED", "MDR"},
|
||||||
|
nil},
|
||||||
|
}
|
||||||
|
|
||||||
|
func queryDomain(query string) string {
|
||||||
|
ql := strings.ToLower(query)
|
||||||
|
for _, d := range domains {
|
||||||
|
for _, kw := range d.keywords {
|
||||||
|
if strings.Contains(ql, kw) {
|
||||||
|
return d.name
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
|
||||||
|
func chunkDomain(r LegalSearchResult) string {
|
||||||
|
hay := r.ArticleLabel + " " + r.RegulationShort + " " + r.RegulationCode + " " + r.RegulationName
|
||||||
|
for _, d := range domains {
|
||||||
|
if containsAny(hay, d.regs) {
|
||||||
|
return d.name
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
|
||||||
|
// scopeClass flags special sub-regimes that must not win general questions —
|
||||||
|
// BDSG Teil 3 (§§ 45-84) implements the JI directive (law enforcement), not the general regime.
|
||||||
|
func scopeClass(r LegalSearchResult) string {
|
||||||
|
hay := r.ArticleLabel + " " + r.RegulationShort
|
||||||
|
if strings.Contains(hay, "BDSG") {
|
||||||
|
if m := bdsgParagraph.FindStringSubmatch(hay); m != nil {
|
||||||
|
if n, err := strconv.Atoi(m[1]); err == nil && n >= 45 && n <= 84 {
|
||||||
|
return "law_enforcement"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return "general"
|
||||||
|
}
|
||||||
|
|
||||||
|
// --- Topic ontology: amplifier only (boost), never an override ---
|
||||||
|
|
||||||
|
type topicDef struct {
|
||||||
|
keywords []string
|
||||||
|
norms []string // preferred canonical citation fragments
|
||||||
|
}
|
||||||
|
|
||||||
|
var topics = []topicDef{
|
||||||
|
{[]string{"datenschutzbeauftrag", "dsb", "benennung"}, []string{"Art. 37", "§ 38 BDSG"}},
|
||||||
|
{[]string{"stellung des"}, []string{"Art. 38"}},
|
||||||
|
{[]string{"aufgaben des"}, []string{"Art. 39"}},
|
||||||
|
{[]string{"folgenabsch", "dsfa"}, []string{"Art. 35"}},
|
||||||
|
{[]string{"besondere kategorien"}, []string{"Art. 9", "§ 22 BDSG"}},
|
||||||
|
{[]string{"auskunft"}, []string{"Art. 15", "§ 34 BDSG"}},
|
||||||
|
{[]string{"loesch", "lösch"}, []string{"Art. 17", "§ 35 BDSG"}},
|
||||||
|
{[]string{"bussgeld", "geldbusse"}, []string{"Art. 83"}},
|
||||||
|
{[]string{"security update", "sicherheitsupdate", "schwachstelle", "sbom", "cybersicherheitsanforderung"}, []string{"CRA Anhang I"}},
|
||||||
|
{[]string{"meldepflicht", "sicherheitsvorfall"}, []string{"Art. 14 CRA"}},
|
||||||
|
}
|
||||||
|
|
||||||
|
// resultMatchesTopic reports whether the result is a preferred norm of a topic the query hits.
|
||||||
|
func resultMatchesTopic(query string, r LegalSearchResult) bool {
|
||||||
|
ql := strings.ToLower(query)
|
||||||
|
hay := r.ArticleLabel + " " + r.RegulationShort
|
||||||
|
for _, t := range topics {
|
||||||
|
if !containsAnyLower(ql, t.keywords) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
for _, n := range t.norms {
|
||||||
|
if normMatches(hay, n) {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
// normMatches checks that norm appears in hay with a non-digit boundary, so "Art. 9"
|
||||||
|
// matches "Art. 9 DSGVO" but not "Art. 90".
|
||||||
|
func normMatches(hay, norm string) bool {
|
||||||
|
idx := strings.Index(hay, norm)
|
||||||
|
if idx < 0 {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
end := idx + len(norm)
|
||||||
|
if end < len(hay) && hay[end] >= '0' && hay[end] <= '9' {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
|
func queryIsForeign(query string) bool {
|
||||||
|
return containsAnyLower(strings.ToLower(query),
|
||||||
|
[]string{"schweiz", "revdsg", "fedlex", " ch ", "oesterreich", "österreich"})
|
||||||
|
}
|
||||||
|
|
||||||
|
func containsAny(hay string, markers []string) bool {
|
||||||
|
for _, m := range markers {
|
||||||
|
if strings.Contains(hay, m) {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
func containsAnyLower(haylower string, markers []string) bool {
|
||||||
|
for _, m := range markers {
|
||||||
|
if strings.Contains(haylower, strings.ToLower(m)) {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
@@ -0,0 +1,171 @@
|
|||||||
|
package ucca
|
||||||
|
|
||||||
|
import (
|
||||||
|
"sort"
|
||||||
|
"strings"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Re-ranking coefficients (validated in the offline golden harness; Phase A — conservative).
|
||||||
|
const (
|
||||||
|
authorityCoef = 0.40 // * weight/100
|
||||||
|
jurisdictionGain = 0.05 // binding/guidance from DE or EU
|
||||||
|
foreignPenalty = 0.60 // foreign law on a DE/EU question (demoted, not removed)
|
||||||
|
unknownPenalty = 0.08
|
||||||
|
domainMatchGain = 0.15
|
||||||
|
offDomainPenalty = 0.10 // off-domain binding (demoted, not removed)
|
||||||
|
scopePenalty = 0.25 // BDSG Teil 3 (law enforcement) on a general DP question
|
||||||
|
topicGain = 0.18 // amplifier only
|
||||||
|
supersededPenalty = 0.50 // superseded Alt-Quelle (pre-eu-v1): demoted, nicht versteckt
|
||||||
|
intentLiftGain = 0.10 // epsilon a qualifying interpretative source is lifted ABOVE the best binding
|
||||||
|
intentLiftMargin = 0.05 // ...only if that source is semantically competitive with binding
|
||||||
|
)
|
||||||
|
|
||||||
|
// guidanceIntentSignals mark a query that EXPLICITLY asks for an interpretation /
|
||||||
|
// recommendation by a guidance body, rather than for the binding obligation. Only
|
||||||
|
// then may a (semantically competitive) guideline outrank the binding norm.
|
||||||
|
var guidanceIntentSignals = []string{
|
||||||
|
"edpb", "europäischer datenschutzausschuss", "europaeischer datenschutzausschuss",
|
||||||
|
"dsk", "enisa", "bsi", "leitlinie", "guideline", "orientierungshilfe",
|
||||||
|
"auslegung", "empfiehlt", "empfehlung", "sagt", "laut",
|
||||||
|
}
|
||||||
|
|
||||||
|
// controlIntentSignals mark a query that asks HOW to implement / which controls or
|
||||||
|
// measures fit — rather than WHAT the binding obligation is. Only then may a
|
||||||
|
// (semantically competitive) technical_standard outrank the binding norm.
|
||||||
|
var controlIntentSignals = []string{
|
||||||
|
"control", "controls", "maßnahme", "massnahme", "schutzmaßnahme",
|
||||||
|
"best practice", "best-practice", "umsetzen", "implementier", "absicher",
|
||||||
|
"härt", "haert", "hardening", "nist", "owasp", "grundschutz",
|
||||||
|
"ccm", "iso 27001", "isms",
|
||||||
|
}
|
||||||
|
|
||||||
|
func queryMatchesAny(query string, signals []string) bool {
|
||||||
|
q := strings.ToLower(query)
|
||||||
|
for _, sig := range signals {
|
||||||
|
if strings.Contains(q, sig) {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
// queryWantsGuidance reports whether the query explicitly asks for guidance/interpretation.
|
||||||
|
func queryWantsGuidance(query string) bool { return queryMatchesAny(query, guidanceIntentSignals) }
|
||||||
|
|
||||||
|
// queryWantsControls reports whether the query asks for implementation controls/measures.
|
||||||
|
func queryWantsControls(query string) bool { return queryMatchesAny(query, controlIntentSignals) }
|
||||||
|
|
||||||
|
// bestBindingSemantic returns the highest RAW semantic score among binding-law
|
||||||
|
// results (0 if none / no intent). Used as the guard threshold so an off-topic
|
||||||
|
// interpretative source cannot ride the intent boost.
|
||||||
|
func bestBindingSemantic(results []LegalSearchResult, wantsIntent bool) float64 {
|
||||||
|
if !wantsIntent {
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
best := 0.0
|
||||||
|
for _, r := range results {
|
||||||
|
if classifyAuthority(r).sourceClass == "binding_law" && r.Score > best {
|
||||||
|
best = r.Score
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return best
|
||||||
|
}
|
||||||
|
|
||||||
|
// authorityScore computes the normative relevance of a result for a query. It augments the
|
||||||
|
// semantic score with authority/jurisdiction/domain/scope/topic signals. Exposed for tests.
|
||||||
|
func authorityScore(query string, r LegalSearchResult, qDomain string, qForeign bool) float64 {
|
||||||
|
info := classifyAuthority(r)
|
||||||
|
score := r.Score + authorityCoef*float64(info.weight)/100.0
|
||||||
|
|
||||||
|
if r.Superseded {
|
||||||
|
// Alt-Quelle (pre-eu-v1): Default-Fragen sollen die eu-v1-Norm sehen. Demoted,
|
||||||
|
// nicht entfernt — fuer Historie/Uebergangsfragen bleibt sie auffindbar.
|
||||||
|
score -= supersededPenalty
|
||||||
|
}
|
||||||
|
|
||||||
|
if info.jurisdiction == "CH" && !qForeign {
|
||||||
|
score -= foreignPenalty // Fremdrecht bei DE/EU-Frage: demoted, nicht geloescht
|
||||||
|
} else {
|
||||||
|
score += jurisdictionGain
|
||||||
|
}
|
||||||
|
if info.sourceClass == "unknown" {
|
||||||
|
score -= unknownPenalty
|
||||||
|
}
|
||||||
|
if qDomain != "" {
|
||||||
|
switch cd := chunkDomain(r); {
|
||||||
|
case cd == qDomain:
|
||||||
|
score += domainMatchGain
|
||||||
|
case cd != "":
|
||||||
|
score -= offDomainPenalty // off-domain binding: demoted, nicht geloescht
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if qDomain == "data_protection" && scopeClass(r) == "law_enforcement" {
|
||||||
|
score -= scopePenalty
|
||||||
|
}
|
||||||
|
if resultMatchesTopic(query, r) {
|
||||||
|
score += topicGain // Verstaerker, kein Override
|
||||||
|
}
|
||||||
|
return score
|
||||||
|
}
|
||||||
|
|
||||||
|
// rerankByAuthority re-orders results so binding law from the matching jurisdiction/domain
|
||||||
|
// ranks above guidance, foreign and off-domain law — WITHOUT dropping anything (guidance is
|
||||||
|
// kept as interpretation context). The computed score is written back to Score so downstream
|
||||||
|
// merges (e.g. the multi-collection advisor) preserve this order. Pure + deterministic.
|
||||||
|
func rerankByAuthority(query string, results []LegalSearchResult) []LegalSearchResult {
|
||||||
|
if len(results) < 2 {
|
||||||
|
return results
|
||||||
|
}
|
||||||
|
qDomain := queryDomain(query)
|
||||||
|
qForeign := queryIsForeign(query)
|
||||||
|
wantsGuidance := queryWantsGuidance(query)
|
||||||
|
wantsControls := queryWantsControls(query)
|
||||||
|
bestBindingSem := bestBindingSemantic(results, wantsGuidance)
|
||||||
|
|
||||||
|
out := make([]LegalSearchResult, len(results))
|
||||||
|
copy(out, results)
|
||||||
|
for i := range out {
|
||||||
|
out[i].Score = authorityScore(query, out[i], qDomain, qForeign)
|
||||||
|
}
|
||||||
|
// Explicit interpretation intent → a competitive guideline may outrank binding (lift
|
||||||
|
// above the best binding FINAL). Explicit implementation intent → boost the CONTROL-POOL
|
||||||
|
// (operational/procedural requirement, control standard, implementation guidance) over
|
||||||
|
// the abstract obligation, soft-ordered by role. Norm questions (neither) stay untouched.
|
||||||
|
if wantsGuidance {
|
||||||
|
liftAboveBinding(out, results, bestBindingSem, "supervisory_guidance")
|
||||||
|
}
|
||||||
|
if wantsControls {
|
||||||
|
applyControlRoles(out)
|
||||||
|
}
|
||||||
|
sort.SliceStable(out, func(a, b int) bool {
|
||||||
|
return out[a].Score > out[b].Score
|
||||||
|
})
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
|
// liftAboveBinding lifts a semantically-competitive interpretative source (the given
|
||||||
|
// sourceClass — supervisory_guidance or technical_standard) just ABOVE the best binding
|
||||||
|
// hit, ordered by semantic, so an EXPLICIT guidance/implementation question can return
|
||||||
|
// that source Top-1. A pure norm question (no intent → not called) keeps binding on top.
|
||||||
|
// Sources below the semantic margin are left untouched, so an off-topic source can never
|
||||||
|
// ride the override — and the lift is from the binding FINAL score, so authority/topic/
|
||||||
|
// domain bonuses cannot edge it out.
|
||||||
|
func liftAboveBinding(out, raw []LegalSearchResult, bestBindingSem float64, sourceClass string) {
|
||||||
|
bestBindingFinal := 0.0
|
||||||
|
for i := range out {
|
||||||
|
if classifyAuthority(out[i]).sourceClass == "binding_law" && out[i].Score > bestBindingFinal {
|
||||||
|
bestBindingFinal = out[i].Score
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for i := range out {
|
||||||
|
// Classify (not raw payload) so the untagged legacy corpus — e.g. NIST ingested
|
||||||
|
// before source_class tagging — is still recognized as its interpretative class.
|
||||||
|
if classifyAuthority(out[i]).sourceClass != sourceClass || raw[i].Score < bestBindingSem-intentLiftMargin {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
lifted := bestBindingFinal + intentLiftGain + (raw[i].Score - bestBindingSem)
|
||||||
|
if lifted > out[i].Score {
|
||||||
|
out[i].Score = lifted
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,96 @@
|
|||||||
|
package ucca
|
||||||
|
|
||||||
|
import "testing"
|
||||||
|
|
||||||
|
func bindingRes(label, reg, jur string, score float64) LegalSearchResult {
|
||||||
|
return LegalSearchResult{ArticleLabel: label, RegulationShort: reg, SourceClass: "binding_law", AuthorityWeight: 100, Jurisdiction: jur, Score: score}
|
||||||
|
}
|
||||||
|
|
||||||
|
func guidanceRes(label, reg string, score float64) LegalSearchResult {
|
||||||
|
return LegalSearchResult{ArticleLabel: label, RegulationShort: reg, SourceClass: "supervisory_guidance", AuthorityWeight: 70, Jurisdiction: "EU", Score: score}
|
||||||
|
}
|
||||||
|
|
||||||
|
func foreignRes(label string, score float64) LegalSearchResult {
|
||||||
|
return LegalSearchResult{ArticleLabel: label, RegulationShort: "RevDSG", SourceClass: "foreign_law", AuthorityWeight: 0, Jurisdiction: "CH", Score: score}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Acceptance criteria (Phase 1) expressed as ordering tests.
|
||||||
|
func TestRerankByAuthority_Acceptance(t *testing.T) {
|
||||||
|
t.Run("guidance does not overtake semantically competitive binding", func(t *testing.T) {
|
||||||
|
out := rerankByAuthority("Was gilt hier?", []LegalSearchResult{
|
||||||
|
guidanceRes("ENISA Mapping", "ENISA", 0.72),
|
||||||
|
bindingRes("CRA Anhang I", "CRA", "EU", 0.66),
|
||||||
|
})
|
||||||
|
if out[0].RegulationShort != "CRA" {
|
||||||
|
t.Fatalf("binding must rank first over competitive guidance, got %q", out[0].RegulationShort)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
t.Run("foreign law demoted on DE/EU question but kept", func(t *testing.T) {
|
||||||
|
in := []LegalSearchResult{foreignRes("RevDSG Art 1", 0.85), bindingRes("Art. 9 DSGVO", "DSGVO", "EU", 0.62)}
|
||||||
|
out := rerankByAuthority("Welche Daten sind besonders geschuetzt?", in)
|
||||||
|
if out[0].RegulationShort != "DSGVO" {
|
||||||
|
t.Fatalf("binding EU must beat foreign on a DE/EU query, got %q", out[0].RegulationShort)
|
||||||
|
}
|
||||||
|
if len(out) != 2 {
|
||||||
|
t.Fatalf("foreign law must be kept, got len=%d", len(out))
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
t.Run("off-domain binding demoted but not removed", func(t *testing.T) {
|
||||||
|
in := []LegalSearchResult{
|
||||||
|
bindingRes("Art. 13 EU MDR", "MDR", "EU", 0.70),
|
||||||
|
bindingRes("Art. 13 CRA", "CRA", "EU", 0.60),
|
||||||
|
}
|
||||||
|
out := rerankByAuthority("Welche Pflichten hat der Hersteller von Produkten mit digitalen Elementen?", in)
|
||||||
|
if out[0].RegulationShort != "CRA" {
|
||||||
|
t.Fatalf("on-domain CRA must beat off-domain MDR, got %q", out[0].RegulationShort)
|
||||||
|
}
|
||||||
|
if len(out) != 2 {
|
||||||
|
t.Fatalf("off-domain MDR must be kept, got len=%d", len(out))
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
t.Run("same-regime binding wins over guidance", func(t *testing.T) {
|
||||||
|
out := rerankByAuthority("Was gilt hier?", []LegalSearchResult{
|
||||||
|
bindingRes("Art. 13 CRA", "CRA", "EU", 0.70),
|
||||||
|
guidanceRes("ENISA Mapping", "ENISA", 0.60),
|
||||||
|
})
|
||||||
|
if out[0].RegulationShort != "CRA" {
|
||||||
|
t.Fatalf("binding must win, got %q", out[0].RegulationShort)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
t.Run("BDSG Teil 3 demoted below DSGVO on general DP question", func(t *testing.T) {
|
||||||
|
in := []LegalSearchResult{
|
||||||
|
bindingRes("§ 48 BDSG", "BDSG", "DE", 0.70), // Teil 3 (law enforcement)
|
||||||
|
bindingRes("Art. 9 DSGVO", "DSGVO", "EU", 0.62),
|
||||||
|
}
|
||||||
|
out := rerankByAuthority("Was sind besondere Kategorien personenbezogener Daten?", in)
|
||||||
|
if out[0].RegulationShort != "DSGVO" {
|
||||||
|
t.Fatalf("DSGVO must beat BDSG Teil 3 on a general DP question, got %q", out[0].RegulationShort)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
t.Run("nothing is dropped and topic amplifies", func(t *testing.T) {
|
||||||
|
in := []LegalSearchResult{
|
||||||
|
guidanceRes("ENISA", "ENISA", 0.72),
|
||||||
|
bindingRes("CRA Anhang I", "CRA", "EU", 0.66),
|
||||||
|
foreignRes("RevDSG", 0.5),
|
||||||
|
}
|
||||||
|
out := rerankByAuthority("Anforderungen an Security Updates?", in)
|
||||||
|
if len(out) != len(in) {
|
||||||
|
t.Fatalf("rerank must preserve all results, got %d want %d", len(out), len(in))
|
||||||
|
}
|
||||||
|
if out[0].ArticleLabel != "CRA Anhang I" {
|
||||||
|
t.Fatalf("topic+authority must lift CRA Anhang I to top, got %q", out[0].ArticleLabel)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
t.Run("single result returned unchanged", func(t *testing.T) {
|
||||||
|
in := []LegalSearchResult{bindingRes("Art. 1 CRA", "CRA", "EU", 0.5)}
|
||||||
|
if out := rerankByAuthority("x", in); len(out) != 1 {
|
||||||
|
t.Fatalf("len=%d", len(out))
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
@@ -0,0 +1,129 @@
|
|||||||
|
package ucca
|
||||||
|
|
||||||
|
import "testing"
|
||||||
|
|
||||||
|
func TestClassifyAuthority(t *testing.T) {
|
||||||
|
tests := []struct {
|
||||||
|
name string
|
||||||
|
result LegalSearchResult
|
||||||
|
wantW int
|
||||||
|
wantSC string
|
||||||
|
wantJur string
|
||||||
|
}{
|
||||||
|
{"tagged binding EU", LegalSearchResult{AuthorityWeight: 100, SourceClass: "binding_law", Jurisdiction: "EU"}, 100, "binding_law", "EU"},
|
||||||
|
{"tagged guidance DE", LegalSearchResult{AuthorityWeight: 70, SourceClass: "supervisory_guidance", Jurisdiction: "DE"}, 70, "supervisory_guidance", "DE"},
|
||||||
|
{"tagged foreign CH", LegalSearchResult{AuthorityWeight: 0, SourceClass: "foreign_law", Jurisdiction: "CH"}, 0, "foreign_law", "CH"},
|
||||||
|
{"untagged ENISA guidance", LegalSearchResult{RegulationShort: "ENISA", ArticleLabel: "ENISA CRA Standards Mapping"}, 70, "supervisory_guidance", "EU"},
|
||||||
|
{"untagged NIST standard", LegalSearchResult{RegulationShort: "NIST SP 800-82r3", ArticleLabel: "AU-8"}, 80, "technical_standard", "EU"},
|
||||||
|
{"BSI Grundschutz standard beats BSI guidance", LegalSearchResult{RegulationShort: "BSI Grundschutz", ArticleLabel: "BSI Grundschutz Baustein"}, 80, "technical_standard", "DE"},
|
||||||
|
{"weight-only 85 TRGS standard", LegalSearchResult{AuthorityWeight: 85, RegulationShort: "TRGS 529"}, 85, "technical_standard", "EU"},
|
||||||
|
{"tagged technical_standard", LegalSearchResult{AuthorityWeight: 80, SourceClass: "technical_standard", Jurisdiction: "EU"}, 80, "technical_standard", "EU"},
|
||||||
|
{"untagged CRA binding", LegalSearchResult{RegulationShort: "CRA", ArticleLabel: "Art. 13 CRA", Category: "regulation"}, 100, "binding_law", "EU"},
|
||||||
|
{"untagged BDSG binding DE", LegalSearchResult{RegulationShort: "BDSG", ArticleLabel: "§ 38 BDSG"}, 100, "binding_law", "DE"},
|
||||||
|
{"untagged RevDSG foreign", LegalSearchResult{RegulationShort: "RevDSG", ArticleLabel: "RevDSG (CH)"}, 0, "foreign_law", "CH"},
|
||||||
|
{"untagged unknown", LegalSearchResult{RegulationShort: "", ArticleLabel: ""}, 50, "unknown", "EU"},
|
||||||
|
}
|
||||||
|
for _, tt := range tests {
|
||||||
|
t.Run(tt.name, func(t *testing.T) {
|
||||||
|
got := classifyAuthority(tt.result)
|
||||||
|
if got.weight != tt.wantW || got.sourceClass != tt.wantSC || got.jurisdiction != tt.wantJur {
|
||||||
|
t.Errorf("classifyAuthority() = {%d %s %s}, want {%d %s %s}",
|
||||||
|
got.weight, got.sourceClass, got.jurisdiction, tt.wantW, tt.wantSC, tt.wantJur)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestQueryDomain(t *testing.T) {
|
||||||
|
tests := []struct{ q, want string }{
|
||||||
|
{"Welche Anforderungen an Security Updates?", "cyber"},
|
||||||
|
{"Wer braucht einen Datenschutzbeauftragten?", "data_protection"},
|
||||||
|
{"Was sind besondere Kategorien personenbezogener Daten?", "data_protection"},
|
||||||
|
{"Welche Pflichten beim Hochrisiko-KI-System?", "ai"},
|
||||||
|
{"Wie spaet ist es?", ""},
|
||||||
|
}
|
||||||
|
for _, tt := range tests {
|
||||||
|
if got := queryDomain(tt.q); got != tt.want {
|
||||||
|
t.Errorf("queryDomain(%q) = %q, want %q", tt.q, got, tt.want)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestChunkDomain(t *testing.T) {
|
||||||
|
tests := []struct {
|
||||||
|
name string
|
||||||
|
r LegalSearchResult
|
||||||
|
want string
|
||||||
|
}{
|
||||||
|
{"CRA cyber", LegalSearchResult{RegulationShort: "CRA", ArticleLabel: "Art. 13 CRA"}, "cyber"},
|
||||||
|
{"DSGVO dp", LegalSearchResult{RegulationShort: "DSGVO", ArticleLabel: "Art. 9 DSGVO"}, "data_protection"},
|
||||||
|
{"AI Act ai", LegalSearchResult{RegulationShort: "AI Act", ArticleLabel: "Art. 10 AI Act"}, "ai"},
|
||||||
|
{"MDR product", LegalSearchResult{RegulationShort: "MDR", ArticleLabel: "Art. 13 EU MDR"}, "product_safety"},
|
||||||
|
{"unknown", LegalSearchResult{RegulationShort: "XYZ"}, ""},
|
||||||
|
}
|
||||||
|
for _, tt := range tests {
|
||||||
|
t.Run(tt.name, func(t *testing.T) {
|
||||||
|
if got := chunkDomain(tt.r); got != tt.want {
|
||||||
|
t.Errorf("chunkDomain() = %q, want %q", got, tt.want)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestScopeClass(t *testing.T) {
|
||||||
|
tests := []struct {
|
||||||
|
name string
|
||||||
|
r LegalSearchResult
|
||||||
|
want string
|
||||||
|
}{
|
||||||
|
{"BDSG Teil 3 law enforcement", LegalSearchResult{RegulationShort: "BDSG", ArticleLabel: "§ 48 BDSG"}, "law_enforcement"},
|
||||||
|
{"BDSG general part", LegalSearchResult{RegulationShort: "BDSG", ArticleLabel: "§ 38 BDSG"}, "general"},
|
||||||
|
{"DSGVO general", LegalSearchResult{RegulationShort: "DSGVO", ArticleLabel: "Art. 9 DSGVO"}, "general"},
|
||||||
|
}
|
||||||
|
for _, tt := range tests {
|
||||||
|
t.Run(tt.name, func(t *testing.T) {
|
||||||
|
if got := scopeClass(tt.r); got != tt.want {
|
||||||
|
t.Errorf("scopeClass() = %q, want %q", got, tt.want)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestResultMatchesTopic(t *testing.T) {
|
||||||
|
tests := []struct {
|
||||||
|
name string
|
||||||
|
query string
|
||||||
|
r LegalSearchResult
|
||||||
|
want bool
|
||||||
|
}{
|
||||||
|
{"besondere Kategorien -> Art 9 match", "Was sind besondere Kategorien?", LegalSearchResult{ArticleLabel: "Art. 9 DSGVO"}, true},
|
||||||
|
{"besondere Kategorien -> Art 90 no match", "Was sind besondere Kategorien?", LegalSearchResult{ArticleLabel: "Art. 90 DSGVO"}, false},
|
||||||
|
{"security updates -> CRA Anhang I", "Anforderungen an Security Updates?", LegalSearchResult{ArticleLabel: "CRA Anhang I"}, true},
|
||||||
|
{"no topic keyword", "Wie spaet ist es?", LegalSearchResult{ArticleLabel: "Art. 9 DSGVO"}, false},
|
||||||
|
}
|
||||||
|
for _, tt := range tests {
|
||||||
|
t.Run(tt.name, func(t *testing.T) {
|
||||||
|
if got := resultMatchesTopic(tt.query, tt.r); got != tt.want {
|
||||||
|
t.Errorf("resultMatchesTopic() = %v, want %v", got, tt.want)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestNormMatches(t *testing.T) {
|
||||||
|
tests := []struct {
|
||||||
|
hay, norm string
|
||||||
|
want bool
|
||||||
|
}{
|
||||||
|
{"Art. 9 DSGVO", "Art. 9", true},
|
||||||
|
{"Art. 90 DSGVO", "Art. 9", false},
|
||||||
|
{"§ 38 BDSG", "§ 38 BDSG", true},
|
||||||
|
{"§ 380 BDSG", "§ 38", false},
|
||||||
|
{"Art. 14 CRA", "Art. 14 CRA", true},
|
||||||
|
}
|
||||||
|
for _, tt := range tests {
|
||||||
|
if got := normMatches(tt.hay, tt.norm); got != tt.want {
|
||||||
|
t.Errorf("normMatches(%q,%q) = %v, want %v", tt.hay, tt.norm, got, tt.want)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,123 @@
|
|||||||
|
package ucca
|
||||||
|
|
||||||
|
import "strings"
|
||||||
|
|
||||||
|
// source_role is the FUNCTIONAL role of a chunk — WHAT must be done (obligation),
|
||||||
|
// HOW to implement it (operational/procedural requirement, control standard,
|
||||||
|
// implementation guidance), or how to READ the norm (interpretation/definition).
|
||||||
|
// It is ORTHOGONAL to source_class (legal authority): source_class decides RANK,
|
||||||
|
// source_role decides CONTROL-POOL membership for implementation questions.
|
||||||
|
// Derived deterministically from markers, so the untagged corpus needs no re-tag.
|
||||||
|
const (
|
||||||
|
roleObligation = "obligation" // the abstract duty (the WHAT)
|
||||||
|
roleOperationalReq = "operational_requirement" // concrete binding requirement (CRA Annex I)
|
||||||
|
roleProceduralReq = "procedural_requirement" // a process: notification/registration/DPIA/incident report
|
||||||
|
roleControlStandard = "control_standard" // best-practice control catalog (NIST/OWASP/ISO/CIS)
|
||||||
|
roleImplGuidance = "implementation_guidance" // advisory how-to (ENISA good practices, BSI)
|
||||||
|
roleInterpretation = "interpretation" // interprets the norm's MEANING (EDPB guideline)
|
||||||
|
roleDefinition = "definition" // definitions / scope / recitals
|
||||||
|
)
|
||||||
|
|
||||||
|
var (
|
||||||
|
proceduralMarkers = []string{
|
||||||
|
"Meldung", "Meldepflicht", "Notification", "Notifizierung", "Registrierung",
|
||||||
|
"Registration", "Konformitätserklärung", "Declaration of Conformity", "Incident",
|
||||||
|
"Berichterstattung", "Reporting", "Folgenabschätzung", "DSFA", "DPIA", "Anzeigepflicht",
|
||||||
|
}
|
||||||
|
annexMarkers = []string{"Anhang", "Annex", "Appendix", "Anlage"}
|
||||||
|
operationalMarkers = []string{"Anforderung", "Requirement", "essential", "wesentliche"}
|
||||||
|
implMarkers = []string{
|
||||||
|
"Good Practice", "Best Practice", "Standards Mapping", "Umsetzung", "Implementation",
|
||||||
|
"Handreichung", "Maßnahmenkatalog", "ICS", "SCADA", "Technical Guideline", "TIG",
|
||||||
|
}
|
||||||
|
definitionMarkers = []string{"Begriffsbestimmung", "Definition"}
|
||||||
|
)
|
||||||
|
|
||||||
|
// classifyRole derives the functional source_role from chunk metadata + the authority
|
||||||
|
// class. technical_standard is always a control_standard; guidance splits into
|
||||||
|
// implementation_guidance (how-to) vs interpretation (meaning); binding splits into
|
||||||
|
// procedural / operational requirement / definition / plain obligation.
|
||||||
|
func classifyRole(r LegalSearchResult) string {
|
||||||
|
cls := classifyAuthority(r).sourceClass
|
||||||
|
hay := strings.ToLower(r.ArticleLabel + " " + r.RegulationShort + " " + r.RegulationName + " " + r.Article)
|
||||||
|
switch {
|
||||||
|
case r.IsRecital:
|
||||||
|
return roleDefinition
|
||||||
|
case cls == "technical_standard":
|
||||||
|
return roleControlStandard
|
||||||
|
case cls == "supervisory_guidance":
|
||||||
|
if containsAnyLower(hay, implMarkers) {
|
||||||
|
return roleImplGuidance
|
||||||
|
}
|
||||||
|
return roleInterpretation
|
||||||
|
case cls == "binding_law":
|
||||||
|
switch {
|
||||||
|
case containsAnyLower(hay, definitionMarkers):
|
||||||
|
return roleDefinition
|
||||||
|
case containsAnyLower(hay, proceduralMarkers):
|
||||||
|
return roleProceduralReq
|
||||||
|
case containsAnyLower(hay, annexMarkers) || containsAnyLower(hay, operationalMarkers):
|
||||||
|
return roleOperationalReq
|
||||||
|
default:
|
||||||
|
return roleObligation
|
||||||
|
}
|
||||||
|
default:
|
||||||
|
return roleObligation
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// controlRoleBonus is the soft intra-pool preference (User 2026-06-24):
|
||||||
|
// operational_requirement > procedural_requirement > control_standard > implementation_guidance.
|
||||||
|
var controlRoleBonus = map[string]float64{
|
||||||
|
roleOperationalReq: 0.100,
|
||||||
|
roleProceduralReq: 0.075,
|
||||||
|
roleControlStandard: 0.050,
|
||||||
|
roleImplGuidance: 0.000,
|
||||||
|
}
|
||||||
|
|
||||||
|
// controlPoolGain lifts EVERY control-pool role over the non-control roles (obligation/
|
||||||
|
// interpretation/definition) on an implementation question, so the binding abstract
|
||||||
|
// obligation does not dominate by authority alone. The obligation is not removed — it
|
||||||
|
// stays visible as "Rechtsgrundlage" context below the recommended measures.
|
||||||
|
const controlPoolGain = 0.15
|
||||||
|
|
||||||
|
// applyControlRoles boosts the control-pool (the four implementation roles) for an
|
||||||
|
// EXPLICIT implementation question, soft-ordered op_req > procedural > standard > guidance.
|
||||||
|
// Replaces the earlier "lift technical_standard above binding" — controls are not only
|
||||||
|
// technical_standard, and the binding operational_requirement (e.g. CRA Annex I) should win.
|
||||||
|
func applyControlRoles(out []LegalSearchResult) {
|
||||||
|
for i := range out {
|
||||||
|
if bonus, ok := controlRoleBonus[classifyRole(out[i])]; ok {
|
||||||
|
out[i].Score += controlPoolGain + bonus
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// isControlPoolRole reports whether a role belongs to the control-pool surfaced on
|
||||||
|
// implementation questions (the four "how to implement" roles).
|
||||||
|
func isControlPoolRole(role string) bool {
|
||||||
|
switch role {
|
||||||
|
case roleOperationalReq, roleProceduralReq, roleControlStandard, roleImplGuidance:
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
// controlRoleOf classifies a raw Qdrant payload into a source_role, so searchControls can
|
||||||
|
// filter its deep dense pull to the control-pool BEFORE hits are mapped to LegalSearchResult.
|
||||||
|
func controlRoleOf(payload map[string]interface{}) string {
|
||||||
|
article := getString(payload, "article")
|
||||||
|
if article == "" {
|
||||||
|
article = getString(payload, "section")
|
||||||
|
}
|
||||||
|
return classifyRole(LegalSearchResult{
|
||||||
|
RegulationShort: getString(payload, "regulation_short"),
|
||||||
|
RegulationName: getString(payload, "regulation_name_de"),
|
||||||
|
ArticleLabel: getString(payload, "article_label"),
|
||||||
|
Article: article,
|
||||||
|
Category: getString(payload, "category"),
|
||||||
|
SourceClass: getString(payload, "source_class"),
|
||||||
|
AuthorityWeight: getInt(payload, "authority_weight"),
|
||||||
|
IsRecital: getBool(payload, "is_recital"),
|
||||||
|
})
|
||||||
|
}
|
||||||
@@ -0,0 +1,79 @@
|
|||||||
|
package ucca
|
||||||
|
|
||||||
|
import "testing"
|
||||||
|
|
||||||
|
func TestClassifyRole(t *testing.T) {
|
||||||
|
tests := []struct {
|
||||||
|
name string
|
||||||
|
r LegalSearchResult
|
||||||
|
want string
|
||||||
|
}{
|
||||||
|
{"NIST -> control_standard", LegalSearchResult{RegulationShort: "NIST SP 800-82r3", ArticleLabel: "AU-8"}, roleControlStandard},
|
||||||
|
{"OWASP -> control_standard", LegalSearchResult{RegulationShort: "OWASP ASVS"}, roleControlStandard},
|
||||||
|
{"CRA Anhang -> operational_requirement", LegalSearchResult{RegulationShort: "CRA", ArticleLabel: "CRA Anhang I", Category: "regulation"}, roleOperationalReq},
|
||||||
|
{"CRA Meldepflicht -> procedural_requirement", LegalSearchResult{RegulationShort: "CRA", ArticleLabel: "Art. 14 CRA Meldepflicht", Category: "regulation"}, roleProceduralReq},
|
||||||
|
{"ENISA Good Practices -> implementation_guidance", LegalSearchResult{RegulationShort: "ENISA Supply Chain Good Practices"}, roleImplGuidance},
|
||||||
|
{"EDPB Leitlinie -> interpretation", LegalSearchResult{RegulationShort: "EDPB DPO", ArticleLabel: "WP243 Leitlinien Datenschutzbeauftragte"}, roleInterpretation},
|
||||||
|
{"DORA article -> obligation", LegalSearchResult{RegulationShort: "DORA", ArticleLabel: "Art. 5 DORA", Category: "regulation"}, roleObligation},
|
||||||
|
{"DSGVO Begriffsbestimmungen -> definition", LegalSearchResult{RegulationShort: "DSGVO", ArticleLabel: "Art. 4 DSGVO Begriffsbestimmungen", Category: "regulation"}, roleDefinition},
|
||||||
|
{"recital -> definition", LegalSearchResult{RegulationShort: "CRA", IsRecital: true}, roleDefinition},
|
||||||
|
}
|
||||||
|
for _, tt := range tests {
|
||||||
|
t.Run(tt.name, func(t *testing.T) {
|
||||||
|
if got := classifyRole(tt.r); got != tt.want {
|
||||||
|
t.Errorf("classifyRole() = %q, want %q", got, tt.want)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestApplyControlRoles_PoolPreference(t *testing.T) {
|
||||||
|
// op_req > procedural > control_standard > impl_guidance; non-control roles get no boost.
|
||||||
|
roles := []struct {
|
||||||
|
r LegalSearchResult
|
||||||
|
wantGain float64
|
||||||
|
}{
|
||||||
|
{LegalSearchResult{ArticleLabel: "CRA Anhang I", Category: "regulation"}, controlPoolGain + 0.100},
|
||||||
|
{LegalSearchResult{ArticleLabel: "Art. 14 CRA Meldepflicht", Category: "regulation"}, controlPoolGain + 0.075},
|
||||||
|
{LegalSearchResult{RegulationShort: "NIST SP 800-53"}, controlPoolGain + 0.050},
|
||||||
|
{LegalSearchResult{RegulationShort: "ENISA Good Practices"}, controlPoolGain + 0.000},
|
||||||
|
{LegalSearchResult{ArticleLabel: "Art. 5 DORA", Category: "regulation"}, 0.0}, // obligation: no boost
|
||||||
|
}
|
||||||
|
for _, rc := range roles {
|
||||||
|
out := []LegalSearchResult{rc.r}
|
||||||
|
out[0].Score = 1.0
|
||||||
|
applyControlRoles(out)
|
||||||
|
if got := out[0].Score - 1.0; got < rc.wantGain-1e-9 || got > rc.wantGain+1e-9 {
|
||||||
|
t.Errorf("role %q: gain %.3f, want %.3f", classifyRole(rc.r), got, rc.wantGain)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestIsControlPoolRole(t *testing.T) {
|
||||||
|
for _, r := range []string{roleOperationalReq, roleProceduralReq, roleControlStandard, roleImplGuidance} {
|
||||||
|
if !isControlPoolRole(r) {
|
||||||
|
t.Errorf("%q should be in the control-pool", r)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for _, r := range []string{roleObligation, roleInterpretation, roleDefinition} {
|
||||||
|
if isControlPoolRole(r) {
|
||||||
|
t.Errorf("%q should NOT be in the control-pool", r)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestControlRoleOf_Payload(t *testing.T) {
|
||||||
|
// searchControls filters its deep dense pull by classifying the raw Qdrant payload.
|
||||||
|
nist := map[string]interface{}{"regulation_short": "NIST SP 800-82r3", "article": "AU-8"}
|
||||||
|
if got := controlRoleOf(nist); got != roleControlStandard {
|
||||||
|
t.Errorf("untagged NIST payload role = %q, want control_standard", got)
|
||||||
|
}
|
||||||
|
craAnnex := map[string]interface{}{"regulation_short": "CRA", "article": "Anhang-I", "category": "regulation"}
|
||||||
|
if got := controlRoleOf(craAnnex); got != roleOperationalReq {
|
||||||
|
t.Errorf("CRA Anhang payload role = %q, want operational_requirement", got)
|
||||||
|
}
|
||||||
|
dora := map[string]interface{}{"regulation_short": "DORA", "article_label": "Art. 5 DORA", "category": "regulation"}
|
||||||
|
if got := controlRoleOf(dora); isControlPoolRole(got) {
|
||||||
|
t.Errorf("DORA abstract article role = %q must be excluded from the control-pool", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,167 @@
|
|||||||
|
package ucca
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bytes"
|
||||||
|
"context"
|
||||||
|
"encoding/json"
|
||||||
|
"fmt"
|
||||||
|
"io"
|
||||||
|
"net/http"
|
||||||
|
"sort"
|
||||||
|
)
|
||||||
|
|
||||||
|
// LegalActStructure is the composition of one ingested eur-lex legal act — how
|
||||||
|
// many distinct articles, annexes and recitals it consists of (plus the raw
|
||||||
|
// chunk count). Backs the coverage page so the ingested corpus is not a black
|
||||||
|
// box: a developer SEES what each act actually contains, not only its name.
|
||||||
|
type LegalActStructure struct {
|
||||||
|
RegulationShort string `json:"regulation_short"`
|
||||||
|
RegulationName string `json:"regulation_name"`
|
||||||
|
Articles int `json:"articles"`
|
||||||
|
Annexes int `json:"annexes"`
|
||||||
|
Recitals int `json:"recitals"`
|
||||||
|
Chunks int `json:"chunks"`
|
||||||
|
}
|
||||||
|
|
||||||
|
const eurlexSource = "eur-lex.europa.eu"
|
||||||
|
|
||||||
|
// legalStructureCollections hold the clean eur-lex legal corpus (chunks tagged
|
||||||
|
// with chunk_scope = section | annex | recital).
|
||||||
|
var legalStructureCollections = []string{"bp_compliance_ce", "bp_compliance_datenschutz"}
|
||||||
|
|
||||||
|
// chunkScopeBucket maps a Qdrant chunk_scope to the structure field it feeds.
|
||||||
|
var chunkScopeBucket = map[string]string{"section": "articles", "annex": "annexes", "recital": "recitals"}
|
||||||
|
|
||||||
|
// CorpusStructure scrolls the eur-lex legal corpus across the legal collections
|
||||||
|
// and aggregates the per-act composition. The source filter keeps it to a few
|
||||||
|
// hundred points regardless of total corpus size. Read-only; a collection that
|
||||||
|
// fails to scroll is skipped rather than failing the whole call.
|
||||||
|
func (c *LegalRAGClient) CorpusStructure(ctx context.Context) ([]LegalActStructure, error) {
|
||||||
|
var all []qdrantScrollPoint
|
||||||
|
for _, coll := range legalStructureCollections {
|
||||||
|
pts, err := c.scrollLegalCorpus(ctx, coll)
|
||||||
|
if err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
all = append(all, pts...)
|
||||||
|
}
|
||||||
|
return aggregateStructure(all), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// aggregateStructure counts distinct article labels per (regulation, scope).
|
||||||
|
// Pure → unit-testable without a vector store.
|
||||||
|
func aggregateStructure(points []qdrantScrollPoint) []LegalActStructure {
|
||||||
|
distinct := map[string]map[string]map[string]struct{}{}
|
||||||
|
names := map[string]string{}
|
||||||
|
chunks := map[string]int{}
|
||||||
|
order := []string{}
|
||||||
|
|
||||||
|
for _, pt := range points {
|
||||||
|
reg := getString(pt.Payload, "regulation_short")
|
||||||
|
if reg == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if _, seen := names[reg]; !seen {
|
||||||
|
name := getString(pt.Payload, "regulation_name_de")
|
||||||
|
if name == "" {
|
||||||
|
name = reg
|
||||||
|
}
|
||||||
|
names[reg] = name
|
||||||
|
distinct[reg] = map[string]map[string]struct{}{}
|
||||||
|
order = append(order, reg)
|
||||||
|
}
|
||||||
|
chunks[reg]++
|
||||||
|
bucket, ok := chunkScopeBucket[getString(pt.Payload, "chunk_scope")]
|
||||||
|
article := getString(pt.Payload, "article")
|
||||||
|
if !ok || article == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if distinct[reg][bucket] == nil {
|
||||||
|
distinct[reg][bucket] = map[string]struct{}{}
|
||||||
|
}
|
||||||
|
distinct[reg][bucket][article] = struct{}{}
|
||||||
|
}
|
||||||
|
|
||||||
|
out := make([]LegalActStructure, 0, len(order))
|
||||||
|
for _, reg := range order {
|
||||||
|
out = append(out, LegalActStructure{
|
||||||
|
RegulationShort: reg,
|
||||||
|
RegulationName: names[reg],
|
||||||
|
Articles: len(distinct[reg]["articles"]),
|
||||||
|
Annexes: len(distinct[reg]["annexes"]),
|
||||||
|
Recitals: len(distinct[reg]["recitals"]),
|
||||||
|
Chunks: chunks[reg],
|
||||||
|
})
|
||||||
|
}
|
||||||
|
sort.SliceStable(out, func(i, j int) bool {
|
||||||
|
if out[i].Articles != out[j].Articles {
|
||||||
|
return out[i].Articles > out[j].Articles
|
||||||
|
}
|
||||||
|
return out[i].RegulationShort < out[j].RegulationShort
|
||||||
|
})
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
|
// scrollLegalCorpus pages through one collection, filtered to the eur-lex legal
|
||||||
|
// corpus, returning minimal-payload points (no text/vectors).
|
||||||
|
func (c *LegalRAGClient) scrollLegalCorpus(ctx context.Context, collection string) ([]qdrantScrollPoint, error) {
|
||||||
|
var all []qdrantScrollPoint
|
||||||
|
var offset interface{}
|
||||||
|
for {
|
||||||
|
points, next, err := c.scrollLegalPage(ctx, collection, offset)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
all = append(all, points...)
|
||||||
|
if next == nil {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
offset = next
|
||||||
|
}
|
||||||
|
return all, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// scrollLegalPage fetches one page of the filtered scroll and returns the
|
||||||
|
// points plus the next-page offset (nil when exhausted).
|
||||||
|
func (c *LegalRAGClient) scrollLegalPage(ctx context.Context, collection string, offset interface{}) ([]qdrantScrollPoint, interface{}, error) {
|
||||||
|
reqBody := map[string]interface{}{
|
||||||
|
"limit": 500,
|
||||||
|
"with_payload": map[string]interface{}{"include": []string{"regulation_short", "regulation_name_de", "chunk_scope", "article"}},
|
||||||
|
"with_vectors": false,
|
||||||
|
"filter": map[string]interface{}{
|
||||||
|
"must": []map[string]interface{}{
|
||||||
|
{"key": "source", "match": map[string]interface{}{"value": eurlexSource}},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
if offset != nil {
|
||||||
|
reqBody["offset"] = offset
|
||||||
|
}
|
||||||
|
jsonBody, err := json.Marshal(reqBody)
|
||||||
|
if err != nil {
|
||||||
|
return nil, nil, err
|
||||||
|
}
|
||||||
|
url := fmt.Sprintf("%s/collections/%s/points/scroll", c.qdrantURL, collection)
|
||||||
|
req, err := http.NewRequestWithContext(ctx, "POST", url, bytes.NewReader(jsonBody))
|
||||||
|
if err != nil {
|
||||||
|
return nil, nil, err
|
||||||
|
}
|
||||||
|
req.Header.Set("Content-Type", "application/json")
|
||||||
|
if c.qdrantAPIKey != "" {
|
||||||
|
req.Header.Set("api-key", c.qdrantAPIKey)
|
||||||
|
}
|
||||||
|
resp, err := c.httpClient.Do(req)
|
||||||
|
if err != nil {
|
||||||
|
return nil, nil, err
|
||||||
|
}
|
||||||
|
defer func() { _ = resp.Body.Close() }()
|
||||||
|
if resp.StatusCode != http.StatusOK {
|
||||||
|
body, _ := io.ReadAll(resp.Body)
|
||||||
|
return nil, nil, fmt.Errorf("qdrant returned %d: %s", resp.StatusCode, string(body))
|
||||||
|
}
|
||||||
|
var scrollResp qdrantScrollResponse
|
||||||
|
if err := json.NewDecoder(resp.Body).Decode(&scrollResp); err != nil {
|
||||||
|
return nil, nil, err
|
||||||
|
}
|
||||||
|
return scrollResp.Result.Points, scrollResp.Result.NextPageOffset, nil
|
||||||
|
}
|
||||||
@@ -0,0 +1,50 @@
|
|||||||
|
package ucca
|
||||||
|
|
||||||
|
import "testing"
|
||||||
|
|
||||||
|
func structPoint(reg, name, scope, article string) qdrantScrollPoint {
|
||||||
|
return qdrantScrollPoint{Payload: map[string]interface{}{
|
||||||
|
"regulation_short": reg,
|
||||||
|
"regulation_name_de": name,
|
||||||
|
"chunk_scope": scope,
|
||||||
|
"article": article,
|
||||||
|
}}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestAggregateStructure_CountsDistinctPerScope(t *testing.T) {
|
||||||
|
points := []qdrantScrollPoint{
|
||||||
|
structPoint("CRA", "Cyber Resilience Act", "section", "13"),
|
||||||
|
structPoint("CRA", "Cyber Resilience Act", "section", "13"), // duplicate article → still 1
|
||||||
|
structPoint("CRA", "Cyber Resilience Act", "section", "14"),
|
||||||
|
structPoint("CRA", "Cyber Resilience Act", "annex", "Anhang-I"),
|
||||||
|
structPoint("CRA", "Cyber Resilience Act", "annex", "Anhang-VII"),
|
||||||
|
structPoint("DORA", "", "section", "6"), // first sighting has no name →
|
||||||
|
structPoint("DORA", "", "section", "19"), // regulation_name falls back to short
|
||||||
|
structPoint("DORA", "", "recital", ""), // empty article → ignored for distinct
|
||||||
|
structPoint("", "x", "section", "1"), // missing regulation → skipped entirely
|
||||||
|
}
|
||||||
|
|
||||||
|
got := aggregateStructure(points)
|
||||||
|
|
||||||
|
if len(got) != 2 {
|
||||||
|
t.Fatalf("want 2 acts, got %d (%+v)", len(got), got)
|
||||||
|
}
|
||||||
|
// CRA has more articles → sorts first.
|
||||||
|
cra := got[0]
|
||||||
|
if cra.RegulationShort != "CRA" || cra.Articles != 2 || cra.Annexes != 2 || cra.Recitals != 0 || cra.Chunks != 5 {
|
||||||
|
t.Errorf("CRA wrong: %+v", cra)
|
||||||
|
}
|
||||||
|
dora := got[1]
|
||||||
|
if dora.RegulationShort != "DORA" || dora.Articles != 2 || dora.Chunks != 3 {
|
||||||
|
t.Errorf("DORA wrong: %+v", dora)
|
||||||
|
}
|
||||||
|
if dora.RegulationName != "DORA" {
|
||||||
|
t.Errorf("DORA name fallback failed: %q", dora.RegulationName)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestAggregateStructure_Empty(t *testing.T) {
|
||||||
|
if got := aggregateStructure(nil); len(got) != 0 {
|
||||||
|
t.Errorf("want empty, got %+v", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,134 @@
|
|||||||
|
package ucca
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"strings"
|
||||||
|
)
|
||||||
|
|
||||||
|
const (
|
||||||
|
assessConnectedCap = 12 // cap connected norms surfaced in the assessment
|
||||||
|
assessCrossRegimeTopN = 5 // window over which "cross regime" is judged
|
||||||
|
assessReviewMargin = 0.05 // a tighter winner gap → recommend human review
|
||||||
|
)
|
||||||
|
|
||||||
|
// Assess builds the auditable explanation layer over a ranked result set:
|
||||||
|
// primary norm, the norms it connects to (citation graph), cross-regime, a
|
||||||
|
// human-review flag, the winner margin and a short reasoning string. Pure →
|
||||||
|
// unit-testable. It EXPLAINS the ranking, it does not change it. Returns nil for
|
||||||
|
// an empty result set.
|
||||||
|
func Assess(results []LegalSearchResult) *LegalAssessment {
|
||||||
|
if len(results) == 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
// Norm-level view: collapse multiple chunks of the same article/annex so the
|
||||||
|
// margin and cross-regime are judged between DISTINCT norms, not near-identical
|
||||||
|
// chunks of one norm (which would make every winner margin ~0).
|
||||||
|
norms := distinctNorms(results)
|
||||||
|
p := norms[0]
|
||||||
|
|
||||||
|
primary := primaryLabel(p)
|
||||||
|
connected := dedupStrings(p.ReferencesOut, p.ReferencesIn, p.CitationUnit)
|
||||||
|
if len(connected) > assessConnectedCap {
|
||||||
|
connected = connected[:assessConnectedCap]
|
||||||
|
}
|
||||||
|
|
||||||
|
window := norms
|
||||||
|
if len(window) > assessCrossRegimeTopN {
|
||||||
|
window = window[:assessCrossRegimeTopN]
|
||||||
|
}
|
||||||
|
regimes := make(map[string]bool)
|
||||||
|
for _, r := range window {
|
||||||
|
if r.RegulationShort != "" {
|
||||||
|
regimes[r.RegulationShort] = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
crossRegime := len(regimes) > 1
|
||||||
|
|
||||||
|
margin := 0.0
|
||||||
|
if len(norms) > 1 {
|
||||||
|
margin = norms[0].Score - norms[1].Score
|
||||||
|
}
|
||||||
|
|
||||||
|
primaryBinding := p.SourceClass == "binding_law"
|
||||||
|
humanReview := margin < assessReviewMargin || crossRegime || !primaryBinding
|
||||||
|
|
||||||
|
return &LegalAssessment{
|
||||||
|
PrimaryNorm: primary,
|
||||||
|
PrimaryRegulation: p.RegulationShort,
|
||||||
|
ConnectedNorms: connected,
|
||||||
|
CrossRegime: crossRegime,
|
||||||
|
HumanReviewFlag: humanReview,
|
||||||
|
WinnerMargin: margin,
|
||||||
|
ScoreReasoning: assessReasoning(p, margin, crossRegime, primaryBinding),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func primaryLabel(p LegalSearchResult) string {
|
||||||
|
if p.CitationUnit != "" {
|
||||||
|
return p.CitationUnit
|
||||||
|
}
|
||||||
|
if p.ArticleLabel != "" {
|
||||||
|
return p.ArticleLabel
|
||||||
|
}
|
||||||
|
return strings.TrimSpace(p.RegulationShort + " " + p.Article)
|
||||||
|
}
|
||||||
|
|
||||||
|
// assessReasoning renders a short, human-readable justification (German).
|
||||||
|
func assessReasoning(p LegalSearchResult, margin float64, crossRegime, primaryBinding bool) string {
|
||||||
|
label := primaryLabel(p)
|
||||||
|
parts := make([]string, 0, 4)
|
||||||
|
if primaryBinding {
|
||||||
|
parts = append(parts, fmt.Sprintf("Primärtreffer %s: bindendes Recht (Autorität %d).", label, p.AuthorityWeight))
|
||||||
|
} else {
|
||||||
|
parts = append(parts, fmt.Sprintf("Primärtreffer %s ist keine bindende Norm (Leitlinie/Standard) — Quelle prüfen.", label))
|
||||||
|
}
|
||||||
|
if margin > 0 {
|
||||||
|
parts = append(parts, fmt.Sprintf("Vorsprung %.2f vor #2.", margin))
|
||||||
|
}
|
||||||
|
if margin < assessReviewMargin {
|
||||||
|
parts = append(parts, "Knapper Vorsprung — Alternativtreffer prüfen.")
|
||||||
|
}
|
||||||
|
if crossRegime {
|
||||||
|
parts = append(parts, "Mehrere Regime betroffen — Querbezug prüfen.")
|
||||||
|
}
|
||||||
|
return strings.Join(parts, " ")
|
||||||
|
}
|
||||||
|
|
||||||
|
// distinctNorms collapses results that share a citation (multiple chunks of the
|
||||||
|
// same article/annex) to the first — i.e. highest-ranked — occurrence. Results
|
||||||
|
// without any citation identity are each kept, since they cannot be matched.
|
||||||
|
func distinctNorms(results []LegalSearchResult) []LegalSearchResult {
|
||||||
|
seen := make(map[string]bool, len(results))
|
||||||
|
out := make([]LegalSearchResult, 0, len(results))
|
||||||
|
for _, r := range results {
|
||||||
|
key := r.CitationUnit
|
||||||
|
if key == "" {
|
||||||
|
key = r.ArticleLabel
|
||||||
|
}
|
||||||
|
if key != "" {
|
||||||
|
if seen[key] {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
seen[key] = true
|
||||||
|
}
|
||||||
|
out = append(out, r)
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
|
// dedupStrings concatenates out+in, drops empties and the excluded value, and
|
||||||
|
// returns a stable de-duplicated slice (insertion order preserved).
|
||||||
|
func dedupStrings(out, in []string, exclude string) []string {
|
||||||
|
seen := map[string]bool{exclude: true}
|
||||||
|
res := make([]string, 0, len(out)+len(in))
|
||||||
|
for _, list := range [][]string{out, in} {
|
||||||
|
for _, s := range list {
|
||||||
|
if s == "" || seen[s] {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
seen[s] = true
|
||||||
|
res = append(res, s)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return res
|
||||||
|
}
|
||||||
@@ -0,0 +1,112 @@
|
|||||||
|
package ucca
|
||||||
|
|
||||||
|
import "testing"
|
||||||
|
|
||||||
|
func ares(reg, cu, sc string, score float64, weight int, out, in []string) LegalSearchResult {
|
||||||
|
return LegalSearchResult{
|
||||||
|
RegulationShort: reg, CitationUnit: cu, SourceClass: sc, Score: score,
|
||||||
|
AuthorityWeight: weight, ReferencesOut: out, ReferencesIn: in,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestAssess_Empty(t *testing.T) {
|
||||||
|
if Assess(nil) != nil {
|
||||||
|
t.Error("empty results → nil assessment")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestAssess_BindingPrimary_NoReview(t *testing.T) {
|
||||||
|
results := []LegalSearchResult{
|
||||||
|
ares("CRA", "Art. 13 CRA", "binding_law", 1.05, 100,
|
||||||
|
[]string{"CRA Anhang I", "Art. 14 CRA"}, []string{"Art. 12 CRA"}),
|
||||||
|
ares("CRA", "Art. 14 CRA", "binding_law", 0.80, 100, nil, nil),
|
||||||
|
}
|
||||||
|
a := Assess(results)
|
||||||
|
if a == nil {
|
||||||
|
t.Fatal("nil assessment")
|
||||||
|
}
|
||||||
|
if a.PrimaryNorm != "Art. 13 CRA" || a.PrimaryRegulation != "CRA" {
|
||||||
|
t.Errorf("primary wrong: %+v", a)
|
||||||
|
}
|
||||||
|
if len(a.ConnectedNorms) != 3 { // out(2) + in(1), self excluded, deduped
|
||||||
|
t.Errorf("connected norms: %v", a.ConnectedNorms)
|
||||||
|
}
|
||||||
|
if a.CrossRegime {
|
||||||
|
t.Error("single regime must not be cross-regime")
|
||||||
|
}
|
||||||
|
if a.WinnerMargin < 0.24 || a.WinnerMargin > 0.26 {
|
||||||
|
t.Errorf("margin = %v, want ~0.25", a.WinnerMargin)
|
||||||
|
}
|
||||||
|
if a.HumanReviewFlag {
|
||||||
|
t.Error("clean binding + healthy margin + single regime → no review")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestAssess_CrossRegimeFlagsReview(t *testing.T) {
|
||||||
|
a := Assess([]LegalSearchResult{
|
||||||
|
ares("CRA", "Art. 13 CRA", "binding_law", 1.05, 100, nil, nil),
|
||||||
|
ares("DORA", "Art. 6 DORA", "binding_law", 0.70, 100, nil, nil),
|
||||||
|
})
|
||||||
|
if !a.CrossRegime || !a.HumanReviewFlag {
|
||||||
|
t.Errorf("cross-regime must flag review: %+v", a)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestAssess_NonBindingFlagsReview(t *testing.T) {
|
||||||
|
a := Assess([]LegalSearchResult{
|
||||||
|
ares("ENISA", "ENISA SBOM", "supervisory_guidance", 0.90, 70, nil, nil),
|
||||||
|
ares("ENISA", "ENISA X", "supervisory_guidance", 0.40, 70, nil, nil),
|
||||||
|
})
|
||||||
|
if !a.HumanReviewFlag {
|
||||||
|
t.Error("non-binding primary → review")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestAssess_TightMarginFlagsReview(t *testing.T) {
|
||||||
|
a := Assess([]LegalSearchResult{
|
||||||
|
ares("CRA", "Art. 13 CRA", "binding_law", 1.00, 100, nil, nil),
|
||||||
|
ares("CRA", "Art. 14 CRA", "binding_law", 0.98, 100, nil, nil),
|
||||||
|
})
|
||||||
|
if a.WinnerMargin >= 0.05 || !a.HumanReviewFlag {
|
||||||
|
t.Errorf("tight margin → review: %+v", a)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestAssess_MarginIsNormLevelNotChunkLevel(t *testing.T) {
|
||||||
|
// Two near-identical chunks of the SAME norm at the top, then a distinct norm.
|
||||||
|
results := []LegalSearchResult{
|
||||||
|
ares("CRA", "Art. 13 CRA", "binding_law", 1.050, 100, []string{"CRA Anhang I"}, nil),
|
||||||
|
ares("CRA", "Art. 13 CRA", "binding_law", 1.049, 100, nil, nil), // same norm
|
||||||
|
ares("CRA", "Art. 14 CRA", "binding_law", 0.800, 100, nil, nil),
|
||||||
|
}
|
||||||
|
a := Assess(results)
|
||||||
|
if a.WinnerMargin < 0.24 || a.WinnerMargin > 0.26 { // Art.13 vs Art.14, not chunk vs chunk
|
||||||
|
t.Errorf("margin must be norm-level (~0.25), got %v", a.WinnerMargin)
|
||||||
|
}
|
||||||
|
if a.HumanReviewFlag {
|
||||||
|
t.Error("healthy norm-level margin → no review")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestDistinctNorms(t *testing.T) {
|
||||||
|
got := distinctNorms([]LegalSearchResult{
|
||||||
|
{CitationUnit: "Art. 13 CRA"},
|
||||||
|
{CitationUnit: "Art. 13 CRA"}, // duplicate norm → collapsed
|
||||||
|
{CitationUnit: "Art. 14 CRA"},
|
||||||
|
{CitationUnit: ""}, // no identity → kept
|
||||||
|
{CitationUnit: ""}, // no identity → kept
|
||||||
|
})
|
||||||
|
if len(got) != 4 {
|
||||||
|
t.Errorf("want 4 (2 distinct + 2 unidentified), got %d", len(got))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestDedupStrings(t *testing.T) {
|
||||||
|
got := dedupStrings([]string{"a", "b", "", "a"}, []string{"b", "c"}, "self")
|
||||||
|
if len(got) != 3 || got[0] != "a" || got[1] != "b" || got[2] != "c" {
|
||||||
|
t.Errorf("dedup: %v", got)
|
||||||
|
}
|
||||||
|
if len(dedupStrings([]string{"self"}, nil, "self")) != 0 {
|
||||||
|
t.Error("excluded value must be dropped")
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -20,6 +20,7 @@ type LegalRAGClient struct {
|
|||||||
httpClient *http.Client
|
httpClient *http.Client
|
||||||
textIndexEnsured map[string]bool
|
textIndexEnsured map[string]bool
|
||||||
hybridEnabled bool
|
hybridEnabled bool
|
||||||
|
graphEnabled bool
|
||||||
}
|
}
|
||||||
|
|
||||||
// NewLegalRAGClient creates a new Legal RAG client using Ollama bge-m3 embeddings.
|
// NewLegalRAGClient creates a new Legal RAG client using Ollama bge-m3 embeddings.
|
||||||
@@ -38,6 +39,11 @@ func NewLegalRAGClient() *LegalRAGClient {
|
|||||||
}
|
}
|
||||||
|
|
||||||
hybridEnabled := os.Getenv("RAG_HYBRID_SEARCH") != "false"
|
hybridEnabled := os.Getenv("RAG_HYBRID_SEARCH") != "false"
|
||||||
|
// Graph-Expansion ist OPT-IN: kein gemessener Rang-Nutzen ggue. der Binding-Augmentation,
|
||||||
|
// +1 Qdrant-Call/Suche, Flutungsrisiko ueber Reverse-Kanten. Bleibt als Recall-Sicherheitsnetz
|
||||||
|
// fuer spaetere Luecken (RAG_GRAPH_EXPANSION=true). Die Graph-Kanten werden in der Response
|
||||||
|
// zur Begruendung/Vollstaendigkeit genutzt, nicht zur Pool-Expansion (Default).
|
||||||
|
graphEnabled := os.Getenv("RAG_GRAPH_EXPANSION") == "true"
|
||||||
|
|
||||||
return &LegalRAGClient{
|
return &LegalRAGClient{
|
||||||
qdrantURL: qdrantURL,
|
qdrantURL: qdrantURL,
|
||||||
@@ -47,6 +53,7 @@ func NewLegalRAGClient() *LegalRAGClient {
|
|||||||
collection: "bp_compliance_ce",
|
collection: "bp_compliance_ce",
|
||||||
textIndexEnsured: make(map[string]bool),
|
textIndexEnsured: make(map[string]bool),
|
||||||
hybridEnabled: hybridEnabled,
|
hybridEnabled: hybridEnabled,
|
||||||
|
graphEnabled: graphEnabled,
|
||||||
httpClient: &http.Client{
|
httpClient: &http.Client{
|
||||||
Timeout: 60 * time.Second,
|
Timeout: 60 * time.Second,
|
||||||
},
|
},
|
||||||
@@ -93,6 +100,29 @@ func (c *LegalRAGClient) searchInternal(ctx context.Context, collection string,
|
|||||||
hits = denseHits
|
hits = denseHits
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Stratified: den binding_law-Pool ERGAENZEN (nicht ersetzen), damit die Pflichtquelle
|
||||||
|
// immer Kandidat ist — Guidance bleibt als Auslegungskontext erhalten. Best-effort:
|
||||||
|
// Fehler beim Binding-Query degradieren still auf den semantischen Pool.
|
||||||
|
if bindingHits, bErr := c.searchBinding(ctx, collection, embedding, topK); bErr == nil {
|
||||||
|
hits = mergeDedupHits(hits, bindingHits)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Control-Augmentation: bei expliziter Umsetzungsfrage einen tiefen dense-Pool ziehen und
|
||||||
|
// nur die Control-Pool-Rollen behalten — so werden NIST/CRA-Anhang (dense rank ~8-9, unter
|
||||||
|
// dem kleinen top-K) Kandidaten. Re-Rank/applyControlRoles ordnen sie danach.
|
||||||
|
if queryWantsControls(query) {
|
||||||
|
if controlHits, cErr := c.searchControls(ctx, collection, embedding); cErr == nil {
|
||||||
|
hits = mergeDedupHits(hits, controlHits)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Graph-Augmentation: verbundene Normen (references_out/in) der Top-Hits ueber die
|
||||||
|
// praezise Zitations-Kante in den Pool ziehen — z.B. Art. 13 CRA zieht Anhang I (die
|
||||||
|
// eigentliche Pflichtquelle). Pool-Augmentation only; Re-Rank + topK bleiben.
|
||||||
|
if c.graphEnabled {
|
||||||
|
hits = c.expandViaGraph(ctx, collection, hits)
|
||||||
|
}
|
||||||
|
|
||||||
results := make([]LegalSearchResult, len(hits))
|
results := make([]LegalSearchResult, len(hits))
|
||||||
for i, hit := range hits {
|
for i, hit := range hits {
|
||||||
// Legal-Metadaten nach rag_reingest_spec.md §2: bevorzugt die normalisierten Felder
|
// Legal-Metadaten nach rag_reingest_spec.md §2: bevorzugt die normalisierten Felder
|
||||||
@@ -121,12 +151,45 @@ func (c *LegalRAGClient) searchInternal(ctx context.Context, collection string,
|
|||||||
Pages: getIntSlice(hit.Payload, "pages"),
|
Pages: getIntSlice(hit.Payload, "pages"),
|
||||||
SourceURL: getString(hit.Payload, "source"),
|
SourceURL: getString(hit.Payload, "source"),
|
||||||
Score: hit.Score,
|
Score: hit.Score,
|
||||||
|
AuthorityWeight: getInt(hit.Payload, "authority_weight"),
|
||||||
|
SourceClass: getString(hit.Payload, "source_class"),
|
||||||
|
Jurisdiction: getString(hit.Payload, "jurisdiction"),
|
||||||
|
CitationUnit: getString(hit.Payload, "citation_unit"),
|
||||||
|
ReferencesOut: getStringSlice(hit.Payload, "references_out"),
|
||||||
|
ReferencesIn: getStringSlice(hit.Payload, "references_in"),
|
||||||
|
Superseded: getString(hit.Payload, "status") == "superseded",
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Authority-aware Re-Ranking: bindendes Recht der passenden Jurisdiktion/Domaene nach
|
||||||
|
// oben, Guidance/Fremdrecht/Off-Domain runter (nichts wird geloescht). Reihenfolge only,
|
||||||
|
// Response-Schema unveraendert. Score traegt den Authority-Score, damit nachgelagerte
|
||||||
|
// Multi-Collection-Merges (Advisor) die Ordnung bewahren.
|
||||||
|
results = rerankByAuthority(query, results)
|
||||||
|
if topK > 0 && len(results) > topK {
|
||||||
|
results = results[:topK]
|
||||||
|
}
|
||||||
|
|
||||||
return results, nil
|
return results, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// mergeDedupHits concatenates two hit lists, keeping the first occurrence of each point ID.
|
||||||
|
func mergeDedupHits(primary, extra []qdrantSearchHit) []qdrantSearchHit {
|
||||||
|
seen := make(map[string]bool, len(primary)+len(extra))
|
||||||
|
out := make([]qdrantSearchHit, 0, len(primary)+len(extra))
|
||||||
|
for _, list := range [][]qdrantSearchHit{primary, extra} {
|
||||||
|
for _, h := range list {
|
||||||
|
id := fmt.Sprint(h.ID)
|
||||||
|
if seen[id] {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
seen[id] = true
|
||||||
|
out = append(out, h)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
// FormatLegalContextForPrompt formats the legal context for inclusion in an LLM prompt.
|
// FormatLegalContextForPrompt formats the legal context for inclusion in an LLM prompt.
|
||||||
func (c *LegalRAGClient) FormatLegalContextForPrompt(lc *LegalContext) string {
|
func (c *LegalRAGClient) FormatLegalContextForPrompt(lc *LegalContext) string {
|
||||||
if lc == nil || len(lc.Results) == 0 {
|
if lc == nil || len(lc.Results) == 0 {
|
||||||
|
|||||||
@@ -0,0 +1,162 @@
|
|||||||
|
package ucca
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bytes"
|
||||||
|
"context"
|
||||||
|
"encoding/json"
|
||||||
|
"fmt"
|
||||||
|
"io"
|
||||||
|
"net/http"
|
||||||
|
"sort"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Graph-augmented retrieval: when a top hit cites an annex/article (references_out)
|
||||||
|
// or is cited by one (references_in), pull that connected norm into the candidate
|
||||||
|
// pool via the PRECISE citation graph instead of hoping semantic search surfaces
|
||||||
|
// it. E.g. a hit on CRA Art. 13 pulls in CRA Anhang I (the actual requirement).
|
||||||
|
// Pool-augmentation only — authority re-rank + topK slice still apply, so the
|
||||||
|
// response schema is unchanged.
|
||||||
|
const (
|
||||||
|
graphSeedCount = 5 // only the top hits seed the expansion
|
||||||
|
graphMaxExpand = 15 // cap connected norms pulled in (avoid pool explosion)
|
||||||
|
graphHopPenalty = 0.05 // a one-hop neighbour ranks just below its seed
|
||||||
|
)
|
||||||
|
|
||||||
|
// expandViaGraph augments hits with the norms they cite and the norms that cite
|
||||||
|
// them. Best-effort: on any error (or nothing to expand) the original hits are
|
||||||
|
// returned unchanged.
|
||||||
|
func (c *LegalRAGClient) expandViaGraph(ctx context.Context, collection string, hits []qdrantSearchHit) []qdrantSearchHit {
|
||||||
|
if len(hits) == 0 {
|
||||||
|
return hits
|
||||||
|
}
|
||||||
|
present := make(map[string]bool, len(hits))
|
||||||
|
for _, h := range hits {
|
||||||
|
if cu := getString(h.Payload, "citation_unit"); cu != "" {
|
||||||
|
present[cu] = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
seeds := hits
|
||||||
|
if len(seeds) > graphSeedCount {
|
||||||
|
seeds = seeds[:graphSeedCount]
|
||||||
|
}
|
||||||
|
// Forward edges only (references_out = the detail a hit explicitly points to,
|
||||||
|
// e.g. Art. 13 → Anhang I). Reverse (references_in) has high fan-out for popular
|
||||||
|
// annexes (Anhang I is cited by 23 articles) → pool flooding; it is surfaced as
|
||||||
|
// connected-norm metadata in the Phase 2 response instead of expanding the pool.
|
||||||
|
want := make(map[string]float64) // connected citation_unit -> best seeding score
|
||||||
|
for _, h := range seeds {
|
||||||
|
for _, cu := range getStringSlice(h.Payload, "references_out") {
|
||||||
|
if cu == "" || present[cu] {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if s, ok := want[cu]; !ok || h.Score > s {
|
||||||
|
want[cu] = h.Score
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if len(want) == 0 {
|
||||||
|
return hits
|
||||||
|
}
|
||||||
|
|
||||||
|
units := topByScore(want, graphMaxExpand)
|
||||||
|
fetched, err := c.fetchByCitationUnits(ctx, collection, units)
|
||||||
|
if err != nil || len(fetched) == 0 {
|
||||||
|
return hits
|
||||||
|
}
|
||||||
|
neighbours := make([]qdrantSearchHit, 0, len(fetched))
|
||||||
|
for cu, pt := range fetched {
|
||||||
|
neighbours = append(neighbours, qdrantSearchHit{ID: pt.ID, Score: want[cu] - graphHopPenalty, Payload: pt.Payload})
|
||||||
|
}
|
||||||
|
return mergeDedupHits(hits, neighbours)
|
||||||
|
}
|
||||||
|
|
||||||
|
// topByScore returns up to n keys with the highest values. Deterministic: ties
|
||||||
|
// broken by the key string so the cap is stable across runs.
|
||||||
|
func topByScore(m map[string]float64, n int) []string {
|
||||||
|
keys := make([]string, 0, len(m))
|
||||||
|
for k := range m {
|
||||||
|
keys = append(keys, k)
|
||||||
|
}
|
||||||
|
sort.Slice(keys, func(i, j int) bool {
|
||||||
|
if m[keys[i]] != m[keys[j]] {
|
||||||
|
return m[keys[i]] > m[keys[j]]
|
||||||
|
}
|
||||||
|
return keys[i] < keys[j]
|
||||||
|
})
|
||||||
|
if len(keys) > n {
|
||||||
|
keys = keys[:n]
|
||||||
|
}
|
||||||
|
return keys
|
||||||
|
}
|
||||||
|
|
||||||
|
// fetchByCitationUnits loads one representative point (the first chunk) per
|
||||||
|
// citation_unit from the given collection.
|
||||||
|
func (c *LegalRAGClient) fetchByCitationUnits(ctx context.Context, collection string, units []string) (map[string]qdrantScrollPoint, error) {
|
||||||
|
should := make([]map[string]interface{}, 0, len(units))
|
||||||
|
for _, cu := range units {
|
||||||
|
should = append(should, map[string]interface{}{"key": "citation_unit", "match": map[string]interface{}{"value": cu}})
|
||||||
|
}
|
||||||
|
reqBody := map[string]interface{}{
|
||||||
|
"limit": len(units) * 4,
|
||||||
|
"with_payload": true,
|
||||||
|
"with_vectors": false,
|
||||||
|
"filter": map[string]interface{}{"should": should},
|
||||||
|
}
|
||||||
|
jsonBody, err := json.Marshal(reqBody)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
url := fmt.Sprintf("%s/collections/%s/points/scroll", c.qdrantURL, collection)
|
||||||
|
req, err := http.NewRequestWithContext(ctx, "POST", url, bytes.NewReader(jsonBody))
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
req.Header.Set("Content-Type", "application/json")
|
||||||
|
if c.qdrantAPIKey != "" {
|
||||||
|
req.Header.Set("api-key", c.qdrantAPIKey)
|
||||||
|
}
|
||||||
|
resp, err := c.httpClient.Do(req)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
defer func() { _ = resp.Body.Close() }()
|
||||||
|
if resp.StatusCode != http.StatusOK {
|
||||||
|
body, _ := io.ReadAll(resp.Body)
|
||||||
|
return nil, fmt.Errorf("qdrant scroll returned %d: %s", resp.StatusCode, string(body))
|
||||||
|
}
|
||||||
|
var scrollResp qdrantScrollResponse
|
||||||
|
if err := json.NewDecoder(resp.Body).Decode(&scrollResp); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
out := make(map[string]qdrantScrollPoint, len(units))
|
||||||
|
for _, pt := range scrollResp.Result.Points {
|
||||||
|
cu := getString(pt.Payload, "citation_unit")
|
||||||
|
if cu != "" {
|
||||||
|
if _, seen := out[cu]; !seen {
|
||||||
|
out[cu] = pt
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return out, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// getStringSlice extracts a []string from a Qdrant payload list field
|
||||||
|
// (references_out / references_in are stored as JSON arrays of strings).
|
||||||
|
func getStringSlice(m map[string]interface{}, key string) []string {
|
||||||
|
v, ok := m[key]
|
||||||
|
if !ok {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
arr, ok := v.([]interface{})
|
||||||
|
if !ok {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
out := make([]string, 0, len(arr))
|
||||||
|
for _, item := range arr {
|
||||||
|
if s, ok := item.(string); ok {
|
||||||
|
out = append(out, s)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
@@ -0,0 +1,89 @@
|
|||||||
|
package ucca
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"encoding/json"
|
||||||
|
"net/http"
|
||||||
|
"net/http/httptest"
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestGetStringSlice(t *testing.T) {
|
||||||
|
m := map[string]interface{}{
|
||||||
|
"refs": []interface{}{"a", "b", 3, "c"}, // non-strings are skipped
|
||||||
|
"str": "not-a-list",
|
||||||
|
}
|
||||||
|
got := getStringSlice(m, "refs")
|
||||||
|
if len(got) != 3 || got[0] != "a" || got[2] != "c" {
|
||||||
|
t.Errorf("refs: %v", got)
|
||||||
|
}
|
||||||
|
if getStringSlice(m, "missing") != nil {
|
||||||
|
t.Error("missing key should be nil")
|
||||||
|
}
|
||||||
|
if getStringSlice(m, "str") != nil {
|
||||||
|
t.Error("non-list should be nil")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestTopByScore_DeterministicCap(t *testing.T) {
|
||||||
|
m := map[string]float64{"x": 0.5, "y": 0.9, "z": 0.5, "w": 0.7}
|
||||||
|
got := topByScore(m, 2)
|
||||||
|
if len(got) != 2 || got[0] != "y" || got[1] != "w" {
|
||||||
|
t.Errorf("want [y w], got %v", got)
|
||||||
|
}
|
||||||
|
all := topByScore(m, 10)
|
||||||
|
if all[2] != "x" || all[3] != "z" { // tie 0.5 broken by key string
|
||||||
|
t.Errorf("tie-break not deterministic: %v", all)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestExpandViaGraph_NoSeedsOrRefs(t *testing.T) {
|
||||||
|
c := &LegalRAGClient{} // nil httpClient → must not be called on these paths
|
||||||
|
if out := c.expandViaGraph(context.Background(), "x", nil); out != nil {
|
||||||
|
t.Error("empty hits should return nil")
|
||||||
|
}
|
||||||
|
hits := []qdrantSearchHit{{ID: 1, Score: 0.8, Payload: map[string]interface{}{"citation_unit": "Art. 1 CRA"}}}
|
||||||
|
if out := c.expandViaGraph(context.Background(), "x", hits); len(out) != 1 {
|
||||||
|
t.Errorf("no references → unchanged, got %d", len(out))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestExpandViaGraph_PullsConnectedNorm(t *testing.T) {
|
||||||
|
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
|
||||||
|
_ = json.NewEncoder(w).Encode(map[string]interface{}{
|
||||||
|
"result": map[string]interface{}{
|
||||||
|
"points": []map[string]interface{}{
|
||||||
|
{"id": 99, "payload": map[string]interface{}{
|
||||||
|
"citation_unit": "CRA Anhang I", "chunk_text": "Sicherheitsanforderungen",
|
||||||
|
"source_class": "binding_law", "authority_weight": 100, "regulation_short": "CRA",
|
||||||
|
}},
|
||||||
|
},
|
||||||
|
"next_page_offset": nil,
|
||||||
|
},
|
||||||
|
})
|
||||||
|
}))
|
||||||
|
defer srv.Close()
|
||||||
|
|
||||||
|
c := &LegalRAGClient{qdrantURL: srv.URL, httpClient: srv.Client()}
|
||||||
|
hits := []qdrantSearchHit{
|
||||||
|
{ID: 1, Score: 0.70, Payload: map[string]interface{}{
|
||||||
|
"citation_unit": "Art. 13 CRA", "references_out": []interface{}{"CRA Anhang I"},
|
||||||
|
}},
|
||||||
|
}
|
||||||
|
out := c.expandViaGraph(context.Background(), "bp_compliance_ce", hits)
|
||||||
|
if len(out) != 2 {
|
||||||
|
t.Fatalf("want 2 hits (seed + connected annex), got %d", len(out))
|
||||||
|
}
|
||||||
|
var found *qdrantSearchHit
|
||||||
|
for i := range out {
|
||||||
|
if getString(out[i].Payload, "citation_unit") == "CRA Anhang I" {
|
||||||
|
found = &out[i]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if found == nil {
|
||||||
|
t.Fatal("connected norm CRA Anhang I was not pulled into the pool")
|
||||||
|
}
|
||||||
|
if found.Score < 0.64 || found.Score > 0.66 { // 0.70 seed − 0.05 hop penalty
|
||||||
|
t.Errorf("connected score = %v, want ~0.65", found.Score)
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -185,6 +185,55 @@ func (c *LegalRAGClient) searchDense(ctx context.Context, collection string, emb
|
|||||||
searchReq.Filter = &qdrantFilter{Should: conditions}
|
searchReq.Filter = &qdrantFilter{Should: conditions}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
return c.doPointsSearch(ctx, collection, searchReq)
|
||||||
|
}
|
||||||
|
|
||||||
|
// searchBinding fetches the top binding_law hits (authority-stratified pool) so the
|
||||||
|
// obligation source is always a candidate even when guidance dominates semantically.
|
||||||
|
// It AUGMENTS the semantic pool — guidance is preserved as interpretation context.
|
||||||
|
func (c *LegalRAGClient) searchBinding(ctx context.Context, collection string, embedding []float64, topK int) ([]qdrantSearchHit, error) {
|
||||||
|
searchReq := qdrantSearchRequest{
|
||||||
|
Vector: embedding,
|
||||||
|
Limit: topK,
|
||||||
|
WithPayload: true,
|
||||||
|
Filter: &qdrantFilter{Must: []qdrantCondition{
|
||||||
|
{Key: "source_class", Match: qdrantMatch{Value: "binding_law"}},
|
||||||
|
}},
|
||||||
|
}
|
||||||
|
|
||||||
|
return c.doPointsSearch(ctx, collection, searchReq)
|
||||||
|
}
|
||||||
|
|
||||||
|
// controlPoolDepth is how deep the dense control pull reaches. Measured: for an EU-cyber
|
||||||
|
// control query the relevant control sources sit at dense rank ~8-9 (NIST, CRA Annex), far
|
||||||
|
// below the client's small top-K — so a fixed dense depth of 60 reliably surfaces them.
|
||||||
|
const controlPoolDepth = 60
|
||||||
|
|
||||||
|
// searchControls fetches a DEEP dense pool and keeps only the control-pool roles, so control
|
||||||
|
// sources that the small top-K (hybrid) search misses become candidates on an implementation
|
||||||
|
// question. Role is derived in code (no source_role tag needed). AUGMENTS the pool — the
|
||||||
|
// caller gates it on control-intent.
|
||||||
|
func (c *LegalRAGClient) searchControls(ctx context.Context, collection string, embedding []float64) ([]qdrantSearchHit, error) {
|
||||||
|
searchReq := qdrantSearchRequest{
|
||||||
|
Vector: embedding,
|
||||||
|
Limit: controlPoolDepth,
|
||||||
|
WithPayload: true,
|
||||||
|
}
|
||||||
|
hits, err := c.doPointsSearch(ctx, collection, searchReq)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
kept := make([]qdrantSearchHit, 0, len(hits))
|
||||||
|
for _, h := range hits {
|
||||||
|
if isControlPoolRole(controlRoleOf(h.Payload)) {
|
||||||
|
kept = append(kept, h)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return kept, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// doPointsSearch issues a POST /points/search and decodes the hits.
|
||||||
|
func (c *LegalRAGClient) doPointsSearch(ctx context.Context, collection string, searchReq qdrantSearchRequest) ([]qdrantSearchHit, error) {
|
||||||
jsonBody, err := json.Marshal(searchReq)
|
jsonBody, err := json.Marshal(searchReq)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("failed to marshal search request: %w", err)
|
return nil, fmt.Errorf("failed to marshal search request: %w", err)
|
||||||
|
|||||||
@@ -0,0 +1,135 @@
|
|||||||
|
package ucca
|
||||||
|
|
||||||
|
import "testing"
|
||||||
|
|
||||||
|
func intentRes(reg, sourceClass string, sem float64, weight int) LegalSearchResult {
|
||||||
|
return LegalSearchResult{
|
||||||
|
RegulationShort: reg, SourceClass: sourceClass, Score: sem,
|
||||||
|
AuthorityWeight: weight, Jurisdiction: "EU",
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestQueryWantsGuidance(t *testing.T) {
|
||||||
|
wants := []string{
|
||||||
|
"Was empfiehlt der EDPB zum DSB?",
|
||||||
|
"Was sagt die ENISA zu Security Updates?",
|
||||||
|
"laut DSK ...",
|
||||||
|
"Orientierungshilfe zur DSFA",
|
||||||
|
"Welche BSI-Empfehlung gilt?",
|
||||||
|
"Auslegung der Aufsichtsbehörde",
|
||||||
|
}
|
||||||
|
plain := []string{
|
||||||
|
"Ab wann braucht man einen Datenschutzbeauftragten?",
|
||||||
|
"Welche Anforderungen bestehen an Security Updates?",
|
||||||
|
}
|
||||||
|
for _, q := range wants {
|
||||||
|
if !queryWantsGuidance(q) {
|
||||||
|
t.Errorf("should detect interpretation intent: %q", q)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for _, q := range plain {
|
||||||
|
if queryWantsGuidance(q) {
|
||||||
|
t.Errorf("should NOT detect intent (norm question): %q", q)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestRerank_NormQuestion_BindingStaysTop(t *testing.T) {
|
||||||
|
// No intent signal → binding wins even though guidance is semantically higher.
|
||||||
|
results := []LegalSearchResult{
|
||||||
|
intentRes("EDPB DPO", "supervisory_guidance", 0.64, 70),
|
||||||
|
intentRes("DSGVO", "binding_law", 0.58, 100),
|
||||||
|
}
|
||||||
|
out := rerankByAuthority("Ab wann braucht man einen Datenschutzbeauftragten?", results)
|
||||||
|
if out[0].SourceClass != "binding_law" {
|
||||||
|
t.Errorf("norm question: binding must stay Top-1, got %s", out[0].SourceClass)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestRerank_InterpretationQuestion_GuidanceMayWin(t *testing.T) {
|
||||||
|
// Explicit intent + guidance semantically competitive → guidance wins.
|
||||||
|
results := []LegalSearchResult{
|
||||||
|
intentRes("EDPB DPO", "supervisory_guidance", 0.64, 70),
|
||||||
|
intentRes("DSGVO", "binding_law", 0.58, 100),
|
||||||
|
}
|
||||||
|
out := rerankByAuthority("Was empfiehlt der EDPB zum Datenschutzbeauftragten?", results)
|
||||||
|
if out[0].SourceClass != "supervisory_guidance" {
|
||||||
|
t.Errorf("interpretation question: guidance should win Top-1, got %s", out[0].SourceClass)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestRerank_OffTopicGuidance_BlockedByGuard(t *testing.T) {
|
||||||
|
// Intent present, but guidance semantic is far below the best binding hit →
|
||||||
|
// the margin guard keeps binding on top (no off-topic guideline override).
|
||||||
|
results := []LegalSearchResult{
|
||||||
|
intentRes("EDPB DPO", "supervisory_guidance", 0.40, 70),
|
||||||
|
intentRes("DSGVO", "binding_law", 0.58, 100),
|
||||||
|
}
|
||||||
|
out := rerankByAuthority("Was empfiehlt der EDPB zum Datenschutzbeauftragten?", results)
|
||||||
|
if out[0].SourceClass != "binding_law" {
|
||||||
|
t.Errorf("off-topic guidance must not win even with intent, got %s", out[0].SourceClass)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestQueryWantsControls(t *testing.T) {
|
||||||
|
wants := []string{
|
||||||
|
"Welche Controls passen zu Security Updates?",
|
||||||
|
"Welche Maßnahmen sollten wir umsetzen?",
|
||||||
|
"Wie härten wir den Server ab?",
|
||||||
|
"Gibt es NIST-Controls dafür?",
|
||||||
|
"OWASP Best Practice für Logging?",
|
||||||
|
"BSI Grundschutz Bausteine",
|
||||||
|
}
|
||||||
|
plain := []string{
|
||||||
|
"Welche Anforderungen bestehen an Security Updates?",
|
||||||
|
"Ab wann braucht man einen Datenschutzbeauftragten?",
|
||||||
|
}
|
||||||
|
for _, q := range wants {
|
||||||
|
if !queryWantsControls(q) {
|
||||||
|
t.Errorf("should detect control/implementation intent: %q", q)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for _, q := range plain {
|
||||||
|
if queryWantsControls(q) {
|
||||||
|
t.Errorf("should NOT detect control intent (norm question): %q", q)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestRerank_ControlQuestion_OperationalReqTop(t *testing.T) {
|
||||||
|
// User priority for implementation questions: operational_requirement (binding concrete,
|
||||||
|
// CRA Anhang I) > control_standard (NIST). Both are in the control-pool; op_req wins.
|
||||||
|
results := []LegalSearchResult{
|
||||||
|
{RegulationShort: "NIST SP 800-82r3", ArticleLabel: "AU-8", SourceClass: "technical_standard", AuthorityWeight: 80, Jurisdiction: "EU", Score: 0.60},
|
||||||
|
{RegulationShort: "CRA", ArticleLabel: "CRA Anhang I", Category: "regulation", Score: 0.58},
|
||||||
|
}
|
||||||
|
out := rerankByAuthority("Welche Controls und Massnahmen passen zu Security Updates?", results)
|
||||||
|
if out[0].RegulationShort != "CRA" {
|
||||||
|
t.Errorf("operational_requirement (CRA Anhang I) should be Top-1 over control_standard, got %q", out[0].RegulationShort)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestRerank_NormQuestion_BindingOverStandard(t *testing.T) {
|
||||||
|
// "Anforderungen" → no control intent → binding obligation stays Top-1 over the standard.
|
||||||
|
results := []LegalSearchResult{
|
||||||
|
intentRes("NIST SP 800-82", "technical_standard", 0.62, 80),
|
||||||
|
intentRes("CRA", "binding_law", 0.58, 100),
|
||||||
|
}
|
||||||
|
out := rerankByAuthority("Welche Anforderungen bestehen an Security Updates?", results)
|
||||||
|
if out[0].SourceClass != "binding_law" {
|
||||||
|
t.Errorf("norm question: binding must stay Top-1 over standard, got %s", out[0].SourceClass)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestRerank_ControlQuestion_PoolBeatsBareObligation(t *testing.T) {
|
||||||
|
// A control-pool source (NIST control_standard) outranks an abstract obligation with no
|
||||||
|
// domain/topic advantage, because the implementation intent boosts the control-pool.
|
||||||
|
results := []LegalSearchResult{
|
||||||
|
{RegulationShort: "NIST SP 800-82r3", ArticleLabel: "AU-8", SourceClass: "technical_standard", AuthorityWeight: 80, Jurisdiction: "EU", Score: 0.55},
|
||||||
|
{RegulationShort: "XYZ", ArticleLabel: "Art. 5 XYZ", Category: "regulation", Score: 0.58},
|
||||||
|
}
|
||||||
|
out := rerankByAuthority("Welche Controls und Massnahmen passen zu Security Updates?", results)
|
||||||
|
if out[0].RegulationShort != "NIST SP 800-82r3" {
|
||||||
|
t.Errorf("control_standard should beat a bare abstract obligation on a control question, got %q", out[0].RegulationShort)
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -225,6 +225,18 @@ func getIntSlice(m map[string]interface{}, key string) []int {
|
|||||||
return result
|
return result
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func getInt(m map[string]interface{}, key string) int {
|
||||||
|
if v, ok := m[key]; ok {
|
||||||
|
switch n := v.(type) {
|
||||||
|
case float64:
|
||||||
|
return int(n)
|
||||||
|
case int:
|
||||||
|
return n
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
func contains(slice []string, item string) bool {
|
func contains(slice []string, item string) bool {
|
||||||
for _, s := range slice {
|
for _, s := range slice {
|
||||||
if s == item {
|
if s == item {
|
||||||
|
|||||||
@@ -0,0 +1,30 @@
|
|||||||
|
package ucca
|
||||||
|
|
||||||
|
import "testing"
|
||||||
|
|
||||||
|
// A superseded alt-source must rank below the same result when it is NOT
|
||||||
|
// superseded (the eu-v1 norm), but only demoted — the penalty is finite, so it
|
||||||
|
// stays in the pool and remains findable for history/transition questions.
|
||||||
|
func TestAuthorityScore_SupersededIsDemotedNotRemoved(t *testing.T) {
|
||||||
|
fresh := LegalSearchResult{
|
||||||
|
Score: 0.65, SourceClass: "binding_law", AuthorityWeight: 100,
|
||||||
|
Jurisdiction: "EU", RegulationShort: "CRA", Article: "13",
|
||||||
|
}
|
||||||
|
old := fresh
|
||||||
|
old.Superseded = true
|
||||||
|
|
||||||
|
sFresh := authorityScore("CRA Sicherheitsupdates Hersteller", fresh, "", false)
|
||||||
|
sOld := authorityScore("CRA Sicherheitsupdates Hersteller", old, "", false)
|
||||||
|
|
||||||
|
if sOld >= sFresh {
|
||||||
|
t.Errorf("superseded must score lower: fresh=%.3f superseded=%.3f", sFresh, sOld)
|
||||||
|
}
|
||||||
|
gap := sFresh - sOld
|
||||||
|
if gap < supersededPenalty-0.001 || gap > supersededPenalty+0.001 {
|
||||||
|
t.Errorf("demotion should equal supersededPenalty (%.2f), got %.3f", supersededPenalty, gap)
|
||||||
|
}
|
||||||
|
// Still a positive, finite score → present in the pool, not hidden.
|
||||||
|
if sOld <= -1 {
|
||||||
|
t.Errorf("superseded score collapsed (%.3f) — must remain findable", sOld)
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -399,8 +399,9 @@ func TestHybridSearch_UsesQueryAPI(t *testing.T) {
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
// Fallback: should not reach dense search
|
// /points/search is now the stratified binding-law augmentation query (it AUGMENTS
|
||||||
t.Error("Unexpected dense search call when hybrid succeeded")
|
// the hybrid pool, it is not a dense fallback). Return empty so the hybrid hit
|
||||||
|
// remains the sole result for this test.
|
||||||
json.NewEncoder(w).Encode(qdrantSearchResponse{Result: []qdrantSearchHit{}})
|
json.NewEncoder(w).Encode(qdrantSearchResponse{Result: []qdrantSearchHit{}})
|
||||||
}))
|
}))
|
||||||
defer qdrantMock.Close()
|
defer qdrantMock.Close()
|
||||||
@@ -446,6 +447,59 @@ func TestHybridSearch_UsesQueryAPI(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TestSearch_StratifiedBindingRerank verifies that the binding-law pool augments the
|
||||||
|
// semantic pool and that authority re-ranking lifts binding law above higher-semantic guidance.
|
||||||
|
func TestSearch_StratifiedBindingRerank(t *testing.T) {
|
||||||
|
ollamaMock := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||||
|
json.NewEncoder(w).Encode(ollamaEmbeddingResponse{Embedding: make([]float64, 1024)})
|
||||||
|
}))
|
||||||
|
defer ollamaMock.Close()
|
||||||
|
|
||||||
|
qdrantMock := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||||
|
if strings.Contains(r.URL.Path, "/index") {
|
||||||
|
w.WriteHeader(http.StatusOK)
|
||||||
|
w.Write([]byte(`{"result":{"status":"completed"}}`))
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if strings.Contains(r.URL.Path, "/points/query") {
|
||||||
|
json.NewEncoder(w).Encode(qdrantQueryResponse{Result: []qdrantSearchHit{
|
||||||
|
{ID: "g1", Score: 0.72, Payload: map[string]interface{}{
|
||||||
|
"chunk_text": "ENISA guidance", "regulation_short": "ENISA",
|
||||||
|
"article_label": "ENISA CRA Mapping", "source_class": "supervisory_guidance",
|
||||||
|
"authority_weight": float64(70), "jurisdiction": "EU",
|
||||||
|
}},
|
||||||
|
}})
|
||||||
|
return
|
||||||
|
}
|
||||||
|
// /points/search = stratified binding-law pool (source_class=binding_law)
|
||||||
|
json.NewEncoder(w).Encode(qdrantSearchResponse{Result: []qdrantSearchHit{
|
||||||
|
{ID: "b1", Score: 0.66, Payload: map[string]interface{}{
|
||||||
|
"chunk_text": "CRA Anhang I requirement", "regulation_short": "CRA",
|
||||||
|
"article_label": "CRA Anhang I", "source_class": "binding_law",
|
||||||
|
"authority_weight": float64(100), "jurisdiction": "EU",
|
||||||
|
}},
|
||||||
|
}})
|
||||||
|
}))
|
||||||
|
defer qdrantMock.Close()
|
||||||
|
|
||||||
|
client := &LegalRAGClient{
|
||||||
|
qdrantURL: qdrantMock.URL, ollamaURL: ollamaMock.URL, embeddingModel: "bge-m3",
|
||||||
|
collection: "bp_compliance_ce", textIndexEnsured: make(map[string]bool),
|
||||||
|
hybridEnabled: true, httpClient: http.DefaultClient,
|
||||||
|
}
|
||||||
|
|
||||||
|
results, err := client.Search(context.Background(), "Was gilt hier?", nil, 5)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("search failed: %v", err)
|
||||||
|
}
|
||||||
|
if len(results) != 2 {
|
||||||
|
t.Fatalf("expected 2 merged results (guidance + binding), got %d", len(results))
|
||||||
|
}
|
||||||
|
if results[0].RegulationShort != "CRA" {
|
||||||
|
t.Errorf("binding CRA must rank first over higher-semantic guidance, got %q", results[0].RegulationShort)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestHybridSearch_FallbackToDense(t *testing.T) {
|
func TestHybridSearch_FallbackToDense(t *testing.T) {
|
||||||
var requestedPaths []string
|
var requestedPaths []string
|
||||||
|
|
||||||
|
|||||||
@@ -20,6 +20,38 @@ type LegalSearchResult struct {
|
|||||||
Pages []int `json:"pages,omitempty"`
|
Pages []int `json:"pages,omitempty"`
|
||||||
SourceURL string `json:"source_url"`
|
SourceURL string `json:"source_url"`
|
||||||
Score float64 `json:"score"`
|
Score float64 `json:"score"`
|
||||||
|
|
||||||
|
// Interne Felder fuer das Authority-Re-Ranking (Phase 1) — NICHT serialisiert
|
||||||
|
// (json:"-"), daher kein Contract-Change. Aus dem Qdrant-Payload befuellt und nur
|
||||||
|
// fuer die Sortierung in rerankByAuthority verwendet.
|
||||||
|
AuthorityWeight int `json:"-"`
|
||||||
|
SourceClass string `json:"-"`
|
||||||
|
Jurisdiction string `json:"-"`
|
||||||
|
|
||||||
|
// Zitations-Graph (Phase 2) — intern, speist nur die Assessment-Berechnung
|
||||||
|
// (verbundene Normen, Begruendung). Pro-Result-Schema bleibt eingefroren.
|
||||||
|
CitationUnit string `json:"-"`
|
||||||
|
ReferencesOut []string `json:"-"`
|
||||||
|
ReferencesIn []string `json:"-"`
|
||||||
|
|
||||||
|
// Supersede-Status (status="superseded", use_for_primary=false) — Alt-Quelle,
|
||||||
|
// die fuer Default-Fragen demoted wird (nicht versteckt; fuer Historie auffindbar).
|
||||||
|
Superseded bool `json:"-"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// LegalAssessment is the auditable explanation layer over a ranked result set:
|
||||||
|
// which norm is primary, which norms connect to it via the citation graph,
|
||||||
|
// whether the answer crosses regulatory regimes, and whether a human should
|
||||||
|
// review. Computed from the already-ranked results — it EXPLAINS retrieval, it
|
||||||
|
// does not change it (graph edges for reasoning/completeness, not pool-expansion).
|
||||||
|
type LegalAssessment struct {
|
||||||
|
PrimaryNorm string `json:"primary_norm"`
|
||||||
|
PrimaryRegulation string `json:"primary_regulation"`
|
||||||
|
ConnectedNorms []string `json:"connected_norms"`
|
||||||
|
CrossRegime bool `json:"cross_regime"`
|
||||||
|
HumanReviewFlag bool `json:"human_review_flag"`
|
||||||
|
WinnerMargin float64 `json:"winner_margin"`
|
||||||
|
ScoreReasoning string `json:"score_reasoning"`
|
||||||
}
|
}
|
||||||
|
|
||||||
// LegalContext represents aggregated legal context for an assessment.
|
// LegalContext represents aggregated legal context for an assessment.
|
||||||
|
|||||||
Reference in New Issue
Block a user