fix(ucca): guidance-intent erkennt direkt benannte Guidance-Dokumente
CI / detect-changes (pull_request) Successful in 7s
CI / branch-name (pull_request) Successful in 1s
CI / guardrail-integrity (pull_request) Successful in 5s
CI / secret-scan (pull_request) Successful in 9s
CI / dep-audit (pull_request) Failing after 56s
CI / sbom-scan (pull_request) Failing after 58s
CI / build-sha-integrity (pull_request) Successful in 7s
CI / validate-canonical-controls (pull_request) Successful in 7s
CI / loc-budget (pull_request) Successful in 21s
CI / go-lint (pull_request) Successful in 48s
CI / python-lint (pull_request) Failing after 17s
CI / nodejs-lint (pull_request) Failing after 1m9s
CI / nodejs-build (pull_request) Successful in 3m2s
CI / test-go (pull_request) Successful in 1m3s
CI / iace-gt-coverage (pull_request) Successful in 18s
CI / test-python-backend (pull_request) Successful in 28s
CI / test-python-document-crawler (pull_request) Successful in 14s
CI / test-python-dsms-gateway (pull_request) Successful in 11s

queryWantsGuidance verfehlte rein dokument-namige Fragen ("Welche Kriterien
nennt WP248 ...", "Was sagt GL 07/2020 ..."): guidanceIntentSignals enthielt
zwar Herausgeber (edpb/dsk/enisa) und Verben (empfiehlt/laut), aber keine
Working-Paper-/Guideline-Identifier. Dadurch loeste der Authority-Lift nicht
aus -> binding_law (bzw. im homogenen Korpus sogar off-domain MaschVO/CRA)
verdraengte die Guidance aus den Top-K.

Fix: WP2xx / GL 0x / "working paper" als Guidance-Signal ergaenzt. Generisch
ueber alle WP-/GL-Dokumente, KEINE doc-spezifische Regel (Query->Intent, nicht
Query->konkretes Dokument).

Validierung (homogener Build-Korpus, bge-m3 + Qdrant Cosine):
- 10 Hard Cases: 8/10 -> 10/10 (WP248/WP260 zurueck in Top-8)
- ComplianceBench-100: 0/100 Norm-Fragen veraendert (Freeze-Regression gruen),
  18/18 Guidance-Intent-Fragen verbessert (binding -> korrekte Guidance-Klasse)
- Hybrid == Dense (Keyword-RRF war NICHT die Ursache, der Lift-Gate war es)

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-06-29 14:44:53 +02:00
parent e54f3cde94
commit 4818fc51c2
2 changed files with 109 additions and 0 deletions
@@ -28,6 +28,10 @@ var guidanceIntentSignals = []string{
"edpb", "europäischer datenschutzausschuss", "europaeischer datenschutzausschuss", "edpb", "europäischer datenschutzausschuss", "europaeischer datenschutzausschuss",
"dsk", "enisa", "bsi", "leitlinie", "guideline", "orientierungshilfe", "dsk", "enisa", "bsi", "leitlinie", "guideline", "orientierungshilfe",
"auslegung", "empfiehlt", "empfehlung", "sagt", "laut", "auslegung", "empfiehlt", "empfehlung", "sagt", "laut",
// Guidance-Dokumente direkt benannt (WP29-Working-Papers WP2xx + EDPB-Guidelines "GL 0x/20xx"):
// "Welche Kriterien nennt WP248 ..." / "Was sagt GL 07/2020 ..." tragen Guidance-Intent ohne
// die Verben oben. Fix: queryWantsGuidance verfehlte rein-doc-namige Formulierungen.
"wp2", "wp 2", "wp29", "working paper", "gl 0",
} }
// controlIntentSignals mark a query that asks HOW to implement / which controls or // controlIntentSignals mark a query that asks HOW to implement / which controls or
@@ -0,0 +1,105 @@
package ucca
import (
"context"
"encoding/json"
"fmt"
"os"
"strings"
"testing"
)
// TestGuidanceFixE2E runs the 10 hard cases through the REAL LegalRAGClient against the
// homogeneous build collection. Guarded by RUN_E2E=1. Reports the rank of the expected
// document within the returned top-K — proving whether the guidanceIntentSignals fix lifts
// guidance (WP248/WP260) back into the prompt. Toggle RAG_HYBRID_SEARCH to compare modes.
func TestGuidanceFixE2E(t *testing.T) {
if os.Getenv("RUN_E2E") != "1" {
t.Skip("set RUN_E2E=1 + QDRANT_URL/OLLAMA_URL to run")
}
c := NewLegalRAGClient()
coll := os.Getenv("E2E_COLLECTION")
if coll == "" {
coll = "bp_compliance_kb_2026_1_build"
}
cases := []struct{ id, q, expect string }{
{"GQ-0012", "Welche neun Kriterien nennt WP248 fuer ein voraussichtlich hohes Risiko?", "WP248"},
{"GQ-0013", "Ab wie vielen der WP248-Kriterien ist in der Regel eine Datenschutz-Folgenabschaetzung erforderlich?", "WP248"},
{"GQ-0023", "Welche Anforderungen stellt WP260 an eine klare und einfache Sprache?", "WP260"},
{"GQ-0024", "Was versteht WP260 unter Layered Privacy Notices?", "WP260"},
{"GQ-0054", "Welche grundlegenden Cybersecurity-Anforderungen enthaelt Annex I Part I?", "CRA"},
{"GQ-0060", "Wann muss eine aktiv ausgenutzte Schwachstelle gemeldet werden?", "CRA"},
{"GQ-0074", "Benoetigt eine SPS ohne Netzwerkanschluss eine CRA-Bewertung?", "CRA"},
{"GQ-0079", "Welche grundlegenden Sicherheits- und Gesundheitsschutzanforderungen enthaelt Anhang III?", "MASCHVO"},
{"GQ-0091", "Welche Anforderungen gelten fuer wesentliche Veraenderungen einer Maschine?", "MASCHVO"},
{"GQ-0070", "Wie greifen CRA und Maschinenverordnung bei einer vernetzten Maschine ineinander?", "CRA"},
}
fmt.Printf("\n### hybrid=%v collection=%s\n", os.Getenv("RAG_HYBRID_SEARCH") != "false", coll)
for _, tc := range cases {
res, err := c.SearchCollection(context.Background(), coll, tc.q, nil, 8)
if err != nil {
t.Fatalf("%s: %v", tc.id, err)
}
rank := -1
for i, r := range res {
lab := strings.ToUpper(r.RegulationCode + " " + r.ArticleLabel)
if strings.Contains(lab, tc.expect) {
rank = i + 1
break
}
}
top1 := ""
if len(res) > 0 {
top1 = res[0].RegulationCode + " (" + res[0].SourceClass + ")"
}
status := "FAIL"
if rank > 0 {
status = "OK"
}
fmt.Printf("%-9s expect=%-8s rank_in_top8=%-2d %-5s top1=%s\n", tc.id, tc.expect, rank, status, top1)
}
}
// TestBenchE2E runs the FULL ComplianceBench (E2E_BENCH_FILE) through the real client and
// prints, per question, the ordered top-8 regulation codes. Diffing BEFORE vs AFTER proves
// the fix only perturbs guidance-intent queries (gated on queryWantsGuidance) and never the
// norm questions — the Knowledge-Freeze regression guard.
func TestBenchE2E(t *testing.T) {
if os.Getenv("RUN_E2E") != "1" {
t.Skip("set RUN_E2E=1 + E2E_BENCH_FILE")
}
path := os.Getenv("E2E_BENCH_FILE")
if path == "" {
t.Skip("E2E_BENCH_FILE not set")
}
raw, err := os.ReadFile(path)
if err != nil {
t.Fatal(err)
}
var bench struct {
Questions []struct {
ID string `json:"id"`
Question string `json:"question"`
} `json:"questions"`
}
if err := json.Unmarshal(raw, &bench); err != nil {
t.Fatal(err)
}
c := NewLegalRAGClient()
coll := os.Getenv("E2E_COLLECTION")
if coll == "" {
coll = "bp_compliance_kb_2026_1_build"
}
fmt.Printf("### BENCH n=%d hybrid=%v\n", len(bench.Questions), os.Getenv("RAG_HYBRID_SEARCH") != "false")
for _, q := range bench.Questions {
res, err := c.SearchCollection(context.Background(), coll, q.Question, nil, 8)
if err != nil {
t.Fatalf("%s: %v", q.ID, err)
}
codes := make([]string, 0, len(res))
for _, r := range res {
codes = append(codes, strings.ReplaceAll(r.RegulationCode, ";", ","))
}
fmt.Printf("BENCH|%s|%s\n", q.ID, strings.Join(codes, ";"))
}
}