package ucca import ( "context" "encoding/json" "fmt" "os" "strings" "testing" ) // TestGuidanceFixE2E runs the 10 hard cases through the REAL LegalRAGClient against the // homogeneous build collection. Guarded by RUN_E2E=1. Reports the rank of the expected // document within the returned top-K — proving whether the guidanceIntentSignals fix lifts // guidance (WP248/WP260) back into the prompt. Toggle RAG_HYBRID_SEARCH to compare modes. func TestGuidanceFixE2E(t *testing.T) { if os.Getenv("RUN_E2E") != "1" { t.Skip("set RUN_E2E=1 + QDRANT_URL/OLLAMA_URL to run") } c := NewLegalRAGClient() coll := os.Getenv("E2E_COLLECTION") if coll == "" { coll = "bp_compliance_kb_2026_1_build" } cases := []struct{ id, q, expect string }{ {"GQ-0012", "Welche neun Kriterien nennt WP248 fuer ein voraussichtlich hohes Risiko?", "WP248"}, {"GQ-0013", "Ab wie vielen der WP248-Kriterien ist in der Regel eine Datenschutz-Folgenabschaetzung erforderlich?", "WP248"}, {"GQ-0023", "Welche Anforderungen stellt WP260 an eine klare und einfache Sprache?", "WP260"}, {"GQ-0024", "Was versteht WP260 unter Layered Privacy Notices?", "WP260"}, {"GQ-0054", "Welche grundlegenden Cybersecurity-Anforderungen enthaelt Annex I Part I?", "CRA"}, {"GQ-0060", "Wann muss eine aktiv ausgenutzte Schwachstelle gemeldet werden?", "CRA"}, {"GQ-0074", "Benoetigt eine SPS ohne Netzwerkanschluss eine CRA-Bewertung?", "CRA"}, {"GQ-0079", "Welche grundlegenden Sicherheits- und Gesundheitsschutzanforderungen enthaelt Anhang III?", "MASCHVO"}, {"GQ-0091", "Welche Anforderungen gelten fuer wesentliche Veraenderungen einer Maschine?", "MASCHVO"}, {"GQ-0070", "Wie greifen CRA und Maschinenverordnung bei einer vernetzten Maschine ineinander?", "CRA"}, } fmt.Printf("\n### hybrid=%v collection=%s\n", os.Getenv("RAG_HYBRID_SEARCH") != "false", coll) for _, tc := range cases { res, err := c.SearchCollection(context.Background(), coll, tc.q, nil, 8) if err != nil { t.Fatalf("%s: %v", tc.id, err) } rank := -1 for i, r := range res { lab := strings.ToUpper(r.RegulationCode + " " + r.ArticleLabel) if strings.Contains(lab, tc.expect) { rank = i + 1 break } } top1 := "" if len(res) > 0 { top1 = res[0].RegulationCode + " (" + res[0].SourceClass + ")" } status := "FAIL" if rank > 0 { status = "OK" } fmt.Printf("%-9s expect=%-8s rank_in_top8=%-2d %-5s top1=%s\n", tc.id, tc.expect, rank, status, top1) } } // TestBenchE2E runs the FULL ComplianceBench (E2E_BENCH_FILE) through the real client and // prints, per question, the ordered top-8 regulation codes. Diffing BEFORE vs AFTER proves // the fix only perturbs guidance-intent queries (gated on queryWantsGuidance) and never the // norm questions — the Knowledge-Freeze regression guard. func TestBenchE2E(t *testing.T) { if os.Getenv("RUN_E2E") != "1" { t.Skip("set RUN_E2E=1 + E2E_BENCH_FILE") } path := os.Getenv("E2E_BENCH_FILE") if path == "" { t.Skip("E2E_BENCH_FILE not set") } raw, err := os.ReadFile(path) if err != nil { t.Fatal(err) } var bench struct { Questions []struct { ID string `json:"id"` Question string `json:"question"` } `json:"questions"` } if err := json.Unmarshal(raw, &bench); err != nil { t.Fatal(err) } c := NewLegalRAGClient() coll := os.Getenv("E2E_COLLECTION") if coll == "" { coll = "bp_compliance_kb_2026_1_build" } fmt.Printf("### BENCH n=%d hybrid=%v\n", len(bench.Questions), os.Getenv("RAG_HYBRID_SEARCH") != "false") for _, q := range bench.Questions { res, err := c.SearchCollection(context.Background(), coll, q.Question, nil, 8) if err != nil { t.Fatalf("%s: %v", q.ID, err) } codes := make([]string, 0, len(res)) for _, r := range res { codes = append(codes, strings.ReplaceAll(r.RegulationCode, ";", ",")) } fmt.Printf("BENCH|%s|%s\n", q.ID, strings.Join(codes, ";")) } }