feat(ai-sdk): pluggable LLM judgment over recall-safe dedup candidates (P2 slice 2)
Adds the semantic judgement layer on top of the slice-1 detector + GT wall. DEV-TIME, propose-only — nothing mutates the library or runtime. - CandidateJudge interface with two implementations: HeuristicJudge (deterministic default/fallback, used in tests) and LLMJudge (offline, over the shared llm.ProviderRegistry via the LLMCompleter adapter). LLMJudge degrades to "uncertain" on any transport/parse error — it can never break a run. - BuildJudgePrompt: the ISO 12100 same-vs-distinct prompt, unit-tested deterministically even though the call is not. - RenderProposalQueue: markdown human-review queue with a suggested action per candidate (supersede / keep both / needs review). On real warewashing output the heuristic punts to "uncertain — needs the LLM judge" for exactly the two recall-safe near-dupes (HP807/HP033 update, HP101/HP096 winding-vs-friction), making the LLM's role explicit. All 3 GTs unaffected (read-only). Live qwen wiring + a CLI/file queue are slice 3. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,104 @@
|
||||
package iace
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestHeuristicJudge_Verdicts(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
zone, meas float64
|
||||
scenario float64
|
||||
wantVerdict string
|
||||
}{
|
||||
{"high scenario overlap -> duplicate", 0, 0.3, 0.6, VerdictDuplicate},
|
||||
{"high zone+measure -> duplicate", 0.6, 0.6, 0.1, VerdictDuplicate},
|
||||
{"identical measures, no text -> distinct", 0, 1.0, 0.0, VerdictDistinct},
|
||||
{"shared measures, low text -> uncertain", 0, 0.67, 0.19, VerdictUncertain},
|
||||
}
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
c := DedupCandidate{ZoneJaccard: tt.zone, MeasureJaccard: tt.meas, ScenarioJaccard: tt.scenario}
|
||||
v, conf, _ := HeuristicJudge{}.Judge(context.Background(), c, PatternMatch{}, PatternMatch{})
|
||||
if v != tt.wantVerdict {
|
||||
t.Errorf("verdict: want %s, got %s", tt.wantVerdict, v)
|
||||
}
|
||||
if conf != "low" {
|
||||
t.Errorf("heuristic confidence must be low, got %s", conf)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestBuildJudgePrompt_ContainsKeyFacts(t *testing.T) {
|
||||
a := PatternMatch{PatternID: "HPa", PatternName: "Heisse Flaeche", HazardCats: []string{"thermal_hazard"},
|
||||
ZoneDE: "Boiler", ScenarioDE: "Beruehrung heisser Boiler", SuggestedMeasureIDs: []string{"M071"}}
|
||||
b := PatternMatch{PatternID: "HPb", PatternName: "Heisses Spuelgut", HazardCats: []string{"thermal_hazard"},
|
||||
ZoneDE: "Spuelgut", ScenarioDE: "Beruehrung heisses Geschirr", SuggestedMeasureIDs: []string{"M071"}}
|
||||
system, user := BuildJudgePrompt("Geschirrspuelmaschine", a, b)
|
||||
|
||||
for _, want := range []string{"EN ISO 12100", "JSON", "verdict"} {
|
||||
if !strings.Contains(system, want) {
|
||||
t.Errorf("system prompt missing %q", want)
|
||||
}
|
||||
}
|
||||
for _, want := range []string{"Geschirrspuelmaschine", "HPa", "HPb", "Boiler", "Spuelgut", "thermal_hazard"} {
|
||||
if !strings.Contains(user, want) {
|
||||
t.Errorf("user prompt missing %q", want)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
type fakeCompleter struct {
|
||||
out string
|
||||
err error
|
||||
}
|
||||
|
||||
func (f fakeCompleter) Complete(_ context.Context, _, _ string) (string, error) { return f.out, f.err }
|
||||
|
||||
func TestLLMJudge_ParsesAndDegrades(t *testing.T) {
|
||||
cand := DedupCandidate{KeepPattern: "HPa", DropPattern: "HPb"}
|
||||
|
||||
// Well-formed JSON, even wrapped in chatter, parses.
|
||||
j := LLMJudge{Completer: fakeCompleter{out: "Sicher. {\"verdict\":\"distinct\",\"confidence\":\"high\",\"rationale\":\"andere Wirkorte\"}"}, MachineClass: "x"}
|
||||
if v, conf, r := j.Judge(context.Background(), cand, PatternMatch{}, PatternMatch{}); v != VerdictDistinct || conf != "high" || r != "andere Wirkorte" {
|
||||
t.Errorf("parse: got %s/%s/%q", v, conf, r)
|
||||
}
|
||||
|
||||
// Unknown verdict value normalises to uncertain.
|
||||
j2 := LLMJudge{Completer: fakeCompleter{out: `{"verdict":"maybe","confidence":"medium","rationale":"x"}`}}
|
||||
if v, _, _ := j2.Judge(context.Background(), cand, PatternMatch{}, PatternMatch{}); v != VerdictUncertain {
|
||||
t.Errorf("unknown verdict must normalise to uncertain, got %s", v)
|
||||
}
|
||||
|
||||
// Transport error degrades gracefully, never panics.
|
||||
j3 := LLMJudge{Completer: fakeCompleter{err: errors.New("connection refused")}}
|
||||
if v, _, r := j3.Judge(context.Background(), cand, PatternMatch{}, PatternMatch{}); v != VerdictUncertain || !strings.Contains(r, "LLM error") {
|
||||
t.Errorf("error path: got %s / %q", v, r)
|
||||
}
|
||||
|
||||
// Garbage (no JSON) degrades to uncertain.
|
||||
j4 := LLMJudge{Completer: fakeCompleter{out: "no json here"}}
|
||||
if v, _, _ := j4.Judge(context.Background(), cand, PatternMatch{}, PatternMatch{}); v != VerdictUncertain {
|
||||
t.Errorf("garbage must degrade to uncertain, got %s", v)
|
||||
}
|
||||
}
|
||||
|
||||
func TestRenderProposalQueue_ShowsActions(t *testing.T) {
|
||||
proposals := []JudgedProposal{
|
||||
{
|
||||
Candidate: DedupCandidate{KeepPattern: "HP807", DropPattern: "HP033", Category: "update_failure", Score: 0.32},
|
||||
Screen: ScreenResult{RecallBefore: 1, RecallAfter: 1},
|
||||
Verdict: VerdictDuplicate, Confidence: "medium", Rationale: "same update failure", Judge: "llm",
|
||||
},
|
||||
}
|
||||
out := RenderProposalQueue("Geschirrspuelmaschine", proposals)
|
||||
for _, want := range []string{"HP807", "HP033", "update_failure", "supersession", "Propose-only"} {
|
||||
if !strings.Contains(out, want) {
|
||||
t.Errorf("queue missing %q\n%s", want, out)
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user