feat(ai-sdk): pluggable LLM judgment over recall-safe dedup candidates (P2 slice 2)

Adds the semantic judgement layer on top of the slice-1 detector + GT wall.
DEV-TIME, propose-only — nothing mutates the library or runtime.

- CandidateJudge interface with two implementations: HeuristicJudge
  (deterministic default/fallback, used in tests) and LLMJudge (offline, over the
  shared llm.ProviderRegistry via the LLMCompleter adapter). LLMJudge degrades to
  "uncertain" on any transport/parse error — it can never break a run.
- BuildJudgePrompt: the ISO 12100 same-vs-distinct prompt, unit-tested
  deterministically even though the call is not.
- RenderProposalQueue: markdown human-review queue with a suggested action per
  candidate (supersede / keep both / needs review).

On real warewashing output the heuristic punts to "uncertain — needs the LLM
judge" for exactly the two recall-safe near-dupes (HP807/HP033 update,
HP101/HP096 winding-vs-friction), making the LLM's role explicit. All 3 GTs
unaffected (read-only). Live qwen wiring + a CLI/file queue are slice 3.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-06-25 08:56:04 +02:00
parent 8674b2cd9a
commit 0ce4794767
4 changed files with 351 additions and 17 deletions
@@ -1,6 +1,7 @@
package iace
import (
"context"
"encoding/json"
"os"
"path/filepath"
@@ -196,33 +197,41 @@ func TestWarewashing_DedupProposer(t *testing.T) {
}
hazards, mits, kept := warewashingEngineOutput()
byID := map[string]PatternMatch{}
for _, pm := range kept {
byID[pm.PatternID] = pm
}
// 0.25 is a deliberately permissive candidate threshold: the proposer is meant
// to over-surface, because the deterministic GT wall below (and a human, and in
// slice 2 an LLM) is the precision filter — not the detector.
// to over-surface, because the deterministic GT wall below (and a human, and the
// LLM judge) is the precision filter — not the detector.
candidates := FindDedupCandidates(kept, 0.25)
t.Logf("Proposer: %d dedup candidate(s) from %d fired patterns", len(candidates), len(kept))
safe, blocked := 0, 0
// Deterministic judge in the test; the dev-time CLI swaps in LLMJudge.
judge := HeuristicJudge{}
var judged []JudgedProposal
blocked := 0
for _, c := range candidates {
sr := ScreenSupersession(&gt, hazards, mits, c.KeepHazardName, c.DropName)
var verdict string
switch {
case sr.RecallAfter < sr.RecallBefore:
verdict, blocked = "BLOCK (recall-load-bearing)", blocked+1
t.Logf("[BLOCK recall-load-bearing] keep %s / drop %s", c.KeepPattern, c.DropPattern)
blocked++
case sr.DistinctGT:
verdict, blocked = "BLOCK (distinct GT "+sr.KeepGT+" vs "+sr.DropGT+")", blocked+1
t.Logf("[BLOCK distinct GT %s vs %s] keep %s / drop %s", sr.KeepGT, sr.DropGT, c.KeepPattern, c.DropPattern)
blocked++
default:
verdict, safe = "RECALL-SAFE (needs semantic review)", safe+1
}
t.Logf("[%s] keep %s / drop %s score=%.2f recall %.1f%%->%.1f%% | %s",
verdict, c.KeepPattern, c.DropPattern, c.Score,
sr.RecallBefore*100, sr.RecallAfter*100, c.Rationale)
// The wall must be sound: Safe implies recall preserved AND not distinct.
if sr.Safe && (sr.RecallAfter < sr.RecallBefore || sr.DistinctGT) {
t.Errorf("screen inconsistent for drop %s: Safe but recall dropped or distinct GT", c.DropPattern)
if !sr.Safe {
t.Errorf("RECALL-SAFE branch but ScreenResult.Safe=false for drop %s", c.DropPattern)
}
v, conf, rat := judge.Judge(context.Background(), c, byID[c.KeepPattern], byID[c.DropPattern])
judged = append(judged, JudgedProposal{
Candidate: c, Screen: sr, Verdict: v, Confidence: conf, Rationale: rat, Judge: judge.Name(),
})
}
}
t.Logf("Proposer summary: %d RECALL-SAFE candidate(s) for human/LLM review, %d BLOCKED by the GT wall — propose-only, nothing auto-applied",
safe, blocked)
t.Logf("\n%s", RenderProposalQueue("Gewerbliche Geschirrspuelmaschine (vernetzt)", judged))
t.Logf("Proposer summary: %d candidate(s) in queue (judge=%s), %d BLOCKED by the GT wall — propose-only, nothing auto-applied",
len(judged), judge.Name(), blocked)
}