feat(ai-sdk): pluggable LLM judgment over recall-safe dedup candidates (P2 slice 2)

Adds the semantic judgement layer on top of the slice-1 detector + GT wall. DEV-TIME, propose-only — nothing mutates the library or runtime. - CandidateJudge interface with two implementations: HeuristicJudge (deterministic default/fallback, used in tests) and LLMJudge (offline, over the shared llm.ProviderRegistry via the LLMCompleter adapter). LLMJudge degrades to "uncertain" on any transport/parse error — it can never break a run. - BuildJudgePrompt: the ISO 12100 same-vs-distinct prompt, unit-tested deterministically even though the call is not. - RenderProposalQueue: markdown human-review queue with a suggested action per candidate (supersede / keep both / needs review). On real warewashing output the heuristic punts to "uncertain — needs the LLM judge" for exactly the two recall-safe near-dupes (HP807/HP033 update, HP101/HP096 winding-vs-friction), making the LLM's role explicit. All 3 GTs unaffected (read-only). Live qwen wiring + a CLI/file queue are slice 3. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-06-25 08:56:04 +02:00
parent 8674b2cd9a
commit 0ce4794767
4 changed files with 351 additions and 17 deletions
@@ -1,6 +1,7 @@
 package iace

 import (
+	"context"
 	"encoding/json"
 	"os"
 	"path/filepath"
@@ -196,33 +197,41 @@ func TestWarewashing_DedupProposer(t *testing.T) {
 	}

 	hazards, mits, kept := warewashingEngineOutput()
+	byID := map[string]PatternMatch{}
+	for _, pm := range kept {
+		byID[pm.PatternID] = pm
+	}
 	// 0.25 is a deliberately permissive candidate threshold: the proposer is meant
-	// to over-surface, because the deterministic GT wall below (and a human, and in
-	// slice 2 an LLM) is the precision filter — not the detector.
+	// to over-surface, because the deterministic GT wall below (and a human, and the
+	// LLM judge) is the precision filter — not the detector.
 	candidates := FindDedupCandidates(kept, 0.25)
 	t.Logf("Proposer: %d dedup candidate(s) from %d fired patterns", len(candidates), len(kept))

-	safe, blocked := 0, 0
+	// Deterministic judge in the test; the dev-time CLI swaps in LLMJudge.
+	judge := HeuristicJudge{}
+	var judged []JudgedProposal
+	blocked := 0
 	for _, c := range candidates {
 		sr := ScreenSupersession(&gt, hazards, mits, c.KeepHazardName, c.DropName)
-		var verdict string
 		switch {
 		case sr.RecallAfter < sr.RecallBefore:
-			verdict, blocked = "BLOCK (recall-load-bearing)", blocked+1
+			t.Logf("[BLOCK recall-load-bearing] keep %s / drop %s", c.KeepPattern, c.DropPattern)
+			blocked++
 		case sr.DistinctGT:
-			verdict, blocked = "BLOCK (distinct GT "+sr.KeepGT+" vs "+sr.DropGT+")", blocked+1
+			t.Logf("[BLOCK distinct GT %s vs %s] keep %s / drop %s", sr.KeepGT, sr.DropGT, c.KeepPattern, c.DropPattern)
+			blocked++
 		default:
-			verdict, safe = "RECALL-SAFE (needs semantic review)", safe+1
-		}
-		t.Logf("[%s] keep %s / drop %s  score=%.2f  recall %.1f%%->%.1f%%  | %s",
-			verdict, c.KeepPattern, c.DropPattern, c.Score,
-			sr.RecallBefore*100, sr.RecallAfter*100, c.Rationale)
-
-		// The wall must be sound: Safe implies recall preserved AND not distinct.
-		if sr.Safe && (sr.RecallAfter < sr.RecallBefore || sr.DistinctGT) {
-			t.Errorf("screen inconsistent for drop %s: Safe but recall dropped or distinct GT", c.DropPattern)
+			if !sr.Safe {
+				t.Errorf("RECALL-SAFE branch but ScreenResult.Safe=false for drop %s", c.DropPattern)
+			}
+			v, conf, rat := judge.Judge(context.Background(), c, byID[c.KeepPattern], byID[c.DropPattern])
+			judged = append(judged, JudgedProposal{
+				Candidate: c, Screen: sr, Verdict: v, Confidence: conf, Rationale: rat, Judge: judge.Name(),
+			})
 		}
 	}
-	t.Logf("Proposer summary: %d RECALL-SAFE candidate(s) for human/LLM review, %d BLOCKED by the GT wall — propose-only, nothing auto-applied",
-		safe, blocked)
+
+	t.Logf("\n%s", RenderProposalQueue("Gewerbliche Geschirrspuelmaschine (vernetzt)", judged))
+	t.Logf("Proposer summary: %d candidate(s) in queue (judge=%s), %d BLOCKED by the GT wall — propose-only, nothing auto-applied",
+		len(judged), judge.Name(), blocked)
 }