Files
breakpilot-compliance/ai-compliance-sdk/internal/iace/proposer_queue.go
T
Benjamin Admin 0ce4794767 feat(ai-sdk): pluggable LLM judgment over recall-safe dedup candidates (P2 slice 2)
Adds the semantic judgement layer on top of the slice-1 detector + GT wall.
DEV-TIME, propose-only — nothing mutates the library or runtime.

- CandidateJudge interface with two implementations: HeuristicJudge
  (deterministic default/fallback, used in tests) and LLMJudge (offline, over the
  shared llm.ProviderRegistry via the LLMCompleter adapter). LLMJudge degrades to
  "uncertain" on any transport/parse error — it can never break a run.
- BuildJudgePrompt: the ISO 12100 same-vs-distinct prompt, unit-tested
  deterministically even though the call is not.
- RenderProposalQueue: markdown human-review queue with a suggested action per
  candidate (supersede / keep both / needs review).

On real warewashing output the heuristic punts to "uncertain — needs the LLM
judge" for exactly the two recall-safe near-dupes (HP807/HP033 update,
HP101/HP096 winding-vs-friction), making the LLM's role explicit. All 3 GTs
unaffected (read-only). Live qwen wiring + a CLI/file queue are slice 3.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-06-26 10:27:01 +02:00

48 lines
1.9 KiB
Go

package iace
import (
"fmt"
"strings"
)
// RenderProposalQueue turns judged dedup proposals into the human-review queue
// (markdown). Deterministic. Nothing here applies a change — every entry is a
// suggestion for a human to confirm, edit, commit, and pin with a GT case.
func RenderProposalQueue(machine string, proposals []JudgedProposal) string {
var b strings.Builder
fmt.Fprintf(&b, "# Dedup proposal queue — %s\n\n", machine)
fmt.Fprintf(&b, "%d candidate(s) survived the deterministic GT wall. Propose-only — nothing is applied automatically.\n\n", len(proposals))
for i, p := range proposals {
c := p.Candidate
fmt.Fprintf(&b, "## %d. keep %s ⊃ drop %s [%s → %s (%s)]\n",
i+1, c.KeepPattern, c.DropPattern, p.Judge, p.Verdict, p.Confidence)
fmt.Fprintf(&b, "- category %s · score %.2f (measures %.0f%%, zone %.0f%%, scenario %.0f%%)\n",
c.Category, c.Score, c.MeasureJaccard*100, c.ZoneJaccard*100, c.ScenarioJaccard*100)
fmt.Fprintf(&b, "- GT recall %.1f%% → %.1f%% when %s is dropped (wall: %s)\n",
p.Screen.RecallBefore*100, p.Screen.RecallAfter*100, c.DropPattern, wallNote(p.Screen))
fmt.Fprintf(&b, "- keep: %s\n- drop: %s\n", c.KeepHazardName, c.DropName)
fmt.Fprintf(&b, "- judge rationale: %s\n", p.Rationale)
fmt.Fprintf(&b, "- suggested action: %s\n\n", suggestedAction(p))
}
return b.String()
}
func wallNote(s ScreenResult) string {
if s.DistinctGT {
return fmt.Sprintf("distinct GT %s vs %s", s.KeepGT, s.DropGT)
}
return "recall-safe"
}
func suggestedAction(p JudgedProposal) string {
switch p.Verdict {
case VerdictDuplicate:
return fmt.Sprintf("add %s to a supersession set, then a human confirms + commits + pins a GT case", p.Candidate.DropPattern)
case VerdictDistinct:
return "keep both — judge considers them distinct hazards"
default:
return "needs human (or higher-confidence LLM) review — no automatic action"
}
}