feat(ai-sdk): pluggable LLM judgment over recall-safe dedup candidates (P2 slice 2)
Adds the semantic judgement layer on top of the slice-1 detector + GT wall. DEV-TIME, propose-only — nothing mutates the library or runtime. - CandidateJudge interface with two implementations: HeuristicJudge (deterministic default/fallback, used in tests) and LLMJudge (offline, over the shared llm.ProviderRegistry via the LLMCompleter adapter). LLMJudge degrades to "uncertain" on any transport/parse error — it can never break a run. - BuildJudgePrompt: the ISO 12100 same-vs-distinct prompt, unit-tested deterministically even though the call is not. - RenderProposalQueue: markdown human-review queue with a suggested action per candidate (supersede / keep both / needs review). On real warewashing output the heuristic punts to "uncertain — needs the LLM judge" for exactly the two recall-safe near-dupes (HP807/HP033 update, HP101/HP096 winding-vs-friction), making the LLM's role explicit. All 3 GTs unaffected (read-only). Live qwen wiring + a CLI/file queue are slice 3. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -1,6 +1,7 @@
|
|||||||
package iace
|
package iace
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"context"
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"os"
|
"os"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
@@ -196,33 +197,41 @@ func TestWarewashing_DedupProposer(t *testing.T) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
hazards, mits, kept := warewashingEngineOutput()
|
hazards, mits, kept := warewashingEngineOutput()
|
||||||
|
byID := map[string]PatternMatch{}
|
||||||
|
for _, pm := range kept {
|
||||||
|
byID[pm.PatternID] = pm
|
||||||
|
}
|
||||||
// 0.25 is a deliberately permissive candidate threshold: the proposer is meant
|
// 0.25 is a deliberately permissive candidate threshold: the proposer is meant
|
||||||
// to over-surface, because the deterministic GT wall below (and a human, and in
|
// to over-surface, because the deterministic GT wall below (and a human, and the
|
||||||
// slice 2 an LLM) is the precision filter — not the detector.
|
// LLM judge) is the precision filter — not the detector.
|
||||||
candidates := FindDedupCandidates(kept, 0.25)
|
candidates := FindDedupCandidates(kept, 0.25)
|
||||||
t.Logf("Proposer: %d dedup candidate(s) from %d fired patterns", len(candidates), len(kept))
|
t.Logf("Proposer: %d dedup candidate(s) from %d fired patterns", len(candidates), len(kept))
|
||||||
|
|
||||||
safe, blocked := 0, 0
|
// Deterministic judge in the test; the dev-time CLI swaps in LLMJudge.
|
||||||
|
judge := HeuristicJudge{}
|
||||||
|
var judged []JudgedProposal
|
||||||
|
blocked := 0
|
||||||
for _, c := range candidates {
|
for _, c := range candidates {
|
||||||
sr := ScreenSupersession(>, hazards, mits, c.KeepHazardName, c.DropName)
|
sr := ScreenSupersession(>, hazards, mits, c.KeepHazardName, c.DropName)
|
||||||
var verdict string
|
|
||||||
switch {
|
switch {
|
||||||
case sr.RecallAfter < sr.RecallBefore:
|
case sr.RecallAfter < sr.RecallBefore:
|
||||||
verdict, blocked = "BLOCK (recall-load-bearing)", blocked+1
|
t.Logf("[BLOCK recall-load-bearing] keep %s / drop %s", c.KeepPattern, c.DropPattern)
|
||||||
|
blocked++
|
||||||
case sr.DistinctGT:
|
case sr.DistinctGT:
|
||||||
verdict, blocked = "BLOCK (distinct GT "+sr.KeepGT+" vs "+sr.DropGT+")", blocked+1
|
t.Logf("[BLOCK distinct GT %s vs %s] keep %s / drop %s", sr.KeepGT, sr.DropGT, c.KeepPattern, c.DropPattern)
|
||||||
|
blocked++
|
||||||
default:
|
default:
|
||||||
verdict, safe = "RECALL-SAFE (needs semantic review)", safe+1
|
if !sr.Safe {
|
||||||
|
t.Errorf("RECALL-SAFE branch but ScreenResult.Safe=false for drop %s", c.DropPattern)
|
||||||
|
}
|
||||||
|
v, conf, rat := judge.Judge(context.Background(), c, byID[c.KeepPattern], byID[c.DropPattern])
|
||||||
|
judged = append(judged, JudgedProposal{
|
||||||
|
Candidate: c, Screen: sr, Verdict: v, Confidence: conf, Rationale: rat, Judge: judge.Name(),
|
||||||
|
})
|
||||||
|
}
|
||||||
}
|
}
|
||||||
t.Logf("[%s] keep %s / drop %s score=%.2f recall %.1f%%->%.1f%% | %s",
|
|
||||||
verdict, c.KeepPattern, c.DropPattern, c.Score,
|
|
||||||
sr.RecallBefore*100, sr.RecallAfter*100, c.Rationale)
|
|
||||||
|
|
||||||
// The wall must be sound: Safe implies recall preserved AND not distinct.
|
t.Logf("\n%s", RenderProposalQueue("Gewerbliche Geschirrspuelmaschine (vernetzt)", judged))
|
||||||
if sr.Safe && (sr.RecallAfter < sr.RecallBefore || sr.DistinctGT) {
|
t.Logf("Proposer summary: %d candidate(s) in queue (judge=%s), %d BLOCKED by the GT wall — propose-only, nothing auto-applied",
|
||||||
t.Errorf("screen inconsistent for drop %s: Safe but recall dropped or distinct GT", c.DropPattern)
|
len(judged), judge.Name(), blocked)
|
||||||
}
|
|
||||||
}
|
|
||||||
t.Logf("Proposer summary: %d RECALL-SAFE candidate(s) for human/LLM review, %d BLOCKED by the GT wall — propose-only, nothing auto-applied",
|
|
||||||
safe, blocked)
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -0,0 +1,174 @@
|
|||||||
|
package iace
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"encoding/json"
|
||||||
|
"fmt"
|
||||||
|
"strings"
|
||||||
|
|
||||||
|
"github.com/breakpilot/ai-compliance-sdk/internal/llm"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Semantic judgement over RECALL-SAFE dedup candidates (P2 slice 2). DEV-TIME,
|
||||||
|
// propose-only. The deterministic GT wall (proposer_screen.go) has already
|
||||||
|
// removed candidates that would drop recall or that credit different GT entries;
|
||||||
|
// the judge only adds an opinion on whether the survivors are truly the same
|
||||||
|
// hazard, plus a rationale, for the human review queue. It NEVER mutates anything.
|
||||||
|
//
|
||||||
|
// The judge is pluggable behind CandidateJudge so the runtime/tests stay
|
||||||
|
// deterministic (HeuristicJudge) while the dev-time CLI can plug in the
|
||||||
|
// non-deterministic LLM (LLMJudge over the shared llm.ProviderRegistry).
|
||||||
|
|
||||||
|
const (
|
||||||
|
VerdictDuplicate = "duplicate"
|
||||||
|
VerdictDistinct = "distinct"
|
||||||
|
VerdictUncertain = "uncertain"
|
||||||
|
)
|
||||||
|
|
||||||
|
// JudgedProposal is one candidate with its GT-wall result and the judge's opinion.
|
||||||
|
type JudgedProposal struct {
|
||||||
|
Candidate DedupCandidate `json:"candidate"`
|
||||||
|
Screen ScreenResult `json:"screen"`
|
||||||
|
Verdict string `json:"verdict"`
|
||||||
|
Confidence string `json:"confidence"`
|
||||||
|
Rationale string `json:"rationale"`
|
||||||
|
Judge string `json:"judge"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// CandidateJudge decides whether two near-duplicate patterns are the same hazard.
|
||||||
|
type CandidateJudge interface {
|
||||||
|
Name() string
|
||||||
|
Judge(ctx context.Context, c DedupCandidate, a, b PatternMatch) (verdict, confidence, rationale string)
|
||||||
|
}
|
||||||
|
|
||||||
|
// HeuristicJudge is the deterministic default/fallback. It only ever returns "low"
|
||||||
|
// confidence — it is a placeholder for the LLM, and it deliberately punts to
|
||||||
|
// "uncertain" on the hard cases (low text overlap, shared measures) so the queue
|
||||||
|
// makes clear exactly where the LLM earns its keep.
|
||||||
|
type HeuristicJudge struct{}
|
||||||
|
|
||||||
|
func (HeuristicJudge) Name() string { return "heuristic" }
|
||||||
|
|
||||||
|
func (HeuristicJudge) Judge(_ context.Context, c DedupCandidate, _, _ PatternMatch) (string, string, string) {
|
||||||
|
switch {
|
||||||
|
case c.ScenarioJaccard >= 0.5 || (c.ZoneJaccard >= 0.5 && c.MeasureJaccard >= 0.5):
|
||||||
|
return VerdictDuplicate, "low", "structural: high scenario, or combined zone+measure, overlap"
|
||||||
|
case c.MeasureJaccard >= 0.99 && c.ZoneJaccard == 0 && c.ScenarioJaccard < 0.3:
|
||||||
|
return VerdictDistinct, "low", "structural: identical measures but no zone/scenario overlap — likely distinct hazards sharing generic measures"
|
||||||
|
default:
|
||||||
|
return VerdictUncertain, "low", "structural signal inconclusive — needs the LLM judge"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// LLMJudge asks an offline model to make the semantic call. Non-deterministic, so
|
||||||
|
// it lives only in the dev-time tool, never in tests or the runtime. It degrades
|
||||||
|
// to "uncertain" on any transport or parse error — it must never break the run.
|
||||||
|
type LLMJudge struct {
|
||||||
|
Completer LLMCompleter
|
||||||
|
MachineClass string
|
||||||
|
}
|
||||||
|
|
||||||
|
func (LLMJudge) Name() string { return "llm" }
|
||||||
|
|
||||||
|
func (j LLMJudge) Judge(ctx context.Context, c DedupCandidate, a, b PatternMatch) (string, string, string) {
|
||||||
|
system, user := BuildJudgePrompt(j.MachineClass, a, b)
|
||||||
|
raw, err := j.Completer.Complete(ctx, system, user)
|
||||||
|
if err != nil {
|
||||||
|
return VerdictUncertain, "low", "LLM error: " + err.Error()
|
||||||
|
}
|
||||||
|
return parseJudgeJSON(raw)
|
||||||
|
}
|
||||||
|
|
||||||
|
// BuildJudgePrompt is the real LLM artifact — built and unit-tested deterministically
|
||||||
|
// even though the call itself is not. It frames the ISO 12100 same-vs-distinct
|
||||||
|
// question and forces a JSON answer.
|
||||||
|
func BuildJudgePrompt(machineClass string, a, b PatternMatch) (system, user string) {
|
||||||
|
system = "Du bist Sachverstaendiger fuer Maschinensicherheit nach EN ISO 12100. " +
|
||||||
|
"Entscheide, ob zwei generierte Gefaehrdungen fuer DIESE Maschine DIESELBE Gefaehrdung " +
|
||||||
|
"beschreiben (Dublette) oder fachlich VERSCHIEDENE Gefaehrdungen sind, die nur zufaellig " +
|
||||||
|
"dieselben Schutzmassnahmen teilen. Verschieden, wenn Wirkort, Ausloeser oder " +
|
||||||
|
"Schadensmechanismus abweichen — auch bei gleicher Kategorie und gleichen Massnahmen. " +
|
||||||
|
"Antworte AUSSCHLIESSLICH als JSON: " +
|
||||||
|
`{"verdict":"duplicate|distinct|uncertain","confidence":"high|medium|low","rationale":"..."}.`
|
||||||
|
user = fmt.Sprintf(`Maschinenklasse: %s
|
||||||
|
|
||||||
|
Gefaehrdung A (%s):
|
||||||
|
Name: %s
|
||||||
|
Kategorie: %s
|
||||||
|
Zone: %s
|
||||||
|
Szenario: %s
|
||||||
|
Ausloeser: %s
|
||||||
|
Schaden: %s
|
||||||
|
Massnahmen: %s
|
||||||
|
|
||||||
|
Gefaehrdung B (%s):
|
||||||
|
Name: %s
|
||||||
|
Kategorie: %s
|
||||||
|
Zone: %s
|
||||||
|
Szenario: %s
|
||||||
|
Ausloeser: %s
|
||||||
|
Schaden: %s
|
||||||
|
Massnahmen: %s
|
||||||
|
|
||||||
|
Sind A und B dieselbe Gefaehrdung fuer diese Maschine?`,
|
||||||
|
machineClass,
|
||||||
|
a.PatternID, a.PatternName, primaryCat(a), a.ZoneDE, a.ScenarioDE, a.TriggerDE, a.HarmDE, strings.Join(a.SuggestedMeasureIDs, ", "),
|
||||||
|
b.PatternID, b.PatternName, primaryCat(b), b.ZoneDE, b.ScenarioDE, b.TriggerDE, b.HarmDE, strings.Join(b.SuggestedMeasureIDs, ", "))
|
||||||
|
return system, user
|
||||||
|
}
|
||||||
|
|
||||||
|
func parseJudgeJSON(raw string) (verdict, confidence, rationale string) {
|
||||||
|
start, end := strings.Index(raw, "{"), strings.LastIndex(raw, "}")
|
||||||
|
if start < 0 || end <= start {
|
||||||
|
return VerdictUncertain, "low", "unparseable LLM output"
|
||||||
|
}
|
||||||
|
var v struct {
|
||||||
|
Verdict string `json:"verdict"`
|
||||||
|
Confidence string `json:"confidence"`
|
||||||
|
Rationale string `json:"rationale"`
|
||||||
|
}
|
||||||
|
if err := json.Unmarshal([]byte(raw[start:end+1]), &v); err != nil {
|
||||||
|
return VerdictUncertain, "low", "unparseable LLM JSON: " + err.Error()
|
||||||
|
}
|
||||||
|
switch v.Verdict {
|
||||||
|
case VerdictDuplicate, VerdictDistinct, VerdictUncertain:
|
||||||
|
default:
|
||||||
|
v.Verdict = VerdictUncertain
|
||||||
|
}
|
||||||
|
if v.Confidence == "" {
|
||||||
|
v.Confidence = "low"
|
||||||
|
}
|
||||||
|
return v.Verdict, v.Confidence, v.Rationale
|
||||||
|
}
|
||||||
|
|
||||||
|
// LLMCompleter is the minimal text-in/text-out the LLM judge needs. Tests pass a
|
||||||
|
// stub; the dev-time tool passes a registry-backed adapter (NewRegistryCompleter).
|
||||||
|
type LLMCompleter interface {
|
||||||
|
Complete(ctx context.Context, system, user string) (string, error)
|
||||||
|
}
|
||||||
|
|
||||||
|
type registryCompleter struct {
|
||||||
|
reg *llm.ProviderRegistry
|
||||||
|
model string
|
||||||
|
}
|
||||||
|
|
||||||
|
// NewRegistryCompleter adapts the shared llm.ProviderRegistry to LLMCompleter so
|
||||||
|
// the proposer can reuse the platform's offline model wiring (e.g. self-hosted qwen).
|
||||||
|
func NewRegistryCompleter(reg *llm.ProviderRegistry, model string) LLMCompleter {
|
||||||
|
return ®istryCompleter{reg: reg, model: model}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (rc *registryCompleter) Complete(ctx context.Context, system, user string) (string, error) {
|
||||||
|
resp, err := rc.reg.Chat(ctx, &llm.ChatRequest{
|
||||||
|
Model: rc.model,
|
||||||
|
Messages: []llm.Message{
|
||||||
|
{Role: "system", Content: system},
|
||||||
|
{Role: "user", Content: user},
|
||||||
|
},
|
||||||
|
Temperature: 0,
|
||||||
|
})
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
return resp.Message.Content, nil
|
||||||
|
}
|
||||||
@@ -0,0 +1,104 @@
|
|||||||
|
package iace
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"errors"
|
||||||
|
"strings"
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestHeuristicJudge_Verdicts(t *testing.T) {
|
||||||
|
tests := []struct {
|
||||||
|
name string
|
||||||
|
zone, meas float64
|
||||||
|
scenario float64
|
||||||
|
wantVerdict string
|
||||||
|
}{
|
||||||
|
{"high scenario overlap -> duplicate", 0, 0.3, 0.6, VerdictDuplicate},
|
||||||
|
{"high zone+measure -> duplicate", 0.6, 0.6, 0.1, VerdictDuplicate},
|
||||||
|
{"identical measures, no text -> distinct", 0, 1.0, 0.0, VerdictDistinct},
|
||||||
|
{"shared measures, low text -> uncertain", 0, 0.67, 0.19, VerdictUncertain},
|
||||||
|
}
|
||||||
|
for _, tt := range tests {
|
||||||
|
t.Run(tt.name, func(t *testing.T) {
|
||||||
|
c := DedupCandidate{ZoneJaccard: tt.zone, MeasureJaccard: tt.meas, ScenarioJaccard: tt.scenario}
|
||||||
|
v, conf, _ := HeuristicJudge{}.Judge(context.Background(), c, PatternMatch{}, PatternMatch{})
|
||||||
|
if v != tt.wantVerdict {
|
||||||
|
t.Errorf("verdict: want %s, got %s", tt.wantVerdict, v)
|
||||||
|
}
|
||||||
|
if conf != "low" {
|
||||||
|
t.Errorf("heuristic confidence must be low, got %s", conf)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestBuildJudgePrompt_ContainsKeyFacts(t *testing.T) {
|
||||||
|
a := PatternMatch{PatternID: "HPa", PatternName: "Heisse Flaeche", HazardCats: []string{"thermal_hazard"},
|
||||||
|
ZoneDE: "Boiler", ScenarioDE: "Beruehrung heisser Boiler", SuggestedMeasureIDs: []string{"M071"}}
|
||||||
|
b := PatternMatch{PatternID: "HPb", PatternName: "Heisses Spuelgut", HazardCats: []string{"thermal_hazard"},
|
||||||
|
ZoneDE: "Spuelgut", ScenarioDE: "Beruehrung heisses Geschirr", SuggestedMeasureIDs: []string{"M071"}}
|
||||||
|
system, user := BuildJudgePrompt("Geschirrspuelmaschine", a, b)
|
||||||
|
|
||||||
|
for _, want := range []string{"EN ISO 12100", "JSON", "verdict"} {
|
||||||
|
if !strings.Contains(system, want) {
|
||||||
|
t.Errorf("system prompt missing %q", want)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for _, want := range []string{"Geschirrspuelmaschine", "HPa", "HPb", "Boiler", "Spuelgut", "thermal_hazard"} {
|
||||||
|
if !strings.Contains(user, want) {
|
||||||
|
t.Errorf("user prompt missing %q", want)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
type fakeCompleter struct {
|
||||||
|
out string
|
||||||
|
err error
|
||||||
|
}
|
||||||
|
|
||||||
|
func (f fakeCompleter) Complete(_ context.Context, _, _ string) (string, error) { return f.out, f.err }
|
||||||
|
|
||||||
|
func TestLLMJudge_ParsesAndDegrades(t *testing.T) {
|
||||||
|
cand := DedupCandidate{KeepPattern: "HPa", DropPattern: "HPb"}
|
||||||
|
|
||||||
|
// Well-formed JSON, even wrapped in chatter, parses.
|
||||||
|
j := LLMJudge{Completer: fakeCompleter{out: "Sicher. {\"verdict\":\"distinct\",\"confidence\":\"high\",\"rationale\":\"andere Wirkorte\"}"}, MachineClass: "x"}
|
||||||
|
if v, conf, r := j.Judge(context.Background(), cand, PatternMatch{}, PatternMatch{}); v != VerdictDistinct || conf != "high" || r != "andere Wirkorte" {
|
||||||
|
t.Errorf("parse: got %s/%s/%q", v, conf, r)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Unknown verdict value normalises to uncertain.
|
||||||
|
j2 := LLMJudge{Completer: fakeCompleter{out: `{"verdict":"maybe","confidence":"medium","rationale":"x"}`}}
|
||||||
|
if v, _, _ := j2.Judge(context.Background(), cand, PatternMatch{}, PatternMatch{}); v != VerdictUncertain {
|
||||||
|
t.Errorf("unknown verdict must normalise to uncertain, got %s", v)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Transport error degrades gracefully, never panics.
|
||||||
|
j3 := LLMJudge{Completer: fakeCompleter{err: errors.New("connection refused")}}
|
||||||
|
if v, _, r := j3.Judge(context.Background(), cand, PatternMatch{}, PatternMatch{}); v != VerdictUncertain || !strings.Contains(r, "LLM error") {
|
||||||
|
t.Errorf("error path: got %s / %q", v, r)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Garbage (no JSON) degrades to uncertain.
|
||||||
|
j4 := LLMJudge{Completer: fakeCompleter{out: "no json here"}}
|
||||||
|
if v, _, _ := j4.Judge(context.Background(), cand, PatternMatch{}, PatternMatch{}); v != VerdictUncertain {
|
||||||
|
t.Errorf("garbage must degrade to uncertain, got %s", v)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestRenderProposalQueue_ShowsActions(t *testing.T) {
|
||||||
|
proposals := []JudgedProposal{
|
||||||
|
{
|
||||||
|
Candidate: DedupCandidate{KeepPattern: "HP807", DropPattern: "HP033", Category: "update_failure", Score: 0.32},
|
||||||
|
Screen: ScreenResult{RecallBefore: 1, RecallAfter: 1},
|
||||||
|
Verdict: VerdictDuplicate, Confidence: "medium", Rationale: "same update failure", Judge: "llm",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
out := RenderProposalQueue("Geschirrspuelmaschine", proposals)
|
||||||
|
for _, want := range []string{"HP807", "HP033", "update_failure", "supersession", "Propose-only"} {
|
||||||
|
if !strings.Contains(out, want) {
|
||||||
|
t.Errorf("queue missing %q\n%s", want, out)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,47 @@
|
|||||||
|
package iace
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"strings"
|
||||||
|
)
|
||||||
|
|
||||||
|
// RenderProposalQueue turns judged dedup proposals into the human-review queue
|
||||||
|
// (markdown). Deterministic. Nothing here applies a change — every entry is a
|
||||||
|
// suggestion for a human to confirm, edit, commit, and pin with a GT case.
|
||||||
|
func RenderProposalQueue(machine string, proposals []JudgedProposal) string {
|
||||||
|
var b strings.Builder
|
||||||
|
fmt.Fprintf(&b, "# Dedup proposal queue — %s\n\n", machine)
|
||||||
|
fmt.Fprintf(&b, "%d candidate(s) survived the deterministic GT wall. Propose-only — nothing is applied automatically.\n\n", len(proposals))
|
||||||
|
|
||||||
|
for i, p := range proposals {
|
||||||
|
c := p.Candidate
|
||||||
|
fmt.Fprintf(&b, "## %d. keep %s ⊃ drop %s [%s → %s (%s)]\n",
|
||||||
|
i+1, c.KeepPattern, c.DropPattern, p.Judge, p.Verdict, p.Confidence)
|
||||||
|
fmt.Fprintf(&b, "- category %s · score %.2f (measures %.0f%%, zone %.0f%%, scenario %.0f%%)\n",
|
||||||
|
c.Category, c.Score, c.MeasureJaccard*100, c.ZoneJaccard*100, c.ScenarioJaccard*100)
|
||||||
|
fmt.Fprintf(&b, "- GT recall %.1f%% → %.1f%% when %s is dropped (wall: %s)\n",
|
||||||
|
p.Screen.RecallBefore*100, p.Screen.RecallAfter*100, c.DropPattern, wallNote(p.Screen))
|
||||||
|
fmt.Fprintf(&b, "- keep: %s\n- drop: %s\n", c.KeepHazardName, c.DropName)
|
||||||
|
fmt.Fprintf(&b, "- judge rationale: %s\n", p.Rationale)
|
||||||
|
fmt.Fprintf(&b, "- suggested action: %s\n\n", suggestedAction(p))
|
||||||
|
}
|
||||||
|
return b.String()
|
||||||
|
}
|
||||||
|
|
||||||
|
func wallNote(s ScreenResult) string {
|
||||||
|
if s.DistinctGT {
|
||||||
|
return fmt.Sprintf("distinct GT %s vs %s", s.KeepGT, s.DropGT)
|
||||||
|
}
|
||||||
|
return "recall-safe"
|
||||||
|
}
|
||||||
|
|
||||||
|
func suggestedAction(p JudgedProposal) string {
|
||||||
|
switch p.Verdict {
|
||||||
|
case VerdictDuplicate:
|
||||||
|
return fmt.Sprintf("add %s to a supersession set, then a human confirms + commits + pins a GT case", p.Candidate.DropPattern)
|
||||||
|
case VerdictDistinct:
|
||||||
|
return "keep both — judge considers them distinct hazards"
|
||||||
|
default:
|
||||||
|
return "needs human (or higher-confidence LLM) review — no automatic action"
|
||||||
|
}
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user