From 0ce4794767535cc075128325a19b43c237859905 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Thu, 25 Jun 2026 08:56:04 +0200 Subject: [PATCH] feat(ai-sdk): pluggable LLM judgment over recall-safe dedup candidates (P2 slice 2) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds the semantic judgement layer on top of the slice-1 detector + GT wall. DEV-TIME, propose-only — nothing mutates the library or runtime. - CandidateJudge interface with two implementations: HeuristicJudge (deterministic default/fallback, used in tests) and LLMJudge (offline, over the shared llm.ProviderRegistry via the LLMCompleter adapter). LLMJudge degrades to "uncertain" on any transport/parse error — it can never break a run. - BuildJudgePrompt: the ISO 12100 same-vs-distinct prompt, unit-tested deterministically even though the call is not. - RenderProposalQueue: markdown human-review queue with a suggested action per candidate (supersede / keep both / needs review). On real warewashing output the heuristic punts to "uncertain — needs the LLM judge" for exactly the two recall-safe near-dupes (HP807/HP033 update, HP101/HP096 winding-vs-friction), making the LLM's role explicit. All 3 GTs unaffected (read-only). Live qwen wiring + a CLI/file queue are slice 3. Co-Authored-By: Claude Opus 4.7 --- .../internal/iace/gt_warewashing_test.go | 43 +++-- .../internal/iace/proposer_judge.go | 174 ++++++++++++++++++ .../internal/iace/proposer_judge_test.go | 104 +++++++++++ .../internal/iace/proposer_queue.go | 47 +++++ 4 files changed, 351 insertions(+), 17 deletions(-) create mode 100644 ai-compliance-sdk/internal/iace/proposer_judge.go create mode 100644 ai-compliance-sdk/internal/iace/proposer_judge_test.go create mode 100644 ai-compliance-sdk/internal/iace/proposer_queue.go diff --git a/ai-compliance-sdk/internal/iace/gt_warewashing_test.go b/ai-compliance-sdk/internal/iace/gt_warewashing_test.go index 71e8d960..c42ff010 100644 --- a/ai-compliance-sdk/internal/iace/gt_warewashing_test.go +++ b/ai-compliance-sdk/internal/iace/gt_warewashing_test.go @@ -1,6 +1,7 @@ package iace import ( + "context" "encoding/json" "os" "path/filepath" @@ -196,33 +197,41 @@ func TestWarewashing_DedupProposer(t *testing.T) { } hazards, mits, kept := warewashingEngineOutput() + byID := map[string]PatternMatch{} + for _, pm := range kept { + byID[pm.PatternID] = pm + } // 0.25 is a deliberately permissive candidate threshold: the proposer is meant - // to over-surface, because the deterministic GT wall below (and a human, and in - // slice 2 an LLM) is the precision filter — not the detector. + // to over-surface, because the deterministic GT wall below (and a human, and the + // LLM judge) is the precision filter — not the detector. candidates := FindDedupCandidates(kept, 0.25) t.Logf("Proposer: %d dedup candidate(s) from %d fired patterns", len(candidates), len(kept)) - safe, blocked := 0, 0 + // Deterministic judge in the test; the dev-time CLI swaps in LLMJudge. + judge := HeuristicJudge{} + var judged []JudgedProposal + blocked := 0 for _, c := range candidates { sr := ScreenSupersession(>, hazards, mits, c.KeepHazardName, c.DropName) - var verdict string switch { case sr.RecallAfter < sr.RecallBefore: - verdict, blocked = "BLOCK (recall-load-bearing)", blocked+1 + t.Logf("[BLOCK recall-load-bearing] keep %s / drop %s", c.KeepPattern, c.DropPattern) + blocked++ case sr.DistinctGT: - verdict, blocked = "BLOCK (distinct GT "+sr.KeepGT+" vs "+sr.DropGT+")", blocked+1 + t.Logf("[BLOCK distinct GT %s vs %s] keep %s / drop %s", sr.KeepGT, sr.DropGT, c.KeepPattern, c.DropPattern) + blocked++ default: - verdict, safe = "RECALL-SAFE (needs semantic review)", safe+1 - } - t.Logf("[%s] keep %s / drop %s score=%.2f recall %.1f%%->%.1f%% | %s", - verdict, c.KeepPattern, c.DropPattern, c.Score, - sr.RecallBefore*100, sr.RecallAfter*100, c.Rationale) - - // The wall must be sound: Safe implies recall preserved AND not distinct. - if sr.Safe && (sr.RecallAfter < sr.RecallBefore || sr.DistinctGT) { - t.Errorf("screen inconsistent for drop %s: Safe but recall dropped or distinct GT", c.DropPattern) + if !sr.Safe { + t.Errorf("RECALL-SAFE branch but ScreenResult.Safe=false for drop %s", c.DropPattern) + } + v, conf, rat := judge.Judge(context.Background(), c, byID[c.KeepPattern], byID[c.DropPattern]) + judged = append(judged, JudgedProposal{ + Candidate: c, Screen: sr, Verdict: v, Confidence: conf, Rationale: rat, Judge: judge.Name(), + }) } } - t.Logf("Proposer summary: %d RECALL-SAFE candidate(s) for human/LLM review, %d BLOCKED by the GT wall — propose-only, nothing auto-applied", - safe, blocked) + + t.Logf("\n%s", RenderProposalQueue("Gewerbliche Geschirrspuelmaschine (vernetzt)", judged)) + t.Logf("Proposer summary: %d candidate(s) in queue (judge=%s), %d BLOCKED by the GT wall — propose-only, nothing auto-applied", + len(judged), judge.Name(), blocked) } diff --git a/ai-compliance-sdk/internal/iace/proposer_judge.go b/ai-compliance-sdk/internal/iace/proposer_judge.go new file mode 100644 index 00000000..d068656e --- /dev/null +++ b/ai-compliance-sdk/internal/iace/proposer_judge.go @@ -0,0 +1,174 @@ +package iace + +import ( + "context" + "encoding/json" + "fmt" + "strings" + + "github.com/breakpilot/ai-compliance-sdk/internal/llm" +) + +// Semantic judgement over RECALL-SAFE dedup candidates (P2 slice 2). DEV-TIME, +// propose-only. The deterministic GT wall (proposer_screen.go) has already +// removed candidates that would drop recall or that credit different GT entries; +// the judge only adds an opinion on whether the survivors are truly the same +// hazard, plus a rationale, for the human review queue. It NEVER mutates anything. +// +// The judge is pluggable behind CandidateJudge so the runtime/tests stay +// deterministic (HeuristicJudge) while the dev-time CLI can plug in the +// non-deterministic LLM (LLMJudge over the shared llm.ProviderRegistry). + +const ( + VerdictDuplicate = "duplicate" + VerdictDistinct = "distinct" + VerdictUncertain = "uncertain" +) + +// JudgedProposal is one candidate with its GT-wall result and the judge's opinion. +type JudgedProposal struct { + Candidate DedupCandidate `json:"candidate"` + Screen ScreenResult `json:"screen"` + Verdict string `json:"verdict"` + Confidence string `json:"confidence"` + Rationale string `json:"rationale"` + Judge string `json:"judge"` +} + +// CandidateJudge decides whether two near-duplicate patterns are the same hazard. +type CandidateJudge interface { + Name() string + Judge(ctx context.Context, c DedupCandidate, a, b PatternMatch) (verdict, confidence, rationale string) +} + +// HeuristicJudge is the deterministic default/fallback. It only ever returns "low" +// confidence — it is a placeholder for the LLM, and it deliberately punts to +// "uncertain" on the hard cases (low text overlap, shared measures) so the queue +// makes clear exactly where the LLM earns its keep. +type HeuristicJudge struct{} + +func (HeuristicJudge) Name() string { return "heuristic" } + +func (HeuristicJudge) Judge(_ context.Context, c DedupCandidate, _, _ PatternMatch) (string, string, string) { + switch { + case c.ScenarioJaccard >= 0.5 || (c.ZoneJaccard >= 0.5 && c.MeasureJaccard >= 0.5): + return VerdictDuplicate, "low", "structural: high scenario, or combined zone+measure, overlap" + case c.MeasureJaccard >= 0.99 && c.ZoneJaccard == 0 && c.ScenarioJaccard < 0.3: + return VerdictDistinct, "low", "structural: identical measures but no zone/scenario overlap — likely distinct hazards sharing generic measures" + default: + return VerdictUncertain, "low", "structural signal inconclusive — needs the LLM judge" + } +} + +// LLMJudge asks an offline model to make the semantic call. Non-deterministic, so +// it lives only in the dev-time tool, never in tests or the runtime. It degrades +// to "uncertain" on any transport or parse error — it must never break the run. +type LLMJudge struct { + Completer LLMCompleter + MachineClass string +} + +func (LLMJudge) Name() string { return "llm" } + +func (j LLMJudge) Judge(ctx context.Context, c DedupCandidate, a, b PatternMatch) (string, string, string) { + system, user := BuildJudgePrompt(j.MachineClass, a, b) + raw, err := j.Completer.Complete(ctx, system, user) + if err != nil { + return VerdictUncertain, "low", "LLM error: " + err.Error() + } + return parseJudgeJSON(raw) +} + +// BuildJudgePrompt is the real LLM artifact — built and unit-tested deterministically +// even though the call itself is not. It frames the ISO 12100 same-vs-distinct +// question and forces a JSON answer. +func BuildJudgePrompt(machineClass string, a, b PatternMatch) (system, user string) { + system = "Du bist Sachverstaendiger fuer Maschinensicherheit nach EN ISO 12100. " + + "Entscheide, ob zwei generierte Gefaehrdungen fuer DIESE Maschine DIESELBE Gefaehrdung " + + "beschreiben (Dublette) oder fachlich VERSCHIEDENE Gefaehrdungen sind, die nur zufaellig " + + "dieselben Schutzmassnahmen teilen. Verschieden, wenn Wirkort, Ausloeser oder " + + "Schadensmechanismus abweichen — auch bei gleicher Kategorie und gleichen Massnahmen. " + + "Antworte AUSSCHLIESSLICH als JSON: " + + `{"verdict":"duplicate|distinct|uncertain","confidence":"high|medium|low","rationale":"..."}.` + user = fmt.Sprintf(`Maschinenklasse: %s + +Gefaehrdung A (%s): + Name: %s + Kategorie: %s + Zone: %s + Szenario: %s + Ausloeser: %s + Schaden: %s + Massnahmen: %s + +Gefaehrdung B (%s): + Name: %s + Kategorie: %s + Zone: %s + Szenario: %s + Ausloeser: %s + Schaden: %s + Massnahmen: %s + +Sind A und B dieselbe Gefaehrdung fuer diese Maschine?`, + machineClass, + a.PatternID, a.PatternName, primaryCat(a), a.ZoneDE, a.ScenarioDE, a.TriggerDE, a.HarmDE, strings.Join(a.SuggestedMeasureIDs, ", "), + b.PatternID, b.PatternName, primaryCat(b), b.ZoneDE, b.ScenarioDE, b.TriggerDE, b.HarmDE, strings.Join(b.SuggestedMeasureIDs, ", ")) + return system, user +} + +func parseJudgeJSON(raw string) (verdict, confidence, rationale string) { + start, end := strings.Index(raw, "{"), strings.LastIndex(raw, "}") + if start < 0 || end <= start { + return VerdictUncertain, "low", "unparseable LLM output" + } + var v struct { + Verdict string `json:"verdict"` + Confidence string `json:"confidence"` + Rationale string `json:"rationale"` + } + if err := json.Unmarshal([]byte(raw[start:end+1]), &v); err != nil { + return VerdictUncertain, "low", "unparseable LLM JSON: " + err.Error() + } + switch v.Verdict { + case VerdictDuplicate, VerdictDistinct, VerdictUncertain: + default: + v.Verdict = VerdictUncertain + } + if v.Confidence == "" { + v.Confidence = "low" + } + return v.Verdict, v.Confidence, v.Rationale +} + +// LLMCompleter is the minimal text-in/text-out the LLM judge needs. Tests pass a +// stub; the dev-time tool passes a registry-backed adapter (NewRegistryCompleter). +type LLMCompleter interface { + Complete(ctx context.Context, system, user string) (string, error) +} + +type registryCompleter struct { + reg *llm.ProviderRegistry + model string +} + +// NewRegistryCompleter adapts the shared llm.ProviderRegistry to LLMCompleter so +// the proposer can reuse the platform's offline model wiring (e.g. self-hosted qwen). +func NewRegistryCompleter(reg *llm.ProviderRegistry, model string) LLMCompleter { + return ®istryCompleter{reg: reg, model: model} +} + +func (rc *registryCompleter) Complete(ctx context.Context, system, user string) (string, error) { + resp, err := rc.reg.Chat(ctx, &llm.ChatRequest{ + Model: rc.model, + Messages: []llm.Message{ + {Role: "system", Content: system}, + {Role: "user", Content: user}, + }, + Temperature: 0, + }) + if err != nil { + return "", err + } + return resp.Message.Content, nil +} diff --git a/ai-compliance-sdk/internal/iace/proposer_judge_test.go b/ai-compliance-sdk/internal/iace/proposer_judge_test.go new file mode 100644 index 00000000..fdfc043a --- /dev/null +++ b/ai-compliance-sdk/internal/iace/proposer_judge_test.go @@ -0,0 +1,104 @@ +package iace + +import ( + "context" + "errors" + "strings" + "testing" +) + +func TestHeuristicJudge_Verdicts(t *testing.T) { + tests := []struct { + name string + zone, meas float64 + scenario float64 + wantVerdict string + }{ + {"high scenario overlap -> duplicate", 0, 0.3, 0.6, VerdictDuplicate}, + {"high zone+measure -> duplicate", 0.6, 0.6, 0.1, VerdictDuplicate}, + {"identical measures, no text -> distinct", 0, 1.0, 0.0, VerdictDistinct}, + {"shared measures, low text -> uncertain", 0, 0.67, 0.19, VerdictUncertain}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + c := DedupCandidate{ZoneJaccard: tt.zone, MeasureJaccard: tt.meas, ScenarioJaccard: tt.scenario} + v, conf, _ := HeuristicJudge{}.Judge(context.Background(), c, PatternMatch{}, PatternMatch{}) + if v != tt.wantVerdict { + t.Errorf("verdict: want %s, got %s", tt.wantVerdict, v) + } + if conf != "low" { + t.Errorf("heuristic confidence must be low, got %s", conf) + } + }) + } +} + +func TestBuildJudgePrompt_ContainsKeyFacts(t *testing.T) { + a := PatternMatch{PatternID: "HPa", PatternName: "Heisse Flaeche", HazardCats: []string{"thermal_hazard"}, + ZoneDE: "Boiler", ScenarioDE: "Beruehrung heisser Boiler", SuggestedMeasureIDs: []string{"M071"}} + b := PatternMatch{PatternID: "HPb", PatternName: "Heisses Spuelgut", HazardCats: []string{"thermal_hazard"}, + ZoneDE: "Spuelgut", ScenarioDE: "Beruehrung heisses Geschirr", SuggestedMeasureIDs: []string{"M071"}} + system, user := BuildJudgePrompt("Geschirrspuelmaschine", a, b) + + for _, want := range []string{"EN ISO 12100", "JSON", "verdict"} { + if !strings.Contains(system, want) { + t.Errorf("system prompt missing %q", want) + } + } + for _, want := range []string{"Geschirrspuelmaschine", "HPa", "HPb", "Boiler", "Spuelgut", "thermal_hazard"} { + if !strings.Contains(user, want) { + t.Errorf("user prompt missing %q", want) + } + } +} + +type fakeCompleter struct { + out string + err error +} + +func (f fakeCompleter) Complete(_ context.Context, _, _ string) (string, error) { return f.out, f.err } + +func TestLLMJudge_ParsesAndDegrades(t *testing.T) { + cand := DedupCandidate{KeepPattern: "HPa", DropPattern: "HPb"} + + // Well-formed JSON, even wrapped in chatter, parses. + j := LLMJudge{Completer: fakeCompleter{out: "Sicher. {\"verdict\":\"distinct\",\"confidence\":\"high\",\"rationale\":\"andere Wirkorte\"}"}, MachineClass: "x"} + if v, conf, r := j.Judge(context.Background(), cand, PatternMatch{}, PatternMatch{}); v != VerdictDistinct || conf != "high" || r != "andere Wirkorte" { + t.Errorf("parse: got %s/%s/%q", v, conf, r) + } + + // Unknown verdict value normalises to uncertain. + j2 := LLMJudge{Completer: fakeCompleter{out: `{"verdict":"maybe","confidence":"medium","rationale":"x"}`}} + if v, _, _ := j2.Judge(context.Background(), cand, PatternMatch{}, PatternMatch{}); v != VerdictUncertain { + t.Errorf("unknown verdict must normalise to uncertain, got %s", v) + } + + // Transport error degrades gracefully, never panics. + j3 := LLMJudge{Completer: fakeCompleter{err: errors.New("connection refused")}} + if v, _, r := j3.Judge(context.Background(), cand, PatternMatch{}, PatternMatch{}); v != VerdictUncertain || !strings.Contains(r, "LLM error") { + t.Errorf("error path: got %s / %q", v, r) + } + + // Garbage (no JSON) degrades to uncertain. + j4 := LLMJudge{Completer: fakeCompleter{out: "no json here"}} + if v, _, _ := j4.Judge(context.Background(), cand, PatternMatch{}, PatternMatch{}); v != VerdictUncertain { + t.Errorf("garbage must degrade to uncertain, got %s", v) + } +} + +func TestRenderProposalQueue_ShowsActions(t *testing.T) { + proposals := []JudgedProposal{ + { + Candidate: DedupCandidate{KeepPattern: "HP807", DropPattern: "HP033", Category: "update_failure", Score: 0.32}, + Screen: ScreenResult{RecallBefore: 1, RecallAfter: 1}, + Verdict: VerdictDuplicate, Confidence: "medium", Rationale: "same update failure", Judge: "llm", + }, + } + out := RenderProposalQueue("Geschirrspuelmaschine", proposals) + for _, want := range []string{"HP807", "HP033", "update_failure", "supersession", "Propose-only"} { + if !strings.Contains(out, want) { + t.Errorf("queue missing %q\n%s", want, out) + } + } +} diff --git a/ai-compliance-sdk/internal/iace/proposer_queue.go b/ai-compliance-sdk/internal/iace/proposer_queue.go new file mode 100644 index 00000000..6d7a1aa3 --- /dev/null +++ b/ai-compliance-sdk/internal/iace/proposer_queue.go @@ -0,0 +1,47 @@ +package iace + +import ( + "fmt" + "strings" +) + +// RenderProposalQueue turns judged dedup proposals into the human-review queue +// (markdown). Deterministic. Nothing here applies a change — every entry is a +// suggestion for a human to confirm, edit, commit, and pin with a GT case. +func RenderProposalQueue(machine string, proposals []JudgedProposal) string { + var b strings.Builder + fmt.Fprintf(&b, "# Dedup proposal queue — %s\n\n", machine) + fmt.Fprintf(&b, "%d candidate(s) survived the deterministic GT wall. Propose-only — nothing is applied automatically.\n\n", len(proposals)) + + for i, p := range proposals { + c := p.Candidate + fmt.Fprintf(&b, "## %d. keep %s ⊃ drop %s [%s → %s (%s)]\n", + i+1, c.KeepPattern, c.DropPattern, p.Judge, p.Verdict, p.Confidence) + fmt.Fprintf(&b, "- category %s · score %.2f (measures %.0f%%, zone %.0f%%, scenario %.0f%%)\n", + c.Category, c.Score, c.MeasureJaccard*100, c.ZoneJaccard*100, c.ScenarioJaccard*100) + fmt.Fprintf(&b, "- GT recall %.1f%% → %.1f%% when %s is dropped (wall: %s)\n", + p.Screen.RecallBefore*100, p.Screen.RecallAfter*100, c.DropPattern, wallNote(p.Screen)) + fmt.Fprintf(&b, "- keep: %s\n- drop: %s\n", c.KeepHazardName, c.DropName) + fmt.Fprintf(&b, "- judge rationale: %s\n", p.Rationale) + fmt.Fprintf(&b, "- suggested action: %s\n\n", suggestedAction(p)) + } + return b.String() +} + +func wallNote(s ScreenResult) string { + if s.DistinctGT { + return fmt.Sprintf("distinct GT %s vs %s", s.KeepGT, s.DropGT) + } + return "recall-safe" +} + +func suggestedAction(p JudgedProposal) string { + switch p.Verdict { + case VerdictDuplicate: + return fmt.Sprintf("add %s to a supersession set, then a human confirms + commits + pins a GT case", p.Candidate.DropPattern) + case VerdictDistinct: + return "keep both — judge considers them distinct hazards" + default: + return "needs human (or higher-confidence LLM) review — no automatic action" + } +}