feat(ai-sdk): pluggable LLM judgment over recall-safe dedup candidates (P2 slice 2)
Adds the semantic judgement layer on top of the slice-1 detector + GT wall. DEV-TIME, propose-only — nothing mutates the library or runtime. - CandidateJudge interface with two implementations: HeuristicJudge (deterministic default/fallback, used in tests) and LLMJudge (offline, over the shared llm.ProviderRegistry via the LLMCompleter adapter). LLMJudge degrades to "uncertain" on any transport/parse error — it can never break a run. - BuildJudgePrompt: the ISO 12100 same-vs-distinct prompt, unit-tested deterministically even though the call is not. - RenderProposalQueue: markdown human-review queue with a suggested action per candidate (supersede / keep both / needs review). On real warewashing output the heuristic punts to "uncertain — needs the LLM judge" for exactly the two recall-safe near-dupes (HP807/HP033 update, HP101/HP096 winding-vs-friction), making the LLM's role explicit. All 3 GTs unaffected (read-only). Live qwen wiring + a CLI/file queue are slice 3. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,174 @@
|
||||
package iace
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"strings"
|
||||
|
||||
"github.com/breakpilot/ai-compliance-sdk/internal/llm"
|
||||
)
|
||||
|
||||
// Semantic judgement over RECALL-SAFE dedup candidates (P2 slice 2). DEV-TIME,
|
||||
// propose-only. The deterministic GT wall (proposer_screen.go) has already
|
||||
// removed candidates that would drop recall or that credit different GT entries;
|
||||
// the judge only adds an opinion on whether the survivors are truly the same
|
||||
// hazard, plus a rationale, for the human review queue. It NEVER mutates anything.
|
||||
//
|
||||
// The judge is pluggable behind CandidateJudge so the runtime/tests stay
|
||||
// deterministic (HeuristicJudge) while the dev-time CLI can plug in the
|
||||
// non-deterministic LLM (LLMJudge over the shared llm.ProviderRegistry).
|
||||
|
||||
const (
|
||||
VerdictDuplicate = "duplicate"
|
||||
VerdictDistinct = "distinct"
|
||||
VerdictUncertain = "uncertain"
|
||||
)
|
||||
|
||||
// JudgedProposal is one candidate with its GT-wall result and the judge's opinion.
|
||||
type JudgedProposal struct {
|
||||
Candidate DedupCandidate `json:"candidate"`
|
||||
Screen ScreenResult `json:"screen"`
|
||||
Verdict string `json:"verdict"`
|
||||
Confidence string `json:"confidence"`
|
||||
Rationale string `json:"rationale"`
|
||||
Judge string `json:"judge"`
|
||||
}
|
||||
|
||||
// CandidateJudge decides whether two near-duplicate patterns are the same hazard.
|
||||
type CandidateJudge interface {
|
||||
Name() string
|
||||
Judge(ctx context.Context, c DedupCandidate, a, b PatternMatch) (verdict, confidence, rationale string)
|
||||
}
|
||||
|
||||
// HeuristicJudge is the deterministic default/fallback. It only ever returns "low"
|
||||
// confidence — it is a placeholder for the LLM, and it deliberately punts to
|
||||
// "uncertain" on the hard cases (low text overlap, shared measures) so the queue
|
||||
// makes clear exactly where the LLM earns its keep.
|
||||
type HeuristicJudge struct{}
|
||||
|
||||
func (HeuristicJudge) Name() string { return "heuristic" }
|
||||
|
||||
func (HeuristicJudge) Judge(_ context.Context, c DedupCandidate, _, _ PatternMatch) (string, string, string) {
|
||||
switch {
|
||||
case c.ScenarioJaccard >= 0.5 || (c.ZoneJaccard >= 0.5 && c.MeasureJaccard >= 0.5):
|
||||
return VerdictDuplicate, "low", "structural: high scenario, or combined zone+measure, overlap"
|
||||
case c.MeasureJaccard >= 0.99 && c.ZoneJaccard == 0 && c.ScenarioJaccard < 0.3:
|
||||
return VerdictDistinct, "low", "structural: identical measures but no zone/scenario overlap — likely distinct hazards sharing generic measures"
|
||||
default:
|
||||
return VerdictUncertain, "low", "structural signal inconclusive — needs the LLM judge"
|
||||
}
|
||||
}
|
||||
|
||||
// LLMJudge asks an offline model to make the semantic call. Non-deterministic, so
|
||||
// it lives only in the dev-time tool, never in tests or the runtime. It degrades
|
||||
// to "uncertain" on any transport or parse error — it must never break the run.
|
||||
type LLMJudge struct {
|
||||
Completer LLMCompleter
|
||||
MachineClass string
|
||||
}
|
||||
|
||||
func (LLMJudge) Name() string { return "llm" }
|
||||
|
||||
func (j LLMJudge) Judge(ctx context.Context, c DedupCandidate, a, b PatternMatch) (string, string, string) {
|
||||
system, user := BuildJudgePrompt(j.MachineClass, a, b)
|
||||
raw, err := j.Completer.Complete(ctx, system, user)
|
||||
if err != nil {
|
||||
return VerdictUncertain, "low", "LLM error: " + err.Error()
|
||||
}
|
||||
return parseJudgeJSON(raw)
|
||||
}
|
||||
|
||||
// BuildJudgePrompt is the real LLM artifact — built and unit-tested deterministically
|
||||
// even though the call itself is not. It frames the ISO 12100 same-vs-distinct
|
||||
// question and forces a JSON answer.
|
||||
func BuildJudgePrompt(machineClass string, a, b PatternMatch) (system, user string) {
|
||||
system = "Du bist Sachverstaendiger fuer Maschinensicherheit nach EN ISO 12100. " +
|
||||
"Entscheide, ob zwei generierte Gefaehrdungen fuer DIESE Maschine DIESELBE Gefaehrdung " +
|
||||
"beschreiben (Dublette) oder fachlich VERSCHIEDENE Gefaehrdungen sind, die nur zufaellig " +
|
||||
"dieselben Schutzmassnahmen teilen. Verschieden, wenn Wirkort, Ausloeser oder " +
|
||||
"Schadensmechanismus abweichen — auch bei gleicher Kategorie und gleichen Massnahmen. " +
|
||||
"Antworte AUSSCHLIESSLICH als JSON: " +
|
||||
`{"verdict":"duplicate|distinct|uncertain","confidence":"high|medium|low","rationale":"..."}.`
|
||||
user = fmt.Sprintf(`Maschinenklasse: %s
|
||||
|
||||
Gefaehrdung A (%s):
|
||||
Name: %s
|
||||
Kategorie: %s
|
||||
Zone: %s
|
||||
Szenario: %s
|
||||
Ausloeser: %s
|
||||
Schaden: %s
|
||||
Massnahmen: %s
|
||||
|
||||
Gefaehrdung B (%s):
|
||||
Name: %s
|
||||
Kategorie: %s
|
||||
Zone: %s
|
||||
Szenario: %s
|
||||
Ausloeser: %s
|
||||
Schaden: %s
|
||||
Massnahmen: %s
|
||||
|
||||
Sind A und B dieselbe Gefaehrdung fuer diese Maschine?`,
|
||||
machineClass,
|
||||
a.PatternID, a.PatternName, primaryCat(a), a.ZoneDE, a.ScenarioDE, a.TriggerDE, a.HarmDE, strings.Join(a.SuggestedMeasureIDs, ", "),
|
||||
b.PatternID, b.PatternName, primaryCat(b), b.ZoneDE, b.ScenarioDE, b.TriggerDE, b.HarmDE, strings.Join(b.SuggestedMeasureIDs, ", "))
|
||||
return system, user
|
||||
}
|
||||
|
||||
func parseJudgeJSON(raw string) (verdict, confidence, rationale string) {
|
||||
start, end := strings.Index(raw, "{"), strings.LastIndex(raw, "}")
|
||||
if start < 0 || end <= start {
|
||||
return VerdictUncertain, "low", "unparseable LLM output"
|
||||
}
|
||||
var v struct {
|
||||
Verdict string `json:"verdict"`
|
||||
Confidence string `json:"confidence"`
|
||||
Rationale string `json:"rationale"`
|
||||
}
|
||||
if err := json.Unmarshal([]byte(raw[start:end+1]), &v); err != nil {
|
||||
return VerdictUncertain, "low", "unparseable LLM JSON: " + err.Error()
|
||||
}
|
||||
switch v.Verdict {
|
||||
case VerdictDuplicate, VerdictDistinct, VerdictUncertain:
|
||||
default:
|
||||
v.Verdict = VerdictUncertain
|
||||
}
|
||||
if v.Confidence == "" {
|
||||
v.Confidence = "low"
|
||||
}
|
||||
return v.Verdict, v.Confidence, v.Rationale
|
||||
}
|
||||
|
||||
// LLMCompleter is the minimal text-in/text-out the LLM judge needs. Tests pass a
|
||||
// stub; the dev-time tool passes a registry-backed adapter (NewRegistryCompleter).
|
||||
type LLMCompleter interface {
|
||||
Complete(ctx context.Context, system, user string) (string, error)
|
||||
}
|
||||
|
||||
type registryCompleter struct {
|
||||
reg *llm.ProviderRegistry
|
||||
model string
|
||||
}
|
||||
|
||||
// NewRegistryCompleter adapts the shared llm.ProviderRegistry to LLMCompleter so
|
||||
// the proposer can reuse the platform's offline model wiring (e.g. self-hosted qwen).
|
||||
func NewRegistryCompleter(reg *llm.ProviderRegistry, model string) LLMCompleter {
|
||||
return ®istryCompleter{reg: reg, model: model}
|
||||
}
|
||||
|
||||
func (rc *registryCompleter) Complete(ctx context.Context, system, user string) (string, error) {
|
||||
resp, err := rc.reg.Chat(ctx, &llm.ChatRequest{
|
||||
Model: rc.model,
|
||||
Messages: []llm.Message{
|
||||
{Role: "system", Content: system},
|
||||
{Role: "user", Content: user},
|
||||
},
|
||||
Temperature: 0,
|
||||
})
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return resp.Message.Content, nil
|
||||
}
|
||||
Reference in New Issue
Block a user