breakpilot-compliance/ai-compliance-sdk/internal/iace/proposer_judge.go

package iace

import (
	"context"
	"encoding/json"
	"fmt"
	"strings"

	"github.com/breakpilot/ai-compliance-sdk/internal/llm"
)

// Semantic judgement over RECALL-SAFE dedup candidates (P2 slice 2). DEV-TIME,
// propose-only. The deterministic GT wall (proposer_screen.go) has already
// removed candidates that would drop recall or that credit different GT entries;
// the judge only adds an opinion on whether the survivors are truly the same
// hazard, plus a rationale, for the human review queue. It NEVER mutates anything.
//
// The judge is pluggable behind CandidateJudge so the runtime/tests stay
// deterministic (HeuristicJudge) while the dev-time CLI can plug in the
// non-deterministic LLM (LLMJudge over the shared llm.ProviderRegistry).

const (
	VerdictDuplicate = "duplicate"
	VerdictDistinct  = "distinct"
	VerdictUncertain = "uncertain"
)

// JudgedProposal is one candidate with its GT-wall result and the judge's opinion.
type JudgedProposal struct {
	Candidate  DedupCandidate `json:"candidate"`
	Screen     ScreenResult   `json:"screen"`
	Verdict    string         `json:"verdict"`
	Confidence string         `json:"confidence"`
	Rationale  string         `json:"rationale"`
	Judge      string         `json:"judge"`
}

// CandidateJudge decides whether two near-duplicate patterns are the same hazard.
type CandidateJudge interface {
	Name() string
	Judge(ctx context.Context, c DedupCandidate, a, b PatternMatch) (verdict, confidence, rationale string)
}

// HeuristicJudge is the deterministic default/fallback. It only ever returns "low"
// confidence — it is a placeholder for the LLM, and it deliberately punts to
// "uncertain" on the hard cases (low text overlap, shared measures) so the queue
// makes clear exactly where the LLM earns its keep.
type HeuristicJudge struct{}

func (HeuristicJudge) Name() string { return "heuristic" }

func (HeuristicJudge) Judge(_ context.Context, c DedupCandidate, _, _ PatternMatch) (string, string, string) {
	switch {
	case c.ScenarioJaccard >= 0.5 || (c.ZoneJaccard >= 0.5 && c.MeasureJaccard >= 0.5):
		return VerdictDuplicate, "low", "structural: high scenario, or combined zone+measure, overlap"
	case c.MeasureJaccard >= 0.99 && c.ZoneJaccard == 0 && c.ScenarioJaccard < 0.3:
		return VerdictDistinct, "low", "structural: identical measures but no zone/scenario overlap — likely distinct hazards sharing generic measures"
	default:
		return VerdictUncertain, "low", "structural signal inconclusive — needs the LLM judge"
	}
}

// LLMJudge asks an offline model to make the semantic call. Non-deterministic, so
// it lives only in the dev-time tool, never in tests or the runtime. It degrades
// to "uncertain" on any transport or parse error — it must never break the run.
type LLMJudge struct {
	Completer    LLMCompleter
	MachineClass string
}

func (LLMJudge) Name() string { return "llm" }

func (j LLMJudge) Judge(ctx context.Context, c DedupCandidate, a, b PatternMatch) (string, string, string) {
	system, user := BuildJudgePrompt(j.MachineClass, a, b)
	raw, err := j.Completer.Complete(ctx, system, user)
	if err != nil {
		return VerdictUncertain, "low", "LLM error: " + err.Error()
	}
	return parseJudgeJSON(raw)
}

// BuildJudgePrompt is the real LLM artifact — built and unit-tested deterministically
// even though the call itself is not. It frames the ISO 12100 same-vs-distinct
// question and forces a JSON answer.
func BuildJudgePrompt(machineClass string, a, b PatternMatch) (system, user string) {
	system = "Du bist Sachverstaendiger fuer Maschinensicherheit nach EN ISO 12100. " +
		"Entscheide, ob zwei generierte Gefaehrdungen fuer DIESE Maschine DIESELBE Gefaehrdung " +
		"beschreiben (Dublette) oder fachlich VERSCHIEDENE Gefaehrdungen sind, die nur zufaellig " +
		"dieselben Schutzmassnahmen teilen. Verschieden, wenn Wirkort, Ausloeser oder " +
		"Schadensmechanismus abweichen — auch bei gleicher Kategorie und gleichen Massnahmen. " +
		"Antworte AUSSCHLIESSLICH als JSON: " +
		`{"verdict":"duplicate|distinct|uncertain","confidence":"high|medium|low","rationale":"..."}.`
	user = fmt.Sprintf(`Maschinenklasse: %s

Gefaehrdung A (%s):
  Name: %s
  Kategorie: %s
  Zone: %s
  Szenario: %s
  Ausloeser: %s
  Schaden: %s
  Massnahmen: %s

Gefaehrdung B (%s):
  Name: %s
  Kategorie: %s
  Zone: %s
  Szenario: %s
  Ausloeser: %s
  Schaden: %s
  Massnahmen: %s

Sind A und B dieselbe Gefaehrdung fuer diese Maschine?`,
		machineClass,
		a.PatternID, a.PatternName, primaryCat(a), a.ZoneDE, a.ScenarioDE, a.TriggerDE, a.HarmDE, strings.Join(a.SuggestedMeasureIDs, ", "),
		b.PatternID, b.PatternName, primaryCat(b), b.ZoneDE, b.ScenarioDE, b.TriggerDE, b.HarmDE, strings.Join(b.SuggestedMeasureIDs, ", "))
	return system, user
}

func parseJudgeJSON(raw string) (verdict, confidence, rationale string) {
	start, end := strings.Index(raw, "{"), strings.LastIndex(raw, "}")
	if start < 0 || end <= start {
		return VerdictUncertain, "low", "unparseable LLM output"
	}
	var v struct {
		Verdict    string `json:"verdict"`
		Confidence string `json:"confidence"`
		Rationale  string `json:"rationale"`
	}
	if err := json.Unmarshal([]byte(raw[start:end+1]), &v); err != nil {
		return VerdictUncertain, "low", "unparseable LLM JSON: " + err.Error()
	}
	switch v.Verdict {
	case VerdictDuplicate, VerdictDistinct, VerdictUncertain:
	default:
		v.Verdict = VerdictUncertain
	}
	if v.Confidence == "" {
		v.Confidence = "low"
	}
	return v.Verdict, v.Confidence, v.Rationale
}

// LLMCompleter is the minimal text-in/text-out the LLM judge needs. Tests pass a
// stub; the dev-time tool passes a registry-backed adapter (NewRegistryCompleter).
type LLMCompleter interface {
	Complete(ctx context.Context, system, user string) (string, error)
}

type registryCompleter struct {
	reg   *llm.ProviderRegistry
	model string
}

// NewRegistryCompleter adapts the shared llm.ProviderRegistry to LLMCompleter so
// the proposer can reuse the platform's offline model wiring (e.g. self-hosted qwen).
func NewRegistryCompleter(reg *llm.ProviderRegistry, model string) LLMCompleter {
	return &registryCompleter{reg: reg, model: model}
}

func (rc *registryCompleter) Complete(ctx context.Context, system, user string) (string, error) {
	resp, err := rc.reg.Chat(ctx, &llm.ChatRequest{
		Model: rc.model,
		Messages: []llm.Message{
			{Role: "system", Content: system},
			{Role: "user", Content: user},
		},
		Temperature: 0,
	})
	if err != nil {
		return "", err
	}
	return resp.Message.Content, nil
}