Files
breakpilot-compliance/ai-compliance-sdk/internal/iace/gt_kistenhub_test.go
T
Benjamin Admin 8440ddfecb feat(ai-sdk): runnable iace-audit propose CLI + live LLM wiring (P2 slice 3)
Makes the offline proposer runnable end-to-end.

- BuildProposerInput (proposer_input.go): non-test engine->hazards path. The
  PatternMatch->Hazard converter is lifted out of the GT test files into
  production scope so both the tests and the CLI share one pipeline.
- iace-audit propose <narrative.json> [<ground-truth.json>]: detect candidates ->
  GT-screen survivors (when a ground truth is given) -> judge (HeuristicJudge by
  default, LLMJudge over ollama when IACE_PROPOSE_LLM=1) -> write the human-review
  queue to audit-reports/proposals.{md,json}. Propose-only.

Smoke run on a dishwasher narrative: 32 fired -> 3 candidates -> queue with a
confident duplicate, a confident distinct, and one punted to the LLM judge; GT
wall recall-safe. Live qwen is opt-in via env; the heuristic default keeps the
tool runnable (and CI deterministic) without a model. Proposal types 2-4
(foreign-framing gates, vocab->tag, coverage blind spots) remain for slice 4.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-06-26 10:27:01 +02:00

144 lines
4.5 KiB
Go

package iace
import (
"encoding/json"
"fmt"
"os"
"path/filepath"
"sort"
"testing"
)
// TestKistenhub_GTCoverage runs the Kistenhubgeraet ground truth (37 entries)
// against the current pattern engine + measure library and reports the
// recall/precision split. Pure in-memory — no DB required.
//
// Composition:
// - C014 Hubwerk supplies the lift-relevant tags (crush_point,
// gravity_risk, person_under_load).
// - EN01 electric + EN03 potential/gravity match HP2100-2102's
// RequiredEnergyTags ("gravitational").
// - MachineTypes {lift, hoist, scissor_lift, elevator} gates the new
// lift-bridge patterns.
//
// The test does not assert hard coverage thresholds — it logs the
// metrics so the user can read them via `go test -v`. Use it as a
// reproducible benchmark when changing the lift-bridge library.
func TestKistenhub_GTCoverage(t *testing.T) {
gtPath := filepath.Join("testdata", "ground_truth_kistenhub.json")
raw, err := os.ReadFile(gtPath)
if err != nil {
t.Fatalf("read GT: %v", err)
}
var gt GroundTruth
if err := json.Unmarshal(raw, &gt); err != nil {
t.Fatalf("parse GT: %v", err)
}
t.Logf("Loaded %d GT entries from %s", len(gt.Entries), gtPath)
input := MatchInput{
ComponentLibraryIDs: []string{"C014"},
EnergySourceIDs: []string{"EN01", "EN03"},
LifecyclePhases: []string{
"normal_operation", "maintenance", "cleaning",
"setup", "transport", "manual_operation",
},
CustomTags: []string{
"lift", "hoist", "scissor_lift", "manual_lift",
"mobile_machine", "hand_operated",
},
OperationalStates: []string{"normal_operation", "maintenance", "manual_operation"},
HumanRoles: []string{"operator", "maintenance_tech"},
MachineTypes: []string{"lift", "hoist", "scissor_lift", "elevator"},
}
engine := NewPatternEngine()
out := engine.Match(input)
t.Logf("Pattern engine matched %d patterns", len(out.MatchedPatterns))
hazards, mitigations := patternsToHazardsAndMitigations(out)
result := CompareBenchmark(&gt, hazards, mitigations)
precision := 0.0
if result.TotalEngine > 0 {
precision = float64(len(result.MatchedPairs)) / float64(result.TotalEngine)
}
t.Logf("=== Kistenhub-GT Benchmark Result ===")
t.Logf("Hazard Coverage: %.1f%% (%d/%d, %d missing)",
result.CoverageScore*100, len(result.MatchedPairs), result.TotalGT, len(result.MissingFromEngine))
t.Logf("Measure Coverage: %.1f%%", result.MeasureCoverage*100)
t.Logf("Engine Hazards: %d (%d extra)", result.TotalEngine, len(result.ExtraInEngine))
t.Logf("Precision: %.1f%%", precision*100)
t.Logf("\n--- Category breakdown ---")
for _, cb := range result.CategoryBreakdown {
t.Logf(" %-50s %d/%d (%.0f%%)", cb.Category, cb.MatchCount, cb.GTCount, cb.Coverage*100)
}
if len(result.MissingFromEngine) > 0 {
t.Logf("\n--- Missing from engine (%d) ---", len(result.MissingFromEngine))
for _, m := range result.MissingFromEngine {
t.Logf(" GT %s [%s]: %q — %q",
m.Nr, abbrev(m.HazardGroup, 25), abbrev(m.HazardType, 30), abbrev(m.HazardCause, 60))
}
}
liftPatterns := map[string]bool{"HP2100": false, "HP2101": false, "HP2102": false}
liftMeasures := map[string]bool{"M600": false, "M601": false, "M602": false, "M603": false, "M604": false}
for _, pm := range out.MatchedPatterns {
if _, ok := liftPatterns[pm.PatternID]; ok {
liftPatterns[pm.PatternID] = true
}
}
for _, sm := range out.SuggestedMeasures {
if _, ok := liftMeasures[sm.MeasureID]; ok {
liftMeasures[sm.MeasureID] = true
}
}
t.Logf("\n--- Lift-Bridge verification (SHA c771d8e from 2026-05-22) ---")
t.Logf("HP2100-2102 fired: %s", formatPresence(liftPatterns))
t.Logf("M600-M604 fired: %s", formatPresence(liftMeasures))
if firedPatterns := countTrue(liftPatterns); firedPatterns == 0 {
t.Log("WARNING: none of the lift-bridge patterns fired — check tag composition")
}
}
// patternsToHazardsAndMitigations converts a pattern match output into the
// Hazard/Mitigation shapes that CompareBenchmark expects. Mirrors what
// iace_handler_init.go does in production but without DB writes.
func abbrev(s string, max int) string {
if len(s) <= max {
return s
}
return s[:max-1] + "…"
}
func formatPresence(m map[string]bool) string {
keys := make([]string, 0, len(m))
for k := range m {
keys = append(keys, k)
}
sort.Strings(keys)
out := ""
for _, k := range keys {
mark := "✗"
if m[k] {
mark = "✓"
}
out += fmt.Sprintf("%s%s ", mark, k)
}
return out
}
func countTrue(m map[string]bool) int {
n := 0
for _, v := range m {
if v {
n++
}
}
return n
}