feat(ai-sdk): vocab->tag proposer (P2 slice 5, type 3)

Extends Method C: for each unknown narrative token that pattern text names, suggest
the keyword_dictionary tag = the RequiredComponentTags shared by the naming
patterns (ranked by frequency, kept only when shared by >=40% of them, top 3).
Surfaces real dictionary gaps like "zwischenkreis" -> stored_energy and
"updates" -> has_software, which close coverage without hand-editing the dict.

Two precision fixes to Method C while here:
- patternsMentioning now matches WHOLE WORDS, not substrings — substring matching
  flagged fragments like "stehen" inside "entstehen" and produced nonsensical
  tag suggestions.
- a token is only proposed with a tag if one is shared by >=40% of its naming
  patterns, so diffuse common verbs (spread across categories) drop out.

Wired into iace-audit propose -> audit-reports/vocab.{md,json}. Residual
common-verb noise is left to the human/LLM filter rather than a hand-grown
stopword list. Type 4 (coverage blind spots) + P3 (pin accepted proposals into a
GT case) remain for slice 6.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-06-25 09:51:12 +02:00
parent 662aec209a
commit c13aa9183a
4 changed files with 143 additions and 7 deletions
@@ -6,8 +6,10 @@ import (
"fmt"
"os"
"strconv"
"strings"
"github.com/breakpilot/ai-compliance-sdk/internal/iace"
"github.com/breakpilot/ai-compliance-sdk/internal/iace/audit"
"github.com/breakpilot/ai-compliance-sdk/internal/llm"
)
@@ -89,12 +91,25 @@ func cmdPropose(args []string) {
writeText("audit-reports/framing.md", iace.RenderFramingQueue(in.MachineType, framing))
writeJSON("audit-reports/framing.json", framing)
// Type 3: vocab->tag proposals (unknown narrative tokens that pattern text
// names as a whole word, with a dominant shared required tag).
vocab := audit.RunVocabulary(map[string]any{"narrative": in.Narrative})
var vgaps []audit.DictionarySuggestion
for _, s := range vocab.SuggestedDictionaryEntries {
if len(s.SuggestedTags) > 0 {
vgaps = append(vgaps, s)
}
}
writeText("audit-reports/vocab.md", renderVocabQueue(in.MachineType, vgaps))
writeJSON("audit-reports/vocab.json", vgaps)
printSummary("Method P — Dedup Proposer ("+judge.Name()+")", map[string]int{
"fired_patterns": len(fired),
"candidates": len(candidates),
"in_queue": len(proposals),
"gt_blocked": blocked,
"framing_flags": len(framing),
"vocab_gaps": len(vgaps),
})
if gt == nil {
fmt.Fprintln(os.Stderr, "note: no ground truth provided — GT wall NOT applied (candidates not recall-screened)")
@@ -145,3 +160,19 @@ func envFloat(key string, def float64) float64 {
}
return def
}
func renderVocabQueue(machine string, entries []audit.DictionarySuggestion) string {
var b strings.Builder
fmt.Fprintf(&b, "# Vocab→tag review queue — %s\n\n", machine)
fmt.Fprintf(&b, "%d unknown token(s) appear in pattern text but map to no dictionary tag. Propose-only — a human (or the LLM) confirms the tag, then adds a keyword_dictionary entry and pins a GT case.\n\n", len(entries))
for i, s := range entries {
tag := "<tag>"
if len(s.SuggestedTags) > 0 {
tag = s.SuggestedTags[0]
}
fmt.Fprintf(&b, "## %d. \"%s\" → suggested tag(s): %s\n", i+1, s.Token, strings.Join(s.SuggestedTags, ", "))
fmt.Fprintf(&b, "- named by %d pattern(s): %s\n", len(s.PatternIDs), strings.Join(s.PatternIDs, ", "))
fmt.Fprintf(&b, "- suggested action: add keyword_dictionary entry {%q → %s} so narratives mentioning it trigger those patterns; human confirms\n\n", s.Token, tag)
}
return b.String()
}