c13aa9183a
Extends Method C: for each unknown narrative token that pattern text names, suggest
the keyword_dictionary tag = the RequiredComponentTags shared by the naming
patterns (ranked by frequency, kept only when shared by >=40% of them, top 3).
Surfaces real dictionary gaps like "zwischenkreis" -> stored_energy and
"updates" -> has_software, which close coverage without hand-editing the dict.
Two precision fixes to Method C while here:
- patternsMentioning now matches WHOLE WORDS, not substrings — substring matching
flagged fragments like "stehen" inside "entstehen" and produced nonsensical
tag suggestions.
- a token is only proposed with a tag if one is shared by >=40% of its naming
patterns, so diffuse common verbs (spread across categories) drop out.
Wired into iace-audit propose -> audit-reports/vocab.{md,json}. Residual
common-verb noise is left to the human/LLM filter rather than a hand-grown
stopword list. Type 4 (coverage blind spots) + P3 (pin accepted proposals into a
GT case) remain for slice 6.
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
219 lines
6.3 KiB
Go
219 lines
6.3 KiB
Go
package audit
|
|
|
|
import (
|
|
"regexp"
|
|
"sort"
|
|
"strings"
|
|
|
|
"github.com/breakpilot/ai-compliance-sdk/internal/iace"
|
|
)
|
|
|
|
// runVocabularyImpl takes a limits-form payload (the structured machine
|
|
// description filled in by the engineer) and asks: which of its words
|
|
// are unknown to the keyword dictionary yet appear in any pattern's
|
|
// scenario/trigger/harm/zone text? Each such word is a dictionary gap —
|
|
// the engineer typed a term that some pattern is waiting for, but the
|
|
// parser cannot translate it into a tag.
|
|
func init() {
|
|
runVocabularyImpl = runVocabulary
|
|
}
|
|
|
|
var tokenRE = regexp.MustCompile(`[a-zäöüßA-ZÄÖÜ]{4,}`)
|
|
|
|
// German + English stop words that show up in any narrative but carry
|
|
// no engineering meaning. Kept short on purpose — we only want to drop
|
|
// obvious filler.
|
|
var stopWords = map[string]bool{
|
|
"oder": true, "und": true, "auch": true, "wenn": true, "wird": true,
|
|
"werden": true, "kann": true, "koennen": true, "soll": true, "muss": true,
|
|
"sind": true, "eine": true, "einer": true, "einem": true, "einen": true,
|
|
"diese": true, "dieser": true, "dieses": true, "diesem": true, "diesen": true,
|
|
"durch": true, "nach": true, "ueber": true, "unter": true, "zwischen": true,
|
|
"nicht": true, "ohne": true, "fuer": true, "bzw": true, "etc": true,
|
|
"sowie": true, "siehe": true, "etwa": true, "ggf": true, "the": true,
|
|
"with": true, "from": true, "this": true, "that": true, "have": true,
|
|
"insbesondere": true, "ausschliesslich": true, "ebenfalls": true,
|
|
"jeweils": true, "weitere": true, "weiteren": true, "weiterer": true,
|
|
}
|
|
|
|
func runVocabulary(form map[string]any) VocabularyReport {
|
|
limits, ok := form["limits_form"].(map[string]any)
|
|
if !ok {
|
|
// Form may already be the inner object
|
|
limits = form
|
|
}
|
|
|
|
tokens := map[string]bool{}
|
|
for _, v := range limits {
|
|
extractTokens(v, tokens)
|
|
}
|
|
report := VocabularyReport{UniqueTokens: len(tokens)}
|
|
|
|
dictTokens := dictionaryVocabulary()
|
|
|
|
for tok := range tokens {
|
|
if stopWords[tok] {
|
|
continue
|
|
}
|
|
if dictTokenHit(tok, dictTokens) {
|
|
report.KnownTokens = append(report.KnownTokens, tok)
|
|
} else {
|
|
report.UnknownTokens = append(report.UnknownTokens, tok)
|
|
}
|
|
}
|
|
sort.Strings(report.KnownTokens)
|
|
sort.Strings(report.UnknownTokens)
|
|
|
|
// For each unknown token check if any pattern names it
|
|
patterns := iace.AllPatterns()
|
|
byID := make(map[string]iace.HazardPattern, len(patterns))
|
|
for _, p := range patterns {
|
|
byID[p.ID] = p
|
|
}
|
|
for _, tok := range report.UnknownTokens {
|
|
hits := patternsMentioning(tok, patterns)
|
|
if len(hits) == 0 {
|
|
continue
|
|
}
|
|
report.SuggestedDictionaryEntries = append(report.SuggestedDictionaryEntries, DictionarySuggestion{
|
|
Token: tok,
|
|
PatternIDs: hits,
|
|
SuggestedTags: suggestTagsFor(hits, byID),
|
|
})
|
|
}
|
|
sort.Slice(report.SuggestedDictionaryEntries, func(i, j int) bool {
|
|
return len(report.SuggestedDictionaryEntries[i].PatternIDs) > len(report.SuggestedDictionaryEntries[j].PatternIDs)
|
|
})
|
|
return report
|
|
}
|
|
|
|
func extractTokens(v any, out map[string]bool) {
|
|
switch x := v.(type) {
|
|
case string:
|
|
for _, m := range tokenRE.FindAllString(x, -1) {
|
|
out[strings.ToLower(m)] = true
|
|
}
|
|
case []any:
|
|
for _, e := range x {
|
|
extractTokens(e, out)
|
|
}
|
|
case map[string]any:
|
|
for _, e := range x {
|
|
extractTokens(e, out)
|
|
}
|
|
}
|
|
}
|
|
|
|
// dictionaryVocabulary builds the lowercase set of all keyword strings
|
|
// that the parser will recognize, including normalized forms (umlauts
|
|
// replaced like in the keyword dictionary).
|
|
func dictionaryVocabulary() map[string]bool {
|
|
out := map[string]bool{}
|
|
for _, kw := range iace.GetKeywordDictionary() {
|
|
for _, k := range kw.Keywords {
|
|
out[strings.ToLower(k)] = true
|
|
}
|
|
}
|
|
return out
|
|
}
|
|
|
|
// dictTokenHit returns true if the token would be matched by any
|
|
// dictionary entry. Dictionary entries can be substrings, so we treat
|
|
// the dict as a set of stem-like matchers: a token is "known" if it
|
|
// equals a dict word OR contains a dict word as substring OR the dict
|
|
// word contains the token.
|
|
func dictTokenHit(tok string, dict map[string]bool) bool {
|
|
if dict[tok] {
|
|
return true
|
|
}
|
|
for d := range dict {
|
|
if strings.Contains(tok, d) || strings.Contains(d, tok) {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
// patternsMentioning returns up to 8 pattern IDs whose scenario/trigger/harm/
|
|
// zone text names the token as a WHOLE WORD. Whole-word (not substring) matching
|
|
// is essential: a substring match flags common fragments like "stehen" inside
|
|
// "entstehen", producing spurious hits and nonsensical tag suggestions.
|
|
func patternsMentioning(tok string, patterns []iace.HazardPattern) []string {
|
|
tokLower := strings.ToLower(tok)
|
|
seen := map[string]bool{}
|
|
var out []string
|
|
for _, p := range patterns {
|
|
hay := strings.ToLower(p.ScenarioDE + " " + p.TriggerDE + " " + p.HarmDE + " " + p.ZoneDE + " " + p.NameDE)
|
|
matched := false
|
|
for _, w := range tokenRE.FindAllString(hay, -1) {
|
|
if w == tokLower {
|
|
matched = true
|
|
break
|
|
}
|
|
}
|
|
if !matched || seen[p.ID] {
|
|
continue
|
|
}
|
|
seen[p.ID] = true
|
|
out = append(out, p.ID)
|
|
if len(out) >= 8 {
|
|
break
|
|
}
|
|
}
|
|
return out
|
|
}
|
|
|
|
// suggestTagsFor returns the RequiredComponentTags shared across the naming
|
|
// patterns, ranked by how many of them require each tag (ties broken by name),
|
|
// top 3. These are the candidate tags a dictionary entry for the token should
|
|
// emit so a narrative mentioning the token can trigger those patterns.
|
|
func suggestTagsFor(ids []string, byID map[string]iace.HazardPattern) []string {
|
|
freq := map[string]int{}
|
|
total := 0
|
|
for _, id := range ids {
|
|
p, ok := byID[id]
|
|
if !ok {
|
|
continue
|
|
}
|
|
total++
|
|
seen := map[string]bool{}
|
|
for _, tag := range p.RequiredComponentTags {
|
|
if seen[tag] {
|
|
continue
|
|
}
|
|
seen[tag] = true
|
|
freq[tag]++
|
|
}
|
|
}
|
|
if total == 0 {
|
|
return nil
|
|
}
|
|
type tf struct {
|
|
tag string
|
|
n int
|
|
}
|
|
ranked := make([]tf, 0, len(freq))
|
|
for t, n := range freq {
|
|
ranked = append(ranked, tf{t, n})
|
|
}
|
|
sort.Slice(ranked, func(i, j int) bool {
|
|
if ranked[i].n != ranked[j].n {
|
|
return ranked[i].n > ranked[j].n
|
|
}
|
|
return ranked[i].tag < ranked[j].tag
|
|
})
|
|
// Only suggest a tag shared by >= 40% of the naming patterns. Diffuse tokens
|
|
// (common verbs spread across categories) get no dominant tag and are dropped.
|
|
var out []string
|
|
for _, x := range ranked {
|
|
if float64(x.n)/float64(total) < 0.4 {
|
|
break
|
|
}
|
|
out = append(out, x.tag)
|
|
if len(out) >= 3 {
|
|
break
|
|
}
|
|
}
|
|
return out
|
|
}
|