From df7966656aa888480a467272f64fb4f862652840 Mon Sep 17 00:00:00 2001 From: Benjamin_Boenisch Date: Wed, 24 Jun 2026 10:15:17 +0000 Subject: [PATCH] feat(ai-sdk): classify NIST/OWASP/Grundschutz as technical_standard (#37) --- ai-compliance-sdk/internal/ucca/authority.go | 16 +++++++++++++--- .../internal/ucca/authority_rerank.go | 8 +++++--- .../internal/ucca/authority_test.go | 4 ++++ .../internal/ucca/legal_rag_intent_test.go | 14 ++++++++++++++ 4 files changed, 36 insertions(+), 6 deletions(-) diff --git a/ai-compliance-sdk/internal/ucca/authority.go b/ai-compliance-sdk/internal/ucca/authority.go index 53994f5a..715f80c7 100644 --- a/ai-compliance-sdk/internal/ucca/authority.go +++ b/ai-compliance-sdk/internal/ucca/authority.go @@ -9,8 +9,8 @@ import ( // authorityInfo is the normative classification of a search result, used internally // for re-ranking only (Phase 1 changes ordering, not the response contract). type authorityInfo struct { - weight int // 100 binding_law, 70 guidance, 0 foreign_law, 50 unknown - sourceClass string // binding_law | supervisory_guidance | foreign_law | unknown + weight int // 100 binding, 80 technical_standard, 70 guidance, 0 foreign, 50 unknown + sourceClass string // binding_law | technical_standard | supervisory_guidance | foreign_law | unknown jurisdiction string // DE | EU | CH } @@ -18,7 +18,13 @@ var ( guidanceMarkers = []string{ "DSK", "EDPB", "BfDI", "BFDI", "BayLfD", "Baylfb", "ENISA", "BSI", "EUCC", "Standards Mapping", "Kpnr", "Orientierungshilfe", "Handreichung", "Beschluss", - "Leitlinie", "Guidance", "Empfehlung", "NIST", "OECD", "CISA", "Blue Guide", + "Leitlinie", "Guidance", "Empfehlung", "OECD", "CISA", "Blue Guide", + } + // Technical standards / control frameworks (best-practice controls). Checked BEFORE + // guidanceMarkers so a "BSI Grundschutz" chunk classifies as a standard, not BSI guidance. + standardMarkers = []string{ + "NIST", "OWASP", "Grundschutz", "ISO 27001", "ISO/IEC 27001", + "CSA CCM", "Cloud Controls Matrix", "CIS Benchmark", "CIS Control", } foreignMarkers = []string{"RevDSG", "fedlex", "(CH)"} deMarkers = []string{"BDSG", "DSK", "BfDI", "BFDI", "BayLfD", "Baylfb", "BSI"} @@ -48,6 +54,8 @@ func classifyAuthority(r LegalSearchResult) authorityInfo { switch { case containsAny(hay, foreignMarkers): return authorityInfo{weight: 0, sourceClass: "foreign_law", jurisdiction: "CH"} + case r.Category == "standard" || containsAny(hay, standardMarkers): + return authorityInfo{weight: 80, sourceClass: "technical_standard", jurisdiction: jur} case r.Category == "guidance" || containsAny(hay, guidanceMarkers): return authorityInfo{weight: 70, sourceClass: "supervisory_guidance", jurisdiction: jur} case r.Category == "regulation" || r.Category == "eu_recht" || normPattern.MatchString(r.ArticleLabel): @@ -61,6 +69,8 @@ func sourceClassFromWeight(w int) string { switch { case w >= 100: return "binding_law" + case w >= 80: + return "technical_standard" case w >= 70: return "supervisory_guidance" case w <= 0: diff --git a/ai-compliance-sdk/internal/ucca/authority_rerank.go b/ai-compliance-sdk/internal/ucca/authority_rerank.go index 1360b1b5..e5cbf463 100644 --- a/ai-compliance-sdk/internal/ucca/authority_rerank.go +++ b/ai-compliance-sdk/internal/ucca/authority_rerank.go @@ -64,7 +64,7 @@ func bestBindingSemantic(results []LegalSearchResult, wantsIntent bool) float64 } best := 0.0 for _, r := range results { - if r.SourceClass == "binding_law" && r.Score > best { + if classifyAuthority(r).sourceClass == "binding_law" && r.Score > best { best = r.Score } } @@ -152,12 +152,14 @@ func rerankByAuthority(query string, results []LegalSearchResult) []LegalSearchR func liftAboveBinding(out, raw []LegalSearchResult, bestBindingSem float64, sourceClass string) { bestBindingFinal := 0.0 for i := range out { - if out[i].SourceClass == "binding_law" && out[i].Score > bestBindingFinal { + if classifyAuthority(out[i]).sourceClass == "binding_law" && out[i].Score > bestBindingFinal { bestBindingFinal = out[i].Score } } for i := range out { - if out[i].SourceClass != sourceClass || raw[i].Score < bestBindingSem-intentLiftMargin { + // Classify (not raw payload) so the untagged legacy corpus โ€” e.g. NIST ingested + // before source_class tagging โ€” is still recognized as its interpretative class. + if classifyAuthority(out[i]).sourceClass != sourceClass || raw[i].Score < bestBindingSem-intentLiftMargin { continue } lifted := bestBindingFinal + intentLiftGain + (raw[i].Score - bestBindingSem) diff --git a/ai-compliance-sdk/internal/ucca/authority_test.go b/ai-compliance-sdk/internal/ucca/authority_test.go index d4109c65..5e63e2a6 100644 --- a/ai-compliance-sdk/internal/ucca/authority_test.go +++ b/ai-compliance-sdk/internal/ucca/authority_test.go @@ -14,6 +14,10 @@ func TestClassifyAuthority(t *testing.T) { {"tagged guidance DE", LegalSearchResult{AuthorityWeight: 70, SourceClass: "supervisory_guidance", Jurisdiction: "DE"}, 70, "supervisory_guidance", "DE"}, {"tagged foreign CH", LegalSearchResult{AuthorityWeight: 0, SourceClass: "foreign_law", Jurisdiction: "CH"}, 0, "foreign_law", "CH"}, {"untagged ENISA guidance", LegalSearchResult{RegulationShort: "ENISA", ArticleLabel: "ENISA CRA Standards Mapping"}, 70, "supervisory_guidance", "EU"}, + {"untagged NIST standard", LegalSearchResult{RegulationShort: "NIST SP 800-82r3", ArticleLabel: "AU-8"}, 80, "technical_standard", "EU"}, + {"BSI Grundschutz standard beats BSI guidance", LegalSearchResult{RegulationShort: "BSI Grundschutz", ArticleLabel: "BSI Grundschutz Baustein"}, 80, "technical_standard", "DE"}, + {"weight-only 85 TRGS standard", LegalSearchResult{AuthorityWeight: 85, RegulationShort: "TRGS 529"}, 85, "technical_standard", "EU"}, + {"tagged technical_standard", LegalSearchResult{AuthorityWeight: 80, SourceClass: "technical_standard", Jurisdiction: "EU"}, 80, "technical_standard", "EU"}, {"untagged CRA binding", LegalSearchResult{RegulationShort: "CRA", ArticleLabel: "Art. 13 CRA", Category: "regulation"}, 100, "binding_law", "EU"}, {"untagged BDSG binding DE", LegalSearchResult{RegulationShort: "BDSG", ArticleLabel: "ยง 38 BDSG"}, 100, "binding_law", "DE"}, {"untagged RevDSG foreign", LegalSearchResult{RegulationShort: "RevDSG", ArticleLabel: "RevDSG (CH)"}, 0, "foreign_law", "CH"}, diff --git a/ai-compliance-sdk/internal/ucca/legal_rag_intent_test.go b/ai-compliance-sdk/internal/ucca/legal_rag_intent_test.go index 25050b47..7c36c3d8 100644 --- a/ai-compliance-sdk/internal/ucca/legal_rag_intent_test.go +++ b/ai-compliance-sdk/internal/ucca/legal_rag_intent_test.go @@ -132,3 +132,17 @@ func TestRerank_OffTopicStandard_BlockedByGuard(t *testing.T) { t.Errorf("off-topic standard must not win even with control intent, got %s", out[0].SourceClass) } } + +func TestRerank_ControlQuestion_UntaggedNISTLifted(t *testing.T) { + // The existing NIST corpus is UNtagged (no source_class). It must still be classified + // technical_standard via markers and lifted on a control question โ€” the whole reason + // the lift path classifies instead of trusting the raw payload field. + results := []LegalSearchResult{ + {RegulationShort: "NIST SP 800-82r3", ArticleLabel: "AU-8", Score: 0.62}, + {RegulationShort: "CRA", ArticleLabel: "Art. 13 CRA", Category: "regulation", Score: 0.58}, + } + out := rerankByAuthority("Welche Controls passen zu Security Updates?", results) + if out[0].RegulationShort != "NIST SP 800-82r3" { + t.Errorf("untagged NIST should be lifted Top-1 on a control question, got %q", out[0].RegulationShort) + } +}