feat(ai-sdk): classify NIST/OWASP/Grundschutz as technical_standard (#37)
CI / detect-changes (push) Successful in 4s
CI / branch-name (push) Has been skipped
CI / guardrail-integrity (push) Has been skipped
CI / secret-scan (push) Has been skipped
CI / dep-audit (push) Has been skipped
CI / sbom-scan (push) Has been skipped
CI / build-sha-integrity (push) Successful in 6s
CI / validate-canonical-controls (push) Successful in 3s
CI / loc-budget (push) Successful in 18s
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / nodejs-build (push) Has been skipped
CI / test-go (push) Successful in 1m0s
CI / iace-gt-coverage (push) Successful in 14s
CI / test-python-backend (push) Has been skipped
CI / test-python-document-crawler (push) Has been skipped
CI / test-python-dsms-gateway (push) Has been skipped
CI / detect-changes (push) Successful in 4s
CI / branch-name (push) Has been skipped
CI / guardrail-integrity (push) Has been skipped
CI / secret-scan (push) Has been skipped
CI / dep-audit (push) Has been skipped
CI / sbom-scan (push) Has been skipped
CI / build-sha-integrity (push) Successful in 6s
CI / validate-canonical-controls (push) Successful in 3s
CI / loc-budget (push) Successful in 18s
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / nodejs-build (push) Has been skipped
CI / test-go (push) Successful in 1m0s
CI / iace-gt-coverage (push) Successful in 14s
CI / test-python-backend (push) Has been skipped
CI / test-python-document-crawler (push) Has been skipped
CI / test-python-dsms-gateway (push) Has been skipped
This commit was merged in pull request #37.
This commit is contained in:
@@ -9,8 +9,8 @@ import (
|
|||||||
// authorityInfo is the normative classification of a search result, used internally
|
// authorityInfo is the normative classification of a search result, used internally
|
||||||
// for re-ranking only (Phase 1 changes ordering, not the response contract).
|
// for re-ranking only (Phase 1 changes ordering, not the response contract).
|
||||||
type authorityInfo struct {
|
type authorityInfo struct {
|
||||||
weight int // 100 binding_law, 70 guidance, 0 foreign_law, 50 unknown
|
weight int // 100 binding, 80 technical_standard, 70 guidance, 0 foreign, 50 unknown
|
||||||
sourceClass string // binding_law | supervisory_guidance | foreign_law | unknown
|
sourceClass string // binding_law | technical_standard | supervisory_guidance | foreign_law | unknown
|
||||||
jurisdiction string // DE | EU | CH
|
jurisdiction string // DE | EU | CH
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -18,7 +18,13 @@ var (
|
|||||||
guidanceMarkers = []string{
|
guidanceMarkers = []string{
|
||||||
"DSK", "EDPB", "BfDI", "BFDI", "BayLfD", "Baylfb", "ENISA", "BSI", "EUCC",
|
"DSK", "EDPB", "BfDI", "BFDI", "BayLfD", "Baylfb", "ENISA", "BSI", "EUCC",
|
||||||
"Standards Mapping", "Kpnr", "Orientierungshilfe", "Handreichung", "Beschluss",
|
"Standards Mapping", "Kpnr", "Orientierungshilfe", "Handreichung", "Beschluss",
|
||||||
"Leitlinie", "Guidance", "Empfehlung", "NIST", "OECD", "CISA", "Blue Guide",
|
"Leitlinie", "Guidance", "Empfehlung", "OECD", "CISA", "Blue Guide",
|
||||||
|
}
|
||||||
|
// Technical standards / control frameworks (best-practice controls). Checked BEFORE
|
||||||
|
// guidanceMarkers so a "BSI Grundschutz" chunk classifies as a standard, not BSI guidance.
|
||||||
|
standardMarkers = []string{
|
||||||
|
"NIST", "OWASP", "Grundschutz", "ISO 27001", "ISO/IEC 27001",
|
||||||
|
"CSA CCM", "Cloud Controls Matrix", "CIS Benchmark", "CIS Control",
|
||||||
}
|
}
|
||||||
foreignMarkers = []string{"RevDSG", "fedlex", "(CH)"}
|
foreignMarkers = []string{"RevDSG", "fedlex", "(CH)"}
|
||||||
deMarkers = []string{"BDSG", "DSK", "BfDI", "BFDI", "BayLfD", "Baylfb", "BSI"}
|
deMarkers = []string{"BDSG", "DSK", "BfDI", "BFDI", "BayLfD", "Baylfb", "BSI"}
|
||||||
@@ -48,6 +54,8 @@ func classifyAuthority(r LegalSearchResult) authorityInfo {
|
|||||||
switch {
|
switch {
|
||||||
case containsAny(hay, foreignMarkers):
|
case containsAny(hay, foreignMarkers):
|
||||||
return authorityInfo{weight: 0, sourceClass: "foreign_law", jurisdiction: "CH"}
|
return authorityInfo{weight: 0, sourceClass: "foreign_law", jurisdiction: "CH"}
|
||||||
|
case r.Category == "standard" || containsAny(hay, standardMarkers):
|
||||||
|
return authorityInfo{weight: 80, sourceClass: "technical_standard", jurisdiction: jur}
|
||||||
case r.Category == "guidance" || containsAny(hay, guidanceMarkers):
|
case r.Category == "guidance" || containsAny(hay, guidanceMarkers):
|
||||||
return authorityInfo{weight: 70, sourceClass: "supervisory_guidance", jurisdiction: jur}
|
return authorityInfo{weight: 70, sourceClass: "supervisory_guidance", jurisdiction: jur}
|
||||||
case r.Category == "regulation" || r.Category == "eu_recht" || normPattern.MatchString(r.ArticleLabel):
|
case r.Category == "regulation" || r.Category == "eu_recht" || normPattern.MatchString(r.ArticleLabel):
|
||||||
@@ -61,6 +69,8 @@ func sourceClassFromWeight(w int) string {
|
|||||||
switch {
|
switch {
|
||||||
case w >= 100:
|
case w >= 100:
|
||||||
return "binding_law"
|
return "binding_law"
|
||||||
|
case w >= 80:
|
||||||
|
return "technical_standard"
|
||||||
case w >= 70:
|
case w >= 70:
|
||||||
return "supervisory_guidance"
|
return "supervisory_guidance"
|
||||||
case w <= 0:
|
case w <= 0:
|
||||||
|
|||||||
@@ -64,7 +64,7 @@ func bestBindingSemantic(results []LegalSearchResult, wantsIntent bool) float64
|
|||||||
}
|
}
|
||||||
best := 0.0
|
best := 0.0
|
||||||
for _, r := range results {
|
for _, r := range results {
|
||||||
if r.SourceClass == "binding_law" && r.Score > best {
|
if classifyAuthority(r).sourceClass == "binding_law" && r.Score > best {
|
||||||
best = r.Score
|
best = r.Score
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -152,12 +152,14 @@ func rerankByAuthority(query string, results []LegalSearchResult) []LegalSearchR
|
|||||||
func liftAboveBinding(out, raw []LegalSearchResult, bestBindingSem float64, sourceClass string) {
|
func liftAboveBinding(out, raw []LegalSearchResult, bestBindingSem float64, sourceClass string) {
|
||||||
bestBindingFinal := 0.0
|
bestBindingFinal := 0.0
|
||||||
for i := range out {
|
for i := range out {
|
||||||
if out[i].SourceClass == "binding_law" && out[i].Score > bestBindingFinal {
|
if classifyAuthority(out[i]).sourceClass == "binding_law" && out[i].Score > bestBindingFinal {
|
||||||
bestBindingFinal = out[i].Score
|
bestBindingFinal = out[i].Score
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
for i := range out {
|
for i := range out {
|
||||||
if out[i].SourceClass != sourceClass || raw[i].Score < bestBindingSem-intentLiftMargin {
|
// Classify (not raw payload) so the untagged legacy corpus — e.g. NIST ingested
|
||||||
|
// before source_class tagging — is still recognized as its interpretative class.
|
||||||
|
if classifyAuthority(out[i]).sourceClass != sourceClass || raw[i].Score < bestBindingSem-intentLiftMargin {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
lifted := bestBindingFinal + intentLiftGain + (raw[i].Score - bestBindingSem)
|
lifted := bestBindingFinal + intentLiftGain + (raw[i].Score - bestBindingSem)
|
||||||
|
|||||||
@@ -14,6 +14,10 @@ func TestClassifyAuthority(t *testing.T) {
|
|||||||
{"tagged guidance DE", LegalSearchResult{AuthorityWeight: 70, SourceClass: "supervisory_guidance", Jurisdiction: "DE"}, 70, "supervisory_guidance", "DE"},
|
{"tagged guidance DE", LegalSearchResult{AuthorityWeight: 70, SourceClass: "supervisory_guidance", Jurisdiction: "DE"}, 70, "supervisory_guidance", "DE"},
|
||||||
{"tagged foreign CH", LegalSearchResult{AuthorityWeight: 0, SourceClass: "foreign_law", Jurisdiction: "CH"}, 0, "foreign_law", "CH"},
|
{"tagged foreign CH", LegalSearchResult{AuthorityWeight: 0, SourceClass: "foreign_law", Jurisdiction: "CH"}, 0, "foreign_law", "CH"},
|
||||||
{"untagged ENISA guidance", LegalSearchResult{RegulationShort: "ENISA", ArticleLabel: "ENISA CRA Standards Mapping"}, 70, "supervisory_guidance", "EU"},
|
{"untagged ENISA guidance", LegalSearchResult{RegulationShort: "ENISA", ArticleLabel: "ENISA CRA Standards Mapping"}, 70, "supervisory_guidance", "EU"},
|
||||||
|
{"untagged NIST standard", LegalSearchResult{RegulationShort: "NIST SP 800-82r3", ArticleLabel: "AU-8"}, 80, "technical_standard", "EU"},
|
||||||
|
{"BSI Grundschutz standard beats BSI guidance", LegalSearchResult{RegulationShort: "BSI Grundschutz", ArticleLabel: "BSI Grundschutz Baustein"}, 80, "technical_standard", "DE"},
|
||||||
|
{"weight-only 85 TRGS standard", LegalSearchResult{AuthorityWeight: 85, RegulationShort: "TRGS 529"}, 85, "technical_standard", "EU"},
|
||||||
|
{"tagged technical_standard", LegalSearchResult{AuthorityWeight: 80, SourceClass: "technical_standard", Jurisdiction: "EU"}, 80, "technical_standard", "EU"},
|
||||||
{"untagged CRA binding", LegalSearchResult{RegulationShort: "CRA", ArticleLabel: "Art. 13 CRA", Category: "regulation"}, 100, "binding_law", "EU"},
|
{"untagged CRA binding", LegalSearchResult{RegulationShort: "CRA", ArticleLabel: "Art. 13 CRA", Category: "regulation"}, 100, "binding_law", "EU"},
|
||||||
{"untagged BDSG binding DE", LegalSearchResult{RegulationShort: "BDSG", ArticleLabel: "§ 38 BDSG"}, 100, "binding_law", "DE"},
|
{"untagged BDSG binding DE", LegalSearchResult{RegulationShort: "BDSG", ArticleLabel: "§ 38 BDSG"}, 100, "binding_law", "DE"},
|
||||||
{"untagged RevDSG foreign", LegalSearchResult{RegulationShort: "RevDSG", ArticleLabel: "RevDSG (CH)"}, 0, "foreign_law", "CH"},
|
{"untagged RevDSG foreign", LegalSearchResult{RegulationShort: "RevDSG", ArticleLabel: "RevDSG (CH)"}, 0, "foreign_law", "CH"},
|
||||||
|
|||||||
@@ -132,3 +132,17 @@ func TestRerank_OffTopicStandard_BlockedByGuard(t *testing.T) {
|
|||||||
t.Errorf("off-topic standard must not win even with control intent, got %s", out[0].SourceClass)
|
t.Errorf("off-topic standard must not win even with control intent, got %s", out[0].SourceClass)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestRerank_ControlQuestion_UntaggedNISTLifted(t *testing.T) {
|
||||||
|
// The existing NIST corpus is UNtagged (no source_class). It must still be classified
|
||||||
|
// technical_standard via markers and lifted on a control question — the whole reason
|
||||||
|
// the lift path classifies instead of trusting the raw payload field.
|
||||||
|
results := []LegalSearchResult{
|
||||||
|
{RegulationShort: "NIST SP 800-82r3", ArticleLabel: "AU-8", Score: 0.62},
|
||||||
|
{RegulationShort: "CRA", ArticleLabel: "Art. 13 CRA", Category: "regulation", Score: 0.58},
|
||||||
|
}
|
||||||
|
out := rerankByAuthority("Welche Controls passen zu Security Updates?", results)
|
||||||
|
if out[0].RegulationShort != "NIST SP 800-82r3" {
|
||||||
|
t.Errorf("untagged NIST should be lifted Top-1 on a control question, got %q", out[0].RegulationShort)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user