Compare commits
3 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| c28c532958 | |||
| 989d9f6f91 | |||
| 4c99773fa1 |
@@ -9,8 +9,8 @@ import (
|
||||
// authorityInfo is the normative classification of a search result, used internally
|
||||
// for re-ranking only (Phase 1 changes ordering, not the response contract).
|
||||
type authorityInfo struct {
|
||||
weight int // 100 binding, 80 technical_standard, 70 guidance, 0 foreign, 50 unknown
|
||||
sourceClass string // binding_law | technical_standard | supervisory_guidance | foreign_law | unknown
|
||||
weight int // 100 binding_law, 70 guidance, 0 foreign_law, 50 unknown
|
||||
sourceClass string // binding_law | supervisory_guidance | foreign_law | unknown
|
||||
jurisdiction string // DE | EU | CH
|
||||
}
|
||||
|
||||
@@ -18,13 +18,7 @@ var (
|
||||
guidanceMarkers = []string{
|
||||
"DSK", "EDPB", "BfDI", "BFDI", "BayLfD", "Baylfb", "ENISA", "BSI", "EUCC",
|
||||
"Standards Mapping", "Kpnr", "Orientierungshilfe", "Handreichung", "Beschluss",
|
||||
"Leitlinie", "Guidance", "Empfehlung", "OECD", "CISA", "Blue Guide",
|
||||
}
|
||||
// Technical standards / control frameworks (best-practice controls). Checked BEFORE
|
||||
// guidanceMarkers so a "BSI Grundschutz" chunk classifies as a standard, not BSI guidance.
|
||||
standardMarkers = []string{
|
||||
"NIST", "OWASP", "Grundschutz", "ISO 27001", "ISO/IEC 27001",
|
||||
"CSA CCM", "Cloud Controls Matrix", "CIS Benchmark", "CIS Control",
|
||||
"Leitlinie", "Guidance", "Empfehlung", "NIST", "OECD", "CISA", "Blue Guide",
|
||||
}
|
||||
foreignMarkers = []string{"RevDSG", "fedlex", "(CH)"}
|
||||
deMarkers = []string{"BDSG", "DSK", "BfDI", "BFDI", "BayLfD", "Baylfb", "BSI"}
|
||||
@@ -54,8 +48,6 @@ func classifyAuthority(r LegalSearchResult) authorityInfo {
|
||||
switch {
|
||||
case containsAny(hay, foreignMarkers):
|
||||
return authorityInfo{weight: 0, sourceClass: "foreign_law", jurisdiction: "CH"}
|
||||
case r.Category == "standard" || containsAny(hay, standardMarkers):
|
||||
return authorityInfo{weight: 80, sourceClass: "technical_standard", jurisdiction: jur}
|
||||
case r.Category == "guidance" || containsAny(hay, guidanceMarkers):
|
||||
return authorityInfo{weight: 70, sourceClass: "supervisory_guidance", jurisdiction: jur}
|
||||
case r.Category == "regulation" || r.Category == "eu_recht" || normPattern.MatchString(r.ArticleLabel):
|
||||
@@ -69,8 +61,6 @@ func sourceClassFromWeight(w int) string {
|
||||
switch {
|
||||
case w >= 100:
|
||||
return "binding_law"
|
||||
case w >= 80:
|
||||
return "technical_standard"
|
||||
case w >= 70:
|
||||
return "supervisory_guidance"
|
||||
case w <= 0:
|
||||
|
||||
@@ -1,9 +1,6 @@
|
||||
package ucca
|
||||
|
||||
import (
|
||||
"sort"
|
||||
"strings"
|
||||
)
|
||||
import "sort"
|
||||
|
||||
// Re-ranking coefficients (validated in the offline golden harness; Phase A — conservative).
|
||||
const (
|
||||
@@ -16,61 +13,8 @@ const (
|
||||
scopePenalty = 0.25 // BDSG Teil 3 (law enforcement) on a general DP question
|
||||
topicGain = 0.18 // amplifier only
|
||||
supersededPenalty = 0.50 // superseded Alt-Quelle (pre-eu-v1): demoted, nicht versteckt
|
||||
intentLiftGain = 0.10 // epsilon a qualifying interpretative source is lifted ABOVE the best binding
|
||||
intentLiftMargin = 0.05 // ...only if that source is semantically competitive with binding
|
||||
)
|
||||
|
||||
// guidanceIntentSignals mark a query that EXPLICITLY asks for an interpretation /
|
||||
// recommendation by a guidance body, rather than for the binding obligation. Only
|
||||
// then may a (semantically competitive) guideline outrank the binding norm.
|
||||
var guidanceIntentSignals = []string{
|
||||
"edpb", "europäischer datenschutzausschuss", "europaeischer datenschutzausschuss",
|
||||
"dsk", "enisa", "bsi", "leitlinie", "guideline", "orientierungshilfe",
|
||||
"auslegung", "empfiehlt", "empfehlung", "sagt", "laut",
|
||||
}
|
||||
|
||||
// controlIntentSignals mark a query that asks HOW to implement / which controls or
|
||||
// measures fit — rather than WHAT the binding obligation is. Only then may a
|
||||
// (semantically competitive) technical_standard outrank the binding norm.
|
||||
var controlIntentSignals = []string{
|
||||
"control", "controls", "maßnahme", "massnahme", "schutzmaßnahme",
|
||||
"best practice", "best-practice", "umsetzen", "implementier", "absicher",
|
||||
"härt", "haert", "hardening", "nist", "owasp", "grundschutz",
|
||||
"ccm", "iso 27001", "isms",
|
||||
}
|
||||
|
||||
func queryMatchesAny(query string, signals []string) bool {
|
||||
q := strings.ToLower(query)
|
||||
for _, sig := range signals {
|
||||
if strings.Contains(q, sig) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// queryWantsGuidance reports whether the query explicitly asks for guidance/interpretation.
|
||||
func queryWantsGuidance(query string) bool { return queryMatchesAny(query, guidanceIntentSignals) }
|
||||
|
||||
// queryWantsControls reports whether the query asks for implementation controls/measures.
|
||||
func queryWantsControls(query string) bool { return queryMatchesAny(query, controlIntentSignals) }
|
||||
|
||||
// bestBindingSemantic returns the highest RAW semantic score among binding-law
|
||||
// results (0 if none / no intent). Used as the guard threshold so an off-topic
|
||||
// interpretative source cannot ride the intent boost.
|
||||
func bestBindingSemantic(results []LegalSearchResult, wantsIntent bool) float64 {
|
||||
if !wantsIntent {
|
||||
return 0
|
||||
}
|
||||
best := 0.0
|
||||
for _, r := range results {
|
||||
if classifyAuthority(r).sourceClass == "binding_law" && r.Score > best {
|
||||
best = r.Score
|
||||
}
|
||||
}
|
||||
return best
|
||||
}
|
||||
|
||||
// authorityScore computes the normative relevance of a result for a query. It augments the
|
||||
// semantic score with authority/jurisdiction/domain/scope/topic signals. Exposed for tests.
|
||||
func authorityScore(query string, r LegalSearchResult, qDomain string, qForeign bool) float64 {
|
||||
@@ -118,53 +62,14 @@ func rerankByAuthority(query string, results []LegalSearchResult) []LegalSearchR
|
||||
}
|
||||
qDomain := queryDomain(query)
|
||||
qForeign := queryIsForeign(query)
|
||||
wantsGuidance := queryWantsGuidance(query)
|
||||
wantsControls := queryWantsControls(query)
|
||||
bestBindingSem := bestBindingSemantic(results, wantsGuidance || wantsControls)
|
||||
|
||||
out := make([]LegalSearchResult, len(results))
|
||||
copy(out, results)
|
||||
for i := range out {
|
||||
out[i].Score = authorityScore(query, out[i], qDomain, qForeign)
|
||||
}
|
||||
// Explicit interpretation intent → a competitive guideline may outrank binding;
|
||||
// explicit implementation intent → a competitive technical_standard may. Both lift
|
||||
// ABOVE the best binding FINAL, so a pure norm question (neither intent) is untouched.
|
||||
if wantsGuidance {
|
||||
liftAboveBinding(out, results, bestBindingSem, "supervisory_guidance")
|
||||
}
|
||||
if wantsControls {
|
||||
liftAboveBinding(out, results, bestBindingSem, "technical_standard")
|
||||
}
|
||||
sort.SliceStable(out, func(a, b int) bool {
|
||||
return out[a].Score > out[b].Score
|
||||
})
|
||||
return out
|
||||
}
|
||||
|
||||
// liftAboveBinding lifts a semantically-competitive interpretative source (the given
|
||||
// sourceClass — supervisory_guidance or technical_standard) just ABOVE the best binding
|
||||
// hit, ordered by semantic, so an EXPLICIT guidance/implementation question can return
|
||||
// that source Top-1. A pure norm question (no intent → not called) keeps binding on top.
|
||||
// Sources below the semantic margin are left untouched, so an off-topic source can never
|
||||
// ride the override — and the lift is from the binding FINAL score, so authority/topic/
|
||||
// domain bonuses cannot edge it out.
|
||||
func liftAboveBinding(out, raw []LegalSearchResult, bestBindingSem float64, sourceClass string) {
|
||||
bestBindingFinal := 0.0
|
||||
for i := range out {
|
||||
if classifyAuthority(out[i]).sourceClass == "binding_law" && out[i].Score > bestBindingFinal {
|
||||
bestBindingFinal = out[i].Score
|
||||
}
|
||||
}
|
||||
for i := range out {
|
||||
// Classify (not raw payload) so the untagged legacy corpus — e.g. NIST ingested
|
||||
// before source_class tagging — is still recognized as its interpretative class.
|
||||
if classifyAuthority(out[i]).sourceClass != sourceClass || raw[i].Score < bestBindingSem-intentLiftMargin {
|
||||
continue
|
||||
}
|
||||
lifted := bestBindingFinal + intentLiftGain + (raw[i].Score - bestBindingSem)
|
||||
if lifted > out[i].Score {
|
||||
out[i].Score = lifted
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -14,10 +14,6 @@ func TestClassifyAuthority(t *testing.T) {
|
||||
{"tagged guidance DE", LegalSearchResult{AuthorityWeight: 70, SourceClass: "supervisory_guidance", Jurisdiction: "DE"}, 70, "supervisory_guidance", "DE"},
|
||||
{"tagged foreign CH", LegalSearchResult{AuthorityWeight: 0, SourceClass: "foreign_law", Jurisdiction: "CH"}, 0, "foreign_law", "CH"},
|
||||
{"untagged ENISA guidance", LegalSearchResult{RegulationShort: "ENISA", ArticleLabel: "ENISA CRA Standards Mapping"}, 70, "supervisory_guidance", "EU"},
|
||||
{"untagged NIST standard", LegalSearchResult{RegulationShort: "NIST SP 800-82r3", ArticleLabel: "AU-8"}, 80, "technical_standard", "EU"},
|
||||
{"BSI Grundschutz standard beats BSI guidance", LegalSearchResult{RegulationShort: "BSI Grundschutz", ArticleLabel: "BSI Grundschutz Baustein"}, 80, "technical_standard", "DE"},
|
||||
{"weight-only 85 TRGS standard", LegalSearchResult{AuthorityWeight: 85, RegulationShort: "TRGS 529"}, 85, "technical_standard", "EU"},
|
||||
{"tagged technical_standard", LegalSearchResult{AuthorityWeight: 80, SourceClass: "technical_standard", Jurisdiction: "EU"}, 80, "technical_standard", "EU"},
|
||||
{"untagged CRA binding", LegalSearchResult{RegulationShort: "CRA", ArticleLabel: "Art. 13 CRA", Category: "regulation"}, 100, "binding_law", "EU"},
|
||||
{"untagged BDSG binding DE", LegalSearchResult{RegulationShort: "BDSG", ArticleLabel: "§ 38 BDSG"}, 100, "binding_law", "DE"},
|
||||
{"untagged RevDSG foreign", LegalSearchResult{RegulationShort: "RevDSG", ArticleLabel: "RevDSG (CH)"}, 0, "foreign_law", "CH"},
|
||||
|
||||
@@ -1,148 +0,0 @@
|
||||
package ucca
|
||||
|
||||
import "testing"
|
||||
|
||||
func intentRes(reg, sourceClass string, sem float64, weight int) LegalSearchResult {
|
||||
return LegalSearchResult{
|
||||
RegulationShort: reg, SourceClass: sourceClass, Score: sem,
|
||||
AuthorityWeight: weight, Jurisdiction: "EU",
|
||||
}
|
||||
}
|
||||
|
||||
func TestQueryWantsGuidance(t *testing.T) {
|
||||
wants := []string{
|
||||
"Was empfiehlt der EDPB zum DSB?",
|
||||
"Was sagt die ENISA zu Security Updates?",
|
||||
"laut DSK ...",
|
||||
"Orientierungshilfe zur DSFA",
|
||||
"Welche BSI-Empfehlung gilt?",
|
||||
"Auslegung der Aufsichtsbehörde",
|
||||
}
|
||||
plain := []string{
|
||||
"Ab wann braucht man einen Datenschutzbeauftragten?",
|
||||
"Welche Anforderungen bestehen an Security Updates?",
|
||||
}
|
||||
for _, q := range wants {
|
||||
if !queryWantsGuidance(q) {
|
||||
t.Errorf("should detect interpretation intent: %q", q)
|
||||
}
|
||||
}
|
||||
for _, q := range plain {
|
||||
if queryWantsGuidance(q) {
|
||||
t.Errorf("should NOT detect intent (norm question): %q", q)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestRerank_NormQuestion_BindingStaysTop(t *testing.T) {
|
||||
// No intent signal → binding wins even though guidance is semantically higher.
|
||||
results := []LegalSearchResult{
|
||||
intentRes("EDPB DPO", "supervisory_guidance", 0.64, 70),
|
||||
intentRes("DSGVO", "binding_law", 0.58, 100),
|
||||
}
|
||||
out := rerankByAuthority("Ab wann braucht man einen Datenschutzbeauftragten?", results)
|
||||
if out[0].SourceClass != "binding_law" {
|
||||
t.Errorf("norm question: binding must stay Top-1, got %s", out[0].SourceClass)
|
||||
}
|
||||
}
|
||||
|
||||
func TestRerank_InterpretationQuestion_GuidanceMayWin(t *testing.T) {
|
||||
// Explicit intent + guidance semantically competitive → guidance wins.
|
||||
results := []LegalSearchResult{
|
||||
intentRes("EDPB DPO", "supervisory_guidance", 0.64, 70),
|
||||
intentRes("DSGVO", "binding_law", 0.58, 100),
|
||||
}
|
||||
out := rerankByAuthority("Was empfiehlt der EDPB zum Datenschutzbeauftragten?", results)
|
||||
if out[0].SourceClass != "supervisory_guidance" {
|
||||
t.Errorf("interpretation question: guidance should win Top-1, got %s", out[0].SourceClass)
|
||||
}
|
||||
}
|
||||
|
||||
func TestRerank_OffTopicGuidance_BlockedByGuard(t *testing.T) {
|
||||
// Intent present, but guidance semantic is far below the best binding hit →
|
||||
// the margin guard keeps binding on top (no off-topic guideline override).
|
||||
results := []LegalSearchResult{
|
||||
intentRes("EDPB DPO", "supervisory_guidance", 0.40, 70),
|
||||
intentRes("DSGVO", "binding_law", 0.58, 100),
|
||||
}
|
||||
out := rerankByAuthority("Was empfiehlt der EDPB zum Datenschutzbeauftragten?", results)
|
||||
if out[0].SourceClass != "binding_law" {
|
||||
t.Errorf("off-topic guidance must not win even with intent, got %s", out[0].SourceClass)
|
||||
}
|
||||
}
|
||||
|
||||
func TestQueryWantsControls(t *testing.T) {
|
||||
wants := []string{
|
||||
"Welche Controls passen zu Security Updates?",
|
||||
"Welche Maßnahmen sollten wir umsetzen?",
|
||||
"Wie härten wir den Server ab?",
|
||||
"Gibt es NIST-Controls dafür?",
|
||||
"OWASP Best Practice für Logging?",
|
||||
"BSI Grundschutz Bausteine",
|
||||
}
|
||||
plain := []string{
|
||||
"Welche Anforderungen bestehen an Security Updates?",
|
||||
"Ab wann braucht man einen Datenschutzbeauftragten?",
|
||||
}
|
||||
for _, q := range wants {
|
||||
if !queryWantsControls(q) {
|
||||
t.Errorf("should detect control/implementation intent: %q", q)
|
||||
}
|
||||
}
|
||||
for _, q := range plain {
|
||||
if queryWantsControls(q) {
|
||||
t.Errorf("should NOT detect control intent (norm question): %q", q)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestRerank_ControlQuestion_StandardMayWin(t *testing.T) {
|
||||
// Explicit implementation intent + standard semantically competitive → standard wins.
|
||||
results := []LegalSearchResult{
|
||||
intentRes("NIST SP 800-82", "technical_standard", 0.62, 80),
|
||||
intentRes("CRA", "binding_law", 0.58, 100),
|
||||
}
|
||||
out := rerankByAuthority("Welche Controls passen zu Security Updates?", results)
|
||||
if out[0].SourceClass != "technical_standard" {
|
||||
t.Errorf("control question: technical_standard should win Top-1, got %s", out[0].SourceClass)
|
||||
}
|
||||
}
|
||||
|
||||
func TestRerank_NormQuestion_BindingOverStandard(t *testing.T) {
|
||||
// "Anforderungen" → no control intent → binding stays Top-1 over the standard.
|
||||
results := []LegalSearchResult{
|
||||
intentRes("NIST SP 800-82", "technical_standard", 0.62, 80),
|
||||
intentRes("CRA", "binding_law", 0.58, 100),
|
||||
}
|
||||
out := rerankByAuthority("Welche Anforderungen bestehen an Security Updates?", results)
|
||||
if out[0].SourceClass != "binding_law" {
|
||||
t.Errorf("norm question: binding must stay Top-1 over standard, got %s", out[0].SourceClass)
|
||||
}
|
||||
}
|
||||
|
||||
func TestRerank_OffTopicStandard_BlockedByGuard(t *testing.T) {
|
||||
// Control intent present, but the standard is semantically far below binding →
|
||||
// the margin guard keeps binding Top-1 (no off-topic standard override).
|
||||
results := []LegalSearchResult{
|
||||
intentRes("NIST SP 800-82", "technical_standard", 0.40, 80),
|
||||
intentRes("CRA", "binding_law", 0.58, 100),
|
||||
}
|
||||
out := rerankByAuthority("Welche Controls passen zu Security Updates?", results)
|
||||
if out[0].SourceClass != "binding_law" {
|
||||
t.Errorf("off-topic standard must not win even with control intent, got %s", out[0].SourceClass)
|
||||
}
|
||||
}
|
||||
|
||||
func TestRerank_ControlQuestion_UntaggedNISTLifted(t *testing.T) {
|
||||
// The existing NIST corpus is UNtagged (no source_class). It must still be classified
|
||||
// technical_standard via markers and lifted on a control question — the whole reason
|
||||
// the lift path classifies instead of trusting the raw payload field.
|
||||
results := []LegalSearchResult{
|
||||
{RegulationShort: "NIST SP 800-82r3", ArticleLabel: "AU-8", Score: 0.62},
|
||||
{RegulationShort: "CRA", ArticleLabel: "Art. 13 CRA", Category: "regulation", Score: 0.58},
|
||||
}
|
||||
out := rerankByAuthority("Welche Controls passen zu Security Updates?", results)
|
||||
if out[0].RegulationShort != "NIST SP 800-82r3" {
|
||||
t.Errorf("untagged NIST should be lifted Top-1 on a control question, got %q", out[0].RegulationShort)
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user