Files
breakpilot-compliance/ai-compliance-sdk/internal/ucca/knowledge_space.go
T

149 lines
7.0 KiB
Go

package ucca
import "strings"
// KnowledgeSpace is the CHIP-level knowledge domain used by the clarity gate's
// concentration signal + the user-facing context chips. It is deliberately RICHER
// than the 4 authority domains in authority.go (data_protection/cyber/ai/
// product_safety), which drive the EU-primary/subsidiarity rerank. The clarity
// gate must reflect the FULL corpus breadth (arbeitsschutz, arbeitsrecht,
// wirtschaftsrecht, finanz, ...) so a broad query surfaces as broad. Kept separate
// + additive so the tuned authority rerank stays untouched. Corpus-grounded from
// the 463 real regulation codes (0.3% fall through to "sonstiges").
// knowledgeSpaceExact matches short/ambiguous codes by EXACT string (substring
// would misfire on 2-3 char codes like "OR"/"AO"/"BGB").
var knowledgeSpaceExact = map[string]string{
"HGB": "wirtschaftsrecht", "BGB": "wirtschaftsrecht", "AO": "wirtschaftsrecht", "OR": "wirtschaftsrecht",
"ABGB": "wirtschaftsrecht", "UGB": "wirtschaftsrecht", "IFRS": "wirtschaftsrecht", "BAO": "wirtschaftsrecht",
"GMBHG": "wirtschaftsrecht", "AKTG": "wirtschaftsrecht", "INSO": "wirtschaftsrecht", "USTG": "wirtschaftsrecht",
"GOBD": "wirtschaftsrecht", "EGBGB": "wirtschaftsrecht", "GEWO": "wirtschaftsrecht", "URHG": "wirtschaftsrecht",
"DPF": "datenschutz", "TKG": "datenschutz", "TMG": "datenschutz", "DDG": "datenschutz", "DSG": "datenschutz",
"DSV": "datenschutz", "DSM": "datenschutz", "SCC": "datenschutz", "EPRIVACY": "datenschutz",
"SCHREMS II": "datenschutz", "CH_REVDSG": "datenschutz", "PLANET49": "datenschutz", "GOOGLE FONTS": "datenschutz",
"DSA": "digitale_dienste", "DMA": "digitale_dienste", "DGA": "digitale_dienste", "EHDS": "digitale_dienste",
"EIDAS": "digitale_dienste", "EIDAS 2.0": "digitale_dienste", "DATA ACT": "digitale_dienste",
"DATAACT": "digitale_dienste", "DIGITAL CONTENT": "digitale_dienste",
"MVO": "produktsicherheit", "MACHINERY": "produktsicherheit", "MASCHVO": "produktsicherheit",
"MASCHINENVO": "produktsicherheit", "GPSR": "produktsicherheit", "PID": "produktsicherheit",
"EAA": "produktsicherheit", "BFSG": "produktsicherheit", "ELEKTROG": "produktsicherheit",
"VERPACKG": "produktsicherheit", "BATTVO": "produktsicherheit", "BATTDG": "produktsicherheit", "EU MDR": "produktsicherheit",
"DORA": "finanz", "PSD2": "finanz", "MICA": "finanz", "AMLR": "finanz", "VAIT": "finanz", "BAIT": "finanz", "GWG": "finanz",
"UWG": "verbraucherschutz", "UCPD": "verbraucherschutz", "VSBG": "verbraucherschutz", "PANGV": "verbraucherschutz",
"DL-INFOV": "verbraucherschutz", "OMNIBUS": "verbraucherschutz", "UWG AT": "verbraucherschutz",
"PRODHAFTG": "verbraucherschutz", "PRODUKTHAFTUNGS-RL": "verbraucherschutz",
"ARG": "arbeitsrecht",
}
// KnowledgeSpaceLabel maps a knowledge-space id to a user-facing chip label.
var KnowledgeSpaceLabel = map[string]string{
"datenschutz": "Datenschutz", "cyber": "Cybersecurity", "ki": "KI",
"produktsicherheit": "Produktsicherheit", "arbeitsschutz": "Arbeitsschutz",
"arbeitsrecht": "Arbeitsrecht", "wirtschaftsrecht": "Wirtschaftsrecht",
"finanz": "Finanzregulierung", "digitale_dienste": "Digitale Dienste",
"verbraucherschutz": "Verbraucherschutz", "lieferkette": "Lieferkette/Nachhaltigkeit",
"hinweisgeber": "Hinweisgeberschutz", "sonstiges": "Sonstiges",
}
// KnowledgeSpaceOf maps a regulation_code to a knowledge space. Robust to code
// variants (MVO/MASCHVO/MASCHINENVO -> produktsicherheit; DSK SDM / SDM B51 ->
// datenschutz). Returns "" for empty/untagged codes (not a knowledge space).
func KnowledgeSpaceOf(code string) string {
c := strings.ToUpper(strings.TrimSpace(code))
if c == "" || c == "NONE" {
return ""
}
if d, ok := knowledgeSpaceExact[c]; ok {
return d
}
has := func(subs ...string) bool {
for _, s := range subs {
if strings.Contains(c, s) {
return true
}
}
return false
}
pre := func(subs ...string) bool {
for _, s := range subs {
if strings.HasPrefix(c, s) {
return true
}
}
return false
}
switch {
case pre("TRGS", "TRBS", "ASR", "OSHA") || has("ARBSCHG", "GEFAHRSTOFF"):
return "arbeitsschutz"
case has("AI ACT", "KI-VO", "KI VERORDNUNG", "GPAI", "AI RMF", "HLEG AI", "GENAI", "OECD AI", "AI PRINCIPLES", "OH KI", "KI BEHOERDEN", "KI SICHERHEIT", "POS KI"):
return "ki"
case pre("DSGVO", "BDSG", "TDDDG", "DSK", "EDPB", "WP24", "WP25", "WP26", "DSFA", "BFDI", "BAYLDA", "BAYLFB", "EDPS") || has("DATENSCHUTZ", "LOESCHKONZEPT", "LOESCHUNG", "VVT", "TELEMEDIEN", "EU US DPF", "BESCHAEFTIGTENDATEN"):
return "datenschutz"
case has("CRA", "NIS2", "NISG", "BSIG", "BSI-TR", "BSI_KRITIS", "KRITIS", "ENISA", "NIST", "OWASP", "EUCSA", "EUCC", "CISA", "CYCLONEDX", "SPDX", "SLSA", "OPENTELEMETRY", "CVSS", "SECURE BY DESIGN"):
return "cyber"
case has("MACHINERY", "MASCH", "BLUE GUIDE", "FDA HFE"):
return "produktsicherheit"
case has("LKSG", "CSDDD", "CSRD", "TAXONOMY"):
return "lieferkette"
case has("HINSCHG", "GESCHGEHG"):
return "hinweisgeber"
case pre("BAG ", "BAG_") || has("ARBVG", "AZG", "ARBZG", "BETRVG", "KSCHG", "MUSCHG", "AGG", "MILOG", "TZBFG", "NACHWG", "BURLG", "611A", "PAY TRANSPARENCY", "ANGG", "MUTTERSCHUTZ"):
return "arbeitsrecht"
case has("ECOMMERCE", "ECG", "MEDIENG", "VERBRAUCHERRECHTE", "DIGITAL CONTENT"):
return "verbraucherschutz"
case pre("EUGH", "BVERFG", "BVGE", "BGH", "OGH") || has("EU TAXONOMY"):
return "wirtschaftsrecht"
default:
return "sonstiges"
}
}
// ScopeResults implements G1 scope-gating: when the query names a regulation, its
// knowledge space's hits LEAD the result set (the L2 answer + [n] citations are
// built on this order, so scoped answers cite the named regulation instead of the
// embedding-majority domain). Non-scoped hits backfill to keep topK. Stable within
// each partition. Returns results unchanged when scope is "".
func ScopeResults(results []LegalSearchResult, scope string, topK int) []LegalSearchResult {
if scope == "" {
return results
}
scoped := make([]LegalSearchResult, 0, len(results))
rest := make([]LegalSearchResult, 0, len(results))
for _, r := range results {
if KnowledgeSpaceOf(r.RegulationCode) == scope {
scoped = append(scoped, r)
} else {
rest = append(rest, r)
}
}
out := append(scoped, rest...)
if topK > 0 && len(out) > topK {
out = out[:topK]
}
return out
}
// FilterByKnowledgeSpace returns ONLY the results in the given knowledge space —
// a HARD scope with no off-domain backfill. Used by E5 context scoping: when the
// user explicitly chose a domain chip, off-domain regelwerke (MDR/UStG/eIDAS) must
// not reappear in the evidence. Falls back to the input when the domain has no hits
// (never strand the answer). Caps topK.
func FilterByKnowledgeSpace(results []LegalSearchResult, scope string, topK int) []LegalSearchResult {
if scope == "" {
return results
}
out := make([]LegalSearchResult, 0, len(results))
for _, r := range results {
if KnowledgeSpaceOf(r.RegulationCode) == scope {
out = append(out, r)
}
}
if len(out) == 0 {
return results
}
if topK > 0 && len(out) > topK {
out = out[:topK]
}
return out
}