Files
breakpilot-compliance/ai-compliance-sdk/internal/ucca/concept_ontology.go
T
Claude 0903e3a8d1
CI / detect-changes (push) Successful in 5s
CI / branch-name (push) Has been skipped
CI / guardrail-integrity (push) Has been skipped
CI / secret-scan (push) Has been skipped
CI / dep-audit (push) Has been skipped
CI / sbom-scan (push) Has been skipped
CI / build-sha-integrity (push) Successful in 5s
CI / validate-canonical-controls (push) Successful in 4s
CI / loc-budget (push) Successful in 18s
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / nodejs-build (push) Successful in 3m0s
CI / test-go (push) Successful in 59s
CI / iace-gt-coverage (push) Successful in 17s
CI / test-python-backend (push) Has been skipped
CI / test-python-document-crawler (push) Has been skipped
CI / test-python-dsms-gateway (push) Has been skipped
perf(ai-sdk): embed query once across router fan-out + fold umlauts in intent/concept matching
Authority Router re-embedded the query per collection (6x); on dev the embed
endpoint (OVH) is remote so that was 6 round-trips = 7-12s per /retrieve. Embed
once, reuse via ctx across the concurrent per-collection searches.
DetectIntent + ConceptNorms now fold ae/oe/ue/ss so ASCII (Pruefe) and umlaut
(Pruefe) inputs both match.
2026-07-01 19:03:11 +02:00

98 lines
3.9 KiB
Go

package ucca
import (
"sort"
"strings"
)
// Legal Concept Ontology — the fachliche IP bridge for the Concept->Norm recall
// injector. The words users type ("Datenschutzerklärung", "Cookie Banner") are
// rarely identical to the article titles that actually govern them (Art. 12/13/14
// DSGVO, § 25 TDDDG). Embedding similarity misses this leap, so these bridges are
// curated: concept keyword -> load-bearing norm_ids. This is NOT a fallback to
// hardcoding — it is domain knowledge that surfaces the normatively load-bearing
// units within the (already correctly retrieved) documents.
type conceptNorm struct {
keywords []string
normIDs []string
}
var legalConceptOntology = []conceptNorm{
{[]string{"datenschutzerklärung", "datenschutzerklaerung", "privacy policy", "datenschutzhinweise", "datenschutzinformation"},
[]string{"EU-DSGVO-Art12", "EU-DSGVO-Art13", "EU-DSGVO-Art14"}},
{[]string{"cookie banner", "cookie-banner", "cookies", "cookie", "tracking"},
[]string{"DE-TDDDG-§25", "EU-DSGVO-Art6", "EU-DSGVO-Art7"}},
{[]string{"dsfa", "folgenabschätzung", "folgenabschaetzung", "datenschutz-folgenabschätzung"},
[]string{"EU-DSGVO-Art35", "EU-DSGVO-Art36"}},
{[]string{"auskunft", "auskunftsrecht", "auskunftsersuchen"},
[]string{"EU-DSGVO-Art15"}},
{[]string{"löschung", "loeschung", "vergessenwerden", "recht auf vergessen"},
[]string{"EU-DSGVO-Art17"}},
{[]string{"datenübertragbarkeit", "datenuebertragbarkeit", "portabilität", "portabilitaet"},
[]string{"EU-DSGVO-Art20"}},
{[]string{"widerspruch", "widerspruchsrecht"},
[]string{"EU-DSGVO-Art21"}},
{[]string{"datenpanne", "datenschutzverletzung", "data breach", "verletzung des schutzes"},
[]string{"EU-DSGVO-Art33", "EU-DSGVO-Art34"}},
// E4-Quick-Curation (2026-07-01): resolved abbreviations (E2) pull their core norms.
{[]string{"technische und organisatorische maßnahmen", "technische und organisatorische massnahmen"},
[]string{"EU-DSGVO-Art32", "EU-DSGVO-Art25", "EU-DSGVO-Art5"}},
{[]string{"verzeichnis von verarbeitungstätigkeiten", "verzeichnis von verarbeitungstaetigkeiten", "verarbeitungsverzeichnis"},
[]string{"EU-DSGVO-Art30"}},
{[]string{"auftragsverarbeitungsvertrag", "auftragsverarbeitung", "auftragsverarbeiter"},
[]string{"EU-DSGVO-Art28"}},
{[]string{"datenschutzbeauftragt"},
[]string{"EU-DSGVO-Art37", "EU-DSGVO-Art38", "EU-DSGVO-Art39"}},
}
// ConceptNorms returns the load-bearing norm_ids for the concepts named in the
// query (dedup, order-preserving). Empty if no concept is named.
func ConceptNorms(query string) []string {
q := normalizeGerman(query)
seen := map[string]bool{}
out := []string{}
for _, cn := range legalConceptOntology {
for _, kw := range cn.keywords {
if strings.Contains(q, normalizeGerman(kw)) {
for _, nid := range cn.normIDs {
if !seen[nid] {
seen[nid] = true
out = append(out, nid)
}
}
break
}
}
}
return out
}
// InjectConceptNorms merges concept-injected norm units into the results so the
// load-bearing norms are VISIBLE in the evidence set. Dedups by citation_unit
// (skips norms already retrieved), then re-sorts by score — the injected units
// carry a just-below-top score so they surface high WITHOUT displacing the top
// document hit (inject, don't blindly dominate). Caps at topK.
func InjectConceptNorms(results, injected []LegalSearchResult, topK int) []LegalSearchResult {
if len(injected) == 0 {
return results
}
present := map[string]bool{}
for _, r := range results {
if r.CitationUnit != "" {
present[r.CitationUnit] = true
}
}
merged := append([]LegalSearchResult{}, results...)
for _, in := range injected {
if in.CitationUnit != "" && !present[in.CitationUnit] {
merged = append(merged, in)
present[in.CitationUnit] = true
}
}
sort.SliceStable(merged, func(i, j int) bool { return merged[i].Score > merged[j].Score })
if topK > 0 && len(merged) > topK {
merged = merged[:topK]
}
return merged
}