0903e3a8d1
CI / detect-changes (push) Successful in 5s
CI / branch-name (push) Has been skipped
CI / guardrail-integrity (push) Has been skipped
CI / secret-scan (push) Has been skipped
CI / dep-audit (push) Has been skipped
CI / sbom-scan (push) Has been skipped
CI / build-sha-integrity (push) Successful in 5s
CI / validate-canonical-controls (push) Successful in 4s
CI / loc-budget (push) Successful in 18s
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / nodejs-build (push) Successful in 3m0s
CI / test-go (push) Successful in 59s
CI / iace-gt-coverage (push) Successful in 17s
CI / test-python-backend (push) Has been skipped
CI / test-python-document-crawler (push) Has been skipped
CI / test-python-dsms-gateway (push) Has been skipped
Authority Router re-embedded the query per collection (6x); on dev the embed endpoint (OVH) is remote so that was 6 round-trips = 7-12s per /retrieve. Embed once, reuse via ctx across the concurrent per-collection searches. DetectIntent + ConceptNorms now fold ae/oe/ue/ss so ASCII (Pruefe) and umlaut (Pruefe) inputs both match.
98 lines
3.9 KiB
Go
98 lines
3.9 KiB
Go
package ucca
|
|
|
|
import (
|
|
"sort"
|
|
"strings"
|
|
)
|
|
|
|
// Legal Concept Ontology — the fachliche IP bridge for the Concept->Norm recall
|
|
// injector. The words users type ("Datenschutzerklärung", "Cookie Banner") are
|
|
// rarely identical to the article titles that actually govern them (Art. 12/13/14
|
|
// DSGVO, § 25 TDDDG). Embedding similarity misses this leap, so these bridges are
|
|
// curated: concept keyword -> load-bearing norm_ids. This is NOT a fallback to
|
|
// hardcoding — it is domain knowledge that surfaces the normatively load-bearing
|
|
// units within the (already correctly retrieved) documents.
|
|
type conceptNorm struct {
|
|
keywords []string
|
|
normIDs []string
|
|
}
|
|
|
|
var legalConceptOntology = []conceptNorm{
|
|
{[]string{"datenschutzerklärung", "datenschutzerklaerung", "privacy policy", "datenschutzhinweise", "datenschutzinformation"},
|
|
[]string{"EU-DSGVO-Art12", "EU-DSGVO-Art13", "EU-DSGVO-Art14"}},
|
|
{[]string{"cookie banner", "cookie-banner", "cookies", "cookie", "tracking"},
|
|
[]string{"DE-TDDDG-§25", "EU-DSGVO-Art6", "EU-DSGVO-Art7"}},
|
|
{[]string{"dsfa", "folgenabschätzung", "folgenabschaetzung", "datenschutz-folgenabschätzung"},
|
|
[]string{"EU-DSGVO-Art35", "EU-DSGVO-Art36"}},
|
|
{[]string{"auskunft", "auskunftsrecht", "auskunftsersuchen"},
|
|
[]string{"EU-DSGVO-Art15"}},
|
|
{[]string{"löschung", "loeschung", "vergessenwerden", "recht auf vergessen"},
|
|
[]string{"EU-DSGVO-Art17"}},
|
|
{[]string{"datenübertragbarkeit", "datenuebertragbarkeit", "portabilität", "portabilitaet"},
|
|
[]string{"EU-DSGVO-Art20"}},
|
|
{[]string{"widerspruch", "widerspruchsrecht"},
|
|
[]string{"EU-DSGVO-Art21"}},
|
|
{[]string{"datenpanne", "datenschutzverletzung", "data breach", "verletzung des schutzes"},
|
|
[]string{"EU-DSGVO-Art33", "EU-DSGVO-Art34"}},
|
|
// E4-Quick-Curation (2026-07-01): resolved abbreviations (E2) pull their core norms.
|
|
{[]string{"technische und organisatorische maßnahmen", "technische und organisatorische massnahmen"},
|
|
[]string{"EU-DSGVO-Art32", "EU-DSGVO-Art25", "EU-DSGVO-Art5"}},
|
|
{[]string{"verzeichnis von verarbeitungstätigkeiten", "verzeichnis von verarbeitungstaetigkeiten", "verarbeitungsverzeichnis"},
|
|
[]string{"EU-DSGVO-Art30"}},
|
|
{[]string{"auftragsverarbeitungsvertrag", "auftragsverarbeitung", "auftragsverarbeiter"},
|
|
[]string{"EU-DSGVO-Art28"}},
|
|
{[]string{"datenschutzbeauftragt"},
|
|
[]string{"EU-DSGVO-Art37", "EU-DSGVO-Art38", "EU-DSGVO-Art39"}},
|
|
}
|
|
|
|
// ConceptNorms returns the load-bearing norm_ids for the concepts named in the
|
|
// query (dedup, order-preserving). Empty if no concept is named.
|
|
func ConceptNorms(query string) []string {
|
|
q := normalizeGerman(query)
|
|
seen := map[string]bool{}
|
|
out := []string{}
|
|
for _, cn := range legalConceptOntology {
|
|
for _, kw := range cn.keywords {
|
|
if strings.Contains(q, normalizeGerman(kw)) {
|
|
for _, nid := range cn.normIDs {
|
|
if !seen[nid] {
|
|
seen[nid] = true
|
|
out = append(out, nid)
|
|
}
|
|
}
|
|
break
|
|
}
|
|
}
|
|
}
|
|
return out
|
|
}
|
|
|
|
// InjectConceptNorms merges concept-injected norm units into the results so the
|
|
// load-bearing norms are VISIBLE in the evidence set. Dedups by citation_unit
|
|
// (skips norms already retrieved), then re-sorts by score — the injected units
|
|
// carry a just-below-top score so they surface high WITHOUT displacing the top
|
|
// document hit (inject, don't blindly dominate). Caps at topK.
|
|
func InjectConceptNorms(results, injected []LegalSearchResult, topK int) []LegalSearchResult {
|
|
if len(injected) == 0 {
|
|
return results
|
|
}
|
|
present := map[string]bool{}
|
|
for _, r := range results {
|
|
if r.CitationUnit != "" {
|
|
present[r.CitationUnit] = true
|
|
}
|
|
}
|
|
merged := append([]LegalSearchResult{}, results...)
|
|
for _, in := range injected {
|
|
if in.CitationUnit != "" && !present[in.CitationUnit] {
|
|
merged = append(merged, in)
|
|
present[in.CitationUnit] = true
|
|
}
|
|
}
|
|
sort.SliceStable(merged, func(i, j int) bool { return merged[i].Score > merged[j].Score })
|
|
if topK > 0 && len(merged) > topK {
|
|
merged = merged[:topK]
|
|
}
|
|
return merged
|
|
}
|