package ucca import ( "regexp" "strconv" "strings" ) // authorityInfo is the normative classification of a search result, used internally // for re-ranking only (Phase 1 changes ordering, not the response contract). type authorityInfo struct { weight int // 100 binding_law, 70 guidance, 0 foreign_law, 50 unknown sourceClass string // binding_law | supervisory_guidance | foreign_law | unknown jurisdiction string // DE | EU | CH } var ( guidanceMarkers = []string{ "DSK", "EDPB", "BfDI", "BFDI", "BayLfD", "Baylfb", "ENISA", "BSI", "EUCC", "Standards Mapping", "Kpnr", "Orientierungshilfe", "Handreichung", "Beschluss", "Leitlinie", "Guidance", "Empfehlung", "NIST", "OECD", "CISA", "Blue Guide", } foreignMarkers = []string{"RevDSG", "fedlex", "(CH)"} deMarkers = []string{"BDSG", "DSK", "BfDI", "BFDI", "BayLfD", "Baylfb", "BSI"} normPattern = regexp.MustCompile(`(§|Art\.?)\s*\d`) bdsgParagraph = regexp.MustCompile(`§\s*(\d+)`) ) // classifyAuthority derives weight/source-class/jurisdiction. Explicitly tagged payload // values win; otherwise it falls back to the curated category + name markers, so the // not-yet-re-ingested (untagged) corpus is still classified deterministically. func classifyAuthority(r LegalSearchResult) authorityInfo { jur := r.Jurisdiction if jur == "" { jur = inferJurisdiction(r) } if r.SourceClass != "" { w := r.AuthorityWeight if w == 0 && r.SourceClass == "binding_law" { w = 100 } return authorityInfo{weight: w, sourceClass: r.SourceClass, jurisdiction: jur} } if r.AuthorityWeight > 0 { return authorityInfo{weight: r.AuthorityWeight, sourceClass: sourceClassFromWeight(r.AuthorityWeight), jurisdiction: jur} } hay := r.ArticleLabel + " " + r.RegulationShort + " " + r.RegulationName + " " + r.RegulationCode switch { case containsAny(hay, foreignMarkers): return authorityInfo{weight: 0, sourceClass: "foreign_law", jurisdiction: "CH"} case r.Category == "guidance" || containsAny(hay, guidanceMarkers): return authorityInfo{weight: 70, sourceClass: "supervisory_guidance", jurisdiction: jur} case r.Category == "regulation" || r.Category == "eu_recht" || normPattern.MatchString(r.ArticleLabel): return authorityInfo{weight: 100, sourceClass: "binding_law", jurisdiction: jur} default: return authorityInfo{weight: 50, sourceClass: "unknown", jurisdiction: jur} } } func sourceClassFromWeight(w int) string { switch { case w >= 100: return "binding_law" case w >= 70: return "supervisory_guidance" case w <= 0: return "foreign_law" default: return "unknown" } } func inferJurisdiction(r LegalSearchResult) string { hay := r.ArticleLabel + " " + r.RegulationShort + " " + r.RegulationName switch { case containsAny(hay, foreignMarkers): return "CH" case strings.Contains(hay, "§") || containsAny(hay, deMarkers): return "DE" default: return "EU" } } // --- Domain routing: separates same-authority but topically foreign norms --- type domainDef struct { name string regs []string // regulation markers found in a chunk keywords []string // query keywords that signal this domain } // Deterministic order (slice, not map) — important for stable classification + tests. var domains = []domainDef{ {"data_protection", []string{"DSGVO", "GDPR", "BDSG", "EDPB", "DSK", "BfDI", "BayLfD", "DPF"}, []string{"personenbezogen", "betroffene", "datenschutz", "datenschutzbeauftrag", "dsb", "datenpanne", "auskunft", "loesch", "lösch", "einwilligung", "besondere kategorien", "auftragsverarbeiter"}}, {"cyber", []string{"CRA", "NIS2", "NIS-2", "ENISA", "DORA", "EUCC"}, []string{"security update", "sicherheitsupdate", "sicherheitsaktualisierung", "schwachstelle", "sbom", "cybersicherheit", "konformit", "hersteller", "importeur", "haendler", "händler", "ikt-", "resilienz", "sicherheitsvorfall", "digitalen elementen"}}, {"ai", []string{"AI Act", "KI-VO", "KI-Verordnung"}, []string{"ki-system", "ki-modell", "hochrisiko", "kuenstliche intelligenz", "künstliche intelligenz"}}, {"product_safety", []string{"Maschinenverordnung", "MaschinenVO", "GPSR", "RED", "MDR"}, nil}, } func queryDomain(query string) string { ql := strings.ToLower(query) for _, d := range domains { for _, kw := range d.keywords { if strings.Contains(ql, kw) { return d.name } } } return "" } func chunkDomain(r LegalSearchResult) string { hay := r.ArticleLabel + " " + r.RegulationShort + " " + r.RegulationCode + " " + r.RegulationName for _, d := range domains { if containsAny(hay, d.regs) { return d.name } } return "" } // scopeClass flags special sub-regimes that must not win general questions — // BDSG Teil 3 (§§ 45-84) implements the JI directive (law enforcement), not the general regime. func scopeClass(r LegalSearchResult) string { hay := r.ArticleLabel + " " + r.RegulationShort if strings.Contains(hay, "BDSG") { if m := bdsgParagraph.FindStringSubmatch(hay); m != nil { if n, err := strconv.Atoi(m[1]); err == nil && n >= 45 && n <= 84 { return "law_enforcement" } } } return "general" } // --- Topic ontology: amplifier only (boost), never an override --- type topicDef struct { keywords []string norms []string // preferred canonical citation fragments } var topics = []topicDef{ {[]string{"datenschutzbeauftrag", "dsb", "benennung"}, []string{"Art. 37", "§ 38 BDSG"}}, {[]string{"stellung des"}, []string{"Art. 38"}}, {[]string{"aufgaben des"}, []string{"Art. 39"}}, {[]string{"folgenabsch", "dsfa"}, []string{"Art. 35"}}, {[]string{"besondere kategorien"}, []string{"Art. 9", "§ 22 BDSG"}}, {[]string{"auskunft"}, []string{"Art. 15", "§ 34 BDSG"}}, {[]string{"loesch", "lösch"}, []string{"Art. 17", "§ 35 BDSG"}}, {[]string{"bussgeld", "geldbusse"}, []string{"Art. 83"}}, {[]string{"security update", "sicherheitsupdate", "schwachstelle", "sbom", "cybersicherheitsanforderung"}, []string{"CRA Anhang I"}}, {[]string{"meldepflicht", "sicherheitsvorfall"}, []string{"Art. 14 CRA"}}, } // resultMatchesTopic reports whether the result is a preferred norm of a topic the query hits. func resultMatchesTopic(query string, r LegalSearchResult) bool { ql := strings.ToLower(query) hay := r.ArticleLabel + " " + r.RegulationShort for _, t := range topics { if !containsAnyLower(ql, t.keywords) { continue } for _, n := range t.norms { if normMatches(hay, n) { return true } } } return false } // normMatches checks that norm appears in hay with a non-digit boundary, so "Art. 9" // matches "Art. 9 DSGVO" but not "Art. 90". func normMatches(hay, norm string) bool { idx := strings.Index(hay, norm) if idx < 0 { return false } end := idx + len(norm) if end < len(hay) && hay[end] >= '0' && hay[end] <= '9' { return false } return true } func queryIsForeign(query string) bool { return containsAnyLower(strings.ToLower(query), []string{"schweiz", "revdsg", "fedlex", " ch ", "oesterreich", "österreich"}) } func containsAny(hay string, markers []string) bool { for _, m := range markers { if strings.Contains(hay, m) { return true } } return false } func containsAnyLower(haylower string, markers []string) bool { for _, m := range markers { if strings.Contains(haylower, strings.ToLower(m)) { return true } } return false }