breakpilot-compliance/ai-compliance-sdk/internal/ucca/authority_rerank.go

package ucca

import (
	"sort"
	"strings"
)

// Re-ranking coefficients (validated in the offline golden harness; Phase A — conservative).
const (
	authorityCoef     = 0.40 // * weight/100
	jurisdictionGain  = 0.05 // binding/guidance from DE or EU
	foreignPenalty    = 0.60 // foreign law on a DE/EU question (demoted, not removed)
	unknownPenalty    = 0.08
	domainMatchGain   = 0.15
	offDomainPenalty  = 0.10 // off-domain binding (demoted, not removed)
	scopePenalty      = 0.25 // BDSG Teil 3 (law enforcement) on a general DP question
	topicGain         = 0.18 // amplifier only
	supersededPenalty = 0.50 // superseded Alt-Quelle (pre-eu-v1): demoted, nicht versteckt
	intentLiftGain    = 0.10 // epsilon a qualifying interpretative source is lifted ABOVE the best binding
	intentLiftMargin  = 0.05 // ...only if that source is semantically competitive with binding
)

// guidanceIntentSignals mark a query that EXPLICITLY asks for an interpretation /
// recommendation by a guidance body, rather than for the binding obligation. Only
// then may a (semantically competitive) guideline outrank the binding norm.
var guidanceIntentSignals = []string{
	"edpb", "europäischer datenschutzausschuss", "europaeischer datenschutzausschuss",
	"dsk", "enisa", "bsi", "leitlinie", "guideline", "orientierungshilfe",
	"auslegung", "empfiehlt", "empfehlung", "sagt", "laut",
}

// controlIntentSignals mark a query that asks HOW to implement / which controls or
// measures fit — rather than WHAT the binding obligation is. Only then may a
// (semantically competitive) technical_standard outrank the binding norm.
var controlIntentSignals = []string{
	"control", "controls", "maßnahme", "massnahme", "schutzmaßnahme",
	"best practice", "best-practice", "umsetzen", "implementier", "absicher",
	"härt", "haert", "hardening", "nist", "owasp", "grundschutz",
	"ccm", "iso 27001", "isms",
}

func queryMatchesAny(query string, signals []string) bool {
	q := strings.ToLower(query)
	for _, sig := range signals {
		if strings.Contains(q, sig) {
			return true
		}
	}
	return false
}

// queryWantsGuidance reports whether the query explicitly asks for guidance/interpretation.
func queryWantsGuidance(query string) bool { return queryMatchesAny(query, guidanceIntentSignals) }

// queryWantsControls reports whether the query asks for implementation controls/measures.
func queryWantsControls(query string) bool { return queryMatchesAny(query, controlIntentSignals) }

// bestBindingSemantic returns the highest RAW semantic score among binding-law
// results (0 if none / no intent). Used as the guard threshold so an off-topic
// interpretative source cannot ride the intent boost.
func bestBindingSemantic(results []LegalSearchResult, wantsIntent bool) float64 {
	if !wantsIntent {
		return 0
	}
	best := 0.0
	for _, r := range results {
		if classifyAuthority(r).sourceClass == "binding_law" && r.Score > best {
			best = r.Score
		}
	}
	return best
}

// authorityScore computes the normative relevance of a result for a query. It augments the
// semantic score with authority/jurisdiction/domain/scope/topic signals. Exposed for tests.
func authorityScore(query string, r LegalSearchResult, qDomain string, qForeign bool) float64 {
	info := classifyAuthority(r)
	score := r.Score + authorityCoef*float64(info.weight)/100.0

	if r.Superseded {
		// Alt-Quelle (pre-eu-v1): Default-Fragen sollen die eu-v1-Norm sehen. Demoted,
		// nicht entfernt — fuer Historie/Uebergangsfragen bleibt sie auffindbar.
		score -= supersededPenalty
	}

	if info.jurisdiction == "CH" && !qForeign {
		score -= foreignPenalty // Fremdrecht bei DE/EU-Frage: demoted, nicht geloescht
	} else {
		score += jurisdictionGain
	}
	if info.sourceClass == "unknown" {
		score -= unknownPenalty
	}
	if qDomain != "" {
		switch cd := chunkDomain(r); {
		case cd == qDomain:
			score += domainMatchGain
		case cd != "":
			score -= offDomainPenalty // off-domain binding: demoted, nicht geloescht
		}
	}
	if qDomain == "data_protection" && scopeClass(r) == "law_enforcement" {
		score -= scopePenalty
	}
	if resultMatchesTopic(query, r) {
		score += topicGain // Verstaerker, kein Override
	}
	return score
}

// rerankByAuthority re-orders results so binding law from the matching jurisdiction/domain
// ranks above guidance, foreign and off-domain law — WITHOUT dropping anything (guidance is
// kept as interpretation context). The computed score is written back to Score so downstream
// merges (e.g. the multi-collection advisor) preserve this order. Pure + deterministic.
func rerankByAuthority(query string, results []LegalSearchResult) []LegalSearchResult {
	if len(results) < 2 {
		return results
	}
	qDomain := queryDomain(query)
	qForeign := queryIsForeign(query)
	wantsGuidance := queryWantsGuidance(query)
	wantsControls := queryWantsControls(query)
	bestBindingSem := bestBindingSemantic(results, wantsGuidance || wantsControls)

	out := make([]LegalSearchResult, len(results))
	copy(out, results)
	for i := range out {
		out[i].Score = authorityScore(query, out[i], qDomain, qForeign)
	}
	// Explicit interpretation intent → a competitive guideline may outrank binding;
	// explicit implementation intent → a competitive technical_standard may. Both lift
	// ABOVE the best binding FINAL, so a pure norm question (neither intent) is untouched.
	if wantsGuidance {
		liftAboveBinding(out, results, bestBindingSem, "supervisory_guidance")
	}
	if wantsControls {
		liftAboveBinding(out, results, bestBindingSem, "technical_standard")
	}
	sort.SliceStable(out, func(a, b int) bool {
		return out[a].Score > out[b].Score
	})
	return out
}

// liftAboveBinding lifts a semantically-competitive interpretative source (the given
// sourceClass — supervisory_guidance or technical_standard) just ABOVE the best binding
// hit, ordered by semantic, so an EXPLICIT guidance/implementation question can return
// that source Top-1. A pure norm question (no intent → not called) keeps binding on top.
// Sources below the semantic margin are left untouched, so an off-topic source can never
// ride the override — and the lift is from the binding FINAL score, so authority/topic/
// domain bonuses cannot edge it out.
func liftAboveBinding(out, raw []LegalSearchResult, bestBindingSem float64, sourceClass string) {
	bestBindingFinal := 0.0
	for i := range out {
		if classifyAuthority(out[i]).sourceClass == "binding_law" && out[i].Score > bestBindingFinal {
			bestBindingFinal = out[i].Score
		}
	}
	for i := range out {
		// Classify (not raw payload) so the untagged legacy corpus — e.g. NIST ingested
		// before source_class tagging — is still recognized as its interpretative class.
		if classifyAuthority(out[i]).sourceClass != sourceClass || raw[i].Score < bestBindingSem-intentLiftMargin {
			continue
		}
		lifted := bestBindingFinal + intentLiftGain + (raw[i].Score - bestBindingSem)
		if lifted > out[i].Score {
			out[i].Score = lifted
		}
	}
}