breakpilot-compliance/ai-compliance-sdk/internal/ucca/control_role.go

package ucca

import "strings"

// source_role is the FUNCTIONAL role of a chunk — WHAT must be done (obligation),
// HOW to implement it (operational/procedural requirement, control standard,
// implementation guidance), or how to READ the norm (interpretation/definition).
// It is ORTHOGONAL to source_class (legal authority): source_class decides RANK,
// source_role decides CONTROL-POOL membership for implementation questions.
// Derived deterministically from markers, so the untagged corpus needs no re-tag.
const (
	roleObligation      = "obligation"              // the abstract duty (the WHAT)
	roleOperationalReq  = "operational_requirement" // concrete binding requirement (CRA Annex I)
	roleProceduralReq   = "procedural_requirement"  // a process: notification/registration/DPIA/incident report
	roleControlStandard = "control_standard"        // best-practice control catalog (NIST/OWASP/ISO/CIS)
	roleImplGuidance    = "implementation_guidance" // advisory how-to (ENISA good practices, BSI)
	roleInterpretation  = "interpretation"          // interprets the norm's MEANING (EDPB guideline)
	roleDefinition      = "definition"              // definitions / scope / recitals
)

var (
	proceduralMarkers = []string{
		"Meldung", "Meldepflicht", "Notification", "Notifizierung", "Registrierung",
		"Registration", "Konformitätserklärung", "Declaration of Conformity", "Incident",
		"Berichterstattung", "Reporting", "Folgenabschätzung", "DSFA", "DPIA", "Anzeigepflicht",
	}
	annexMarkers       = []string{"Anhang", "Annex", "Appendix", "Anlage"}
	operationalMarkers = []string{"Anforderung", "Requirement", "essential", "wesentliche"}
	implMarkers        = []string{
		"Good Practice", "Best Practice", "Standards Mapping", "Umsetzung", "Implementation",
		"Handreichung", "Maßnahmenkatalog", "ICS", "SCADA", "Technical Guideline", "TIG",
	}
	definitionMarkers = []string{"Begriffsbestimmung", "Definition"}
)

// classifyRole derives the functional source_role from chunk metadata + the authority
// class. technical_standard is always a control_standard; guidance splits into
// implementation_guidance (how-to) vs interpretation (meaning); binding splits into
// procedural / operational requirement / definition / plain obligation.
func classifyRole(r LegalSearchResult) string {
	cls := classifyAuthority(r).sourceClass
	hay := strings.ToLower(r.ArticleLabel + " " + r.RegulationShort + " " + r.RegulationName + " " + r.Article)
	switch {
	case r.IsRecital:
		return roleDefinition
	case cls == "technical_standard":
		return roleControlStandard
	case cls == "supervisory_guidance":
		if containsAnyLower(hay, implMarkers) {
			return roleImplGuidance
		}
		return roleInterpretation
	case cls == "binding_law":
		switch {
		case containsAnyLower(hay, definitionMarkers):
			return roleDefinition
		case containsAnyLower(hay, proceduralMarkers):
			return roleProceduralReq
		case containsAnyLower(hay, annexMarkers) || containsAnyLower(hay, operationalMarkers):
			return roleOperationalReq
		default:
			return roleObligation
		}
	default:
		return roleObligation
	}
}

// controlRoleBonus is the soft intra-pool preference (User 2026-06-24):
// operational_requirement > procedural_requirement > control_standard > implementation_guidance.
var controlRoleBonus = map[string]float64{
	roleOperationalReq:  0.100,
	roleProceduralReq:   0.075,
	roleControlStandard: 0.050,
	roleImplGuidance:    0.000,
}

// controlPoolGain lifts EVERY control-pool role over the non-control roles (obligation/
// interpretation/definition) on an implementation question, so the binding abstract
// obligation does not dominate by authority alone. The obligation is not removed — it
// stays visible as "Rechtsgrundlage" context below the recommended measures.
const controlPoolGain = 0.15

// applyControlRoles boosts the control-pool (the four implementation roles) for an
// EXPLICIT implementation question, soft-ordered op_req > procedural > standard > guidance.
// Replaces the earlier "lift technical_standard above binding" — controls are not only
// technical_standard, and the binding operational_requirement (e.g. CRA Annex I) should win.
func applyControlRoles(out []LegalSearchResult) {
	for i := range out {
		if bonus, ok := controlRoleBonus[classifyRole(out[i])]; ok {
			out[i].Score += controlPoolGain + bonus
		}
	}
}

// isControlPoolRole reports whether a role belongs to the control-pool surfaced on
// implementation questions (the four "how to implement" roles).
func isControlPoolRole(role string) bool {
	switch role {
	case roleOperationalReq, roleProceduralReq, roleControlStandard, roleImplGuidance:
		return true
	}
	return false
}

// controlRoleOf classifies a raw Qdrant payload into a source_role, so searchControls can
// filter its deep dense pull to the control-pool BEFORE hits are mapped to LegalSearchResult.
func controlRoleOf(payload map[string]interface{}) string {
	article := getString(payload, "article")
	if article == "" {
		article = getString(payload, "section")
	}
	return classifyRole(LegalSearchResult{
		RegulationShort: getString(payload, "regulation_short"),
		RegulationName:  getString(payload, "regulation_name_de"),
		ArticleLabel:    getString(payload, "article_label"),
		Article:         article,
		Category:        getString(payload, "category"),
		SourceClass:     getString(payload, "source_class"),
		AuthorityWeight: getInt(payload, "authority_weight"),
		IsRecital:       getBool(payload, "is_recital"),
	})
}

// ensureControlDiversity guarantees that the returned top-K of a control question surfaces at
// least one operational_requirement and one control_standard WHEN the pool contains them —
// without forcing them to Top-1. implementation_guidance (e.g. ENISA good practices) keeps its
// earned semantic lead; the rule only promotes the best hit of a missing control role into the
// top-K by overwriting the lowest-ranked redundant guidance slot. So an implementation question
// shows the relevant source ROLES (binding requirement + standard + guidance) side by side
// instead of one role flooding the list. The promoted hit's original (now duplicate) position
// stays in the tail and is dropped by the caller's truncation to topK.
func ensureControlDiversity(results []LegalSearchResult, topK int) []LegalSearchResult {
	if topK <= 0 || topK >= len(results) {
		return results // everything is already returned — nothing to promote
	}
	roleAt := make([]string, len(results))
	for i := range results {
		roleAt[i] = classifyRole(results[i])
	}
	present := make(map[string]bool, topK)
	for i := 0; i < topK; i++ {
		present[roleAt[i]] = true
	}
	for _, want := range []string{roleOperationalReq, roleControlStandard} {
		if present[want] {
			continue
		}
		src := -1
		for i := topK; i < len(results); i++ {
			if roleAt[i] == want {
				src = i
				break
			}
		}
		if src < 0 {
			continue // role absent from the whole pool — nothing to promote
		}
		dst := -1
		for j := topK - 1; j >= 0; j-- {
			if roleAt[j] == roleImplGuidance {
				dst = j
				break
			}
		}
		if dst < 0 {
			continue // no redundant guidance to sacrifice — leave the head untouched
		}
		results[dst] = results[src]
		roleAt[dst] = want
		present[want] = true
	}
	return results
}