Compare commits
8 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 5f8009e844 | |||
| 079bb56922 | |||
| 24bb449a79 | |||
| 8af9584d09 | |||
| ce6b4c58e3 | |||
| f6d018234b | |||
| 32e45f0797 | |||
| 9d79cf1576 |
@@ -35,25 +35,6 @@ Dies ist ein **Legal RAG**. Eine falsch zitierte Fundstelle ist schlimmer als ga
|
|||||||
- **Interne IDs** (Control-IDs wie SEC-xxxx, MC-/M-Nummern) gehoeren NICHT in die Nutzerantwort
|
- **Interne IDs** (Control-IDs wie SEC-xxxx, MC-/M-Nummern) gehoeren NICHT in die Nutzerantwort
|
||||||
als Hauptaussage — fuehre die Pflicht im Klartext, eine ID hoechstens in Klammern nachgestellt.
|
als Hauptaussage — fuehre die Pflicht im Klartext, eine ID hoechstens in Klammern nachgestellt.
|
||||||
|
|
||||||
## Korpus-Autoritaet & Aktualitaet — der Kontext schlaegt dein Gedaechtnis (KRITISCH)
|
|
||||||
Gesetze aendern sich nach deinem Trainingsstand. Der bereitgestellte RAG-/Controls-Kontext bildet
|
|
||||||
den AKTUELLEN Rechtsstand ab — dein Trainingswissen kann veraltet sein. Diese Regel gilt fuer
|
|
||||||
FAKTEN, nicht nur fuer Fundstellen (ergaenzt **Quellentreue**).
|
|
||||||
- Rechtliche **Fakten** (Schwellenwerte, Fristen, Zahlen, ob/ab-wann eine Pflicht gilt,
|
|
||||||
Zustaendigkeiten) nimmst du AUSSCHLIESSLICH aus dem bereitgestellten Kontext. Dein Trainingswissen
|
|
||||||
dient nur fuer Sprache, Struktur und Schlussfolgerung — **niemals als Rechtsquelle**.
|
|
||||||
- Steht ein gefragter Fakt NICHT im Kontext: gib KEINE aus dem Gedaechtnis erinnerte Zahl/Frist/
|
|
||||||
Schwelle aus — auch nicht beilaeufig im Fliesstext ohne Fundstelle. Sag offen, dass du ihn aus
|
|
||||||
deinen geprueften Quellen nicht belegen kannst, nenne Pflicht/Thema allgemein, und biete den
|
|
||||||
naechsten Schritt an (gezielt nachschlagen / mit DSB oder Anwalt verifizieren).
|
|
||||||
- **Konflikt-Transparenz**: Weicht der Kontext von dem ab, was dir "gelaeufig" vorkommt, gewinnt
|
|
||||||
IMMER der Kontext. Mach es ruhig transparent — z.B. "Die aktuelle Quelle nennt 20; eine evtl.
|
|
||||||
aeltere, gelaeufige Annahme (10) gilt hier nicht."
|
|
||||||
- **Co-Pilot-Ton, keine Roboter-Verweigerung**: formuliere "Aus meinen geprueften Quellen kann ich
|
|
||||||
X nicht belegen — ich kann es gezielt nachschlagen, oder du klaerst es mit deinem DSB/Anwalt"
|
|
||||||
statt eines harten "Nein". Du bleibst hilfreicher Begleiter, gibst dem Nutzer aber keine
|
|
||||||
ungesicherte Rechtsangabe als Tatsache mit.
|
|
||||||
|
|
||||||
## Kompetenzbereich
|
## Kompetenzbereich
|
||||||
- DSGVO Art. 1-99 + Erwaegsgruende
|
- DSGVO Art. 1-99 + Erwaegsgruende
|
||||||
- BDSG (Bundesdatenschutzgesetz)
|
- BDSG (Bundesdatenschutzgesetz)
|
||||||
|
|||||||
@@ -80,7 +80,7 @@ export async function POST(request: NextRequest) {
|
|||||||
let systemContent = soulPrompt || FALLBACK_SYSTEM_PROMPT
|
let systemContent = soulPrompt || FALLBACK_SYSTEM_PROMPT
|
||||||
if (validCountry) systemContent += countryBlock(validCountry)
|
if (validCountry) systemContent += countryBlock(validCountry)
|
||||||
if (ragContext) {
|
if (ragContext) {
|
||||||
systemContent += `\n\n## Relevanter Kontext aus dem RAG-System (deine EINZIGEN Rechtsquellen)\n\nDies sind deine einzigen zulaessigen Rechtsquellen. Triff keine konkrete Rechtsaussage (Zahl, Frist, Schwelle, Pflicht, Fundstelle), die nicht hier oder im Controls-Block belegt ist — sonst sage offen, dass du sie aus deinen Quellen nicht belegen kannst. Verweise in deiner Antwort auf die jeweilige Quelle:\n\n${ragContext}`
|
systemContent += `\n\n## Relevanter Kontext aus dem RAG-System\n\nNutze die folgenden Quellen fuer deine Antwort. Verweise in deiner Antwort auf die jeweilige Quelle:\n\n${ragContext}`
|
||||||
}
|
}
|
||||||
if (controlsContext) systemContent += `\n\n${controlsContext}`
|
if (controlsContext) systemContent += `\n\n${controlsContext}`
|
||||||
systemContent += `\n\n## Aktueller SDK-Schritt\nDer Nutzer befindet sich im SDK-Schritt: ${currentStep}`
|
systemContent += `\n\n## Aktueller SDK-Schritt\nDer Nutzer befindet sich im SDK-Schritt: ${currentStep}`
|
||||||
|
|||||||
@@ -1,220 +0,0 @@
|
|||||||
package ucca
|
|
||||||
|
|
||||||
import (
|
|
||||||
"regexp"
|
|
||||||
"strconv"
|
|
||||||
"strings"
|
|
||||||
)
|
|
||||||
|
|
||||||
// authorityInfo is the normative classification of a search result, used internally
|
|
||||||
// for re-ranking only (Phase 1 changes ordering, not the response contract).
|
|
||||||
type authorityInfo struct {
|
|
||||||
weight int // 100 binding_law, 70 guidance, 0 foreign_law, 50 unknown
|
|
||||||
sourceClass string // binding_law | supervisory_guidance | foreign_law | unknown
|
|
||||||
jurisdiction string // DE | EU | CH
|
|
||||||
}
|
|
||||||
|
|
||||||
var (
|
|
||||||
guidanceMarkers = []string{
|
|
||||||
"DSK", "EDPB", "BfDI", "BFDI", "BayLfD", "Baylfb", "ENISA", "BSI", "EUCC",
|
|
||||||
"Standards Mapping", "Kpnr", "Orientierungshilfe", "Handreichung", "Beschluss",
|
|
||||||
"Leitlinie", "Guidance", "Empfehlung", "NIST", "OECD", "CISA", "Blue Guide",
|
|
||||||
}
|
|
||||||
foreignMarkers = []string{"RevDSG", "fedlex", "(CH)"}
|
|
||||||
deMarkers = []string{"BDSG", "DSK", "BfDI", "BFDI", "BayLfD", "Baylfb", "BSI"}
|
|
||||||
normPattern = regexp.MustCompile(`(§|Art\.?)\s*\d`)
|
|
||||||
bdsgParagraph = regexp.MustCompile(`§\s*(\d+)`)
|
|
||||||
)
|
|
||||||
|
|
||||||
// classifyAuthority derives weight/source-class/jurisdiction. Explicitly tagged payload
|
|
||||||
// values win; otherwise it falls back to the curated category + name markers, so the
|
|
||||||
// not-yet-re-ingested (untagged) corpus is still classified deterministically.
|
|
||||||
func classifyAuthority(r LegalSearchResult) authorityInfo {
|
|
||||||
jur := r.Jurisdiction
|
|
||||||
if jur == "" {
|
|
||||||
jur = inferJurisdiction(r)
|
|
||||||
}
|
|
||||||
if r.SourceClass != "" {
|
|
||||||
w := r.AuthorityWeight
|
|
||||||
if w == 0 && r.SourceClass == "binding_law" {
|
|
||||||
w = 100
|
|
||||||
}
|
|
||||||
return authorityInfo{weight: w, sourceClass: r.SourceClass, jurisdiction: jur}
|
|
||||||
}
|
|
||||||
if r.AuthorityWeight > 0 {
|
|
||||||
return authorityInfo{weight: r.AuthorityWeight, sourceClass: sourceClassFromWeight(r.AuthorityWeight), jurisdiction: jur}
|
|
||||||
}
|
|
||||||
hay := r.ArticleLabel + " " + r.RegulationShort + " " + r.RegulationName + " " + r.RegulationCode
|
|
||||||
switch {
|
|
||||||
case containsAny(hay, foreignMarkers):
|
|
||||||
return authorityInfo{weight: 0, sourceClass: "foreign_law", jurisdiction: "CH"}
|
|
||||||
case r.Category == "guidance" || containsAny(hay, guidanceMarkers):
|
|
||||||
return authorityInfo{weight: 70, sourceClass: "supervisory_guidance", jurisdiction: jur}
|
|
||||||
case r.Category == "regulation" || r.Category == "eu_recht" || normPattern.MatchString(r.ArticleLabel):
|
|
||||||
return authorityInfo{weight: 100, sourceClass: "binding_law", jurisdiction: jur}
|
|
||||||
default:
|
|
||||||
return authorityInfo{weight: 50, sourceClass: "unknown", jurisdiction: jur}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func sourceClassFromWeight(w int) string {
|
|
||||||
switch {
|
|
||||||
case w >= 100:
|
|
||||||
return "binding_law"
|
|
||||||
case w >= 70:
|
|
||||||
return "supervisory_guidance"
|
|
||||||
case w <= 0:
|
|
||||||
return "foreign_law"
|
|
||||||
default:
|
|
||||||
return "unknown"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func inferJurisdiction(r LegalSearchResult) string {
|
|
||||||
hay := r.ArticleLabel + " " + r.RegulationShort + " " + r.RegulationName
|
|
||||||
switch {
|
|
||||||
case containsAny(hay, foreignMarkers):
|
|
||||||
return "CH"
|
|
||||||
case strings.Contains(hay, "§") || containsAny(hay, deMarkers):
|
|
||||||
return "DE"
|
|
||||||
default:
|
|
||||||
return "EU"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// --- Domain routing: separates same-authority but topically foreign norms ---
|
|
||||||
|
|
||||||
type domainDef struct {
|
|
||||||
name string
|
|
||||||
regs []string // regulation markers found in a chunk
|
|
||||||
keywords []string // query keywords that signal this domain
|
|
||||||
}
|
|
||||||
|
|
||||||
// Deterministic order (slice, not map) — important for stable classification + tests.
|
|
||||||
var domains = []domainDef{
|
|
||||||
{"data_protection",
|
|
||||||
[]string{"DSGVO", "GDPR", "BDSG", "EDPB", "DSK", "BfDI", "BayLfD", "DPF"},
|
|
||||||
[]string{"personenbezogen", "betroffene", "datenschutz", "datenschutzbeauftrag", "dsb",
|
|
||||||
"datenpanne", "auskunft", "loesch", "lösch", "einwilligung", "besondere kategorien", "auftragsverarbeiter"}},
|
|
||||||
{"cyber",
|
|
||||||
[]string{"CRA", "NIS2", "NIS-2", "ENISA", "DORA", "EUCC"},
|
|
||||||
[]string{"security update", "sicherheitsupdate", "sicherheitsaktualisierung", "schwachstelle", "sbom",
|
|
||||||
"cybersicherheit", "konformit", "hersteller", "importeur", "haendler", "händler", "ikt-",
|
|
||||||
"resilienz", "sicherheitsvorfall", "digitalen elementen"}},
|
|
||||||
{"ai",
|
|
||||||
[]string{"AI Act", "KI-VO", "KI-Verordnung"},
|
|
||||||
[]string{"ki-system", "ki-modell", "hochrisiko", "kuenstliche intelligenz", "künstliche intelligenz"}},
|
|
||||||
{"product_safety",
|
|
||||||
[]string{"Maschinenverordnung", "MaschinenVO", "GPSR", "RED", "MDR"},
|
|
||||||
nil},
|
|
||||||
}
|
|
||||||
|
|
||||||
func queryDomain(query string) string {
|
|
||||||
ql := strings.ToLower(query)
|
|
||||||
for _, d := range domains {
|
|
||||||
for _, kw := range d.keywords {
|
|
||||||
if strings.Contains(ql, kw) {
|
|
||||||
return d.name
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return ""
|
|
||||||
}
|
|
||||||
|
|
||||||
func chunkDomain(r LegalSearchResult) string {
|
|
||||||
hay := r.ArticleLabel + " " + r.RegulationShort + " " + r.RegulationCode + " " + r.RegulationName
|
|
||||||
for _, d := range domains {
|
|
||||||
if containsAny(hay, d.regs) {
|
|
||||||
return d.name
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return ""
|
|
||||||
}
|
|
||||||
|
|
||||||
// scopeClass flags special sub-regimes that must not win general questions —
|
|
||||||
// BDSG Teil 3 (§§ 45-84) implements the JI directive (law enforcement), not the general regime.
|
|
||||||
func scopeClass(r LegalSearchResult) string {
|
|
||||||
hay := r.ArticleLabel + " " + r.RegulationShort
|
|
||||||
if strings.Contains(hay, "BDSG") {
|
|
||||||
if m := bdsgParagraph.FindStringSubmatch(hay); m != nil {
|
|
||||||
if n, err := strconv.Atoi(m[1]); err == nil && n >= 45 && n <= 84 {
|
|
||||||
return "law_enforcement"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return "general"
|
|
||||||
}
|
|
||||||
|
|
||||||
// --- Topic ontology: amplifier only (boost), never an override ---
|
|
||||||
|
|
||||||
type topicDef struct {
|
|
||||||
keywords []string
|
|
||||||
norms []string // preferred canonical citation fragments
|
|
||||||
}
|
|
||||||
|
|
||||||
var topics = []topicDef{
|
|
||||||
{[]string{"datenschutzbeauftrag", "dsb", "benennung"}, []string{"Art. 37", "§ 38 BDSG"}},
|
|
||||||
{[]string{"stellung des"}, []string{"Art. 38"}},
|
|
||||||
{[]string{"aufgaben des"}, []string{"Art. 39"}},
|
|
||||||
{[]string{"folgenabsch", "dsfa"}, []string{"Art. 35"}},
|
|
||||||
{[]string{"besondere kategorien"}, []string{"Art. 9", "§ 22 BDSG"}},
|
|
||||||
{[]string{"auskunft"}, []string{"Art. 15", "§ 34 BDSG"}},
|
|
||||||
{[]string{"loesch", "lösch"}, []string{"Art. 17", "§ 35 BDSG"}},
|
|
||||||
{[]string{"bussgeld", "geldbusse"}, []string{"Art. 83"}},
|
|
||||||
{[]string{"security update", "sicherheitsupdate", "schwachstelle", "sbom", "cybersicherheitsanforderung"}, []string{"CRA Anhang I"}},
|
|
||||||
{[]string{"meldepflicht", "sicherheitsvorfall"}, []string{"Art. 14 CRA"}},
|
|
||||||
}
|
|
||||||
|
|
||||||
// resultMatchesTopic reports whether the result is a preferred norm of a topic the query hits.
|
|
||||||
func resultMatchesTopic(query string, r LegalSearchResult) bool {
|
|
||||||
ql := strings.ToLower(query)
|
|
||||||
hay := r.ArticleLabel + " " + r.RegulationShort
|
|
||||||
for _, t := range topics {
|
|
||||||
if !containsAnyLower(ql, t.keywords) {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
for _, n := range t.norms {
|
|
||||||
if normMatches(hay, n) {
|
|
||||||
return true
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return false
|
|
||||||
}
|
|
||||||
|
|
||||||
// normMatches checks that norm appears in hay with a non-digit boundary, so "Art. 9"
|
|
||||||
// matches "Art. 9 DSGVO" but not "Art. 90".
|
|
||||||
func normMatches(hay, norm string) bool {
|
|
||||||
idx := strings.Index(hay, norm)
|
|
||||||
if idx < 0 {
|
|
||||||
return false
|
|
||||||
}
|
|
||||||
end := idx + len(norm)
|
|
||||||
if end < len(hay) && hay[end] >= '0' && hay[end] <= '9' {
|
|
||||||
return false
|
|
||||||
}
|
|
||||||
return true
|
|
||||||
}
|
|
||||||
|
|
||||||
func queryIsForeign(query string) bool {
|
|
||||||
return containsAnyLower(strings.ToLower(query),
|
|
||||||
[]string{"schweiz", "revdsg", "fedlex", " ch ", "oesterreich", "österreich"})
|
|
||||||
}
|
|
||||||
|
|
||||||
func containsAny(hay string, markers []string) bool {
|
|
||||||
for _, m := range markers {
|
|
||||||
if strings.Contains(hay, m) {
|
|
||||||
return true
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return false
|
|
||||||
}
|
|
||||||
|
|
||||||
func containsAnyLower(haylower string, markers []string) bool {
|
|
||||||
for _, m := range markers {
|
|
||||||
if strings.Contains(haylower, strings.ToLower(m)) {
|
|
||||||
return true
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return false
|
|
||||||
}
|
|
||||||
@@ -1,68 +0,0 @@
|
|||||||
package ucca
|
|
||||||
|
|
||||||
import "sort"
|
|
||||||
|
|
||||||
// Re-ranking coefficients (validated in the offline golden harness; Phase A — conservative).
|
|
||||||
const (
|
|
||||||
authorityCoef = 0.40 // * weight/100
|
|
||||||
jurisdictionGain = 0.05 // binding/guidance from DE or EU
|
|
||||||
foreignPenalty = 0.60 // foreign law on a DE/EU question (demoted, not removed)
|
|
||||||
unknownPenalty = 0.08
|
|
||||||
domainMatchGain = 0.15
|
|
||||||
offDomainPenalty = 0.10 // off-domain binding (demoted, not removed)
|
|
||||||
scopePenalty = 0.25 // BDSG Teil 3 (law enforcement) on a general DP question
|
|
||||||
topicGain = 0.18 // amplifier only
|
|
||||||
)
|
|
||||||
|
|
||||||
// authorityScore computes the normative relevance of a result for a query. It augments the
|
|
||||||
// semantic score with authority/jurisdiction/domain/scope/topic signals. Exposed for tests.
|
|
||||||
func authorityScore(query string, r LegalSearchResult, qDomain string, qForeign bool) float64 {
|
|
||||||
info := classifyAuthority(r)
|
|
||||||
score := r.Score + authorityCoef*float64(info.weight)/100.0
|
|
||||||
|
|
||||||
if info.jurisdiction == "CH" && !qForeign {
|
|
||||||
score -= foreignPenalty // Fremdrecht bei DE/EU-Frage: demoted, nicht geloescht
|
|
||||||
} else {
|
|
||||||
score += jurisdictionGain
|
|
||||||
}
|
|
||||||
if info.sourceClass == "unknown" {
|
|
||||||
score -= unknownPenalty
|
|
||||||
}
|
|
||||||
if qDomain != "" {
|
|
||||||
switch cd := chunkDomain(r); {
|
|
||||||
case cd == qDomain:
|
|
||||||
score += domainMatchGain
|
|
||||||
case cd != "":
|
|
||||||
score -= offDomainPenalty // off-domain binding: demoted, nicht geloescht
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if qDomain == "data_protection" && scopeClass(r) == "law_enforcement" {
|
|
||||||
score -= scopePenalty
|
|
||||||
}
|
|
||||||
if resultMatchesTopic(query, r) {
|
|
||||||
score += topicGain // Verstaerker, kein Override
|
|
||||||
}
|
|
||||||
return score
|
|
||||||
}
|
|
||||||
|
|
||||||
// rerankByAuthority re-orders results so binding law from the matching jurisdiction/domain
|
|
||||||
// ranks above guidance, foreign and off-domain law — WITHOUT dropping anything (guidance is
|
|
||||||
// kept as interpretation context). The computed score is written back to Score so downstream
|
|
||||||
// merges (e.g. the multi-collection advisor) preserve this order. Pure + deterministic.
|
|
||||||
func rerankByAuthority(query string, results []LegalSearchResult) []LegalSearchResult {
|
|
||||||
if len(results) < 2 {
|
|
||||||
return results
|
|
||||||
}
|
|
||||||
qDomain := queryDomain(query)
|
|
||||||
qForeign := queryIsForeign(query)
|
|
||||||
|
|
||||||
out := make([]LegalSearchResult, len(results))
|
|
||||||
copy(out, results)
|
|
||||||
for i := range out {
|
|
||||||
out[i].Score = authorityScore(query, out[i], qDomain, qForeign)
|
|
||||||
}
|
|
||||||
sort.SliceStable(out, func(a, b int) bool {
|
|
||||||
return out[a].Score > out[b].Score
|
|
||||||
})
|
|
||||||
return out
|
|
||||||
}
|
|
||||||
@@ -1,96 +0,0 @@
|
|||||||
package ucca
|
|
||||||
|
|
||||||
import "testing"
|
|
||||||
|
|
||||||
func bindingRes(label, reg, jur string, score float64) LegalSearchResult {
|
|
||||||
return LegalSearchResult{ArticleLabel: label, RegulationShort: reg, SourceClass: "binding_law", AuthorityWeight: 100, Jurisdiction: jur, Score: score}
|
|
||||||
}
|
|
||||||
|
|
||||||
func guidanceRes(label, reg string, score float64) LegalSearchResult {
|
|
||||||
return LegalSearchResult{ArticleLabel: label, RegulationShort: reg, SourceClass: "supervisory_guidance", AuthorityWeight: 70, Jurisdiction: "EU", Score: score}
|
|
||||||
}
|
|
||||||
|
|
||||||
func foreignRes(label string, score float64) LegalSearchResult {
|
|
||||||
return LegalSearchResult{ArticleLabel: label, RegulationShort: "RevDSG", SourceClass: "foreign_law", AuthorityWeight: 0, Jurisdiction: "CH", Score: score}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Acceptance criteria (Phase 1) expressed as ordering tests.
|
|
||||||
func TestRerankByAuthority_Acceptance(t *testing.T) {
|
|
||||||
t.Run("guidance does not overtake semantically competitive binding", func(t *testing.T) {
|
|
||||||
out := rerankByAuthority("Was gilt hier?", []LegalSearchResult{
|
|
||||||
guidanceRes("ENISA Mapping", "ENISA", 0.72),
|
|
||||||
bindingRes("CRA Anhang I", "CRA", "EU", 0.66),
|
|
||||||
})
|
|
||||||
if out[0].RegulationShort != "CRA" {
|
|
||||||
t.Fatalf("binding must rank first over competitive guidance, got %q", out[0].RegulationShort)
|
|
||||||
}
|
|
||||||
})
|
|
||||||
|
|
||||||
t.Run("foreign law demoted on DE/EU question but kept", func(t *testing.T) {
|
|
||||||
in := []LegalSearchResult{foreignRes("RevDSG Art 1", 0.85), bindingRes("Art. 9 DSGVO", "DSGVO", "EU", 0.62)}
|
|
||||||
out := rerankByAuthority("Welche Daten sind besonders geschuetzt?", in)
|
|
||||||
if out[0].RegulationShort != "DSGVO" {
|
|
||||||
t.Fatalf("binding EU must beat foreign on a DE/EU query, got %q", out[0].RegulationShort)
|
|
||||||
}
|
|
||||||
if len(out) != 2 {
|
|
||||||
t.Fatalf("foreign law must be kept, got len=%d", len(out))
|
|
||||||
}
|
|
||||||
})
|
|
||||||
|
|
||||||
t.Run("off-domain binding demoted but not removed", func(t *testing.T) {
|
|
||||||
in := []LegalSearchResult{
|
|
||||||
bindingRes("Art. 13 EU MDR", "MDR", "EU", 0.70),
|
|
||||||
bindingRes("Art. 13 CRA", "CRA", "EU", 0.60),
|
|
||||||
}
|
|
||||||
out := rerankByAuthority("Welche Pflichten hat der Hersteller von Produkten mit digitalen Elementen?", in)
|
|
||||||
if out[0].RegulationShort != "CRA" {
|
|
||||||
t.Fatalf("on-domain CRA must beat off-domain MDR, got %q", out[0].RegulationShort)
|
|
||||||
}
|
|
||||||
if len(out) != 2 {
|
|
||||||
t.Fatalf("off-domain MDR must be kept, got len=%d", len(out))
|
|
||||||
}
|
|
||||||
})
|
|
||||||
|
|
||||||
t.Run("same-regime binding wins over guidance", func(t *testing.T) {
|
|
||||||
out := rerankByAuthority("Was gilt hier?", []LegalSearchResult{
|
|
||||||
bindingRes("Art. 13 CRA", "CRA", "EU", 0.70),
|
|
||||||
guidanceRes("ENISA Mapping", "ENISA", 0.60),
|
|
||||||
})
|
|
||||||
if out[0].RegulationShort != "CRA" {
|
|
||||||
t.Fatalf("binding must win, got %q", out[0].RegulationShort)
|
|
||||||
}
|
|
||||||
})
|
|
||||||
|
|
||||||
t.Run("BDSG Teil 3 demoted below DSGVO on general DP question", func(t *testing.T) {
|
|
||||||
in := []LegalSearchResult{
|
|
||||||
bindingRes("§ 48 BDSG", "BDSG", "DE", 0.70), // Teil 3 (law enforcement)
|
|
||||||
bindingRes("Art. 9 DSGVO", "DSGVO", "EU", 0.62),
|
|
||||||
}
|
|
||||||
out := rerankByAuthority("Was sind besondere Kategorien personenbezogener Daten?", in)
|
|
||||||
if out[0].RegulationShort != "DSGVO" {
|
|
||||||
t.Fatalf("DSGVO must beat BDSG Teil 3 on a general DP question, got %q", out[0].RegulationShort)
|
|
||||||
}
|
|
||||||
})
|
|
||||||
|
|
||||||
t.Run("nothing is dropped and topic amplifies", func(t *testing.T) {
|
|
||||||
in := []LegalSearchResult{
|
|
||||||
guidanceRes("ENISA", "ENISA", 0.72),
|
|
||||||
bindingRes("CRA Anhang I", "CRA", "EU", 0.66),
|
|
||||||
foreignRes("RevDSG", 0.5),
|
|
||||||
}
|
|
||||||
out := rerankByAuthority("Anforderungen an Security Updates?", in)
|
|
||||||
if len(out) != len(in) {
|
|
||||||
t.Fatalf("rerank must preserve all results, got %d want %d", len(out), len(in))
|
|
||||||
}
|
|
||||||
if out[0].ArticleLabel != "CRA Anhang I" {
|
|
||||||
t.Fatalf("topic+authority must lift CRA Anhang I to top, got %q", out[0].ArticleLabel)
|
|
||||||
}
|
|
||||||
})
|
|
||||||
|
|
||||||
t.Run("single result returned unchanged", func(t *testing.T) {
|
|
||||||
in := []LegalSearchResult{bindingRes("Art. 1 CRA", "CRA", "EU", 0.5)}
|
|
||||||
if out := rerankByAuthority("x", in); len(out) != 1 {
|
|
||||||
t.Fatalf("len=%d", len(out))
|
|
||||||
}
|
|
||||||
})
|
|
||||||
}
|
|
||||||
@@ -1,125 +0,0 @@
|
|||||||
package ucca
|
|
||||||
|
|
||||||
import "testing"
|
|
||||||
|
|
||||||
func TestClassifyAuthority(t *testing.T) {
|
|
||||||
tests := []struct {
|
|
||||||
name string
|
|
||||||
result LegalSearchResult
|
|
||||||
wantW int
|
|
||||||
wantSC string
|
|
||||||
wantJur string
|
|
||||||
}{
|
|
||||||
{"tagged binding EU", LegalSearchResult{AuthorityWeight: 100, SourceClass: "binding_law", Jurisdiction: "EU"}, 100, "binding_law", "EU"},
|
|
||||||
{"tagged guidance DE", LegalSearchResult{AuthorityWeight: 70, SourceClass: "supervisory_guidance", Jurisdiction: "DE"}, 70, "supervisory_guidance", "DE"},
|
|
||||||
{"tagged foreign CH", LegalSearchResult{AuthorityWeight: 0, SourceClass: "foreign_law", Jurisdiction: "CH"}, 0, "foreign_law", "CH"},
|
|
||||||
{"untagged ENISA guidance", LegalSearchResult{RegulationShort: "ENISA", ArticleLabel: "ENISA CRA Standards Mapping"}, 70, "supervisory_guidance", "EU"},
|
|
||||||
{"untagged CRA binding", LegalSearchResult{RegulationShort: "CRA", ArticleLabel: "Art. 13 CRA", Category: "regulation"}, 100, "binding_law", "EU"},
|
|
||||||
{"untagged BDSG binding DE", LegalSearchResult{RegulationShort: "BDSG", ArticleLabel: "§ 38 BDSG"}, 100, "binding_law", "DE"},
|
|
||||||
{"untagged RevDSG foreign", LegalSearchResult{RegulationShort: "RevDSG", ArticleLabel: "RevDSG (CH)"}, 0, "foreign_law", "CH"},
|
|
||||||
{"untagged unknown", LegalSearchResult{RegulationShort: "", ArticleLabel: ""}, 50, "unknown", "EU"},
|
|
||||||
}
|
|
||||||
for _, tt := range tests {
|
|
||||||
t.Run(tt.name, func(t *testing.T) {
|
|
||||||
got := classifyAuthority(tt.result)
|
|
||||||
if got.weight != tt.wantW || got.sourceClass != tt.wantSC || got.jurisdiction != tt.wantJur {
|
|
||||||
t.Errorf("classifyAuthority() = {%d %s %s}, want {%d %s %s}",
|
|
||||||
got.weight, got.sourceClass, got.jurisdiction, tt.wantW, tt.wantSC, tt.wantJur)
|
|
||||||
}
|
|
||||||
})
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestQueryDomain(t *testing.T) {
|
|
||||||
tests := []struct{ q, want string }{
|
|
||||||
{"Welche Anforderungen an Security Updates?", "cyber"},
|
|
||||||
{"Wer braucht einen Datenschutzbeauftragten?", "data_protection"},
|
|
||||||
{"Was sind besondere Kategorien personenbezogener Daten?", "data_protection"},
|
|
||||||
{"Welche Pflichten beim Hochrisiko-KI-System?", "ai"},
|
|
||||||
{"Wie spaet ist es?", ""},
|
|
||||||
}
|
|
||||||
for _, tt := range tests {
|
|
||||||
if got := queryDomain(tt.q); got != tt.want {
|
|
||||||
t.Errorf("queryDomain(%q) = %q, want %q", tt.q, got, tt.want)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestChunkDomain(t *testing.T) {
|
|
||||||
tests := []struct {
|
|
||||||
name string
|
|
||||||
r LegalSearchResult
|
|
||||||
want string
|
|
||||||
}{
|
|
||||||
{"CRA cyber", LegalSearchResult{RegulationShort: "CRA", ArticleLabel: "Art. 13 CRA"}, "cyber"},
|
|
||||||
{"DSGVO dp", LegalSearchResult{RegulationShort: "DSGVO", ArticleLabel: "Art. 9 DSGVO"}, "data_protection"},
|
|
||||||
{"AI Act ai", LegalSearchResult{RegulationShort: "AI Act", ArticleLabel: "Art. 10 AI Act"}, "ai"},
|
|
||||||
{"MDR product", LegalSearchResult{RegulationShort: "MDR", ArticleLabel: "Art. 13 EU MDR"}, "product_safety"},
|
|
||||||
{"unknown", LegalSearchResult{RegulationShort: "XYZ"}, ""},
|
|
||||||
}
|
|
||||||
for _, tt := range tests {
|
|
||||||
t.Run(tt.name, func(t *testing.T) {
|
|
||||||
if got := chunkDomain(tt.r); got != tt.want {
|
|
||||||
t.Errorf("chunkDomain() = %q, want %q", got, tt.want)
|
|
||||||
}
|
|
||||||
})
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestScopeClass(t *testing.T) {
|
|
||||||
tests := []struct {
|
|
||||||
name string
|
|
||||||
r LegalSearchResult
|
|
||||||
want string
|
|
||||||
}{
|
|
||||||
{"BDSG Teil 3 law enforcement", LegalSearchResult{RegulationShort: "BDSG", ArticleLabel: "§ 48 BDSG"}, "law_enforcement"},
|
|
||||||
{"BDSG general part", LegalSearchResult{RegulationShort: "BDSG", ArticleLabel: "§ 38 BDSG"}, "general"},
|
|
||||||
{"DSGVO general", LegalSearchResult{RegulationShort: "DSGVO", ArticleLabel: "Art. 9 DSGVO"}, "general"},
|
|
||||||
}
|
|
||||||
for _, tt := range tests {
|
|
||||||
t.Run(tt.name, func(t *testing.T) {
|
|
||||||
if got := scopeClass(tt.r); got != tt.want {
|
|
||||||
t.Errorf("scopeClass() = %q, want %q", got, tt.want)
|
|
||||||
}
|
|
||||||
})
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestResultMatchesTopic(t *testing.T) {
|
|
||||||
tests := []struct {
|
|
||||||
name string
|
|
||||||
query string
|
|
||||||
r LegalSearchResult
|
|
||||||
want bool
|
|
||||||
}{
|
|
||||||
{"besondere Kategorien -> Art 9 match", "Was sind besondere Kategorien?", LegalSearchResult{ArticleLabel: "Art. 9 DSGVO"}, true},
|
|
||||||
{"besondere Kategorien -> Art 90 no match", "Was sind besondere Kategorien?", LegalSearchResult{ArticleLabel: "Art. 90 DSGVO"}, false},
|
|
||||||
{"security updates -> CRA Anhang I", "Anforderungen an Security Updates?", LegalSearchResult{ArticleLabel: "CRA Anhang I"}, true},
|
|
||||||
{"no topic keyword", "Wie spaet ist es?", LegalSearchResult{ArticleLabel: "Art. 9 DSGVO"}, false},
|
|
||||||
}
|
|
||||||
for _, tt := range tests {
|
|
||||||
t.Run(tt.name, func(t *testing.T) {
|
|
||||||
if got := resultMatchesTopic(tt.query, tt.r); got != tt.want {
|
|
||||||
t.Errorf("resultMatchesTopic() = %v, want %v", got, tt.want)
|
|
||||||
}
|
|
||||||
})
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestNormMatches(t *testing.T) {
|
|
||||||
tests := []struct {
|
|
||||||
hay, norm string
|
|
||||||
want bool
|
|
||||||
}{
|
|
||||||
{"Art. 9 DSGVO", "Art. 9", true},
|
|
||||||
{"Art. 90 DSGVO", "Art. 9", false},
|
|
||||||
{"§ 38 BDSG", "§ 38 BDSG", true},
|
|
||||||
{"§ 380 BDSG", "§ 38", false},
|
|
||||||
{"Art. 14 CRA", "Art. 14 CRA", true},
|
|
||||||
}
|
|
||||||
for _, tt := range tests {
|
|
||||||
if got := normMatches(tt.hay, tt.norm); got != tt.want {
|
|
||||||
t.Errorf("normMatches(%q,%q) = %v, want %v", tt.hay, tt.norm, got, tt.want)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -93,13 +93,6 @@ func (c *LegalRAGClient) searchInternal(ctx context.Context, collection string,
|
|||||||
hits = denseHits
|
hits = denseHits
|
||||||
}
|
}
|
||||||
|
|
||||||
// Stratified: den binding_law-Pool ERGAENZEN (nicht ersetzen), damit die Pflichtquelle
|
|
||||||
// immer Kandidat ist — Guidance bleibt als Auslegungskontext erhalten. Best-effort:
|
|
||||||
// Fehler beim Binding-Query degradieren still auf den semantischen Pool.
|
|
||||||
if bindingHits, bErr := c.searchBinding(ctx, collection, embedding, topK); bErr == nil {
|
|
||||||
hits = mergeDedupHits(hits, bindingHits)
|
|
||||||
}
|
|
||||||
|
|
||||||
results := make([]LegalSearchResult, len(hits))
|
results := make([]LegalSearchResult, len(hits))
|
||||||
for i, hit := range hits {
|
for i, hit := range hits {
|
||||||
// Legal-Metadaten nach rag_reingest_spec.md §2: bevorzugt die normalisierten Felder
|
// Legal-Metadaten nach rag_reingest_spec.md §2: bevorzugt die normalisierten Felder
|
||||||
@@ -128,41 +121,12 @@ func (c *LegalRAGClient) searchInternal(ctx context.Context, collection string,
|
|||||||
Pages: getIntSlice(hit.Payload, "pages"),
|
Pages: getIntSlice(hit.Payload, "pages"),
|
||||||
SourceURL: getString(hit.Payload, "source"),
|
SourceURL: getString(hit.Payload, "source"),
|
||||||
Score: hit.Score,
|
Score: hit.Score,
|
||||||
AuthorityWeight: getInt(hit.Payload, "authority_weight"),
|
|
||||||
SourceClass: getString(hit.Payload, "source_class"),
|
|
||||||
Jurisdiction: getString(hit.Payload, "jurisdiction"),
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Authority-aware Re-Ranking: bindendes Recht der passenden Jurisdiktion/Domaene nach
|
|
||||||
// oben, Guidance/Fremdrecht/Off-Domain runter (nichts wird geloescht). Reihenfolge only,
|
|
||||||
// Response-Schema unveraendert. Score traegt den Authority-Score, damit nachgelagerte
|
|
||||||
// Multi-Collection-Merges (Advisor) die Ordnung bewahren.
|
|
||||||
results = rerankByAuthority(query, results)
|
|
||||||
if topK > 0 && len(results) > topK {
|
|
||||||
results = results[:topK]
|
|
||||||
}
|
|
||||||
|
|
||||||
return results, nil
|
return results, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// mergeDedupHits concatenates two hit lists, keeping the first occurrence of each point ID.
|
|
||||||
func mergeDedupHits(primary, extra []qdrantSearchHit) []qdrantSearchHit {
|
|
||||||
seen := make(map[string]bool, len(primary)+len(extra))
|
|
||||||
out := make([]qdrantSearchHit, 0, len(primary)+len(extra))
|
|
||||||
for _, list := range [][]qdrantSearchHit{primary, extra} {
|
|
||||||
for _, h := range list {
|
|
||||||
id := fmt.Sprint(h.ID)
|
|
||||||
if seen[id] {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
seen[id] = true
|
|
||||||
out = append(out, h)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return out
|
|
||||||
}
|
|
||||||
|
|
||||||
// FormatLegalContextForPrompt formats the legal context for inclusion in an LLM prompt.
|
// FormatLegalContextForPrompt formats the legal context for inclusion in an LLM prompt.
|
||||||
func (c *LegalRAGClient) FormatLegalContextForPrompt(lc *LegalContext) string {
|
func (c *LegalRAGClient) FormatLegalContextForPrompt(lc *LegalContext) string {
|
||||||
if lc == nil || len(lc.Results) == 0 {
|
if lc == nil || len(lc.Results) == 0 {
|
||||||
|
|||||||
@@ -185,27 +185,6 @@ func (c *LegalRAGClient) searchDense(ctx context.Context, collection string, emb
|
|||||||
searchReq.Filter = &qdrantFilter{Should: conditions}
|
searchReq.Filter = &qdrantFilter{Should: conditions}
|
||||||
}
|
}
|
||||||
|
|
||||||
return c.doPointsSearch(ctx, collection, searchReq)
|
|
||||||
}
|
|
||||||
|
|
||||||
// searchBinding fetches the top binding_law hits (authority-stratified pool) so the
|
|
||||||
// obligation source is always a candidate even when guidance dominates semantically.
|
|
||||||
// It AUGMENTS the semantic pool — guidance is preserved as interpretation context.
|
|
||||||
func (c *LegalRAGClient) searchBinding(ctx context.Context, collection string, embedding []float64, topK int) ([]qdrantSearchHit, error) {
|
|
||||||
searchReq := qdrantSearchRequest{
|
|
||||||
Vector: embedding,
|
|
||||||
Limit: topK,
|
|
||||||
WithPayload: true,
|
|
||||||
Filter: &qdrantFilter{Must: []qdrantCondition{
|
|
||||||
{Key: "source_class", Match: qdrantMatch{Value: "binding_law"}},
|
|
||||||
}},
|
|
||||||
}
|
|
||||||
|
|
||||||
return c.doPointsSearch(ctx, collection, searchReq)
|
|
||||||
}
|
|
||||||
|
|
||||||
// doPointsSearch issues a POST /points/search and decodes the hits.
|
|
||||||
func (c *LegalRAGClient) doPointsSearch(ctx context.Context, collection string, searchReq qdrantSearchRequest) ([]qdrantSearchHit, error) {
|
|
||||||
jsonBody, err := json.Marshal(searchReq)
|
jsonBody, err := json.Marshal(searchReq)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("failed to marshal search request: %w", err)
|
return nil, fmt.Errorf("failed to marshal search request: %w", err)
|
||||||
|
|||||||
@@ -225,18 +225,6 @@ func getIntSlice(m map[string]interface{}, key string) []int {
|
|||||||
return result
|
return result
|
||||||
}
|
}
|
||||||
|
|
||||||
func getInt(m map[string]interface{}, key string) int {
|
|
||||||
if v, ok := m[key]; ok {
|
|
||||||
switch n := v.(type) {
|
|
||||||
case float64:
|
|
||||||
return int(n)
|
|
||||||
case int:
|
|
||||||
return n
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return 0
|
|
||||||
}
|
|
||||||
|
|
||||||
func contains(slice []string, item string) bool {
|
func contains(slice []string, item string) bool {
|
||||||
for _, s := range slice {
|
for _, s := range slice {
|
||||||
if s == item {
|
if s == item {
|
||||||
|
|||||||
@@ -399,9 +399,8 @@ func TestHybridSearch_UsesQueryAPI(t *testing.T) {
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
// /points/search is now the stratified binding-law augmentation query (it AUGMENTS
|
// Fallback: should not reach dense search
|
||||||
// the hybrid pool, it is not a dense fallback). Return empty so the hybrid hit
|
t.Error("Unexpected dense search call when hybrid succeeded")
|
||||||
// remains the sole result for this test.
|
|
||||||
json.NewEncoder(w).Encode(qdrantSearchResponse{Result: []qdrantSearchHit{}})
|
json.NewEncoder(w).Encode(qdrantSearchResponse{Result: []qdrantSearchHit{}})
|
||||||
}))
|
}))
|
||||||
defer qdrantMock.Close()
|
defer qdrantMock.Close()
|
||||||
@@ -447,59 +446,6 @@ func TestHybridSearch_UsesQueryAPI(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// TestSearch_StratifiedBindingRerank verifies that the binding-law pool augments the
|
|
||||||
// semantic pool and that authority re-ranking lifts binding law above higher-semantic guidance.
|
|
||||||
func TestSearch_StratifiedBindingRerank(t *testing.T) {
|
|
||||||
ollamaMock := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
|
||||||
json.NewEncoder(w).Encode(ollamaEmbeddingResponse{Embedding: make([]float64, 1024)})
|
|
||||||
}))
|
|
||||||
defer ollamaMock.Close()
|
|
||||||
|
|
||||||
qdrantMock := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
|
||||||
if strings.Contains(r.URL.Path, "/index") {
|
|
||||||
w.WriteHeader(http.StatusOK)
|
|
||||||
w.Write([]byte(`{"result":{"status":"completed"}}`))
|
|
||||||
return
|
|
||||||
}
|
|
||||||
if strings.Contains(r.URL.Path, "/points/query") {
|
|
||||||
json.NewEncoder(w).Encode(qdrantQueryResponse{Result: []qdrantSearchHit{
|
|
||||||
{ID: "g1", Score: 0.72, Payload: map[string]interface{}{
|
|
||||||
"chunk_text": "ENISA guidance", "regulation_short": "ENISA",
|
|
||||||
"article_label": "ENISA CRA Mapping", "source_class": "supervisory_guidance",
|
|
||||||
"authority_weight": float64(70), "jurisdiction": "EU",
|
|
||||||
}},
|
|
||||||
}})
|
|
||||||
return
|
|
||||||
}
|
|
||||||
// /points/search = stratified binding-law pool (source_class=binding_law)
|
|
||||||
json.NewEncoder(w).Encode(qdrantSearchResponse{Result: []qdrantSearchHit{
|
|
||||||
{ID: "b1", Score: 0.66, Payload: map[string]interface{}{
|
|
||||||
"chunk_text": "CRA Anhang I requirement", "regulation_short": "CRA",
|
|
||||||
"article_label": "CRA Anhang I", "source_class": "binding_law",
|
|
||||||
"authority_weight": float64(100), "jurisdiction": "EU",
|
|
||||||
}},
|
|
||||||
}})
|
|
||||||
}))
|
|
||||||
defer qdrantMock.Close()
|
|
||||||
|
|
||||||
client := &LegalRAGClient{
|
|
||||||
qdrantURL: qdrantMock.URL, ollamaURL: ollamaMock.URL, embeddingModel: "bge-m3",
|
|
||||||
collection: "bp_compliance_ce", textIndexEnsured: make(map[string]bool),
|
|
||||||
hybridEnabled: true, httpClient: http.DefaultClient,
|
|
||||||
}
|
|
||||||
|
|
||||||
results, err := client.Search(context.Background(), "Was gilt hier?", nil, 5)
|
|
||||||
if err != nil {
|
|
||||||
t.Fatalf("search failed: %v", err)
|
|
||||||
}
|
|
||||||
if len(results) != 2 {
|
|
||||||
t.Fatalf("expected 2 merged results (guidance + binding), got %d", len(results))
|
|
||||||
}
|
|
||||||
if results[0].RegulationShort != "CRA" {
|
|
||||||
t.Errorf("binding CRA must rank first over higher-semantic guidance, got %q", results[0].RegulationShort)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestHybridSearch_FallbackToDense(t *testing.T) {
|
func TestHybridSearch_FallbackToDense(t *testing.T) {
|
||||||
var requestedPaths []string
|
var requestedPaths []string
|
||||||
|
|
||||||
|
|||||||
@@ -20,13 +20,6 @@ type LegalSearchResult struct {
|
|||||||
Pages []int `json:"pages,omitempty"`
|
Pages []int `json:"pages,omitempty"`
|
||||||
SourceURL string `json:"source_url"`
|
SourceURL string `json:"source_url"`
|
||||||
Score float64 `json:"score"`
|
Score float64 `json:"score"`
|
||||||
|
|
||||||
// Interne Felder fuer das Authority-Re-Ranking (Phase 1) — NICHT serialisiert
|
|
||||||
// (json:"-"), daher kein Contract-Change. Aus dem Qdrant-Payload befuellt und nur
|
|
||||||
// fuer die Sortierung in rerankByAuthority verwendet.
|
|
||||||
AuthorityWeight int `json:"-"`
|
|
||||||
SourceClass string `json:"-"`
|
|
||||||
Jurisdiction string `json:"-"`
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// LegalContext represents aggregated legal context for an assessment.
|
// LegalContext represents aggregated legal context for an assessment.
|
||||||
|
|||||||
@@ -45,11 +45,6 @@ class LLMChecker:
|
|||||||
text = doc.text or ""
|
text = doc.text or ""
|
||||||
if len(text) < 50:
|
if len(text) < 50:
|
||||||
return CheckResult(present=None, source="llm")
|
return CheckResult(present=None, source="llm")
|
||||||
# decision_method=LLM mit judge='haiku': Sufficiency-Pfad (validiert
|
|
||||||
# P0.89/R0.91). Der Qwen-first-Cascade ist als Sufficiency-Judge
|
|
||||||
# widerlegt -> hier Haiku direkt, kriteriengeführte Subsumtion.
|
|
||||||
if (ctrl.extra or {}).get("judge") == "haiku":
|
|
||||||
return await self._haiku(ctrl, text)
|
|
||||||
secs = _sections(text)
|
secs = _sections(text)
|
||||||
if ctrl.topic_regex:
|
if ctrl.topic_regex:
|
||||||
rel = [s for s in secs if re.search(ctrl.topic_regex, s, re.I)][:6] or secs[:6]
|
rel = [s for s in secs if re.search(ctrl.topic_regex, s, re.I)][:6] or secs[:6]
|
||||||
@@ -76,31 +71,3 @@ class LLMChecker:
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.info("llm checker fail %s: %s", ctrl.control_id, str(e)[:80])
|
logger.info("llm checker fail %s: %s", ctrl.control_id, str(e)[:80])
|
||||||
return CheckResult(present=None, source="error")
|
return CheckResult(present=None, source="error")
|
||||||
|
|
||||||
async def _haiku(self, ctrl: ControlSpec, text: str) -> CheckResult:
|
|
||||||
"""Sufficiency via Haiku direkt (validierter Judge). Kriteriengeführt:
|
|
||||||
die Rechts-Elemente stehen in ctrl.paraphrases; wiederverwendet den
|
|
||||||
validierten deep_check-Sufficiency-Prompt."""
|
|
||||||
try:
|
|
||||||
from compliance.services.llm_cascade import _call_anthropic
|
|
||||||
from compliance.services.specialist_agents.dse.deep_check import (
|
|
||||||
_JUDGE_SYS, _build_user, _parse as _parse_judge,
|
|
||||||
)
|
|
||||||
crit = ctrl.paraphrases or [ctrl.label or ctrl.control_id]
|
|
||||||
user = _build_user(text, ctrl.label or ctrl.control_id, crit)
|
|
||||||
obj = None
|
|
||||||
for _ in range(2):
|
|
||||||
obj = _parse_judge(await _call_anthropic(_JUDGE_SYS, user, max_tokens=400))
|
|
||||||
if obj:
|
|
||||||
break
|
|
||||||
if not obj:
|
|
||||||
return CheckResult(present=None, source="haiku")
|
|
||||||
return CheckResult(
|
|
||||||
present=bool(obj.get("erfuellt")),
|
|
||||||
evidence=(obj.get("begruendung") or "")[:120],
|
|
||||||
confidence=float(obj.get("confidence") or 0.0),
|
|
||||||
source="haiku",
|
|
||||||
)
|
|
||||||
except Exception as e:
|
|
||||||
logger.info("llm haiku checker fail %s: %s", ctrl.control_id, str(e)[:80])
|
|
||||||
return CheckResult(present=None, source="error")
|
|
||||||
|
|||||||
@@ -1,68 +0,0 @@
|
|||||||
"""Prüfer-Router — method-agnostischer Dispatch.
|
|
||||||
|
|
||||||
control → sensor_classification (verification_method + decision_method) → Checker.
|
|
||||||
Ein neues Modul liefert nur ControlSpecs; der Router wählt den Prüfer. Damit wird
|
|
||||||
der „Embedding findet, Claude entscheidet"-Pfad EIN gemeinsamer CONTENT/LLM-Prüfer
|
|
||||||
statt Cookie-Sonderlogik. Nicht-gebaute Prüfer (PLAYWRIGHT/AUDIT/SCANNER/REGEX-
|
|
||||||
FIELD) → present=None (fail-safe: Aufrufer behält sein deterministisches Ergebnis).
|
|
||||||
"""
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
from typing import Any, Optional
|
|
||||||
|
|
||||||
from .base import CheckResult, ControlSpec, DecisionMethod, DocContext
|
|
||||||
from .embedding_checker import EmbeddingChecker
|
|
||||||
from .llm_checker import LLMChecker
|
|
||||||
from .reference_checker import ReferenceChecker
|
|
||||||
|
|
||||||
_LLM = LLMChecker()
|
|
||||||
_EMB = EmbeddingChecker()
|
|
||||||
_REF = ReferenceChecker()
|
|
||||||
|
|
||||||
# decision_method → Checker. Fehlende Mechanismen bewusst None (noch nicht gebaut).
|
|
||||||
_BY_DECISION: dict[str, Any] = {
|
|
||||||
DecisionMethod.LLM: _LLM,
|
|
||||||
DecisionMethod.EMBEDDING: _EMB,
|
|
||||||
DecisionMethod.LINK_RESOLVER: _REF,
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
async def route_and_check(ctrl: ControlSpec, doc: DocContext) -> CheckResult:
|
|
||||||
checker = _BY_DECISION.get((ctrl.decision_method or "").upper())
|
|
||||||
if checker is None:
|
|
||||||
return CheckResult(present=None,
|
|
||||||
source=f"no_checker:{ctrl.decision_method}")
|
|
||||||
return await checker.check(ctrl, doc)
|
|
||||||
|
|
||||||
|
|
||||||
def build_spec(
|
|
||||||
control_id: str,
|
|
||||||
sensor_classification: Optional[dict[str, Any]],
|
|
||||||
*,
|
|
||||||
label: str = "",
|
|
||||||
criteria: Optional[list] = None,
|
|
||||||
question: str = "",
|
|
||||||
patterns: Optional[list[str]] = None,
|
|
||||||
embed_threshold: Optional[float] = None,
|
|
||||||
) -> ControlSpec:
|
|
||||||
"""Baut ein ControlSpec aus der GESPEICHERTEN sensor_classification
|
|
||||||
(canonical_controls.generation_metadata.sensor_classification) + den
|
|
||||||
Control-Kriterien. CONTENT/LLM → judge='haiku' (validierter Sufficiency-
|
|
||||||
Judge; Default für Sufficiency lt. Entscheidung 2026-06-22)."""
|
|
||||||
sc = sensor_classification or {}
|
|
||||||
vm = (sc.get("verification_method") or "").upper()
|
|
||||||
dm = (sc.get("decision_method") or "").upper()
|
|
||||||
extra: dict[str, Any] = {}
|
|
||||||
if vm == "CONTENT" and dm == "LLM":
|
|
||||||
extra["judge"] = "haiku"
|
|
||||||
return ControlSpec(
|
|
||||||
control_id=control_id,
|
|
||||||
verification_method=vm,
|
|
||||||
decision_method=dm,
|
|
||||||
label=label,
|
|
||||||
paraphrases=[str(c) for c in (criteria or []) if c],
|
|
||||||
question=question,
|
|
||||||
patterns=patterns or [],
|
|
||||||
embed_threshold=embed_threshold,
|
|
||||||
extra=extra,
|
|
||||||
)
|
|
||||||
@@ -142,26 +142,19 @@ async def _call_ovh(system: str, user: str, max_tokens: int = 6000) -> str:
|
|||||||
headers = {"Content-Type": "application/json"}
|
headers = {"Content-Type": "application/json"}
|
||||||
if key:
|
if key:
|
||||||
headers["Authorization"] = f"Bearer {key}"
|
headers["Authorization"] = f"Bearer {key}"
|
||||||
# gpt-oss-120b is a REASONING model: it spends output tokens on
|
|
||||||
# chain-of-thought before emitting the answer. A low cap (e.g. deep_check's
|
|
||||||
# max_tokens=400) makes it hit the length limit mid-reasoning and return
|
|
||||||
# content=null — the whole tier then silently yields nothing. Floor the
|
|
||||||
# budget so the reasoning AND the JSON answer fit.
|
|
||||||
payload = {
|
payload = {
|
||||||
"model": model, "temperature": 0.05, "max_tokens": max(max_tokens, 2000),
|
"model": model, "temperature": 0.05, "max_tokens": max_tokens,
|
||||||
"messages": [{"role": "system", "content": system},
|
"messages": [{"role": "system", "content": system},
|
||||||
{"role": "user", "content": user}],
|
{"role": "user", "content": user}],
|
||||||
"response_format": {"type": "json_object"},
|
"response_format": {"type": "json_object"},
|
||||||
}
|
}
|
||||||
try:
|
try:
|
||||||
async with httpx.AsyncClient(timeout=90.0) as c:
|
async with httpx.AsyncClient(timeout=45.0) as c:
|
||||||
r = await c.post(f"{base.rstrip('/')}/v1/chat/completions",
|
r = await c.post(f"{base.rstrip('/')}/v1/chat/completions",
|
||||||
json=payload, headers=headers)
|
json=payload, headers=headers)
|
||||||
r.raise_for_status()
|
r.raise_for_status()
|
||||||
msg = (r.json().get("choices") or [{}])[0].get("message") or {}
|
choice = (r.json().get("choices") or [{}])[0]
|
||||||
# Answer is normally in content; if the model was length-capped the
|
return (choice.get("message") or {}).get("content", "") or ""
|
||||||
# JSON can land in reasoning_content instead — fall back to it.
|
|
||||||
return (msg.get("content") or "") or (msg.get("reasoning_content") or "")
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning("ovh cascade tier 2 failed: %s", e)
|
logger.warning("ovh cascade tier 2 failed: %s", e)
|
||||||
return ""
|
return ""
|
||||||
|
|||||||
-78
@@ -1,78 +0,0 @@
|
|||||||
"""Applicability-Gate fuer den Cookie-Policy-Scan.
|
|
||||||
|
|
||||||
Schliesst Controls aus dem Cookie-Findings-Scan aus, die laut
|
|
||||||
`compliance.control_classification` NICHT gegen eine Cookie-Policy laufen
|
|
||||||
('COOKIE_POLICY' nicht in applicable_artifacts). Diese gehoeren zu einem
|
|
||||||
anderen Artefakt/Pruefer — Banner (BEHAVIOR/Playwright), Security/TOM/Audit
|
|
||||||
(PROCESS) — und erzeugen sonst Unsinn-Findings (z.B. 'TOMs nicht dokumentiert'
|
|
||||||
gegen eine Cookie-Richtlinie). Sie werden NICHT geloescht, sondern als
|
|
||||||
Routing-Liste zurueckgegeben.
|
|
||||||
|
|
||||||
Anders als das DSE-Gate OHNE needs_review-Ausnahme: das Artefakt-Signal ist
|
|
||||||
hier entscheidend und per Inventar (2026-06-21) belegt; die mis-scopeten 11
|
|
||||||
sind geprueft. Fail-safe: fehlt die Tabelle / DB nicht erreichbar -> leeres
|
|
||||||
Dict -> es wird NICHT gefiltert (kein stiller Recall-Verlust).
|
|
||||||
"""
|
|
||||||
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import logging
|
|
||||||
import os
|
|
||||||
from typing import Any
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
|
|
||||||
async def load_cookie_gate(db_url: str = "") -> dict[str, dict[str, Any]]:
|
|
||||||
"""Liefert {control_id: meta} fuer Controls, die aus dem Cookie-Findings-
|
|
||||||
Scan auszuschliessen sind (kein COOKIE_POLICY-Artefakt). Leeres Dict =
|
|
||||||
kein Filter."""
|
|
||||||
dsn = (db_url or os.getenv("DATABASE_URL")
|
|
||||||
or os.getenv("COMPLIANCE_DATABASE_URL") or "")
|
|
||||||
if not dsn:
|
|
||||||
return {}
|
|
||||||
try:
|
|
||||||
import asyncpg
|
|
||||||
conn = await asyncpg.connect(dsn)
|
|
||||||
try:
|
|
||||||
rows = await conn.fetch(
|
|
||||||
"""SELECT control_id, obligation_type, check_intent,
|
|
||||||
applicable_artifacts
|
|
||||||
FROM compliance.control_classification
|
|
||||||
WHERE is_active
|
|
||||||
AND NOT ('COOKIE_POLICY' = ANY(applicable_artifacts))""")
|
|
||||||
finally:
|
|
||||||
await conn.close()
|
|
||||||
except Exception as e: # Tabelle fehlt / DB weg -> kein Filter
|
|
||||||
logger.info("cookie classification gate inaktiv: %s", str(e)[:90])
|
|
||||||
return {}
|
|
||||||
return {
|
|
||||||
r["control_id"]: {
|
|
||||||
"obligation_type": r["obligation_type"],
|
|
||||||
"check_intent": r["check_intent"],
|
|
||||||
"applicable_artifacts": list(r["applicable_artifacts"] or []),
|
|
||||||
}
|
|
||||||
for r in rows if r["control_id"]
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def apply_gate(
|
|
||||||
controls: list[dict[str, Any]],
|
|
||||||
gate: dict[str, dict[str, Any]],
|
|
||||||
) -> tuple[list[dict[str, Any]], list[dict[str, Any]]]:
|
|
||||||
"""Teilt geladene Controls in (kept, routed_out).
|
|
||||||
|
|
||||||
kept: laufen normal durch den Cookie-Scan.
|
|
||||||
routed_out: aus dem Scan genommen (control_id + title + Klassifikations-
|
|
||||||
Metadaten fuer das Routing zu Banner/Security/Audit).
|
|
||||||
"""
|
|
||||||
kept: list[dict[str, Any]] = []
|
|
||||||
routed_out: list[dict[str, Any]] = []
|
|
||||||
for c in controls:
|
|
||||||
cid = c.get("control_id")
|
|
||||||
meta = gate.get(cid) if cid else None
|
|
||||||
if meta:
|
|
||||||
routed_out.append({"control_id": cid, "title": c.get("title"), **meta})
|
|
||||||
else:
|
|
||||||
kept.append(c)
|
|
||||||
return kept, routed_out
|
|
||||||
-63
@@ -1,63 +0,0 @@
|
|||||||
"""Layer-3 Sufficiency-Judge fuer Cookie-Policy.
|
|
||||||
|
|
||||||
Das Embedding/Boost-Auto-Rescue (Layer 0/2) ist BEWUSST optimistisch — es findet
|
|
||||||
das Thema, beweist aber nicht die Erfuellung. Messung (2026-06-22): 159 FN
|
|
||||||
(Over-Rescue) gegen Opus-GT, weil 'Thema erwaehnt' als 'erfuellt' durchgewunken
|
|
||||||
wurde. Diese Schicht prueft GENAU die rescued Controls mit dem validierten
|
|
||||||
Haiku-Judge (Cohort cookie_sufficiency_v1: P0.89/R0.91) — NICHT die Qwen-first-
|
|
||||||
Kaskade (lokal ist als Sufficiency-Judge widerlegt) — und nimmt 'passed' zurueck,
|
|
||||||
wenn die konkrete Pflicht nicht erfuellt ist. 'Embedding findet, Claude entscheidet.'
|
|
||||||
|
|
||||||
Nur fuer den NICHT-skip_llm-Pfad (voller Check); der schnelle/interaktive Pfad
|
|
||||||
behaelt das deterministische Rescue.
|
|
||||||
"""
|
|
||||||
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import logging
|
|
||||||
from typing import Any
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
_RESCUE_MARKERS = ("+embedding", "+regex_boost")
|
|
||||||
|
|
||||||
|
|
||||||
def _is_rescued(r: dict[str, Any]) -> bool:
|
|
||||||
src = r.get("source") or ""
|
|
||||||
return r.get("passed") and any(m in src for m in _RESCUE_MARKERS)
|
|
||||||
|
|
||||||
|
|
||||||
async def judge_rescued(text: str, results: list[dict[str, Any]]) -> int:
|
|
||||||
"""Prueft alle rescued (embedding/boost) passed-Controls mit Haiku.
|
|
||||||
Nimmt passed zurueck, wenn der Judge die Pflicht als NICHT erfuellt sieht.
|
|
||||||
Gibt die Anzahl zurueckgenommener (korrigierter) Rescues zurueck.
|
|
||||||
"""
|
|
||||||
# Über den gemeinsamen Prüfer-Router (kein Cookie-Sonderfall mehr):
|
|
||||||
# CONTENT/LLM → build_spec setzt judge='haiku' → LLMChecker (validierter
|
|
||||||
# Sufficiency-Judge). Damit ist Cookie der erste echte Router-Consumer.
|
|
||||||
from compliance.services.checkers.base import DocContext
|
|
||||||
from compliance.services.checkers.router import build_spec, route_and_check
|
|
||||||
|
|
||||||
candidates = [r for r in results if _is_rescued(r)]
|
|
||||||
if not candidates:
|
|
||||||
return 0
|
|
||||||
doc = DocContext(text=text)
|
|
||||||
sc = {"verification_method": "CONTENT", "decision_method": "LLM"}
|
|
||||||
corrected = 0
|
|
||||||
for r in candidates:
|
|
||||||
crit = r.get("_pass_criteria") or [r.get("label") or r.get("hint") or ""]
|
|
||||||
if not isinstance(crit, list):
|
|
||||||
crit = [str(crit)]
|
|
||||||
label = r.get("label") or r.get("hint") or r.get("control_id") or ""
|
|
||||||
spec = build_spec(r.get("control_id") or "", sc, label=label, criteria=crit)
|
|
||||||
res = await route_and_check(spec, doc)
|
|
||||||
if res.present is False:
|
|
||||||
r["passed"] = False
|
|
||||||
r["source"] = (r.get("source") or "") + "+llm_failed"
|
|
||||||
r["matched_text"] = "[layer-3 sufficiency-judge: nicht erfuellt]"
|
|
||||||
r["_judge_reason"] = (res.evidence or "")[:200]
|
|
||||||
corrected += 1
|
|
||||||
if corrected:
|
|
||||||
logger.info("cookie layer-3 sufficiency-judge: %d/%d rescues zurueckgenommen",
|
|
||||||
corrected, len(candidates))
|
|
||||||
return corrected
|
|
||||||
@@ -96,22 +96,6 @@ class CookiePolicyAgent(BaseSpecialistAgent):
|
|||||||
"Branchen-MCs entfernt"
|
"Branchen-MCs entfernt"
|
||||||
)
|
)
|
||||||
|
|
||||||
# Layer 3 — Sufficiency-Judge (Haiku) auf die embedding/boost-rescued
|
|
||||||
# Controls: Embedding findet das Thema, Claude entscheidet ob die Pflicht
|
|
||||||
# konkret erfuellt ist. Nur im vollen Check (nicht skip_llm).
|
|
||||||
skip_llm = bool((agent_input.context or {}).get("skip_llm"))
|
|
||||||
if not skip_llm:
|
|
||||||
try:
|
|
||||||
from ._sufficiency_judge import judge_rescued
|
|
||||||
corrected = await judge_rescued(text, results)
|
|
||||||
if corrected:
|
|
||||||
notes_parts.append(
|
|
||||||
f"layer-3 sufficiency-judge: {corrected} Rescues "
|
|
||||||
"zurückgenommen"
|
|
||||||
)
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning("cookie layer-3 judge skipped: %s", e)
|
|
||||||
|
|
||||||
seen: set[str] = set()
|
seen: set[str] = set()
|
||||||
for r in results:
|
for r in results:
|
||||||
mc_id = r.get("control_id") or ""
|
mc_id = r.get("control_id") or ""
|
||||||
|
|||||||
@@ -45,15 +45,6 @@ async def run_v3_pipeline(
|
|||||||
controls = []
|
controls = []
|
||||||
_normalize_criteria(controls)
|
_normalize_criteria(controls)
|
||||||
controls, sector_dropped = _filter_sector(controls, business_scope)
|
controls, sector_dropped = _filter_sector(controls, business_scope)
|
||||||
# Artefakt-Gate: Controls ohne COOKIE_POLICY-Artefakt (Security/TOM/Audit,
|
|
||||||
# Banner) raus — sie gehoeren zu anderem Pruefer/Artefakt und erzeugen sonst
|
|
||||||
# Unsinn-Findings. Siehe _classification_gate.
|
|
||||||
routed_out: list[dict[str, Any]] = []
|
|
||||||
try:
|
|
||||||
from ._classification_gate import apply_gate, load_cookie_gate
|
|
||||||
controls, routed_out = apply_gate(controls, await load_cookie_gate(db_url))
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning("cookie classification gate skipped: %s", e)
|
|
||||||
results: list[dict[str, Any]] = []
|
results: list[dict[str, Any]] = []
|
||||||
if controls:
|
if controls:
|
||||||
try:
|
try:
|
||||||
@@ -120,7 +111,6 @@ async def run_v3_pipeline(
|
|||||||
"layer_0_boost_overrides": boost_overrides,
|
"layer_0_boost_overrides": boost_overrides,
|
||||||
"total_mcs": len(results),
|
"total_mcs": len(results),
|
||||||
"sector_dropped": sector_dropped,
|
"sector_dropped": sector_dropped,
|
||||||
"artifact_gated": len(routed_out),
|
|
||||||
}
|
}
|
||||||
return results, telemetry
|
return results, telemetry
|
||||||
|
|
||||||
|
|||||||
@@ -1,183 +0,0 @@
|
|||||||
"""Getierte 3-Status-Auswertung für DSE-Controls mit `tiered_criteria`.
|
|
||||||
|
|
||||||
Pro Kriterium wird nach `decision_method` bewertet:
|
|
||||||
- EMBEDDING (Präsenz): deterministisch (festes Modell), Doc EINMAL pro Scan
|
|
||||||
eingebettet → reproduzierbar, kein LLM. Trägt den GROSSTEIL.
|
|
||||||
- LLM (Sufficiency): Haiku-Judge, GECACHT pro (doc_hash, control_id#idx,
|
|
||||||
PROMPT_VERSION, criterion) → gleicher Scan = gleiches Ergebnis. Löst die
|
|
||||||
empirisch gemessene Judge-Varianz (ein Live-Call ist NICHT reproduzierbar).
|
|
||||||
|
|
||||||
Status NUR aus LEGAL_MINIMUM:
|
|
||||||
ERFÜLLT (alle LM erfüllt ODER kein LM) · FEHLT (kein LM erfüllt) ·
|
|
||||||
TEILWEISE (Teil der LM erfüllt) · UNBESTIMMT (LM nicht bewertbar, z. B.
|
|
||||||
Embedding-Service down → Aufrufer behält sein Legacy-Ergebnis).
|
|
||||||
BEST_PRACTICE/OPTIONAL fließen NIE in den Status, nur in `recommendations`.
|
|
||||||
Siehe docs-src/development/criterion_meta_model.md.
|
|
||||||
"""
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import asyncio
|
|
||||||
import hashlib
|
|
||||||
import logging
|
|
||||||
import os
|
|
||||||
import sqlite3
|
|
||||||
from typing import Any, Optional
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
PROMPT_VERSION = "dse-tier-v1"
|
|
||||||
_CACHE_DB = os.getenv("TIERED_JUDGE_CACHE", "/data/tiered_judge_cache.db")
|
|
||||||
_EMBED_THR = float(os.getenv("DSE_CRITERION_EMBED_THRESHOLD", "0.62"))
|
|
||||||
LM = "LEGAL_MINIMUM"
|
|
||||||
|
|
||||||
|
|
||||||
def _doc_hash(text: str) -> str:
|
|
||||||
return hashlib.sha256(text.encode("utf-8", "ignore")).hexdigest()[:20]
|
|
||||||
|
|
||||||
|
|
||||||
def _ckey(dh: str, cid: str, idx: int, crit: str) -> str:
|
|
||||||
ch = hashlib.sha256(crit.encode("utf-8", "ignore")).hexdigest()[:12]
|
|
||||||
return f"{dh}|{cid}#{idx}|{PROMPT_VERSION}|{ch}"
|
|
||||||
|
|
||||||
|
|
||||||
def _cache_get(key: str) -> Optional[bool]:
|
|
||||||
try:
|
|
||||||
with sqlite3.connect(_CACHE_DB) as c:
|
|
||||||
c.execute("create table if not exists judge(k text primary key, met int)")
|
|
||||||
row = c.execute("select met from judge where k=?", (key,)).fetchone()
|
|
||||||
return None if row is None else bool(row[0])
|
|
||||||
except Exception:
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def _cache_put(key: str, met: bool) -> None:
|
|
||||||
try:
|
|
||||||
with sqlite3.connect(_CACHE_DB) as c:
|
|
||||||
c.execute("create table if not exists judge(k text primary key, met int)")
|
|
||||||
c.execute("insert or replace into judge values(?,?)", (key, int(met)))
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning("tiered judge cache put: %s", e)
|
|
||||||
|
|
||||||
|
|
||||||
async def prepare_doc(text: str) -> dict[str, Any]:
|
|
||||||
"""Doc EINMAL pro Scan einbetten. Liefert {hash, chunk_vecs}. Bei Embedding-
|
|
||||||
Ausfall: chunk_vecs=None → EMBEDDING-Kriterien werden UNBESTIMMT (Fallback)."""
|
|
||||||
ctx: dict[str, Any] = {"hash": _doc_hash(text or ""), "chunk_vecs": None}
|
|
||||||
if not text or len(text) < 100:
|
|
||||||
return ctx
|
|
||||||
try:
|
|
||||||
from compliance.services.mc_embedding_matcher import DIM, _chunk_text, _embed_texts
|
|
||||||
vecs = await asyncio.wait_for(_embed_texts(_chunk_text(text)), timeout=90.0)
|
|
||||||
ctx["chunk_vecs"] = [v for v in vecs if v and len(v) == DIM]
|
|
||||||
except (Exception, asyncio.TimeoutError) as e:
|
|
||||||
logger.warning("tiered prepare_doc embedding inaktiv: %s", e)
|
|
||||||
return ctx
|
|
||||||
|
|
||||||
|
|
||||||
async def _embed_present(crits: list[str], ctx: dict, thr: float) -> dict[str, Optional[bool]]:
|
|
||||||
cvecs = ctx.get("chunk_vecs")
|
|
||||||
if not cvecs:
|
|
||||||
return {c: None for c in crits}
|
|
||||||
try:
|
|
||||||
from compliance.services.mc_embedding_matcher import DIM, _cosine, _embed_texts
|
|
||||||
pv = await _embed_texts(crits)
|
|
||||||
out: dict[str, Optional[bool]] = {}
|
|
||||||
for crit, v in zip(crits, pv):
|
|
||||||
if not v or len(v) != DIM:
|
|
||||||
out[crit] = None
|
|
||||||
else:
|
|
||||||
out[crit] = max((_cosine(v, cv) for cv in cvecs), default=0.0) >= thr
|
|
||||||
return out
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning("tiered embed present: %s", e)
|
|
||||||
return {c: None for c in crits}
|
|
||||||
|
|
||||||
|
|
||||||
async def _llm_met(cid: str, idx: int, crit: str, doc, dh: str) -> Optional[bool]:
|
|
||||||
key = _ckey(dh, cid, idx, crit)
|
|
||||||
cached = _cache_get(key)
|
|
||||||
if cached is not None:
|
|
||||||
return cached
|
|
||||||
from compliance.services.checkers.router import build_spec, route_and_check
|
|
||||||
spec = build_spec(cid, {"verification_method": "CONTENT", "decision_method": "LLM"},
|
|
||||||
label=crit, criteria=[crit])
|
|
||||||
res = await route_and_check(spec, doc)
|
|
||||||
if res.present is None:
|
|
||||||
return None
|
|
||||||
_cache_put(key, bool(res.present))
|
|
||||||
return bool(res.present)
|
|
||||||
|
|
||||||
|
|
||||||
def _status(lm_vals: list[Optional[bool]]) -> str:
|
|
||||||
if not lm_vals:
|
|
||||||
return "ERFÜLLT" # kein gesetzliches Minimum → nie rot
|
|
||||||
if any(m is None for m in lm_vals):
|
|
||||||
return "UNBESTIMMT" # Aufrufer behält Legacy
|
|
||||||
n = sum(1 for m in lm_vals if m)
|
|
||||||
if n == len(lm_vals):
|
|
||||||
return "ERFÜLLT"
|
|
||||||
return "FEHLT" if n == 0 else "TEILWEISE"
|
|
||||||
|
|
||||||
|
|
||||||
async def evaluate_tiered(control_id: str, tiered_criteria: list[dict],
|
|
||||||
ctx: dict, doc) -> dict[str, Any]:
|
|
||||||
dh = ctx.get("hash") or _doc_hash(getattr(doc, "text", "") or "")
|
|
||||||
emb_texts = [c["criterion"] for c in (tiered_criteria or [])
|
|
||||||
if c.get("criterion")
|
|
||||||
and (c.get("decision_method") or "EMBEDDING").upper() != "LLM"]
|
|
||||||
emb_res = await _embed_present(emb_texts, ctx, _EMBED_THR) if emb_texts else {}
|
|
||||||
|
|
||||||
lm_vals: list[Optional[bool]] = []
|
|
||||||
recs: list[dict] = []
|
|
||||||
detail: list[dict] = []
|
|
||||||
for idx, c in enumerate(tiered_criteria or []):
|
|
||||||
crit = c.get("criterion") or ""
|
|
||||||
if not crit:
|
|
||||||
continue
|
|
||||||
tier = (c.get("compliance_tier") or "").upper()
|
|
||||||
if (c.get("decision_method") or "EMBEDDING").upper() == "LLM":
|
|
||||||
met = await _llm_met(control_id, idx, crit, doc, dh)
|
|
||||||
src = "haiku-cache"
|
|
||||||
else:
|
|
||||||
met = emb_res.get(crit)
|
|
||||||
src = "embedding"
|
|
||||||
detail.append({"criterion": crit, "tier": tier, "met": met, "source": src})
|
|
||||||
if tier == LM:
|
|
||||||
lm_vals.append(met)
|
|
||||||
elif met is False:
|
|
||||||
recs.append({"criterion": crit, "tier": tier or "OPTIONAL",
|
|
||||||
"legal_basis": c.get("legal_basis")})
|
|
||||||
|
|
||||||
return {"status": _status(lm_vals), "lm_met": sum(1 for m in lm_vals if m),
|
|
||||||
"lm_total": len(lm_vals), "recommendations": recs, "detail": detail}
|
|
||||||
|
|
||||||
|
|
||||||
async def fetch_tiered_criteria(cids: list[str], db_url: str = "") -> dict[str, list]:
|
|
||||||
"""tiered_criteria der angegebenen Controls aus canonical_controls laden.
|
|
||||||
Leeres Dict bei Fehler/keiner DB (Fallback: kein Tiering, Legacy trägt)."""
|
|
||||||
cids = [c for c in cids if c]
|
|
||||||
if not cids:
|
|
||||||
return {}
|
|
||||||
import json
|
|
||||||
dsn = db_url or os.getenv("DATABASE_URL") or os.getenv("COMPLIANCE_DATABASE_URL")
|
|
||||||
if not dsn:
|
|
||||||
return {}
|
|
||||||
try:
|
|
||||||
import asyncpg
|
|
||||||
conn = await asyncpg.connect(dsn)
|
|
||||||
rows = await conn.fetch(
|
|
||||||
"select control_id, generation_metadata->'tiered_criteria' tc "
|
|
||||||
"from compliance.canonical_controls "
|
|
||||||
"where control_id = any($1::text[]) "
|
|
||||||
"and generation_metadata ? 'tiered_criteria'", cids)
|
|
||||||
await conn.close()
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning("fetch_tiered_criteria failed: %s", e)
|
|
||||||
return {}
|
|
||||||
out: dict[str, list] = {}
|
|
||||||
for r in rows:
|
|
||||||
tc = r["tc"]
|
|
||||||
tc = json.loads(tc) if isinstance(tc, str) else tc
|
|
||||||
if tc:
|
|
||||||
out[r["control_id"]] = tc
|
|
||||||
return out
|
|
||||||
@@ -129,41 +129,11 @@ async def run_v3_pipeline(
|
|||||||
r["source"] = (r.get("source") or "") + "+embedding"
|
r["source"] = (r.get("source") or "") + "+embedding"
|
||||||
embedding_passes += 1
|
embedding_passes += 1
|
||||||
|
|
||||||
# Layer 3: getierte 3-Status-Auswertung (nur Controls mit tiered_criteria).
|
|
||||||
# Reproduzierbar: EMBEDDING-Präsenz (deterministisch) + GECACHTER Haiku-Judge
|
|
||||||
# nur für Sufficiency. UNBESTIMMT → Legacy-Pass bleibt. Gated + fail-safe.
|
|
||||||
tiered_evaluated = 0
|
|
||||||
try:
|
|
||||||
from compliance.services.checkers.base import DocContext
|
|
||||||
from ._tiered_eval import (
|
|
||||||
evaluate_tiered, fetch_tiered_criteria, prepare_doc,
|
|
||||||
)
|
|
||||||
result_cids = [r.get("control_id") for r in results if r.get("control_id")]
|
|
||||||
tiered_map = await fetch_tiered_criteria(result_cids, db_url)
|
|
||||||
if tiered_map:
|
|
||||||
ctx = await prepare_doc(text)
|
|
||||||
doc_ctx = DocContext(text=text)
|
|
||||||
for r in results:
|
|
||||||
tc = tiered_map.get(r.get("control_id"))
|
|
||||||
if not tc:
|
|
||||||
continue
|
|
||||||
ev = await evaluate_tiered(r["control_id"], tc, ctx, doc_ctx)
|
|
||||||
if ev["status"] == "UNBESTIMMT":
|
|
||||||
continue
|
|
||||||
r["compliance_status"] = ev["status"]
|
|
||||||
r["recommendations"] = ev["recommendations"]
|
|
||||||
r["tier_lm"] = f"{ev['lm_met']}/{ev['lm_total']}"
|
|
||||||
r["passed"] = ev["status"] == "ERFÜLLT"
|
|
||||||
tiered_evaluated += 1
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning("dse tiered eval skipped: %s", e)
|
|
||||||
|
|
||||||
telemetry = {
|
telemetry = {
|
||||||
"layer_0_field_hits": len(boost_field_ids),
|
"layer_0_field_hits": len(boost_field_ids),
|
||||||
"layer_0_field_ids": boost_field_ids,
|
"layer_0_field_ids": boost_field_ids,
|
||||||
"layer_1_pass": layer_1_pass,
|
"layer_1_pass": layer_1_pass,
|
||||||
"embedding_passes": embedding_passes,
|
"embedding_passes": embedding_passes,
|
||||||
"tiered_evaluated": tiered_evaluated,
|
|
||||||
"total_mcs": len(results),
|
"total_mcs": len(results),
|
||||||
"sector_dropped": drop_stats.get("sector_dropped", 0),
|
"sector_dropped": drop_stats.get("sector_dropped", 0),
|
||||||
"offtopic_dropped": drop_stats.get("offtopic_dropped", 0),
|
"offtopic_dropped": drop_stats.get("offtopic_dropped", 0),
|
||||||
|
|||||||
@@ -1,51 +0,0 @@
|
|||||||
"""Prüfer-Router: build_spec aus sensor_classification + method-agnostischer
|
|
||||||
Dispatch. CONTENT/LLM -> Haiku-Sufficiency-Tier (validiert), unbekannte
|
|
||||||
decision_methods -> fail-safe present=None."""
|
|
||||||
import pytest
|
|
||||||
from unittest.mock import AsyncMock, patch
|
|
||||||
|
|
||||||
from compliance.services.checkers.base import DocContext
|
|
||||||
from compliance.services.checkers.router import build_spec, route_and_check
|
|
||||||
|
|
||||||
_ANTHROPIC = "compliance.services.llm_cascade._call_anthropic"
|
|
||||||
|
|
||||||
|
|
||||||
def test_build_spec_content_llm_uses_haiku():
|
|
||||||
s = build_spec("X", {"verification_method": "CONTENT", "decision_method": "LLM"},
|
|
||||||
label="L", criteria=["a", "b"])
|
|
||||||
assert s.verification_method == "CONTENT" and s.decision_method == "LLM"
|
|
||||||
assert s.extra.get("judge") == "haiku"
|
|
||||||
assert s.paraphrases == ["a", "b"]
|
|
||||||
|
|
||||||
|
|
||||||
def test_build_spec_embedding_no_haiku():
|
|
||||||
s = build_spec("X", {"verification_method": "CONTENT", "decision_method": "EMBEDDING"})
|
|
||||||
assert s.extra.get("judge") is None
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_route_unknown_decision_is_failsafe():
|
|
||||||
s = build_spec("X", {"verification_method": "BEHAVIOR", "decision_method": "PLAYWRIGHT"})
|
|
||||||
r = await route_and_check(s, DocContext(text="x" * 200))
|
|
||||||
assert r.present is None and "no_checker" in r.source
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_route_content_llm_haiku_fehlt():
|
|
||||||
s = build_spec("X", {"verification_method": "CONTENT", "decision_method": "LLM"},
|
|
||||||
label="Speicherdauer", criteria=["Höchstdauer pro Kategorie"])
|
|
||||||
fake = AsyncMock(return_value='{"erfuellt": false, "confidence": 0.9, "begruendung": "fehlt"}')
|
|
||||||
with patch(_ANTHROPIC, new=fake):
|
|
||||||
r = await route_and_check(s, DocContext(text="Wir nutzen Cookies. " * 30))
|
|
||||||
assert r.present is False and r.source == "haiku"
|
|
||||||
assert fake.call_count >= 1
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_route_content_llm_haiku_erfuellt():
|
|
||||||
s = build_spec("X", {"verification_method": "CONTENT", "decision_method": "LLM"},
|
|
||||||
label="L", criteria=["x"])
|
|
||||||
fake = AsyncMock(return_value='{"erfuellt": true, "confidence": 0.8}')
|
|
||||||
with patch(_ANTHROPIC, new=fake):
|
|
||||||
r = await route_and_check(s, DocContext(text="text " * 40))
|
|
||||||
assert r.present is True
|
|
||||||
@@ -1,42 +0,0 @@
|
|||||||
"""Tests for the cookie-policy applicability gate: controls without a
|
|
||||||
COOKIE_POLICY artifact are routed out of the findings scan (not deleted),
|
|
||||||
and the gate is fail-safe (no DSN -> no filter)."""
|
|
||||||
import pytest
|
|
||||||
|
|
||||||
from compliance.services.specialist_agents.cookie_policy._classification_gate import (
|
|
||||||
apply_gate, load_cookie_gate,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def test_apply_gate_splits_kept_and_routed():
|
|
||||||
controls = [
|
|
||||||
{"control_id": "COOK-1", "title": "Kategorien"},
|
|
||||||
{"control_id": "TOM-1", "title": "Verschlüsselung"},
|
|
||||||
{"control_id": "BAN-1", "title": "Consent vor Setzen"},
|
|
||||||
]
|
|
||||||
gate = {
|
|
||||||
"TOM-1": {"obligation_type": "TECHNICAL", "check_intent": "DIRECT_TECHNICAL",
|
|
||||||
"applicable_artifacts": ["TOM", "AUDIT"]},
|
|
||||||
"BAN-1": {"obligation_type": "TECHNICAL", "check_intent": "DIRECT_TECHNICAL",
|
|
||||||
"applicable_artifacts": ["COOKIE_BANNER", "SYSTEMSCAN"]},
|
|
||||||
}
|
|
||||||
kept, routed = apply_gate(controls, gate)
|
|
||||||
assert [c["control_id"] for c in kept] == ["COOK-1"]
|
|
||||||
assert {c["control_id"] for c in routed} == {"TOM-1", "BAN-1"}
|
|
||||||
# routed entries carry title + classification metadata for downstream routing
|
|
||||||
tom = next(c for c in routed if c["control_id"] == "TOM-1")
|
|
||||||
assert tom["title"] == "Verschlüsselung"
|
|
||||||
assert tom["applicable_artifacts"] == ["TOM", "AUDIT"]
|
|
||||||
|
|
||||||
|
|
||||||
def test_apply_gate_empty_gate_keeps_all():
|
|
||||||
controls = [{"control_id": "A"}, {"control_id": "B"}]
|
|
||||||
kept, routed = apply_gate(controls, {})
|
|
||||||
assert len(kept) == 2 and routed == []
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_load_cookie_gate_no_dsn_is_failsafe(monkeypatch):
|
|
||||||
monkeypatch.delenv("DATABASE_URL", raising=False)
|
|
||||||
monkeypatch.delenv("COMPLIANCE_DATABASE_URL", raising=False)
|
|
||||||
assert await load_cookie_gate("") == {}
|
|
||||||
@@ -1,68 +0,0 @@
|
|||||||
"""Layer-3 cookie sufficiency-judge: only embedding/boost-RESCUED passes are
|
|
||||||
re-judged by Haiku; keyword passes are untouched; a FEHLT verdict un-passes."""
|
|
||||||
import pytest
|
|
||||||
from unittest.mock import AsyncMock, patch
|
|
||||||
|
|
||||||
from compliance.services.specialist_agents.cookie_policy._sufficiency_judge import (
|
|
||||||
judge_rescued,
|
|
||||||
)
|
|
||||||
|
|
||||||
_ANTHROPIC = "compliance.services.llm_cascade._call_anthropic"
|
|
||||||
_DOC = "Volltext der Cookie-Richtlinie mit ausreichend Inhalt. " * 4
|
|
||||||
|
|
||||||
|
|
||||||
def _r(cid, source, passed=True):
|
|
||||||
return {"control_id": cid, "source": source, "passed": passed,
|
|
||||||
"label": cid, "_pass_criteria": ["konkrete Angabe nötig"]}
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_rescued_unpassed_when_judge_fehlt():
|
|
||||||
results = [_r("A", "keyword+embedding")]
|
|
||||||
fake = AsyncMock(return_value='{"erfuellt": false, "confidence": 0.9, "begruendung": "fehlt"}')
|
|
||||||
with patch(_ANTHROPIC, new=fake):
|
|
||||||
n = await judge_rescued(_DOC, results)
|
|
||||||
assert n == 1
|
|
||||||
assert results[0]["passed"] is False
|
|
||||||
assert "+llm_failed" in results[0]["source"]
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_rescued_kept_when_judge_erfuellt():
|
|
||||||
results = [_r("A", "keyword+embedding")]
|
|
||||||
fake = AsyncMock(return_value='{"erfuellt": true, "confidence": 0.9}')
|
|
||||||
with patch(_ANTHROPIC, new=fake):
|
|
||||||
n = await judge_rescued(_DOC, results)
|
|
||||||
assert n == 0
|
|
||||||
assert results[0]["passed"] is True
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_keyword_pass_not_judged():
|
|
||||||
"""Deterministisch (keyword) bestandene Controls werden NICHT befragt."""
|
|
||||||
results = [_r("A", "keyword")]
|
|
||||||
fake = AsyncMock(return_value='{"erfuellt": false}')
|
|
||||||
with patch(_ANTHROPIC, new=fake):
|
|
||||||
n = await judge_rescued(_DOC, results)
|
|
||||||
assert n == 0
|
|
||||||
assert results[0]["passed"] is True
|
|
||||||
assert fake.call_count == 0
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_boost_rescue_is_judged():
|
|
||||||
results = [_r("A", "keyword+regex_boost")]
|
|
||||||
fake = AsyncMock(return_value='{"erfuellt": false}')
|
|
||||||
with patch(_ANTHROPIC, new=fake):
|
|
||||||
n = await judge_rescued(_DOC, results)
|
|
||||||
assert n == 1 and results[0]["passed"] is False
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_failed_controls_ignored():
|
|
||||||
"""Nicht-bestandene (failed) Controls sind nicht Sache dieser Schicht."""
|
|
||||||
results = [_r("A", "keyword+embedding", passed=False)]
|
|
||||||
fake = AsyncMock(return_value='{"erfuellt": false}')
|
|
||||||
with patch(_ANTHROPIC, new=fake):
|
|
||||||
n = await judge_rescued(_DOC, results)
|
|
||||||
assert n == 0 and fake.call_count == 0
|
|
||||||
@@ -1,77 +0,0 @@
|
|||||||
"""Regression tests for the OVH (gpt-oss-120b) tier of the LLM cascade.
|
|
||||||
|
|
||||||
gpt-oss-120b is a reasoning model: it spends output tokens on chain-of-thought
|
|
||||||
before the answer. Two bugs this pins:
|
|
||||||
1. A small max_tokens (deep_check passed 400) length-caps it mid-reasoning →
|
|
||||||
content=null → the tier silently returns nothing. _call_ovh must floor the
|
|
||||||
budget so reasoning + the JSON answer fit.
|
|
||||||
2. When length-capped, the JSON can land in reasoning_content, not content →
|
|
||||||
_call_ovh must fall back to reasoning_content.
|
|
||||||
"""
|
|
||||||
import pytest
|
|
||||||
from unittest.mock import AsyncMock, MagicMock, patch
|
|
||||||
|
|
||||||
from compliance.services import llm_cascade
|
|
||||||
|
|
||||||
|
|
||||||
def _resp(data):
|
|
||||||
r = MagicMock()
|
|
||||||
r.raise_for_status = MagicMock()
|
|
||||||
r.json = MagicMock(return_value=data)
|
|
||||||
return r
|
|
||||||
|
|
||||||
|
|
||||||
def _client(resp):
|
|
||||||
inst = AsyncMock()
|
|
||||||
inst.post.return_value = resp
|
|
||||||
inst.__aenter__ = AsyncMock(return_value=inst)
|
|
||||||
inst.__aexit__ = AsyncMock(return_value=False)
|
|
||||||
return inst
|
|
||||||
|
|
||||||
|
|
||||||
class TestCallOvhReasoning:
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_reasoning_content_used_when_content_null(self, monkeypatch):
|
|
||||||
monkeypatch.setenv("OVH_LLM_URL", "https://llm.example.com")
|
|
||||||
monkeypatch.setenv("OVH_LLM_MODEL", "gpt-oss-120b")
|
|
||||||
monkeypatch.setenv("OVH_LLM_KEY", "k")
|
|
||||||
resp = _resp({"choices": [{"message": {
|
|
||||||
"content": None,
|
|
||||||
"reasoning_content": '{"erfuellt": true, "confidence": 0.9}'}}]})
|
|
||||||
with patch("httpx.AsyncClient", return_value=_client(resp)):
|
|
||||||
out = await llm_cascade._call_ovh("sys", "user", max_tokens=400)
|
|
||||||
assert '"erfuellt": true' in out
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_small_budget_is_floored(self, monkeypatch):
|
|
||||||
monkeypatch.setenv("OVH_LLM_URL", "https://llm.example.com")
|
|
||||||
monkeypatch.setenv("OVH_LLM_MODEL", "gpt-oss-120b")
|
|
||||||
inst = _client(_resp({"choices": [{"message": {"content": "{}"}}]}))
|
|
||||||
with patch("httpx.AsyncClient", return_value=inst):
|
|
||||||
await llm_cascade._call_ovh("sys", "user", max_tokens=400)
|
|
||||||
assert inst.post.call_args.kwargs["json"]["max_tokens"] >= 2000
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_large_budget_is_preserved(self, monkeypatch):
|
|
||||||
monkeypatch.setenv("OVH_LLM_URL", "https://llm.example.com")
|
|
||||||
monkeypatch.setenv("OVH_LLM_MODEL", "gpt-oss-120b")
|
|
||||||
inst = _client(_resp({"choices": [{"message": {"content": "{}"}}]}))
|
|
||||||
with patch("httpx.AsyncClient", return_value=inst):
|
|
||||||
await llm_cascade._call_ovh("sys", "user", max_tokens=6000)
|
|
||||||
assert inst.post.call_args.kwargs["json"]["max_tokens"] == 6000
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_content_preferred_when_present(self, monkeypatch):
|
|
||||||
monkeypatch.setenv("OVH_LLM_URL", "https://llm.example.com")
|
|
||||||
monkeypatch.setenv("OVH_LLM_MODEL", "gpt-oss-120b")
|
|
||||||
resp = _resp({"choices": [{"message": {
|
|
||||||
"content": '{"erfuellt": false}', "reasoning_content": "noise"}}]})
|
|
||||||
with patch("httpx.AsyncClient", return_value=_client(resp)):
|
|
||||||
out = await llm_cascade._call_ovh("sys", "user")
|
|
||||||
assert out == '{"erfuellt": false}'
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_unconfigured_returns_empty(self, monkeypatch):
|
|
||||||
monkeypatch.delenv("OVH_LLM_URL", raising=False)
|
|
||||||
monkeypatch.delenv("OVH_LLM_MODEL", raising=False)
|
|
||||||
assert await llm_cascade._call_ovh("sys", "user") == ""
|
|
||||||
@@ -1,102 +0,0 @@
|
|||||||
"""Unit-Tests für die getierte 3-Status-Auswertung (_tiered_eval).
|
|
||||||
|
|
||||||
Deckt ab: Status-Logik (inkl. kein-LM → ERFÜLLT, UNBESTIMMT bei nicht bewertbar),
|
|
||||||
Empfehlungs-Sammlung, EMBEDDING/LLM-Routing (gemockt) und den Reproduzierbarkeits-
|
|
||||||
Cache. Embedding/LLM werden gemockt — kein Netzwerk."""
|
|
||||||
import asyncio
|
|
||||||
|
|
||||||
from compliance.services.specialist_agents.dse import _tiered_eval as te
|
|
||||||
|
|
||||||
|
|
||||||
# ---- reine Status-Logik -------------------------------------------------
|
|
||||||
def test_status_no_lm_is_erfuellt():
|
|
||||||
assert te._status([]) == "ERFÜLLT"
|
|
||||||
|
|
||||||
|
|
||||||
def test_status_all_met_erfuellt():
|
|
||||||
assert te._status([True, True]) == "ERFÜLLT"
|
|
||||||
|
|
||||||
|
|
||||||
def test_status_none_met_fehlt():
|
|
||||||
assert te._status([False, False]) == "FEHLT"
|
|
||||||
|
|
||||||
|
|
||||||
def test_status_partial_teilweise():
|
|
||||||
assert te._status([True, False]) == "TEILWEISE"
|
|
||||||
|
|
||||||
|
|
||||||
def test_status_any_none_unbestimmt():
|
|
||||||
assert te._status([True, None]) == "UNBESTIMMT"
|
|
||||||
|
|
||||||
|
|
||||||
# ---- evaluate_tiered (Embedding/LLM gemockt) ----------------------------
|
|
||||||
def _crit(text, tier, dm="EMBEDDING"):
|
|
||||||
return {"criterion": text, "compliance_tier": tier,
|
|
||||||
"decision_method": dm, "legal_basis": "x"}
|
|
||||||
|
|
||||||
|
|
||||||
class _Doc:
|
|
||||||
def __init__(self, text):
|
|
||||||
self.text = text
|
|
||||||
|
|
||||||
|
|
||||||
def test_evaluate_partial_with_recommendation(monkeypatch):
|
|
||||||
crits = [_crit("Zwecke genannt", "LEGAL_MINIMUM"),
|
|
||||||
_crit("Speicherdauer genannt", "LEGAL_MINIMUM"),
|
|
||||||
_crit("tabellarisch ausgewiesen", "BEST_PRACTICE")]
|
|
||||||
|
|
||||||
async def fake_embed(texts, ctx, thr):
|
|
||||||
return {"Zwecke genannt": True, "Speicherdauer genannt": False,
|
|
||||||
"tabellarisch ausgewiesen": False}
|
|
||||||
|
|
||||||
monkeypatch.setattr(te, "_embed_present", fake_embed)
|
|
||||||
out = asyncio.run(te.evaluate_tiered("C1", crits, {"hash": "h"}, _Doc("x" * 200)))
|
|
||||||
assert out["status"] == "TEILWEISE"
|
|
||||||
assert out["lm_met"] == 1 and out["lm_total"] == 2
|
|
||||||
assert len(out["recommendations"]) == 1
|
|
||||||
assert out["recommendations"][0]["tier"] == "BEST_PRACTICE"
|
|
||||||
|
|
||||||
|
|
||||||
def test_evaluate_no_lm_is_erfuellt_with_recs(monkeypatch):
|
|
||||||
crits = [_crit("Bildsymbole", "OPTIONAL"), _crit("Legende", "OPTIONAL")]
|
|
||||||
|
|
||||||
async def fake_embed(texts, ctx, thr):
|
|
||||||
return {t: False for t in texts}
|
|
||||||
|
|
||||||
monkeypatch.setattr(te, "_embed_present", fake_embed)
|
|
||||||
out = asyncio.run(te.evaluate_tiered("C2", crits, {"hash": "h"}, _Doc("x" * 200)))
|
|
||||||
assert out["status"] == "ERFÜLLT"
|
|
||||||
assert out["lm_total"] == 0
|
|
||||||
assert len(out["recommendations"]) == 2
|
|
||||||
|
|
||||||
|
|
||||||
def test_evaluate_llm_criterion_routed(monkeypatch):
|
|
||||||
crits = [_crit("Speicherdauer hinreichend nachvollziehbar", "LEGAL_MINIMUM", dm="LLM")]
|
|
||||||
|
|
||||||
async def fake_llm(cid, idx, crit, doc, dh):
|
|
||||||
return True
|
|
||||||
|
|
||||||
monkeypatch.setattr(te, "_llm_met", fake_llm)
|
|
||||||
out = asyncio.run(te.evaluate_tiered("C3", crits, {"hash": "h"}, _Doc("x" * 200)))
|
|
||||||
assert out["status"] == "ERFÜLLT" and out["lm_total"] == 1
|
|
||||||
|
|
||||||
|
|
||||||
def test_evaluate_unbestimmt_when_embed_unavailable(monkeypatch):
|
|
||||||
crits = [_crit("Zwecke genannt", "LEGAL_MINIMUM")]
|
|
||||||
|
|
||||||
async def fake_embed(texts, ctx, thr):
|
|
||||||
return {t: None for t in texts} # Embedding-Service down
|
|
||||||
|
|
||||||
monkeypatch.setattr(te, "_embed_present", fake_embed)
|
|
||||||
out = asyncio.run(te.evaluate_tiered("C4", crits, {"hash": "h"}, _Doc("x" * 200)))
|
|
||||||
assert out["status"] == "UNBESTIMMT"
|
|
||||||
|
|
||||||
|
|
||||||
# ---- Reproduzierbarkeits-Cache -----------------------------------------
|
|
||||||
def test_cache_roundtrip(monkeypatch, tmp_path):
|
|
||||||
monkeypatch.setattr(te, "_CACHE_DB", str(tmp_path / "cache.db"))
|
|
||||||
assert te._cache_get("k1") is None
|
|
||||||
te._cache_put("k1", True)
|
|
||||||
te._cache_put("k2", False)
|
|
||||||
assert te._cache_get("k1") is True
|
|
||||||
assert te._cache_get("k2") is False
|
|
||||||
@@ -1,155 +0,0 @@
|
|||||||
# Kriterien-Meta-Modell & Compliance-Tier-Architektur
|
|
||||||
|
|
||||||
> **Status: EINGEFROREN 2026-06-22.** Änderungen an diesem Modell sind
|
|
||||||
> Architekturentscheidungen und erfordern eine bewusste Freigabe (DB-Owner /
|
|
||||||
> Produktverantwortung). Verwandt: [`platform_checker_matrix.md`](platform_checker_matrix.md),
|
|
||||||
> [`verification_method.md`](verification_method.md), [`platform_validation_v1.md`](platform_validation_v1.md).
|
|
||||||
|
|
||||||
## 1. Motivation
|
|
||||||
|
|
||||||
Die Kalibrierung der vier Website-Compliance-Module deckte vier **verschiedene**
|
|
||||||
dominante Fehlerursachen auf:
|
|
||||||
|
|
||||||
| Modul | Dominanter Hebel |
|
|
||||||
|-------|------------------|
|
|
||||||
| Cookie-Policy | Sufficiency (Judge) |
|
|
||||||
| Impressum | Scope / Routing |
|
|
||||||
| AGB | Decision-Method / Routing |
|
|
||||||
| DSE | **Überladene Controls + Vermischung „gesetzliches Minimum vs. Best Practice"** |
|
|
||||||
|
|
||||||
Die DSE-Untersuchung (Adjudikation von 13 Judge↔GT-Disagreements) ergab: **85 % der
|
|
||||||
Restfehler sind Katalog-Defekte, 15 % Prüfer.** Der größte Einzeldefekt: ein Control
|
|
||||||
bündelt mehrere Anforderungen **unterschiedlicher Verbindlichkeit** und wird nur dann
|
|
||||||
als ERFÜLLT gewertet, wenn *alle* erfüllt sind. Folge: gesetzlich konforme Dokumente
|
|
||||||
werden als „FEHLT" gemeldet, weil eine Best-Practice-Empfehlung fehlt.
|
|
||||||
|
|
||||||
Dieses Modell behebt das **im Katalog** — ohne den Prüfer zu ändern und ohne Controls
|
|
||||||
physisch aufzuspalten.
|
|
||||||
|
|
||||||
## 2. Datenmodell
|
|
||||||
|
|
||||||
Ein Control bleibt **stabil** (UUID, Citations, GT-Historie, Kalibrierung,
|
|
||||||
Statistiken). Seine `pass_criteria` werden von einer Stringliste zu **atomaren,
|
|
||||||
getypten Kriterien-Objekten**:
|
|
||||||
|
|
||||||
```
|
|
||||||
Control (stabile control_uuid — NICHT splitten)
|
|
||||||
└─ criteria: Criterion[]
|
|
||||||
|
|
||||||
Criterion
|
|
||||||
├─ criterion (Text der Einzelanforderung)
|
|
||||||
├─ legal_basis (z. B. "Art. 13(1)(c) DSGVO")
|
|
||||||
├─ verification_method (Achse 1 — WAS wird geprüft)
|
|
||||||
├─ decision_method (Achse 2 — WIE wird entschieden)
|
|
||||||
├─ compliance_tier (Achse 3 — WIE VERBINDLICH)
|
|
||||||
└─ weight (reserviert für Reifegrad, s. §6 — heute NICHT gating)
|
|
||||||
```
|
|
||||||
|
|
||||||
**Speicherort:** `canonical_controls.generation_metadata->'tiered_criteria'` (jsonb).
|
|
||||||
**Keine Schema-Änderung.** Kein physischer Control-Split (Variante A wurde verworfen:
|
|
||||||
neue UUIDs → Verlust von Benchmarks/Kalibrierung/Citation/GT = Migrationsprojekt).
|
|
||||||
|
|
||||||
## 3. Die drei Achsen
|
|
||||||
|
|
||||||
Jedes Kriterium trägt drei **unabhängige** Klassifikationen:
|
|
||||||
|
|
||||||
1. **`verification_method`** — artefakt-abhängig: CONTENT · FIELD · REFERENCE ·
|
|
||||||
BEHAVIOR · PRESENTATION · PROCESS · TECHNICAL · CONTRACTUAL. Siehe
|
|
||||||
[`verification_method.md`](verification_method.md).
|
|
||||||
2. **`decision_method`** — welcher Prüfer: REGEX · EMBEDDING · LLM · LINK_RESOLVER ·
|
|
||||||
PLAYWRIGHT · AUDIT · SCANNER. Siehe [`platform_checker_matrix.md`](platform_checker_matrix.md).
|
|
||||||
3. **`compliance_tier`** *(neu, dieses Dokument)* — Verbindlichkeit:
|
|
||||||
- **`LEGAL_MINIMUM`** — gesetzlich erforderlich. Beeinflusst den Compliance-Status.
|
|
||||||
- **`BEST_PRACTICE`** — empfehlenswert, gesetzlich nicht erforderlich. Erscheint als
|
|
||||||
Empfehlung. Beeinflusst den Status **nie**.
|
|
||||||
- **`OPTIONAL`** — Komfort/Detailtiefe. Empfehlung. Beeinflusst den Status **nie**.
|
|
||||||
|
|
||||||
Achse 1 + 2 sind primär **per Kriterium** (atomar); ein Control kann Kriterien
|
|
||||||
verschiedener Methoden mischen.
|
|
||||||
|
|
||||||
## 4. Status-Berechnung (3 Zustände) — Gating NUR auf LEGAL_MINIMUM
|
|
||||||
|
|
||||||
Sei `LM` die Menge der `LEGAL_MINIMUM`-Kriterien eines Controls und `met(LM)` die
|
|
||||||
erfüllten darunter:
|
|
||||||
|
|
||||||
```
|
|
||||||
ERFÜLLT := |LM| > 0 und met(LM) == |LM| (alle Pflicht-Kriterien erfüllt)
|
|
||||||
TEILWEISE := 0 < met(LM) < |LM| (mind. eines erfüllt, mind. eines fehlt)
|
|
||||||
FEHLT := |LM| > 0 und met(LM) == 0 (kein Pflicht-Kriterium erfüllt)
|
|
||||||
```
|
|
||||||
|
|
||||||
`BEST_PRACTICE`/`OPTIONAL`-Kriterien gehen **nicht** in diese Berechnung ein. Sie
|
|
||||||
werden separat als Empfehlungen ausgewiesen (§5, Ebene 2).
|
|
||||||
|
|
||||||
> **Invariante:** Ein erfülltes gesetzliches Minimum darf NIE durch fehlende
|
|
||||||
> Best-Practice-/Optional-Kriterien auf FEHLT/Rot gezogen werden.
|
|
||||||
|
|
||||||
## 5. Reporting — drei Ebenen
|
|
||||||
|
|
||||||
| Ebene | Inhalt | Quelle |
|
|
||||||
|-------|--------|--------|
|
|
||||||
| **1 — Compliance-Status (rechtlich)** | ERFÜLLT / TEILWEISE / FEHLT | NUR `LEGAL_MINIMUM` |
|
|
||||||
| **2 — Optimierungspotenzial** | „Empfehlungen: N · Best-Practice-Abdeckung X %" | `BEST_PRACTICE` + `OPTIONAL` |
|
|
||||||
| **3 — Risiko-Reifegrad** *(optional, später)* | „Reifegrad Y %" für CRA/NIS2/ISO 27001/TOM | gewichtet, s. §6 |
|
|
||||||
|
|
||||||
**Anti-Pattern (verboten):** kein „Compliance-Score = 72 %", wenn alle gesetzlichen
|
|
||||||
Anforderungen erfüllt sind. Das erzeugt „welche 28 % fehlen?" → „eigentlich keine
|
|
||||||
Pflicht" → der Score wird wertlos.
|
|
||||||
|
|
||||||
### Farb-Semantik (Bedeutung, nicht Wertung)
|
|
||||||
|
|
||||||
- **Grün** = gesetzliche Anforderungen erfüllt (Pflicht erfüllt)
|
|
||||||
- **Blau** = empfohlene Verbesserungen vorhanden (Optimierung möglich)
|
|
||||||
- **Rot** = gesetzliche Anforderungen fehlen (Pflichtverletzung)
|
|
||||||
|
|
||||||
`TEILWEISE` ist visuell ein eigener Zustand (z. B. Gelb/Amber): Pflicht teilweise
|
|
||||||
erfüllt. Verbindet sich mit der BreakPilot-Tonalität (kein Panik-Rot) und dem
|
|
||||||
3-Tier-Obligation-Modell (Pflicht/Empfehlung/Kann).
|
|
||||||
|
|
||||||
## 6. `weight`
|
|
||||||
|
|
||||||
Wird heute **gespeichert, aber nicht für das Gating verwendet** (bewusste
|
|
||||||
Entscheidung: Gewichte erzeugen sofort „warum 0.3 und nicht 0.4?"-Diskussionen). Es
|
|
||||||
ist die Reserve für **Ebene 3 (Reifegrad)**: später lässt sich daraus ein gewichteter
|
|
||||||
Best-Practice-/Reifegrad-Prozentwert berechnen. Richtwerte: LEGAL_MINIMUM 1.0 ·
|
|
||||||
BEST_PRACTICE ~0.3 · OPTIONAL ~0.1.
|
|
||||||
|
|
||||||
## 7. compliance_tier ist eine PLATTFORM-Achse
|
|
||||||
|
|
||||||
Nicht nur ein DSE-Fix. Dasselbe Muster tritt überall auf — DSE (Minimum vs. BP),
|
|
||||||
Cookie (Offenlegung vs. Transparenz), Impressum (Pflicht- vs. Komfortfelder), AGB
|
|
||||||
(erforderlich vs. empfehlenswert) und perspektivisch CRA/NIS2/Maschinenverordnung.
|
|
||||||
Ein einzelnes Kriterium trägt überall `compliance_tier`; die Plattform wertet
|
|
||||||
**Compliance / Empfehlungen / Reifegrad** regulierungsunabhängig aus.
|
|
||||||
|
|
||||||
## 8. Validierungsnachweis (Pilot, 2026-06-22)
|
|
||||||
|
|
||||||
Geschrieben auf macmini (`generation_metadata.tiered_criteria`, prod-guarded), gemessen
|
|
||||||
gegen Opus-GT (ikea/ob/teamviewer):
|
|
||||||
|
|
||||||
- **5 Pilot-Controls** (SEC-7285-A03, SEC-3257-A01, Portabilitäts-Cluster
|
|
||||||
DATA-1613/DATA-2552/COMP-2087): alle **6 Disagreement-Fälle** (vormals falsch-FEHLT)
|
|
||||||
wandern zu **ERFÜLLT + Empfehlungen**; echte Lücken bleiben korrekt FEHLT — ohne
|
|
||||||
Prüfer-Änderung.
|
|
||||||
- **TEILWEISE-Validierung** (DATA-1445-A02, SEC-4752-A02): der 3. Status tritt real auf
|
|
||||||
(1 ERFÜLLT / 5 TEILWEISE), Splitter durchgängig „Speicherdauer pro Zweck"
|
|
||||||
(Art. 13(2)(a)).
|
|
||||||
- Lehre: selbst Pilot-Kriterien können Minimum + Best-Practice vermischen
|
|
||||||
(„Speicherdauer *pro Zweck*"). Die LM/BP-Linie ist eine **Produktpolitik-Entscheidung
|
|
||||||
(Mensch)**, kein NLP-Problem. Das Modell ist korrekt; die Kriterien-Schärfe ist
|
|
||||||
Kurationsarbeit.
|
|
||||||
|
|
||||||
## 9. Invarianten (nicht verletzen)
|
|
||||||
|
|
||||||
1. Control-UUID bleibt stabil — **kein** physischer Split.
|
|
||||||
2. Status (Grün/Gelb/Rot) hängt **ausschließlich** an `LEGAL_MINIMUM`.
|
|
||||||
3. `BEST_PRACTICE`/`OPTIONAL` erzeugen Empfehlungen, **nie** einen FEHLT-Status.
|
|
||||||
4. Kein Prozent-Compliance-Score, wenn alle gesetzlichen Anforderungen erfüllt sind.
|
|
||||||
5. Speicherung in `generation_metadata` (jsonb) — keine Schema-Migration.
|
|
||||||
|
|
||||||
## 10. Rollout (nach diesem Freeze)
|
|
||||||
|
|
||||||
1. **10–15** der schlimmsten überladenen DSE-Controls tiern (nicht alle 49 auf einmal).
|
|
||||||
2. 3-Status-Logik in die Live-DSE-Engine verdrahten (heute nur Mess-Harness).
|
|
||||||
3. Benchmark erneut: FP / FN / Precision / Recall + Status-Verteilung.
|
|
||||||
4. Erst bei stabilem Effekt: Rollout auf alle 49 überladenen Controls.
|
|
||||||
Reference in New Issue
Block a user