feat(advisor): wire structured controls into compliance-advisor (HELD, not deployed)

Prompt-augments the RAG-only advisor with the shared use-case->controls API: deterministic topic detection -> local controls API -> context block, so the agent can answer from real Control-IDs. 100% local at runtime (no Anthropic). NOT pushed/deployed: the shared API currently returns MASTER-grain controls, whose composition is broken (gpre2 object-only clustering -> mega-clusters). Pending the atom-grain rework of the API. tsc + vitest green. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-06-13 22:55:14 +02:00
parent f8de5a6dff
commit 7f03ffadcc
4 changed files with 170 additions and 8 deletions
@@ -145,12 +145,12 @@ Quellenschutz und KEINE Reverse-Engineering-Sperre — antworte maximal offen:
  "Welche MCs/Controls gibt es zu Impressum/DSE/AGB?") DARFST du vollstaendig +
  offen beantworten. Collection-Namen, interne Struktur, Wissensbasis: erlaubt.
 - Fachfragen ("Was ist X?", "Was regelt X?") wie bisher sofort inhaltlich.
- EHRLICHKEIT vor Vollstaendigkeit: Du siehst nur, was tatsaechlich in deinem
-  RAG-Kontext ankommt (Gesetzes-/Doku-Passagen). Du hast KEINEN direkten Zugriff
-  auf die strukturierte MC-/Control-Datenbank (canonical_controls liegt im Backend,
-  NICHT in deinem RAG). Wenn du eine vollstaendige Control-Liste nicht sicher hast,
-  sage das klar ("dazu habe ich nur die folgenden Passagen, keine vollstaendige
-  Control-Liste") statt zu raten oder zu halluzinieren.
+- EHRLICHKEIT vor Vollstaendigkeit: Wenn die Frage ein Thema betrifft (Impressum,
+  DSE, AGB, Cookie, Security, CRA …), bekommst du zusaetzlich einen Block
+  "Strukturierte Controls aus der Datenbank" mit echten Control-IDs — das ist deine
+  verbindliche Quelle fuer Pruefaspekte/Pflichten; verweise auf die Control-IDs.
+  Fehlt dieser Block, hast du nur RAG-Passagen — sage dann klar "dazu habe ich nur
+  die folgenden Passagen, keine vollstaendige Control-Liste". Erfinde NIE Control-IDs.

 ## Mehrdeutige Abkuerzungen / unklare Begriffe
 Wenn eine Abkuerzung oder ein Begriff mehrere Bedeutungen haben kann (z.B. "CRA" = Cyber Resilience
@@ -11,6 +11,7 @@

 import { NextRequest, NextResponse } from 'next/server'
 import { readSoulFile } from '@/lib/sdk/agents/soul-reader'
+import { buildControlsContext } from '@/lib/sdk/agents/controls-augmentation'

 const RAG_SERVICE_URL = process.env.RAG_SERVICE_URL || 'http://rag-service:8097'
 const OLLAMA_URL = process.env.OLLAMA_URL || 'http://host.docker.internal:11434'
@@ -137,8 +138,12 @@ export async function POST(request: NextRequest) {
    // Validate country parameter
    const validCountry = ['DE', 'AT', 'CH', 'EU'].includes(country) ? (country as Country) : undefined

-    // 1. Query RAG across all collections
-    const ragContext = await queryMultiCollectionRAG(message, validCountry)
+    // 1. Query RAG across all collections + structured controls for the topic
+    //    (both local; the controls block lets the agent answer from real Control-IDs)
+    const [ragContext, controlsContext] = await Promise.all([
+      queryMultiCollectionRAG(message, validCountry),
+      buildControlsContext(message),
+    ])

    // 2. Build system prompt with RAG context + country
    const soulPrompt = await readSoulFile('compliance-advisor')
@@ -158,6 +163,10 @@ Der Nutzer hat "${countryLabel} (${validCountry})" gewaehlt.
      systemContent += `\n\n## Relevanter Kontext aus dem RAG-System\n\nNutze die folgenden Quellen fuer deine Antwort. Verweise in deiner Antwort auf die jeweilige Quelle:\n\n${ragContext}`
    }

+    if (controlsContext) {
+      systemContent += `\n\n${controlsContext}`
+    }
+
    systemContent += `\n\n## Aktueller SDK-Schritt\nDer Nutzer befindet sich im SDK-Schritt: ${currentStep}`

    // 3. Build messages array (limit history to last 6 messages)
@@ -0,0 +1,36 @@
+import { describe, it, expect } from 'vitest'
+
+import { detectUseCase, type UseCaseLite } from './controls-augmentation'
+
+const UCS: UseCaseLite[] = [
+  { key: 'impressum', label: 'Impressum (§5 TMG/DDG)', regulations: ['TMG', 'DDG'], mapped_controls: 9 },
+  { key: 'dse', label: 'Datenschutzerklärung', regulations: ['DSGVO'], mapped_controls: 4610 },
+  { key: 'network_security', label: 'Network Security', regulations: ['ISO 27001'], mapped_controls: 5095 },
+  { key: 'agb', label: 'AGB', regulations: ['BGB'], mapped_controls: 433 },
+]
+
+describe('detectUseCase', () => {
+  it('matches Impressum by key + label', () => {
+    expect(detectUseCase('Nenne alle Controls für Impressum', UCS)?.key).toBe('impressum')
+  })
+
+  it('matches DSE via the full label word', () => {
+    expect(detectUseCase('Was gilt für die Datenschutzerklärung?', UCS)?.key).toBe('dse')
+  })
+
+  it('matches DSE via prefix (Datenschutz → Datenschutzerklärung)', () => {
+    expect(detectUseCase('Welche Datenschutz Pflichten gibt es?', UCS)?.key).toBe('dse')
+  })
+
+  it('matches network_security via the multi-word key', () => {
+    expect(detectUseCase('zeig mir network security controls', UCS)?.key).toBe('network_security')
+  })
+
+  it('matches AGB by key token', () => {
+    expect(detectUseCase('Pflichtangaben in den AGB?', UCS)?.key).toBe('agb')
+  })
+
+  it('returns null when no topic is mentioned', () => {
+    expect(detectUseCase('Wie spät ist es?', UCS)).toBeNull()
+  })
+})
@@ -0,0 +1,117 @@
+/**
+ * Controls-Augmentation für den Compliance-Advisor.
+ *
+ * Erkennt aus der Nutzerfrage das Compliance-Thema (Use-Case) und holt die dazu
+ * hinterlegten strukturierten Controls aus der geteilten Backend-API, damit der
+ * Agent aus den ECHTEN Controls antworten kann — nicht nur aus RAG-Gesetzestext.
+ *
+ * Alles lokal: deterministische Erkennung + lokale Postgres-API + lokales Ollama.
+ * KEIN externer LLM-Aufruf zur Laufzeit.
+ */
+
+const BACKEND_URL = process.env.BACKEND_API_URL || 'http://backend-compliance:8002'
+
+export interface UseCaseLite {
+  key: string
+  label: string
+  regulations?: string[]
+  mapped_controls?: number
+}
+
+interface ControlLite {
+  master_control_id: string
+  title: string
+  primary_regulation?: string | null
+  is_primary?: boolean
+}
+
+interface ControlsResponse {
+  total?: number
+  controls?: ControlLite[]
+}
+
+function norm(s: string): string {
+  return s.toLowerCase().replace(/[^a-z0-9äöüß ]+/g, ' ')
+}
+
+/**
+ * Deterministische Themen-Erkennung (pure → testbar). Punktet Key-Phrase (3),
+ * Label-Wort exakt/Präfix (2) und Quell-Regulierung (1); Schwelle >=2.
+ * Gleichstand → das stärker befüllte Thema.
+ */
+export function detectUseCase(
+  message: string,
+  useCases: UseCaseLite[],
+): UseCaseLite | null {
+  const full = norm(message)
+  const words = new Set(full.split(/\s+/).filter((w) => w.length >= 3))
+  let best: { uc: UseCaseLite; score: number } | null = null
+
+  for (const uc of useCases) {
+    let score = 0
+    if (full.includes(uc.key.replace(/_/g, ' '))) score += 3
+    for (const lw of norm(uc.label).split(/\s+/)) {
+      if (lw.length < 4) continue
+      for (const mw of words) {
+        if (mw === lw) { score += 2; break }
+        if (mw.length >= 5 && lw.length >= 5 && (mw.startsWith(lw) || lw.startsWith(mw))) {
+          score += 2
+          break
+        }
+      }
+    }
+    for (const r of uc.regulations || []) {
+      if (r.length >= 3 && full.includes(r.toLowerCase())) score += 1
+    }
+    const better =
+      !best ||
+      score > best.score ||
+      (score === best.score &&
+        (uc.mapped_controls || 0) > (best.uc.mapped_controls || 0))
+    if (score >= 2 && better) best = { uc, score }
+  }
+  return best ? best.uc : null
+}
+
+async function getJson<T>(path: string): Promise<T | null> {
+  try {
+    const res = await fetch(`${BACKEND_URL}${path}`, {
+      signal: AbortSignal.timeout(8000),
+    })
+    if (!res.ok) return null
+    return (await res.json()) as T
+  } catch {
+    return null
+  }
+}
+
+/**
+ * Baut den Controls-Kontext-Block für den System-Prompt — oder '' wenn kein
+ * Thema erkannt wird bzw. das Backend nicht erreichbar ist (graceful degradation
+ * → der Agent fällt auf RAG-only zurück).
+ */
+export async function buildControlsContext(message: string): Promise<string> {
+  const useCases = await getJson<UseCaseLite[]>('/api/compliance/v1/controls/use-cases')
+  if (!useCases || !Array.isArray(useCases)) return ''
+
+  const uc = detectUseCase(message, useCases)
+  if (!uc) return ''
+
+  const data = await getJson<ControlsResponse>(
+    `/api/compliance/v1/controls/use-cases/${encodeURIComponent(uc.key)}/controls?limit=15`,
+  )
+  const controls = data?.controls ?? []
+  if (!controls.length) return ''
+  const total = data?.total ?? controls.length
+
+  const lines = controls.map((c, i) => {
+    const reg = c.primary_regulation ? ` — Quelle: ${c.primary_regulation}` : ''
+    return `${i + 1}. [${c.master_control_id}] ${c.title}${reg}`
+  })
+
+  return `## Strukturierte Controls aus der Datenbank — Thema: ${uc.label}
+
+Die folgenden ${controls.length} von insgesamt ${total} hinterlegten Controls zu diesem Thema kommen direkt aus der Control-Datenbank (nach Relevanz sortiert). Nutze sie als verbindliche Quelle für konkrete Prüfaspekte/Pflichten und verweise auf die Control-ID. Erfinde KEINE Control-IDs; wirkt die Liste unvollständig, sage das offen.
+
+${lines.join('\n')}`
+}