breakpilot-compliance/ai-compliance-sdk/internal/usecase/compiler_llm.go

package usecase

import (
	"context"
	"encoding/json"
	"fmt"
	"strings"
	"time"

	"github.com/breakpilot/ai-compliance-sdk/internal/llm"
)

// LLMQuestionGenerator uses an LLM to create questions from MC metadata
// when no pre-defined questions or doc_check_controls exist (Mode B).
type LLMQuestionGenerator struct {
	registry *llm.ProviderRegistry
}

// NewLLMQuestionGenerator creates a new LLM-based generator.
func NewLLMQuestionGenerator(registry *llm.ProviderRegistry) *LLMQuestionGenerator {
	return &LLMQuestionGenerator{registry: registry}
}

// llmQuestion is the JSON structure we expect from the LLM.
type llmQuestion struct {
	MCName       string   `json:"mc_name"`
	Question     string   `json:"question"`
	PassCriteria []string `json:"pass_criteria"`
	FailCriteria []string `json:"fail_criteria"`
	Severity     string   `json:"severity"`
}

// maxLLMMCs limits how many MCs we send to the LLM in one batch.
const maxLLMMCs = 10

// GenerateQuestions generates questions for MCs using a single batched LLM call.
func (g *LLMQuestionGenerator) GenerateQuestions(mcs []MCInfo, regulations []string) ([]Question, error) {
	if g.registry == nil {
		return nil, fmt.Errorf("no LLM provider configured")
	}

	// Limit batch size
	batch := mcs
	if len(batch) > maxLLMMCs {
		batch = batch[:maxLLMMCs]
	}

	ctx, cancel := context.WithTimeout(context.Background(), 20*time.Second)
	defer cancel()

	prompt := buildBatchPrompt(batch, regulations)

	resp, err := g.registry.Chat(ctx, &llm.ChatRequest{
		Messages: []llm.Message{
			{Role: "system", Content: systemPrompt},
			{Role: "user", Content: prompt},
		},
		Temperature: 0.3,
		MaxTokens:   2000,
	})
	if err != nil {
		return nil, fmt.Errorf("LLM call failed: %w", err)
	}

	parsed := parseLLMResponse(resp.Message.Content)
	if len(parsed) == 0 {
		return nil, fmt.Errorf("LLM returned no valid questions")
	}

	// Map parsed questions back to MCs
	mcByName := make(map[string]MCInfo)
	for _, mc := range batch {
		mcByName[mc.CanonicalName] = mc
	}

	var questions []Question
	for _, lq := range parsed {
		mc, ok := mcByName[lq.MCName]
		if !ok {
			// Try fuzzy match
			for name, m := range mcByName {
				if strings.Contains(lq.MCName, name) || strings.Contains(name, lq.MCName) {
					mc = m
					ok = true
					break
				}
			}
		}

		q := Question{
			Text:         lq.Question,
			QuestionType: "yes_no",
			Severity:     normalizeSeverity(lq.Severity),
			PassCriteria: lq.PassCriteria,
			FailCriteria: lq.FailCriteria,
		}
		if ok {
			q.MCID = mc.MasterControlID
			q.MCName = mc.CanonicalName
			q.Regulation = mc.RegSource
		}
		questions = append(questions, q)
	}

	return questions, nil
}

const systemPrompt = `Du bist ein Compliance-Experte. Generiere praezise Prueffragen fuer Compliance-Audits.

Antworte NUR mit einem JSON-Array. Jedes Element hat:
- "mc_name": Der canonical_name des Master Controls (exakt wie im Input)
- "question": Eine klare Ja/Nein-Frage auf Deutsch
- "pass_criteria": Array mit 1-2 Kriterien fuer "bestanden"
- "fail_criteria": Array mit 1-2 Kriterien fuer "nicht bestanden"
- "severity": "HIGH", "MEDIUM" oder "LOW"

Generiere 1 Frage pro Master Control. Keine Erklaerungen, nur das JSON-Array.`

func buildBatchPrompt(mcs []MCInfo, regulations []string) string {
	regStr := strings.Join(regulations, ", ")

	var sb strings.Builder
	sb.WriteString(fmt.Sprintf("Regulierungen: %s\n\nMaster Controls:\n", regStr))

	for i, mc := range mcs {
		readable := strings.ReplaceAll(mc.CanonicalName, "_", " ")
		sb.WriteString(fmt.Sprintf("%d. mc_name=%q (%d Controls, Quelle: %s)\n",
			i+1, mc.CanonicalName, mc.TotalControls, mc.RegSource))
		_ = readable
	}

	sb.WriteString("\nGeneriere je 1 Prueffrage pro Master Control.")
	return sb.String()
}

func buildPrompt(mc MCInfo, regulations []string) string {
	readable := strings.ReplaceAll(mc.CanonicalName, "_", " ")
	regStr := strings.Join(regulations, ", ")

	return fmt.Sprintf(
		`Master Control: "%s" (%d Atomic Controls)
Regulierungen: %s
Regulation Source: %s

Generiere 1-2 praezise Prueffragen fuer diesen Master Control.`,
		readable, mc.TotalControls, regStr, mc.RegSource)
}

func parseLLMResponse(content string) []llmQuestion {
	content = strings.TrimSpace(content)

	// Try to find JSON array in the response
	start := strings.Index(content, "[")
	end := strings.LastIndex(content, "]")
	if start >= 0 && end > start {
		content = content[start : end+1]
	}

	var questions []llmQuestion
	if err := json.Unmarshal([]byte(content), &questions); err != nil {
		return nil
	}

	// Validate
	var valid []llmQuestion
	for _, q := range questions {
		if q.Question != "" && len(q.PassCriteria) > 0 {
			valid = append(valid, q)
		}
	}
	return valid
}