breakpilot-compliance/ai-compliance-sdk/internal/usecase/compiler.go

package usecase

import (
	"fmt"
	"log"
	"strings"

	"golang.org/x/text/cases"
	"golang.org/x/text/language"
)

// Compiler turns Master Controls into audit questionnaires.
type Compiler struct {
	store  *Store
	llmGen *LLMQuestionGenerator
}

// NewCompiler creates a Compiler with optional LLM generator.
func NewCompiler(store *Store, llmGen *LLMQuestionGenerator) *Compiler {
	return &Compiler{store: store, llmGen: llmGen}
}

// Compile generates questions for a template.
//
// Flow (per Plan):
//  1. Fetch MCs matching template filters from DB
//  2. For each MC: check doc_check_controls → Mode A (deterministic)
//  3. For remaining MCs: use LLM → Mode B
//  4. For remaining MCs: derive from MC name → Mode A fallback
//  5. Template hardcoded questions = absolute fallback if DB returns nothing
func (c *Compiler) Compile(tmpl *Template) ([]Question, error) {
	// 1. Fetch MCs matching the template filters
	mcs, err := c.store.FetchMCsByFilters(tmpl.MCFilters)
	if err != nil {
		log.Printf("usecase: MC fetch failed: %v, falling back to template questions", err)
		return c.templateFallback(tmpl), nil
	}

	if len(mcs) == 0 {
		// No MCs in DB for these filters → use hardcoded template questions
		if len(tmpl.Questions) > 0 {
			return tmpl.Questions, nil
		}
		return nil, fmt.Errorf("no Master Controls found for filters %v", tmpl.MCFilters)
	}

	// 2. Check for existing doc_check_controls
	mcIDs := make([]string, len(mcs))
	for i, mc := range mcs {
		mcIDs[i] = mc.MasterControlID
	}

	checkQuestions, _ := c.store.FetchCheckQuestions(mcIDs)

	// 3. Build questions: doc_check → LLM → deterministic
	var questions []Question
	var mcsWithoutQuestions []MCInfo
	qNum := 1

	for _, mc := range mcs {
		// Mode A: existing doc_check_controls
		if cqs, ok := checkQuestions[mc.MasterControlID]; ok && len(cqs) > 0 {
			for _, cq := range cqs {
				questions = append(questions, Question{
					ID:           fmt.Sprintf("Q%d", qNum),
					MCID:         mc.MasterControlID,
					MCName:       mc.CanonicalName,
					Text:         cq.Question,
					QuestionType: "yes_no",
					Severity:     normalizeSeverity(cq.Severity),
					Regulation:   mc.RegSource,
					PassCriteria: splitCriteria(cq.PassCriteria),
					FailCriteria: splitCriteria(cq.FailCriteria),
				})
				qNum++
			}
			continue
		}
		mcsWithoutQuestions = append(mcsWithoutQuestions, mc)
	}

	// Mode B: LLM for MCs without doc_check_controls
	if len(mcsWithoutQuestions) > 0 && c.llmGen != nil {
		llmQuestions, err := c.llmGen.GenerateQuestions(mcsWithoutQuestions, tmpl.Regulations)
		if err == nil && len(llmQuestions) > 0 {
			// Renumber
			for i := range llmQuestions {
				llmQuestions[i].ID = fmt.Sprintf("Q%d", qNum)
				qNum++
			}
			questions = append(questions, llmQuestions...)
			mcsWithoutQuestions = nil // all handled
		} else if err != nil {
			log.Printf("usecase: LLM generation failed: %v, using deterministic fallback", err)
		}
	}

	// Mode A fallback: deterministic derivation for remaining MCs
	for _, mc := range mcsWithoutQuestions {
		questions = append(questions, Question{
			ID:           fmt.Sprintf("Q%d", qNum),
			MCID:         mc.MasterControlID,
			MCName:       mc.CanonicalName,
			Text:         deriveQuestion(mc.CanonicalName),
			QuestionType: "yes_no",
			Severity:     inferMCSeverity(mc.CanonicalName),
			Regulation:   mc.RegSource,
			PassCriteria: []string{"Anforderung erfuellt und dokumentiert"},
			FailCriteria: []string{"Nicht implementiert oder nicht nachweisbar"},
		})
		qNum++

		if qNum > 50 {
			break
		}
	}

	// Merge: add template hardcoded questions that cover topics not yet covered
	if len(tmpl.Questions) > 0 {
		questions = mergeTemplateQuestions(questions, tmpl.Questions, qNum)
	}

	if len(questions) == 0 {
		return c.templateFallback(tmpl), nil
	}

	return questions, nil
}

// templateFallback returns hardcoded template questions or an error.
func (c *Compiler) templateFallback(tmpl *Template) []Question {
	if len(tmpl.Questions) > 0 {
		return tmpl.Questions
	}
	return nil
}

// mergeTemplateQuestions adds template questions that aren't already
// covered by MC-compiled questions (matched by keyword overlap).
func mergeTemplateQuestions(compiled, template []Question, nextNum int) []Question {
	// Build set of covered MC topics
	coveredTopics := make(map[string]bool)
	for _, q := range compiled {
		if q.MCName != "" {
			coveredTopics[q.MCName] = true
		}
		// Also index key words from the question text
		for _, w := range extractKeywords(q.Text) {
			coveredTopics[w] = true
		}
	}

	qNum := nextNum
	for _, tq := range template {
		// Check if this template question's topic is already covered
		keywords := extractKeywords(tq.Text)
		covered := false
		for _, kw := range keywords {
			if coveredTopics[kw] {
				covered = true
				break
			}
		}
		if covered {
			continue
		}

		tq.ID = fmt.Sprintf("Q%d", qNum)
		compiled = append(compiled, tq)
		qNum++
	}

	return compiled
}

// extractKeywords pulls significant words from a question for dedup.
func extractKeywords(text string) []string {
	stopwords := map[string]bool{
		"ist": true, "hat": true, "gibt": true, "es": true, "ein": true,
		"eine": true, "der": true, "die": true, "das": true, "den": true,
		"dem": true, "des": true, "oder": true, "und": true, "fuer": true,
		"nach": true, "mit": true, "von": true, "zu": true, "auf": true,
		"in": true, "an": true, "bei": true, "werden": true, "wird": true,
		"sind": true, "nicht": true, "nur": true, "auch": true,
	}

	words := strings.Fields(strings.ToLower(text))
	var keywords []string
	for _, w := range words {
		w = strings.Trim(w, "?.,;:!\"'()")
		if len(w) > 3 && !stopwords[w] {
			keywords = append(keywords, w)
		}
	}
	return keywords
}

// deriveQuestion generates a human-readable question from an MC name.
func deriveQuestion(canonicalName string) string {
	readable := strings.ReplaceAll(canonicalName, "_", " ")
	readable = cases.Title(language.German).String(readable)
	return fmt.Sprintf("Ist '%s' implementiert und dokumentiert?", readable)
}

// splitCriteria splits a pipe-separated criteria string.
func splitCriteria(s string) []string {
	if s == "" {
		return nil
	}
	parts := strings.Split(s, "|")
	result := make([]string, 0, len(parts))
	for _, p := range parts {
		p = strings.TrimSpace(p)
		if p != "" {
			result = append(result, p)
		}
	}
	if len(result) == 0 {
		return []string{s}
	}
	return result
}

// normalizeSeverity maps doc_check severity to our format.
func normalizeSeverity(s string) string {
	s = strings.ToUpper(strings.TrimSpace(s))
	switch s {
	case "HIGH", "CRITICAL":
		return "HIGH"
	case "MEDIUM":
		return "MEDIUM"
	case "LOW":
		return "LOW"
	default:
		return "MEDIUM"
	}
}

// inferMCSeverity guesses severity from the MC topic name.
func inferMCSeverity(name string) string {
	high := []string{"encryption", "access_control", "incident", "vulnerability",
		"authentication", "key_management", "data_breach", "personal_data",
		"consent", "data_transfer"}
	for _, h := range high {
		if strings.Contains(name, h) {
			return "HIGH"
		}
	}
	return "MEDIUM"
}