feat: edu-search-service migriert, voice-service/geo-service entfernt
All checks were successful
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 28s
CI / test-go-edu-search (push) Successful in 27s
CI / test-python-klausur (push) Successful in 1m45s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 21s

- edu-search-service von breakpilot-pwa nach breakpilot-lehrer kopiert (ohne vendor)
- opensearch + edu-search-service in docker-compose.yml hinzugefuegt
- voice-service aus docker-compose.yml entfernt (jetzt in breakpilot-core)
- geo-service aus docker-compose.yml entfernt (nicht mehr benoetigt)
- CI/CD: edu-search-service zu Gitea Actions und Woodpecker hinzugefuegt
  (Go lint, test mit go mod download, build, SBOM)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Boenisch
2026-02-15 18:36:38 +01:00
parent d4e1d6bab6
commit 414e0f5ec0
73 changed files with 23938 additions and 92 deletions

View File

@@ -0,0 +1,255 @@
package policy
import (
"context"
"encoding/json"
"github.com/google/uuid"
)
// Auditor provides audit logging functionality for the policy system.
type Auditor struct {
store *Store
}
// NewAuditor creates a new Auditor instance.
func NewAuditor(store *Store) *Auditor {
return &Auditor{store: store}
}
// LogChange logs a policy change to the audit trail.
func (a *Auditor) LogChange(ctx context.Context, action AuditAction, entityType AuditEntityType, entityID *uuid.UUID, oldValue, newValue interface{}, userEmail, ipAddress, userAgent *string) error {
entry := &PolicyAuditLog{
Action: action,
EntityType: entityType,
EntityID: entityID,
UserEmail: userEmail,
IPAddress: ipAddress,
UserAgent: userAgent,
}
if oldValue != nil {
entry.OldValue = toJSON(oldValue)
}
if newValue != nil {
entry.NewValue = toJSON(newValue)
}
return a.store.CreateAuditLog(ctx, entry)
}
// LogBlocked logs a blocked URL to the blocked content log.
func (a *Auditor) LogBlocked(ctx context.Context, url, domain string, reason BlockReason, ruleID *uuid.UUID, details map[string]interface{}) error {
entry := &BlockedContentLog{
URL: url,
Domain: domain,
BlockReason: reason,
MatchedRuleID: ruleID,
}
if details != nil {
entry.Details = toJSON(details)
}
return a.store.CreateBlockedContentLog(ctx, entry)
}
// =============================================================================
// CONVENIENCE METHODS
// =============================================================================
// LogPolicyCreated logs a policy creation event.
func (a *Auditor) LogPolicyCreated(ctx context.Context, policy *SourcePolicy, userEmail *string) error {
return a.LogChange(ctx, AuditActionCreate, AuditEntitySourcePolicy, &policy.ID, nil, policy, userEmail, nil, nil)
}
// LogPolicyUpdated logs a policy update event.
func (a *Auditor) LogPolicyUpdated(ctx context.Context, oldPolicy, newPolicy *SourcePolicy, userEmail *string) error {
return a.LogChange(ctx, AuditActionUpdate, AuditEntitySourcePolicy, &newPolicy.ID, oldPolicy, newPolicy, userEmail, nil, nil)
}
// LogPolicyDeleted logs a policy deletion event.
func (a *Auditor) LogPolicyDeleted(ctx context.Context, policy *SourcePolicy, userEmail *string) error {
return a.LogChange(ctx, AuditActionDelete, AuditEntitySourcePolicy, &policy.ID, policy, nil, userEmail, nil, nil)
}
// LogPolicyActivated logs a policy activation event.
func (a *Auditor) LogPolicyActivated(ctx context.Context, policy *SourcePolicy, userEmail *string) error {
return a.LogChange(ctx, AuditActionActivate, AuditEntitySourcePolicy, &policy.ID, nil, policy, userEmail, nil, nil)
}
// LogPolicyDeactivated logs a policy deactivation event.
func (a *Auditor) LogPolicyDeactivated(ctx context.Context, policy *SourcePolicy, userEmail *string) error {
return a.LogChange(ctx, AuditActionDeactivate, AuditEntitySourcePolicy, &policy.ID, policy, nil, userEmail, nil, nil)
}
// LogSourceCreated logs a source creation event.
func (a *Auditor) LogSourceCreated(ctx context.Context, source *AllowedSource, userEmail *string) error {
return a.LogChange(ctx, AuditActionCreate, AuditEntityAllowedSource, &source.ID, nil, source, userEmail, nil, nil)
}
// LogSourceUpdated logs a source update event.
func (a *Auditor) LogSourceUpdated(ctx context.Context, oldSource, newSource *AllowedSource, userEmail *string) error {
return a.LogChange(ctx, AuditActionUpdate, AuditEntityAllowedSource, &newSource.ID, oldSource, newSource, userEmail, nil, nil)
}
// LogSourceDeleted logs a source deletion event.
func (a *Auditor) LogSourceDeleted(ctx context.Context, source *AllowedSource, userEmail *string) error {
return a.LogChange(ctx, AuditActionDelete, AuditEntityAllowedSource, &source.ID, source, nil, userEmail, nil, nil)
}
// LogOperationUpdated logs an operation permission update event.
func (a *Auditor) LogOperationUpdated(ctx context.Context, oldOp, newOp *OperationPermission, userEmail *string) error {
return a.LogChange(ctx, AuditActionUpdate, AuditEntityOperationPermission, &newOp.ID, oldOp, newOp, userEmail, nil, nil)
}
// LogPIIRuleCreated logs a PII rule creation event.
func (a *Auditor) LogPIIRuleCreated(ctx context.Context, rule *PIIRule, userEmail *string) error {
return a.LogChange(ctx, AuditActionCreate, AuditEntityPIIRule, &rule.ID, nil, rule, userEmail, nil, nil)
}
// LogPIIRuleUpdated logs a PII rule update event.
func (a *Auditor) LogPIIRuleUpdated(ctx context.Context, oldRule, newRule *PIIRule, userEmail *string) error {
return a.LogChange(ctx, AuditActionUpdate, AuditEntityPIIRule, &newRule.ID, oldRule, newRule, userEmail, nil, nil)
}
// LogPIIRuleDeleted logs a PII rule deletion event.
func (a *Auditor) LogPIIRuleDeleted(ctx context.Context, rule *PIIRule, userEmail *string) error {
return a.LogChange(ctx, AuditActionDelete, AuditEntityPIIRule, &rule.ID, rule, nil, userEmail, nil, nil)
}
// LogContentBlocked logs a blocked content event with details.
func (a *Auditor) LogContentBlocked(ctx context.Context, url, domain string, reason BlockReason, matchedPatterns []string, ruleID *uuid.UUID) error {
details := map[string]interface{}{
"matched_patterns": matchedPatterns,
}
return a.LogBlocked(ctx, url, domain, reason, ruleID, details)
}
// LogPIIBlocked logs content blocked due to PII detection.
func (a *Auditor) LogPIIBlocked(ctx context.Context, url, domain string, matches []PIIMatch) error {
matchDetails := make([]map[string]interface{}, len(matches))
var ruleID *uuid.UUID
for i, m := range matches {
matchDetails[i] = map[string]interface{}{
"rule_name": m.RuleName,
"severity": m.Severity,
"match": maskPII(m.Match), // Mask the actual PII in logs
}
if ruleID == nil {
ruleID = &m.RuleID
}
}
details := map[string]interface{}{
"pii_matches": matchDetails,
"match_count": len(matches),
}
return a.LogBlocked(ctx, url, domain, BlockReasonPIIDetected, ruleID, details)
}
// =============================================================================
// HELPERS
// =============================================================================
// toJSON converts a value to JSON.
func toJSON(v interface{}) json.RawMessage {
data, err := json.Marshal(v)
if err != nil {
return nil
}
return data
}
// maskPII masks PII data for safe logging.
func maskPII(pii string) string {
if len(pii) <= 4 {
return "****"
}
// Show first 2 and last 2 characters
return pii[:2] + "****" + pii[len(pii)-2:]
}
// =============================================================================
// AUDIT REPORT GENERATION
// =============================================================================
// AuditReport represents an audit report for compliance.
type AuditReport struct {
GeneratedAt string `json:"generated_at"`
PeriodStart string `json:"period_start"`
PeriodEnd string `json:"period_end"`
Summary AuditReportSummary `json:"summary"`
PolicyChanges []PolicyAuditLog `json:"policy_changes"`
BlockedContent []BlockedContentLog `json:"blocked_content"`
Stats *PolicyStats `json:"stats"`
}
// AuditReportSummary contains summary statistics for the audit report.
type AuditReportSummary struct {
TotalPolicyChanges int `json:"total_policy_changes"`
TotalBlocked int `json:"total_blocked"`
ChangesByAction map[string]int `json:"changes_by_action"`
BlocksByReason map[string]int `json:"blocks_by_reason"`
}
// GenerateAuditReport generates a compliance audit report.
func (a *Auditor) GenerateAuditReport(ctx context.Context, filter *AuditLogFilter, blockedFilter *BlockedContentFilter) (*AuditReport, error) {
// Get audit logs
auditLogs, _, err := a.store.ListAuditLogs(ctx, filter)
if err != nil {
return nil, err
}
// Get blocked content
blockedLogs, _, err := a.store.ListBlockedContent(ctx, blockedFilter)
if err != nil {
return nil, err
}
// Get stats
stats, err := a.store.GetStats(ctx)
if err != nil {
return nil, err
}
// Build summary
summary := AuditReportSummary{
TotalPolicyChanges: len(auditLogs),
TotalBlocked: len(blockedLogs),
ChangesByAction: make(map[string]int),
BlocksByReason: make(map[string]int),
}
for _, log := range auditLogs {
summary.ChangesByAction[string(log.Action)]++
}
for _, log := range blockedLogs {
summary.BlocksByReason[string(log.BlockReason)]++
}
// Build report
periodStart := ""
periodEnd := ""
if filter.FromDate != nil {
periodStart = filter.FromDate.Format("2006-01-02")
}
if filter.ToDate != nil {
periodEnd = filter.ToDate.Format("2006-01-02")
}
report := &AuditReport{
GeneratedAt: uuid.New().String()[:19], // Timestamp placeholder
PeriodStart: periodStart,
PeriodEnd: periodEnd,
Summary: summary,
PolicyChanges: auditLogs,
BlockedContent: blockedLogs,
Stats: stats,
}
return report, nil
}

View File

@@ -0,0 +1,281 @@
package policy
import (
"context"
"net/url"
"strings"
"github.com/google/uuid"
)
// Enforcer provides policy enforcement for the crawler and pipeline.
type Enforcer struct {
store *Store
piiDetector *PIIDetector
auditor *Auditor
}
// NewEnforcer creates a new Enforcer instance.
func NewEnforcer(store *Store) *Enforcer {
return &Enforcer{
store: store,
piiDetector: NewPIIDetector(store),
auditor: NewAuditor(store),
}
}
// =============================================================================
// SOURCE CHECKING
// =============================================================================
// CheckSource verifies if a URL is allowed based on the whitelist.
// Returns the AllowedSource if found, nil if not whitelisted.
func (e *Enforcer) CheckSource(ctx context.Context, rawURL string, bundesland *Bundesland) (*AllowedSource, error) {
domain, err := extractDomain(rawURL)
if err != nil {
return nil, err
}
source, err := e.store.GetSourceByDomain(ctx, domain, bundesland)
if err != nil {
return nil, err
}
return source, nil
}
// CheckOperation verifies if a specific operation is allowed for a source.
func (e *Enforcer) CheckOperation(ctx context.Context, source *AllowedSource, operation Operation) (*OperationPermission, error) {
for _, op := range source.Operations {
if op.Operation == operation {
return &op, nil
}
}
// If not found in loaded operations, query directly
ops, err := e.store.GetOperationsBySourceID(ctx, source.ID)
if err != nil {
return nil, err
}
for _, op := range ops {
if op.Operation == operation {
return &op, nil
}
}
return nil, nil
}
// CheckCompliance performs a full compliance check for a URL and operation.
func (e *Enforcer) CheckCompliance(ctx context.Context, req *CheckComplianceRequest) (*CheckComplianceResponse, error) {
response := &CheckComplianceResponse{
IsAllowed: false,
RequiresCitation: false,
}
// Check if source is whitelisted
source, err := e.CheckSource(ctx, req.URL, req.Bundesland)
if err != nil {
return nil, err
}
if source == nil {
reason := BlockReasonNotWhitelisted
response.BlockReason = &reason
return response, nil
}
response.Source = source
response.License = &source.License
response.CitationTemplate = source.CitationTemplate
// Check operation permission
opPerm, err := e.CheckOperation(ctx, source, req.Operation)
if err != nil {
return nil, err
}
if opPerm == nil || !opPerm.IsAllowed {
var reason BlockReason
if req.Operation == OperationTraining {
reason = BlockReasonTrainingForbidden
} else {
reason = BlockReasonLicenseViolation
}
response.BlockReason = &reason
return response, nil
}
response.IsAllowed = true
response.RequiresCitation = opPerm.RequiresCitation
return response, nil
}
// =============================================================================
// PII CHECKING
// =============================================================================
// DetectPII scans text for PII patterns and returns matches.
func (e *Enforcer) DetectPII(ctx context.Context, text string) (*PIITestResponse, error) {
return e.piiDetector.Detect(ctx, text)
}
// ShouldBlockForPII determines if content should be blocked based on PII matches.
func (e *Enforcer) ShouldBlockForPII(response *PIITestResponse) bool {
if response == nil {
return false
}
return response.ShouldBlock
}
// =============================================================================
// LOGGING
// =============================================================================
// LogBlocked logs a blocked URL to the blocked content log.
func (e *Enforcer) LogBlocked(ctx context.Context, rawURL string, reason BlockReason, ruleID *uuid.UUID, details map[string]interface{}) error {
domain, _ := extractDomain(rawURL)
return e.auditor.LogBlocked(ctx, rawURL, domain, reason, ruleID, details)
}
// LogChange logs a policy change to the audit log.
func (e *Enforcer) LogChange(ctx context.Context, action AuditAction, entityType AuditEntityType, entityID *uuid.UUID, oldValue, newValue interface{}, userEmail *string) error {
return e.auditor.LogChange(ctx, action, entityType, entityID, oldValue, newValue, userEmail, nil, nil)
}
// =============================================================================
// BATCH OPERATIONS
// =============================================================================
// FilterURLs filters a list of URLs, returning only whitelisted ones.
func (e *Enforcer) FilterURLs(ctx context.Context, urls []string, bundesland *Bundesland, operation Operation) ([]FilteredURL, error) {
results := make([]FilteredURL, 0, len(urls))
for _, u := range urls {
result := FilteredURL{
URL: u,
IsAllowed: false,
}
source, err := e.CheckSource(ctx, u, bundesland)
if err != nil {
result.Error = err.Error()
results = append(results, result)
continue
}
if source == nil {
result.BlockReason = BlockReasonNotWhitelisted
results = append(results, result)
continue
}
opPerm, err := e.CheckOperation(ctx, source, operation)
if err != nil {
result.Error = err.Error()
results = append(results, result)
continue
}
if opPerm == nil || !opPerm.IsAllowed {
if operation == OperationTraining {
result.BlockReason = BlockReasonTrainingForbidden
} else {
result.BlockReason = BlockReasonLicenseViolation
}
results = append(results, result)
continue
}
result.IsAllowed = true
result.Source = source
result.RequiresCitation = opPerm.RequiresCitation
results = append(results, result)
}
return results, nil
}
// FilteredURL represents the result of filtering a single URL.
type FilteredURL struct {
URL string `json:"url"`
IsAllowed bool `json:"is_allowed"`
Source *AllowedSource `json:"source,omitempty"`
BlockReason BlockReason `json:"block_reason,omitempty"`
RequiresCitation bool `json:"requires_citation"`
Error string `json:"error,omitempty"`
}
// =============================================================================
// HELPERS
// =============================================================================
// extractDomain extracts the domain from a URL.
func extractDomain(rawURL string) (string, error) {
// Handle URLs without scheme
if !strings.Contains(rawURL, "://") {
rawURL = "https://" + rawURL
}
parsed, err := url.Parse(rawURL)
if err != nil {
return "", err
}
host := parsed.Hostname()
// Remove www. prefix
host = strings.TrimPrefix(host, "www.")
return host, nil
}
// IsTrainingAllowed checks if training is allowed for any source (should always be false).
func (e *Enforcer) IsTrainingAllowed(ctx context.Context) (bool, error) {
// Training should NEVER be allowed - this is a safeguard
matrix, err := e.store.GetOperationsMatrix(ctx)
if err != nil {
return false, err
}
for _, source := range matrix {
for _, op := range source.Operations {
if op.Operation == OperationTraining && op.IsAllowed {
// This should never happen - log a warning
return true, nil
}
}
}
return false, nil
}
// GetSourceByURL is a convenience method to get a source by URL.
func (e *Enforcer) GetSourceByURL(ctx context.Context, rawURL string, bundesland *Bundesland) (*AllowedSource, error) {
return e.CheckSource(ctx, rawURL, bundesland)
}
// GetCitationForURL generates a citation for a URL if required.
func (e *Enforcer) GetCitationForURL(ctx context.Context, rawURL string, bundesland *Bundesland, title string, date string) (string, error) {
source, err := e.CheckSource(ctx, rawURL, bundesland)
if err != nil || source == nil {
return "", err
}
if source.CitationTemplate == nil || *source.CitationTemplate == "" {
// Default citation format
return "Quelle: " + source.Name + ", " + title + ", " + date, nil
}
// Replace placeholders in template
citation := *source.CitationTemplate
citation = strings.ReplaceAll(citation, "{title}", title)
citation = strings.ReplaceAll(citation, "{date}", date)
citation = strings.ReplaceAll(citation, "{url}", rawURL)
citation = strings.ReplaceAll(citation, "{domain}", source.Domain)
citation = strings.ReplaceAll(citation, "{source}", source.Name)
return citation, nil
}

View File

@@ -0,0 +1,255 @@
package policy
import (
"context"
"fmt"
"os"
"gopkg.in/yaml.v3"
)
// Loader handles loading policy configuration from YAML files.
type Loader struct {
store *Store
}
// NewLoader creates a new Loader instance.
func NewLoader(store *Store) *Loader {
return &Loader{store: store}
}
// LoadFromFile loads policy configuration from a YAML file.
func (l *Loader) LoadFromFile(ctx context.Context, path string) error {
data, err := os.ReadFile(path)
if err != nil {
return fmt.Errorf("failed to read YAML file: %w", err)
}
config, err := ParseYAML(data)
if err != nil {
return fmt.Errorf("failed to parse YAML: %w", err)
}
return l.store.LoadFromYAML(ctx, config)
}
// ParseYAML parses YAML configuration data.
func ParseYAML(data []byte) (*BundeslaenderConfig, error) {
// First, parse as a generic map to handle the inline Bundeslaender
var rawConfig map[string]interface{}
if err := yaml.Unmarshal(data, &rawConfig); err != nil {
return nil, fmt.Errorf("failed to parse YAML: %w", err)
}
config := &BundeslaenderConfig{
Bundeslaender: make(map[string]PolicyConfig),
}
// Parse federal
if federal, ok := rawConfig["federal"]; ok {
if federalMap, ok := federal.(map[string]interface{}); ok {
config.Federal = parsePolicyConfig(federalMap)
}
}
// Parse default_operations
if ops, ok := rawConfig["default_operations"]; ok {
if opsMap, ok := ops.(map[string]interface{}); ok {
config.DefaultOperations = parseOperationsConfig(opsMap)
}
}
// Parse pii_rules
if rules, ok := rawConfig["pii_rules"]; ok {
if rulesSlice, ok := rules.([]interface{}); ok {
for _, rule := range rulesSlice {
if ruleMap, ok := rule.(map[string]interface{}); ok {
config.PIIRules = append(config.PIIRules, parsePIIRuleConfig(ruleMap))
}
}
}
}
// Parse Bundeslaender (2-letter codes)
bundeslaender := []string{"BW", "BY", "BE", "BB", "HB", "HH", "HE", "MV", "NI", "NW", "RP", "SL", "SN", "ST", "SH", "TH"}
for _, bl := range bundeslaender {
if blConfig, ok := rawConfig[bl]; ok {
if blMap, ok := blConfig.(map[string]interface{}); ok {
config.Bundeslaender[bl] = parsePolicyConfig(blMap)
}
}
}
return config, nil
}
func parsePolicyConfig(m map[string]interface{}) PolicyConfig {
pc := PolicyConfig{}
if name, ok := m["name"].(string); ok {
pc.Name = name
}
if sources, ok := m["sources"].([]interface{}); ok {
for _, src := range sources {
if srcMap, ok := src.(map[string]interface{}); ok {
pc.Sources = append(pc.Sources, parseSourceConfig(srcMap))
}
}
}
return pc
}
func parseSourceConfig(m map[string]interface{}) SourceConfig {
sc := SourceConfig{
TrustBoost: 0.5, // Default
}
if domain, ok := m["domain"].(string); ok {
sc.Domain = domain
}
if name, ok := m["name"].(string); ok {
sc.Name = name
}
if license, ok := m["license"].(string); ok {
sc.License = license
}
if legalBasis, ok := m["legal_basis"].(string); ok {
sc.LegalBasis = legalBasis
}
if citation, ok := m["citation_template"].(string); ok {
sc.CitationTemplate = citation
}
if trustBoost, ok := m["trust_boost"].(float64); ok {
sc.TrustBoost = trustBoost
}
return sc
}
func parseOperationsConfig(m map[string]interface{}) OperationsConfig {
oc := OperationsConfig{}
if lookup, ok := m["lookup"].(map[string]interface{}); ok {
oc.Lookup = parseOperationConfig(lookup)
}
if rag, ok := m["rag"].(map[string]interface{}); ok {
oc.RAG = parseOperationConfig(rag)
}
if training, ok := m["training"].(map[string]interface{}); ok {
oc.Training = parseOperationConfig(training)
}
if export, ok := m["export"].(map[string]interface{}); ok {
oc.Export = parseOperationConfig(export)
}
return oc
}
func parseOperationConfig(m map[string]interface{}) OperationConfig {
oc := OperationConfig{}
if allowed, ok := m["allowed"].(bool); ok {
oc.Allowed = allowed
}
if requiresCitation, ok := m["requires_citation"].(bool); ok {
oc.RequiresCitation = requiresCitation
}
return oc
}
func parsePIIRuleConfig(m map[string]interface{}) PIIRuleConfig {
rc := PIIRuleConfig{
Severity: "block", // Default
}
if name, ok := m["name"].(string); ok {
rc.Name = name
}
if ruleType, ok := m["type"].(string); ok {
rc.Type = ruleType
}
if pattern, ok := m["pattern"].(string); ok {
rc.Pattern = pattern
}
if severity, ok := m["severity"].(string); ok {
rc.Severity = severity
}
return rc
}
// LoadDefaults loads a minimal set of default data (for testing or when no YAML exists).
func (l *Loader) LoadDefaults(ctx context.Context) error {
// Create federal policy with KMK
federalPolicy, err := l.store.CreatePolicy(ctx, &CreateSourcePolicyRequest{
Name: "KMK & Bundesebene",
})
if err != nil {
return fmt.Errorf("failed to create federal policy: %w", err)
}
trustBoost := 0.95
legalBasis := "Amtliche Werke (§5 UrhG)"
citation := "Quelle: KMK, {title}, {date}"
_, err = l.store.CreateSource(ctx, &CreateAllowedSourceRequest{
PolicyID: federalPolicy.ID,
Domain: "kmk.org",
Name: "Kultusministerkonferenz",
License: LicenseParagraph5,
LegalBasis: &legalBasis,
CitationTemplate: &citation,
TrustBoost: &trustBoost,
})
if err != nil {
return fmt.Errorf("failed to create KMK source: %w", err)
}
// Create default PII rules
defaultRules := DefaultPIIRules()
for _, rule := range defaultRules {
_, err := l.store.CreatePIIRule(ctx, &CreatePIIRuleRequest{
Name: rule.Name,
RuleType: PIIRuleType(rule.Type),
Pattern: rule.Pattern,
Severity: PIISeverity(rule.Severity),
})
if err != nil {
return fmt.Errorf("failed to create PII rule %s: %w", rule.Name, err)
}
}
return nil
}
// HasData checks if the policy tables already have data.
func (l *Loader) HasData(ctx context.Context) (bool, error) {
policies, _, err := l.store.ListPolicies(ctx, &PolicyListFilter{Limit: 1})
if err != nil {
return false, err
}
return len(policies) > 0, nil
}
// LoadIfEmpty loads data from YAML only if tables are empty.
func (l *Loader) LoadIfEmpty(ctx context.Context, path string) error {
hasData, err := l.HasData(ctx)
if err != nil {
return err
}
if hasData {
return nil // Already has data, skip loading
}
// Check if file exists
if _, err := os.Stat(path); os.IsNotExist(err) {
// File doesn't exist, load defaults
return l.LoadDefaults(ctx)
}
return l.LoadFromFile(ctx, path)
}

View File

@@ -0,0 +1,445 @@
// Package policy provides whitelist-based data source management for the edu-search-service.
// It implements source policies, operation permissions, PII detection, and audit logging
// for compliance with German data protection regulations.
package policy
import (
"encoding/json"
"time"
"github.com/google/uuid"
)
// =============================================================================
// ENUMS AND CONSTANTS
// =============================================================================
// Bundesland represents German federal states (2-letter codes).
type Bundesland string
const (
BundeslandBW Bundesland = "BW" // Baden-Wuerttemberg
BundeslandBY Bundesland = "BY" // Bayern
BundeslandBE Bundesland = "BE" // Berlin
BundeslandBB Bundesland = "BB" // Brandenburg
BundeslandHB Bundesland = "HB" // Bremen
BundeslandHH Bundesland = "HH" // Hamburg
BundeslandHE Bundesland = "HE" // Hessen
BundeslandMV Bundesland = "MV" // Mecklenburg-Vorpommern
BundeslandNI Bundesland = "NI" // Niedersachsen
BundeslandNW Bundesland = "NW" // Nordrhein-Westfalen
BundeslandRP Bundesland = "RP" // Rheinland-Pfalz
BundeslandSL Bundesland = "SL" // Saarland
BundeslandSN Bundesland = "SN" // Sachsen
BundeslandST Bundesland = "ST" // Sachsen-Anhalt
BundeslandSH Bundesland = "SH" // Schleswig-Holstein
BundeslandTH Bundesland = "TH" // Thueringen
)
// ValidBundeslaender contains all valid German federal state codes.
var ValidBundeslaender = []Bundesland{
BundeslandBW, BundeslandBY, BundeslandBE, BundeslandBB,
BundeslandHB, BundeslandHH, BundeslandHE, BundeslandMV,
BundeslandNI, BundeslandNW, BundeslandRP, BundeslandSL,
BundeslandSN, BundeslandST, BundeslandSH, BundeslandTH,
}
// License represents allowed license types for data sources.
type License string
const (
LicenseDLDEBY20 License = "DL-DE-BY-2.0" // Datenlizenz Deutschland - Namensnennung
LicenseCCBY License = "CC-BY" // Creative Commons Attribution
LicenseCCBYSA License = "CC-BY-SA" // Creative Commons Attribution-ShareAlike
LicenseCCBYNC License = "CC-BY-NC" // Creative Commons Attribution-NonCommercial
LicenseCCBYNCSA License = "CC-BY-NC-SA" // Creative Commons Attribution-NonCommercial-ShareAlike
LicenseCC0 License = "CC0" // Public Domain
LicenseParagraph5 License = "§5 UrhG" // Amtliche Werke (German Copyright Act)
LicenseCustom License = "Custom" // Custom license (requires legal basis)
)
// Operation represents the types of operations that can be performed on data.
type Operation string
const (
OperationLookup Operation = "lookup" // Display/Search
OperationRAG Operation = "rag" // RAG (Retrieval-Augmented Generation)
OperationTraining Operation = "training" // Model Training (VERBOTEN by default)
OperationExport Operation = "export" // Data Export
)
// ValidOperations contains all valid operation types.
var ValidOperations = []Operation{
OperationLookup,
OperationRAG,
OperationTraining,
OperationExport,
}
// PIIRuleType represents the type of PII detection rule.
type PIIRuleType string
const (
PIIRuleTypeRegex PIIRuleType = "regex" // Regular expression pattern
PIIRuleTypeKeyword PIIRuleType = "keyword" // Keyword matching
)
// PIISeverity represents the severity level of a PII match.
type PIISeverity string
const (
PIISeverityBlock PIISeverity = "block" // Block content completely
PIISeverityWarn PIISeverity = "warn" // Warn but allow
PIISeverityRedact PIISeverity = "redact" // Redact matched content
)
// AuditAction represents the type of action logged in the audit trail.
type AuditAction string
const (
AuditActionCreate AuditAction = "create"
AuditActionUpdate AuditAction = "update"
AuditActionDelete AuditAction = "delete"
AuditActionActivate AuditAction = "activate"
AuditActionDeactivate AuditAction = "deactivate"
AuditActionApprove AuditAction = "approve"
)
// AuditEntityType represents the type of entity being audited.
type AuditEntityType string
const (
AuditEntitySourcePolicy AuditEntityType = "source_policy"
AuditEntityAllowedSource AuditEntityType = "allowed_source"
AuditEntityOperationPermission AuditEntityType = "operation_permission"
AuditEntityPIIRule AuditEntityType = "pii_rule"
)
// BlockReason represents the reason why content was blocked.
type BlockReason string
const (
BlockReasonNotWhitelisted BlockReason = "not_whitelisted"
BlockReasonPIIDetected BlockReason = "pii_detected"
BlockReasonTrainingForbidden BlockReason = "training_forbidden"
BlockReasonLicenseViolation BlockReason = "license_violation"
BlockReasonManualBlock BlockReason = "manual_block"
)
// =============================================================================
// CORE MODELS
// =============================================================================
// SourcePolicy represents a versioned policy for data source management.
// Policies can be scoped to a specific Bundesland or apply federally (bundesland = nil).
type SourcePolicy struct {
ID uuid.UUID `json:"id" db:"id"`
Version int `json:"version" db:"version"`
Name string `json:"name" db:"name"`
Description *string `json:"description,omitempty" db:"description"`
Bundesland *Bundesland `json:"bundesland,omitempty" db:"bundesland"`
IsActive bool `json:"is_active" db:"is_active"`
CreatedAt time.Time `json:"created_at" db:"created_at"`
UpdatedAt time.Time `json:"updated_at" db:"updated_at"`
ApprovedBy *uuid.UUID `json:"approved_by,omitempty" db:"approved_by"`
ApprovedAt *time.Time `json:"approved_at,omitempty" db:"approved_at"`
// Joined fields (populated by queries)
Sources []AllowedSource `json:"sources,omitempty"`
}
// AllowedSource represents a whitelisted data source with license information.
type AllowedSource struct {
ID uuid.UUID `json:"id" db:"id"`
PolicyID uuid.UUID `json:"policy_id" db:"policy_id"`
Domain string `json:"domain" db:"domain"`
Name string `json:"name" db:"name"`
Description *string `json:"description,omitempty" db:"description"`
License License `json:"license" db:"license"`
LegalBasis *string `json:"legal_basis,omitempty" db:"legal_basis"`
CitationTemplate *string `json:"citation_template,omitempty" db:"citation_template"`
TrustBoost float64 `json:"trust_boost" db:"trust_boost"`
IsActive bool `json:"is_active" db:"is_active"`
CreatedAt time.Time `json:"created_at" db:"created_at"`
UpdatedAt time.Time `json:"updated_at" db:"updated_at"`
// Joined fields (populated by queries)
Operations []OperationPermission `json:"operations,omitempty"`
PolicyName *string `json:"policy_name,omitempty"`
}
// OperationPermission represents the permission matrix for a specific source.
type OperationPermission struct {
ID uuid.UUID `json:"id" db:"id"`
SourceID uuid.UUID `json:"source_id" db:"source_id"`
Operation Operation `json:"operation" db:"operation"`
IsAllowed bool `json:"is_allowed" db:"is_allowed"`
RequiresCitation bool `json:"requires_citation" db:"requires_citation"`
Notes *string `json:"notes,omitempty" db:"notes"`
CreatedAt time.Time `json:"created_at" db:"created_at"`
UpdatedAt time.Time `json:"updated_at" db:"updated_at"`
}
// PIIRule represents a rule for detecting personally identifiable information.
type PIIRule struct {
ID uuid.UUID `json:"id" db:"id"`
Name string `json:"name" db:"name"`
Description *string `json:"description,omitempty" db:"description"`
RuleType PIIRuleType `json:"rule_type" db:"rule_type"`
Pattern string `json:"pattern" db:"pattern"`
Severity PIISeverity `json:"severity" db:"severity"`
IsActive bool `json:"is_active" db:"is_active"`
CreatedAt time.Time `json:"created_at" db:"created_at"`
UpdatedAt time.Time `json:"updated_at" db:"updated_at"`
}
// =============================================================================
// AUDIT AND LOGGING MODELS
// =============================================================================
// PolicyAuditLog represents an immutable audit log entry for policy changes.
type PolicyAuditLog struct {
ID uuid.UUID `json:"id" db:"id"`
Action AuditAction `json:"action" db:"action"`
EntityType AuditEntityType `json:"entity_type" db:"entity_type"`
EntityID *uuid.UUID `json:"entity_id,omitempty" db:"entity_id"`
OldValue json.RawMessage `json:"old_value,omitempty" db:"old_value"`
NewValue json.RawMessage `json:"new_value,omitempty" db:"new_value"`
UserID *uuid.UUID `json:"user_id,omitempty" db:"user_id"`
UserEmail *string `json:"user_email,omitempty" db:"user_email"`
IPAddress *string `json:"ip_address,omitempty" db:"ip_address"`
UserAgent *string `json:"user_agent,omitempty" db:"user_agent"`
CreatedAt time.Time `json:"created_at" db:"created_at"`
}
// BlockedContentLog represents a log entry for blocked URLs.
type BlockedContentLog struct {
ID uuid.UUID `json:"id" db:"id"`
URL string `json:"url" db:"url"`
Domain string `json:"domain" db:"domain"`
BlockReason BlockReason `json:"block_reason" db:"block_reason"`
MatchedRuleID *uuid.UUID `json:"matched_rule_id,omitempty" db:"matched_rule_id"`
Details json.RawMessage `json:"details,omitempty" db:"details"`
CreatedAt time.Time `json:"created_at" db:"created_at"`
}
// =============================================================================
// REQUEST/RESPONSE MODELS
// =============================================================================
// CreateSourcePolicyRequest represents a request to create a new policy.
type CreateSourcePolicyRequest struct {
Name string `json:"name" binding:"required"`
Description *string `json:"description"`
Bundesland *Bundesland `json:"bundesland"`
}
// UpdateSourcePolicyRequest represents a request to update a policy.
type UpdateSourcePolicyRequest struct {
Name *string `json:"name"`
Description *string `json:"description"`
Bundesland *Bundesland `json:"bundesland"`
IsActive *bool `json:"is_active"`
}
// CreateAllowedSourceRequest represents a request to create a new allowed source.
type CreateAllowedSourceRequest struct {
PolicyID uuid.UUID `json:"policy_id" binding:"required"`
Domain string `json:"domain" binding:"required"`
Name string `json:"name" binding:"required"`
Description *string `json:"description"`
License License `json:"license" binding:"required"`
LegalBasis *string `json:"legal_basis"`
CitationTemplate *string `json:"citation_template"`
TrustBoost *float64 `json:"trust_boost"`
}
// UpdateAllowedSourceRequest represents a request to update an allowed source.
type UpdateAllowedSourceRequest struct {
Domain *string `json:"domain"`
Name *string `json:"name"`
Description *string `json:"description"`
License *License `json:"license"`
LegalBasis *string `json:"legal_basis"`
CitationTemplate *string `json:"citation_template"`
TrustBoost *float64 `json:"trust_boost"`
IsActive *bool `json:"is_active"`
}
// UpdateOperationPermissionRequest represents a request to update operation permissions.
type UpdateOperationPermissionRequest struct {
IsAllowed *bool `json:"is_allowed"`
RequiresCitation *bool `json:"requires_citation"`
Notes *string `json:"notes"`
}
// CreatePIIRuleRequest represents a request to create a new PII rule.
type CreatePIIRuleRequest struct {
Name string `json:"name" binding:"required"`
Description *string `json:"description"`
RuleType PIIRuleType `json:"rule_type" binding:"required"`
Pattern string `json:"pattern" binding:"required"`
Severity PIISeverity `json:"severity"`
}
// UpdatePIIRuleRequest represents a request to update a PII rule.
type UpdatePIIRuleRequest struct {
Name *string `json:"name"`
Description *string `json:"description"`
RuleType *PIIRuleType `json:"rule_type"`
Pattern *string `json:"pattern"`
Severity *PIISeverity `json:"severity"`
IsActive *bool `json:"is_active"`
}
// CheckComplianceRequest represents a request to check URL compliance.
type CheckComplianceRequest struct {
URL string `json:"url" binding:"required"`
Operation Operation `json:"operation" binding:"required"`
Bundesland *Bundesland `json:"bundesland"`
}
// CheckComplianceResponse represents the compliance check result.
type CheckComplianceResponse struct {
IsAllowed bool `json:"is_allowed"`
Source *AllowedSource `json:"source,omitempty"`
BlockReason *BlockReason `json:"block_reason,omitempty"`
RequiresCitation bool `json:"requires_citation"`
CitationTemplate *string `json:"citation_template,omitempty"`
License *License `json:"license,omitempty"`
}
// PIITestRequest represents a request to test PII detection.
type PIITestRequest struct {
Text string `json:"text" binding:"required"`
}
// PIIMatch represents a single PII match in text.
type PIIMatch struct {
RuleID uuid.UUID `json:"rule_id"`
RuleName string `json:"rule_name"`
RuleType PIIRuleType `json:"rule_type"`
Severity PIISeverity `json:"severity"`
Match string `json:"match"`
StartIndex int `json:"start_index"`
EndIndex int `json:"end_index"`
}
// PIITestResponse represents the result of PII detection test.
type PIITestResponse struct {
HasPII bool `json:"has_pii"`
Matches []PIIMatch `json:"matches"`
BlockLevel PIISeverity `json:"block_level"`
ShouldBlock bool `json:"should_block"`
}
// =============================================================================
// LIST/FILTER MODELS
// =============================================================================
// PolicyListFilter represents filters for listing policies.
type PolicyListFilter struct {
Bundesland *Bundesland `form:"bundesland"`
IsActive *bool `form:"is_active"`
Limit int `form:"limit"`
Offset int `form:"offset"`
}
// SourceListFilter represents filters for listing sources.
type SourceListFilter struct {
PolicyID *uuid.UUID `form:"policy_id"`
Domain *string `form:"domain"`
License *License `form:"license"`
IsActive *bool `form:"is_active"`
Limit int `form:"limit"`
Offset int `form:"offset"`
}
// AuditLogFilter represents filters for querying audit logs.
type AuditLogFilter struct {
EntityType *AuditEntityType `form:"entity_type"`
EntityID *uuid.UUID `form:"entity_id"`
Action *AuditAction `form:"action"`
UserEmail *string `form:"user_email"`
FromDate *time.Time `form:"from"`
ToDate *time.Time `form:"to"`
Limit int `form:"limit"`
Offset int `form:"offset"`
}
// BlockedContentFilter represents filters for querying blocked content logs.
type BlockedContentFilter struct {
Domain *string `form:"domain"`
BlockReason *BlockReason `form:"block_reason"`
FromDate *time.Time `form:"from"`
ToDate *time.Time `form:"to"`
Limit int `form:"limit"`
Offset int `form:"offset"`
}
// =============================================================================
// STATISTICS MODELS
// =============================================================================
// PolicyStats represents aggregated statistics for the policy system.
type PolicyStats struct {
ActivePolicies int `json:"active_policies"`
TotalSources int `json:"total_sources"`
ActiveSources int `json:"active_sources"`
BlockedToday int `json:"blocked_today"`
BlockedTotal int `json:"blocked_total"`
PIIRulesActive int `json:"pii_rules_active"`
SourcesByLicense map[string]int `json:"sources_by_license"`
BlocksByReason map[string]int `json:"blocks_by_reason"`
ComplianceScore float64 `json:"compliance_score"`
}
// =============================================================================
// YAML CONFIGURATION MODELS
// =============================================================================
// BundeslaenderConfig represents the YAML configuration for initial data loading.
type BundeslaenderConfig struct {
Federal PolicyConfig `yaml:"federal"`
Bundeslaender map[string]PolicyConfig `yaml:",inline"`
DefaultOperations OperationsConfig `yaml:"default_operations"`
PIIRules []PIIRuleConfig `yaml:"pii_rules"`
}
// PolicyConfig represents a policy configuration in YAML.
type PolicyConfig struct {
Name string `yaml:"name"`
Sources []SourceConfig `yaml:"sources"`
}
// SourceConfig represents a source configuration in YAML.
type SourceConfig struct {
Domain string `yaml:"domain"`
Name string `yaml:"name"`
License string `yaml:"license"`
LegalBasis string `yaml:"legal_basis,omitempty"`
CitationTemplate string `yaml:"citation_template,omitempty"`
TrustBoost float64 `yaml:"trust_boost,omitempty"`
}
// OperationsConfig represents default operation permissions in YAML.
type OperationsConfig struct {
Lookup OperationConfig `yaml:"lookup"`
RAG OperationConfig `yaml:"rag"`
Training OperationConfig `yaml:"training"`
Export OperationConfig `yaml:"export"`
}
// OperationConfig represents a single operation permission in YAML.
type OperationConfig struct {
Allowed bool `yaml:"allowed"`
RequiresCitation bool `yaml:"requires_citation"`
}
// PIIRuleConfig represents a PII rule configuration in YAML.
type PIIRuleConfig struct {
Name string `yaml:"name"`
Type string `yaml:"type"`
Pattern string `yaml:"pattern"`
Severity string `yaml:"severity"`
}

View File

@@ -0,0 +1,350 @@
package policy
import (
"context"
"regexp"
"strings"
"sync"
)
// PIIDetector detects personally identifiable information in text.
type PIIDetector struct {
store *Store
compiledRules map[string]*regexp.Regexp
rulesMu sync.RWMutex
}
// NewPIIDetector creates a new PIIDetector instance.
func NewPIIDetector(store *Store) *PIIDetector {
return &PIIDetector{
store: store,
compiledRules: make(map[string]*regexp.Regexp),
}
}
// Detect scans text for PII patterns and returns all matches.
func (d *PIIDetector) Detect(ctx context.Context, text string) (*PIITestResponse, error) {
rules, err := d.store.ListPIIRules(ctx, true)
if err != nil {
return nil, err
}
response := &PIITestResponse{
HasPII: false,
Matches: []PIIMatch{},
ShouldBlock: false,
}
highestSeverity := PIISeverity("")
for _, rule := range rules {
matches := d.findMatches(text, &rule)
if len(matches) > 0 {
response.HasPII = true
response.Matches = append(response.Matches, matches...)
// Track highest severity
if compareSeverity(rule.Severity, highestSeverity) > 0 {
highestSeverity = rule.Severity
}
}
}
response.BlockLevel = highestSeverity
response.ShouldBlock = highestSeverity == PIISeverityBlock
return response, nil
}
// findMatches finds all matches for a single rule in the text.
func (d *PIIDetector) findMatches(text string, rule *PIIRule) []PIIMatch {
var matches []PIIMatch
switch rule.RuleType {
case PIIRuleTypeRegex:
matches = d.findRegexMatches(text, rule)
case PIIRuleTypeKeyword:
matches = d.findKeywordMatches(text, rule)
}
return matches
}
// findRegexMatches finds all regex pattern matches in text.
func (d *PIIDetector) findRegexMatches(text string, rule *PIIRule) []PIIMatch {
re := d.getCompiledRegex(rule.ID.String(), rule.Pattern)
if re == nil {
return nil
}
var matches []PIIMatch
allMatches := re.FindAllStringIndex(text, -1)
for _, loc := range allMatches {
matches = append(matches, PIIMatch{
RuleID: rule.ID,
RuleName: rule.Name,
RuleType: rule.RuleType,
Severity: rule.Severity,
Match: text[loc[0]:loc[1]],
StartIndex: loc[0],
EndIndex: loc[1],
})
}
return matches
}
// findKeywordMatches finds all keyword matches in text (case-insensitive).
func (d *PIIDetector) findKeywordMatches(text string, rule *PIIRule) []PIIMatch {
var matches []PIIMatch
lowerText := strings.ToLower(text)
// Split pattern by commas or pipes for multiple keywords
keywords := strings.FieldsFunc(rule.Pattern, func(r rune) bool {
return r == ',' || r == '|'
})
for _, keyword := range keywords {
keyword = strings.TrimSpace(keyword)
if keyword == "" {
continue
}
lowerKeyword := strings.ToLower(keyword)
startIdx := 0
for {
idx := strings.Index(lowerText[startIdx:], lowerKeyword)
if idx == -1 {
break
}
actualIdx := startIdx + idx
matches = append(matches, PIIMatch{
RuleID: rule.ID,
RuleName: rule.Name,
RuleType: rule.RuleType,
Severity: rule.Severity,
Match: text[actualIdx : actualIdx+len(keyword)],
StartIndex: actualIdx,
EndIndex: actualIdx + len(keyword),
})
startIdx = actualIdx + len(keyword)
}
}
return matches
}
// getCompiledRegex returns a compiled regex, caching for performance.
func (d *PIIDetector) getCompiledRegex(ruleID, pattern string) *regexp.Regexp {
d.rulesMu.RLock()
re, ok := d.compiledRules[ruleID]
d.rulesMu.RUnlock()
if ok {
return re
}
// Compile and cache
d.rulesMu.Lock()
defer d.rulesMu.Unlock()
// Double-check after acquiring write lock
if re, ok = d.compiledRules[ruleID]; ok {
return re
}
compiled, err := regexp.Compile(pattern)
if err != nil {
// Invalid regex - don't cache
return nil
}
d.compiledRules[ruleID] = compiled
return compiled
}
// ClearCache clears the compiled regex cache (call after rule updates).
func (d *PIIDetector) ClearCache() {
d.rulesMu.Lock()
defer d.rulesMu.Unlock()
d.compiledRules = make(map[string]*regexp.Regexp)
}
// RefreshRules reloads rules and clears the cache.
func (d *PIIDetector) RefreshRules() {
d.ClearCache()
}
// compareSeverity compares two severity levels.
// Returns: 1 if a > b, -1 if a < b, 0 if equal.
func compareSeverity(a, b PIISeverity) int {
severityOrder := map[PIISeverity]int{
"": 0,
PIISeverityWarn: 1,
PIISeverityRedact: 2,
PIISeverityBlock: 3,
}
aOrder := severityOrder[a]
bOrder := severityOrder[b]
if aOrder > bOrder {
return 1
} else if aOrder < bOrder {
return -1
}
return 0
}
// =============================================================================
// PREDEFINED PII PATTERNS (German Context)
// =============================================================================
// DefaultPIIRules returns a set of default PII detection rules for German context.
func DefaultPIIRules() []PIIRuleConfig {
return []PIIRuleConfig{
// Email Addresses
{
Name: "Email Addresses",
Type: "regex",
Pattern: `[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}`,
Severity: "block",
},
// German Phone Numbers
{
Name: "German Phone Numbers",
Type: "regex",
Pattern: `(?:\+49|0)[\s.-]?\d{2,4}[\s.-]?\d{3,}[\s.-]?\d{2,}`,
Severity: "block",
},
// German Mobile Numbers
{
Name: "German Mobile Numbers",
Type: "regex",
Pattern: `(?:\+49|0)1[567]\d[\s.-]?\d{3,}[\s.-]?\d{2,}`,
Severity: "block",
},
// IBAN (German)
{
Name: "German IBAN",
Type: "regex",
Pattern: `DE\d{2}\s?\d{4}\s?\d{4}\s?\d{4}\s?\d{4}\s?\d{2}`,
Severity: "block",
},
// German Social Security Number (Sozialversicherungsnummer)
{
Name: "German Social Security Number",
Type: "regex",
Pattern: `\d{2}[0-3]\d[01]\d{2}[A-Z]\d{3}`,
Severity: "block",
},
// German Tax ID (Steuer-ID)
{
Name: "German Tax ID",
Type: "regex",
Pattern: `\d{2}\s?\d{3}\s?\d{3}\s?\d{3}`,
Severity: "block",
},
// Credit Card Numbers (Luhn-compatible patterns)
{
Name: "Credit Card Numbers",
Type: "regex",
Pattern: `(?:\d{4}[\s.-]?){3}\d{4}`,
Severity: "block",
},
// German Postal Code + City Pattern (potential address)
{
Name: "German Address Pattern",
Type: "regex",
Pattern: `\d{5}\s+[A-ZÄÖÜ][a-zäöüß]+`,
Severity: "warn",
},
// Date of Birth Patterns (DD.MM.YYYY)
{
Name: "Date of Birth",
Type: "regex",
Pattern: `(?:geboren|geb\.|Geburtsdatum|DoB)[\s:]*\d{1,2}[\./]\d{1,2}[\./]\d{2,4}`,
Severity: "warn",
},
// Personal Names with Titles
{
Name: "Personal Names with Titles",
Type: "regex",
Pattern: `(?:Herr|Frau|Dr\.|Prof\.)\s+[A-ZÄÖÜ][a-zäöüß]+\s+[A-ZÄÖÜ][a-zäöüß]+`,
Severity: "warn",
},
// German Health Insurance Number
{
Name: "Health Insurance Number",
Type: "regex",
Pattern: `[A-Z]\d{9}`,
Severity: "block",
},
// Vehicle Registration (German)
{
Name: "German Vehicle Registration",
Type: "regex",
Pattern: `[A-ZÄÖÜ]{1,3}[\s-]?[A-Z]{1,2}[\s-]?\d{1,4}[HE]?`,
Severity: "warn",
},
}
}
// =============================================================================
// REDACTION
// =============================================================================
// RedactText redacts PII from text based on the matches.
func (d *PIIDetector) RedactText(text string, matches []PIIMatch) string {
if len(matches) == 0 {
return text
}
// Sort matches by start index (descending) to replace from end
sortedMatches := make([]PIIMatch, len(matches))
copy(sortedMatches, matches)
// Simple bubble sort for small number of matches
for i := 0; i < len(sortedMatches)-1; i++ {
for j := 0; j < len(sortedMatches)-i-1; j++ {
if sortedMatches[j].StartIndex < sortedMatches[j+1].StartIndex {
sortedMatches[j], sortedMatches[j+1] = sortedMatches[j+1], sortedMatches[j]
}
}
}
result := text
for _, match := range sortedMatches {
if match.Severity == PIISeverityRedact || match.Severity == PIISeverityBlock {
replacement := strings.Repeat("*", match.EndIndex-match.StartIndex)
result = result[:match.StartIndex] + replacement + result[match.EndIndex:]
}
}
return result
}
// FilterContent filters content based on PII detection.
// Returns the filtered content and whether it should be blocked.
func (d *PIIDetector) FilterContent(ctx context.Context, content string) (string, bool, error) {
response, err := d.Detect(ctx, content)
if err != nil {
return content, false, err
}
if !response.HasPII {
return content, false, nil
}
if response.ShouldBlock {
return "", true, nil
}
// Redact content for warn/redact severity
redacted := d.RedactText(content, response.Matches)
return redacted, false, nil
}

View File

@@ -0,0 +1,489 @@
package policy
import (
"regexp"
"testing"
)
// =============================================================================
// MODEL TESTS
// =============================================================================
func TestBundeslandValidation(t *testing.T) {
tests := []struct {
name string
bl Bundesland
expected bool
}{
{"valid NI", BundeslandNI, true},
{"valid BY", BundeslandBY, true},
{"valid BW", BundeslandBW, true},
{"valid NW", BundeslandNW, true},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
found := false
for _, valid := range ValidBundeslaender {
if valid == tt.bl {
found = true
break
}
}
if found != tt.expected {
t.Errorf("Expected %v to be valid=%v, got valid=%v", tt.bl, tt.expected, found)
}
})
}
}
func TestLicenseValues(t *testing.T) {
licenses := []License{
LicenseDLDEBY20,
LicenseCCBY,
LicenseCCBYSA,
LicenseCC0,
LicenseParagraph5,
}
for _, l := range licenses {
if l == "" {
t.Errorf("License should not be empty")
}
}
}
func TestOperationValues(t *testing.T) {
if len(ValidOperations) != 4 {
t.Errorf("Expected 4 operations, got %d", len(ValidOperations))
}
expectedOps := []Operation{OperationLookup, OperationRAG, OperationTraining, OperationExport}
for _, expected := range expectedOps {
found := false
for _, op := range ValidOperations {
if op == expected {
found = true
break
}
}
if !found {
t.Errorf("Expected operation %s not found in ValidOperations", expected)
}
}
}
// =============================================================================
// PII DETECTOR TESTS
// =============================================================================
func TestPIIDetector_EmailDetection(t *testing.T) {
tests := []struct {
name string
text string
hasEmail bool
}{
{"simple email", "Contact: test@example.com", true},
{"email with plus", "Email: user+tag@domain.org", true},
{"no email", "This is plain text", false},
{"partial email", "user@ is not an email", false},
{"multiple emails", "Send to a@b.com and x@y.de", true},
}
// Test using regex pattern directly since we don't have a store
emailPattern := `[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}`
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
// Simple test without database
rule := &PIIRule{
Name: "Email",
RuleType: PIIRuleTypeRegex,
Pattern: emailPattern,
Severity: PIISeverityBlock,
}
detector := &PIIDetector{
compiledRules: make(map[string]*regexp.Regexp),
}
matches := detector.findMatches(tt.text, rule)
hasMatch := len(matches) > 0
if hasMatch != tt.hasEmail {
t.Errorf("Expected hasEmail=%v, got %v for text: %s", tt.hasEmail, hasMatch, tt.text)
}
})
}
}
func TestPIIDetector_PhoneDetection(t *testing.T) {
tests := []struct {
name string
text string
hasPhone bool
}{
{"german mobile", "Call +49 170 1234567", true},
{"german landline", "Tel: 030-12345678", true},
{"with spaces", "Phone: 0170 123 4567", true},
{"no phone", "This is just text", false},
{"US format", "Call 555-123-4567", false}, // Should not match German pattern
}
phonePattern := `(?:\+49|0)[\s.-]?\d{2,4}[\s.-]?\d{3,}[\s.-]?\d{2,}`
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
rule := &PIIRule{
Name: "Phone",
RuleType: PIIRuleTypeRegex,
Pattern: phonePattern,
Severity: PIISeverityBlock,
}
detector := &PIIDetector{
compiledRules: make(map[string]*regexp.Regexp),
}
matches := detector.findMatches(tt.text, rule)
hasMatch := len(matches) > 0
if hasMatch != tt.hasPhone {
t.Errorf("Expected hasPhone=%v, got %v for text: %s", tt.hasPhone, hasMatch, tt.text)
}
})
}
}
func TestPIIDetector_IBANDetection(t *testing.T) {
tests := []struct {
name string
text string
hasIBAN bool
}{
{"valid IBAN", "IBAN: DE89 3704 0044 0532 0130 00", true},
{"compact IBAN", "DE89370400440532013000", true},
{"no IBAN", "Just a number: 12345678", false},
{"partial", "DE12 is not complete", false},
}
ibanPattern := `DE\d{2}\s?\d{4}\s?\d{4}\s?\d{4}\s?\d{4}\s?\d{2}`
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
rule := &PIIRule{
Name: "IBAN",
RuleType: PIIRuleTypeRegex,
Pattern: ibanPattern,
Severity: PIISeverityBlock,
}
detector := &PIIDetector{
compiledRules: make(map[string]*regexp.Regexp),
}
matches := detector.findMatches(tt.text, rule)
hasMatch := len(matches) > 0
if hasMatch != tt.hasIBAN {
t.Errorf("Expected hasIBAN=%v, got %v for text: %s", tt.hasIBAN, hasMatch, tt.text)
}
})
}
}
func TestPIIDetector_KeywordMatching(t *testing.T) {
tests := []struct {
name string
text string
keywords string
expected int
}{
{"single keyword", "The password is secret", "password", 1},
{"multiple keywords", "Password and secret", "password,secret", 2},
{"case insensitive", "PASSWORD and Secret", "password,secret", 2},
{"no match", "This is safe text", "password,secret", 0},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
rule := &PIIRule{
Name: "Keywords",
RuleType: PIIRuleTypeKeyword,
Pattern: tt.keywords,
Severity: PIISeverityWarn,
}
detector := &PIIDetector{
compiledRules: make(map[string]*regexp.Regexp),
}
matches := detector.findKeywordMatches(tt.text, rule)
if len(matches) != tt.expected {
t.Errorf("Expected %d matches, got %d for text: %s", tt.expected, len(matches), tt.text)
}
})
}
}
func TestPIIDetector_Redaction(t *testing.T) {
detector := &PIIDetector{
compiledRules: make(map[string]*regexp.Regexp),
}
tests := []struct {
name string
text string
matches []PIIMatch
expected string
}{
{
"single redaction",
"Email: test@example.com",
[]PIIMatch{{StartIndex: 7, EndIndex: 23, Severity: PIISeverityBlock}},
"Email: ****************",
},
{
"no matches",
"Plain text",
[]PIIMatch{},
"Plain text",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result := detector.RedactText(tt.text, tt.matches)
if result != tt.expected {
t.Errorf("Expected '%s', got '%s'", tt.expected, result)
}
})
}
}
func TestCompareSeverity(t *testing.T) {
tests := []struct {
a, b PIISeverity
expected int
}{
{PIISeverityBlock, PIISeverityWarn, 1},
{PIISeverityWarn, PIISeverityBlock, -1},
{PIISeverityBlock, PIISeverityBlock, 0},
{PIISeverityRedact, PIISeverityWarn, 1},
{PIISeverityRedact, PIISeverityBlock, -1},
}
for _, tt := range tests {
t.Run(string(tt.a)+"_vs_"+string(tt.b), func(t *testing.T) {
result := compareSeverity(tt.a, tt.b)
if result != tt.expected {
t.Errorf("Expected %d, got %d for %s vs %s", tt.expected, result, tt.a, tt.b)
}
})
}
}
// =============================================================================
// ENFORCER TESTS
// =============================================================================
func TestExtractDomain(t *testing.T) {
tests := []struct {
name string
url string
expected string
hasError bool
}{
{"full URL", "https://www.example.com/path", "example.com", false},
{"with port", "http://example.com:8080/path", "example.com", false},
{"subdomain", "https://sub.domain.example.com", "sub.domain.example.com", false},
{"no scheme", "example.com/path", "example.com", false},
{"www prefix", "https://www.test.de", "test.de", false},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result, err := extractDomain(tt.url)
if tt.hasError && err == nil {
t.Error("Expected error, got nil")
}
if !tt.hasError && err != nil {
t.Errorf("Expected no error, got %v", err)
}
if result != tt.expected {
t.Errorf("Expected '%s', got '%s'", tt.expected, result)
}
})
}
}
// =============================================================================
// YAML LOADER TESTS
// =============================================================================
func TestParseYAML(t *testing.T) {
yamlData := `
federal:
name: "Test Federal"
sources:
- domain: "test.gov"
name: "Test Source"
license: "§5 UrhG"
trust_boost: 0.9
NI:
name: "Niedersachsen"
sources:
- domain: "ni.gov"
name: "NI Source"
license: "DL-DE-BY-2.0"
default_operations:
lookup:
allowed: true
requires_citation: true
training:
allowed: false
requires_citation: false
pii_rules:
- name: "Test Rule"
type: "regex"
pattern: "test.*pattern"
severity: "block"
`
config, err := ParseYAML([]byte(yamlData))
if err != nil {
t.Fatalf("Failed to parse YAML: %v", err)
}
// Test federal
if config.Federal.Name != "Test Federal" {
t.Errorf("Expected federal name 'Test Federal', got '%s'", config.Federal.Name)
}
if len(config.Federal.Sources) != 1 {
t.Errorf("Expected 1 federal source, got %d", len(config.Federal.Sources))
}
if config.Federal.Sources[0].Domain != "test.gov" {
t.Errorf("Expected domain 'test.gov', got '%s'", config.Federal.Sources[0].Domain)
}
if config.Federal.Sources[0].TrustBoost != 0.9 {
t.Errorf("Expected trust_boost 0.9, got %f", config.Federal.Sources[0].TrustBoost)
}
// Test Bundesland
if len(config.Bundeslaender) != 1 {
t.Errorf("Expected 1 Bundesland, got %d", len(config.Bundeslaender))
}
ni, ok := config.Bundeslaender["NI"]
if !ok {
t.Error("Expected NI in Bundeslaender")
}
if ni.Name != "Niedersachsen" {
t.Errorf("Expected name 'Niedersachsen', got '%s'", ni.Name)
}
// Test operations
if !config.DefaultOperations.Lookup.Allowed {
t.Error("Expected lookup to be allowed")
}
if config.DefaultOperations.Training.Allowed {
t.Error("Expected training to be NOT allowed")
}
// Test PII rules
if len(config.PIIRules) != 1 {
t.Errorf("Expected 1 PII rule, got %d", len(config.PIIRules))
}
if config.PIIRules[0].Name != "Test Rule" {
t.Errorf("Expected rule name 'Test Rule', got '%s'", config.PIIRules[0].Name)
}
}
// =============================================================================
// AUDIT TESTS
// =============================================================================
func TestMaskPII(t *testing.T) {
tests := []struct {
name string
input string
expected string
}{
{"short", "ab", "****"},
{"medium", "test@email.com", "te****om"},
{"long", "very-long-email@example.com", "ve****om"},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result := maskPII(tt.input)
if result != tt.expected {
t.Errorf("Expected '%s', got '%s'", tt.expected, result)
}
})
}
}
// =============================================================================
// DEFAULT PII RULES TEST
// =============================================================================
func TestDefaultPIIRules(t *testing.T) {
rules := DefaultPIIRules()
if len(rules) == 0 {
t.Error("Expected default PII rules, got none")
}
// Check that each rule has required fields
for _, rule := range rules {
if rule.Name == "" {
t.Error("Rule name should not be empty")
}
if rule.Type == "" {
t.Error("Rule type should not be empty")
}
if rule.Pattern == "" {
t.Error("Rule pattern should not be empty")
}
}
// Check for email rule
hasEmailRule := false
for _, rule := range rules {
if rule.Name == "Email Addresses" {
hasEmailRule = true
break
}
}
if !hasEmailRule {
t.Error("Expected email addresses rule in defaults")
}
}
// =============================================================================
// INTEGRATION TEST HELPERS
// =============================================================================
// TestFilteredURL tests the FilteredURL struct.
func TestFilteredURL(t *testing.T) {
fu := FilteredURL{
URL: "https://example.com",
IsAllowed: true,
RequiresCitation: true,
}
if fu.URL != "https://example.com" {
t.Error("URL not set correctly")
}
if !fu.IsAllowed {
t.Error("IsAllowed should be true")
}
if !fu.RequiresCitation {
t.Error("RequiresCitation should be true")
}
}

File diff suppressed because it is too large Load Diff