package policy import ( "context" "net/url" "strings" "github.com/google/uuid" ) // Enforcer provides policy enforcement for the crawler and pipeline. type Enforcer struct { store *Store piiDetector *PIIDetector auditor *Auditor } // NewEnforcer creates a new Enforcer instance. func NewEnforcer(store *Store) *Enforcer { return &Enforcer{ store: store, piiDetector: NewPIIDetector(store), auditor: NewAuditor(store), } } // ============================================================================= // SOURCE CHECKING // ============================================================================= // CheckSource verifies if a URL is allowed based on the whitelist. // Returns the AllowedSource if found, nil if not whitelisted. func (e *Enforcer) CheckSource(ctx context.Context, rawURL string, bundesland *Bundesland) (*AllowedSource, error) { domain, err := extractDomain(rawURL) if err != nil { return nil, err } source, err := e.store.GetSourceByDomain(ctx, domain, bundesland) if err != nil { return nil, err } return source, nil } // CheckOperation verifies if a specific operation is allowed for a source. func (e *Enforcer) CheckOperation(ctx context.Context, source *AllowedSource, operation Operation) (*OperationPermission, error) { for _, op := range source.Operations { if op.Operation == operation { return &op, nil } } // If not found in loaded operations, query directly ops, err := e.store.GetOperationsBySourceID(ctx, source.ID) if err != nil { return nil, err } for _, op := range ops { if op.Operation == operation { return &op, nil } } return nil, nil } // CheckCompliance performs a full compliance check for a URL and operation. func (e *Enforcer) CheckCompliance(ctx context.Context, req *CheckComplianceRequest) (*CheckComplianceResponse, error) { response := &CheckComplianceResponse{ IsAllowed: false, RequiresCitation: false, } // Check if source is whitelisted source, err := e.CheckSource(ctx, req.URL, req.Bundesland) if err != nil { return nil, err } if source == nil { reason := BlockReasonNotWhitelisted response.BlockReason = &reason return response, nil } response.Source = source response.License = &source.License response.CitationTemplate = source.CitationTemplate // Check operation permission opPerm, err := e.CheckOperation(ctx, source, req.Operation) if err != nil { return nil, err } if opPerm == nil || !opPerm.IsAllowed { var reason BlockReason if req.Operation == OperationTraining { reason = BlockReasonTrainingForbidden } else { reason = BlockReasonLicenseViolation } response.BlockReason = &reason return response, nil } response.IsAllowed = true response.RequiresCitation = opPerm.RequiresCitation return response, nil } // ============================================================================= // PII CHECKING // ============================================================================= // DetectPII scans text for PII patterns and returns matches. func (e *Enforcer) DetectPII(ctx context.Context, text string) (*PIITestResponse, error) { return e.piiDetector.Detect(ctx, text) } // ShouldBlockForPII determines if content should be blocked based on PII matches. func (e *Enforcer) ShouldBlockForPII(response *PIITestResponse) bool { if response == nil { return false } return response.ShouldBlock } // ============================================================================= // LOGGING // ============================================================================= // LogBlocked logs a blocked URL to the blocked content log. func (e *Enforcer) LogBlocked(ctx context.Context, rawURL string, reason BlockReason, ruleID *uuid.UUID, details map[string]interface{}) error { domain, _ := extractDomain(rawURL) return e.auditor.LogBlocked(ctx, rawURL, domain, reason, ruleID, details) } // LogChange logs a policy change to the audit log. func (e *Enforcer) LogChange(ctx context.Context, action AuditAction, entityType AuditEntityType, entityID *uuid.UUID, oldValue, newValue interface{}, userEmail *string) error { return e.auditor.LogChange(ctx, action, entityType, entityID, oldValue, newValue, userEmail, nil, nil) } // ============================================================================= // BATCH OPERATIONS // ============================================================================= // FilterURLs filters a list of URLs, returning only whitelisted ones. func (e *Enforcer) FilterURLs(ctx context.Context, urls []string, bundesland *Bundesland, operation Operation) ([]FilteredURL, error) { results := make([]FilteredURL, 0, len(urls)) for _, u := range urls { result := FilteredURL{ URL: u, IsAllowed: false, } source, err := e.CheckSource(ctx, u, bundesland) if err != nil { result.Error = err.Error() results = append(results, result) continue } if source == nil { result.BlockReason = BlockReasonNotWhitelisted results = append(results, result) continue } opPerm, err := e.CheckOperation(ctx, source, operation) if err != nil { result.Error = err.Error() results = append(results, result) continue } if opPerm == nil || !opPerm.IsAllowed { if operation == OperationTraining { result.BlockReason = BlockReasonTrainingForbidden } else { result.BlockReason = BlockReasonLicenseViolation } results = append(results, result) continue } result.IsAllowed = true result.Source = source result.RequiresCitation = opPerm.RequiresCitation results = append(results, result) } return results, nil } // FilteredURL represents the result of filtering a single URL. type FilteredURL struct { URL string `json:"url"` IsAllowed bool `json:"is_allowed"` Source *AllowedSource `json:"source,omitempty"` BlockReason BlockReason `json:"block_reason,omitempty"` RequiresCitation bool `json:"requires_citation"` Error string `json:"error,omitempty"` } // ============================================================================= // HELPERS // ============================================================================= // extractDomain extracts the domain from a URL. func extractDomain(rawURL string) (string, error) { // Handle URLs without scheme if !strings.Contains(rawURL, "://") { rawURL = "https://" + rawURL } parsed, err := url.Parse(rawURL) if err != nil { return "", err } host := parsed.Hostname() // Remove www. prefix host = strings.TrimPrefix(host, "www.") return host, nil } // IsTrainingAllowed checks if training is allowed for any source (should always be false). func (e *Enforcer) IsTrainingAllowed(ctx context.Context) (bool, error) { // Training should NEVER be allowed - this is a safeguard matrix, err := e.store.GetOperationsMatrix(ctx) if err != nil { return false, err } for _, source := range matrix { for _, op := range source.Operations { if op.Operation == OperationTraining && op.IsAllowed { // This should never happen - log a warning return true, nil } } } return false, nil } // GetSourceByURL is a convenience method to get a source by URL. func (e *Enforcer) GetSourceByURL(ctx context.Context, rawURL string, bundesland *Bundesland) (*AllowedSource, error) { return e.CheckSource(ctx, rawURL, bundesland) } // GetCitationForURL generates a citation for a URL if required. func (e *Enforcer) GetCitationForURL(ctx context.Context, rawURL string, bundesland *Bundesland, title string, date string) (string, error) { source, err := e.CheckSource(ctx, rawURL, bundesland) if err != nil || source == nil { return "", err } if source.CitationTemplate == nil || *source.CitationTemplate == "" { // Default citation format return "Quelle: " + source.Name + ", " + title + ", " + date, nil } // Replace placeholders in template citation := *source.CitationTemplate citation = strings.ReplaceAll(citation, "{title}", title) citation = strings.ReplaceAll(citation, "{date}", date) citation = strings.ReplaceAll(citation, "{url}", rawURL) citation = strings.ReplaceAll(citation, "{domain}", source.Domain) citation = strings.ReplaceAll(citation, "{source}", source.Name) return citation, nil }