All checks were successful
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 28s
CI / test-go-edu-search (push) Successful in 27s
CI / test-python-klausur (push) Successful in 1m45s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 21s
- edu-search-service von breakpilot-pwa nach breakpilot-lehrer kopiert (ohne vendor) - opensearch + edu-search-service in docker-compose.yml hinzugefuegt - voice-service aus docker-compose.yml entfernt (jetzt in breakpilot-core) - geo-service aus docker-compose.yml entfernt (nicht mehr benoetigt) - CI/CD: edu-search-service zu Gitea Actions und Woodpecker hinzugefuegt (Go lint, test mit go mod download, build, SBOM) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
282 lines
8.1 KiB
Go
282 lines
8.1 KiB
Go
package policy
|
|
|
|
import (
|
|
"context"
|
|
"net/url"
|
|
"strings"
|
|
|
|
"github.com/google/uuid"
|
|
)
|
|
|
|
// Enforcer provides policy enforcement for the crawler and pipeline.
|
|
type Enforcer struct {
|
|
store *Store
|
|
piiDetector *PIIDetector
|
|
auditor *Auditor
|
|
}
|
|
|
|
// NewEnforcer creates a new Enforcer instance.
|
|
func NewEnforcer(store *Store) *Enforcer {
|
|
return &Enforcer{
|
|
store: store,
|
|
piiDetector: NewPIIDetector(store),
|
|
auditor: NewAuditor(store),
|
|
}
|
|
}
|
|
|
|
// =============================================================================
|
|
// SOURCE CHECKING
|
|
// =============================================================================
|
|
|
|
// CheckSource verifies if a URL is allowed based on the whitelist.
|
|
// Returns the AllowedSource if found, nil if not whitelisted.
|
|
func (e *Enforcer) CheckSource(ctx context.Context, rawURL string, bundesland *Bundesland) (*AllowedSource, error) {
|
|
domain, err := extractDomain(rawURL)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
source, err := e.store.GetSourceByDomain(ctx, domain, bundesland)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
return source, nil
|
|
}
|
|
|
|
// CheckOperation verifies if a specific operation is allowed for a source.
|
|
func (e *Enforcer) CheckOperation(ctx context.Context, source *AllowedSource, operation Operation) (*OperationPermission, error) {
|
|
for _, op := range source.Operations {
|
|
if op.Operation == operation {
|
|
return &op, nil
|
|
}
|
|
}
|
|
|
|
// If not found in loaded operations, query directly
|
|
ops, err := e.store.GetOperationsBySourceID(ctx, source.ID)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
for _, op := range ops {
|
|
if op.Operation == operation {
|
|
return &op, nil
|
|
}
|
|
}
|
|
|
|
return nil, nil
|
|
}
|
|
|
|
// CheckCompliance performs a full compliance check for a URL and operation.
|
|
func (e *Enforcer) CheckCompliance(ctx context.Context, req *CheckComplianceRequest) (*CheckComplianceResponse, error) {
|
|
response := &CheckComplianceResponse{
|
|
IsAllowed: false,
|
|
RequiresCitation: false,
|
|
}
|
|
|
|
// Check if source is whitelisted
|
|
source, err := e.CheckSource(ctx, req.URL, req.Bundesland)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
if source == nil {
|
|
reason := BlockReasonNotWhitelisted
|
|
response.BlockReason = &reason
|
|
return response, nil
|
|
}
|
|
|
|
response.Source = source
|
|
response.License = &source.License
|
|
response.CitationTemplate = source.CitationTemplate
|
|
|
|
// Check operation permission
|
|
opPerm, err := e.CheckOperation(ctx, source, req.Operation)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
if opPerm == nil || !opPerm.IsAllowed {
|
|
var reason BlockReason
|
|
if req.Operation == OperationTraining {
|
|
reason = BlockReasonTrainingForbidden
|
|
} else {
|
|
reason = BlockReasonLicenseViolation
|
|
}
|
|
response.BlockReason = &reason
|
|
return response, nil
|
|
}
|
|
|
|
response.IsAllowed = true
|
|
response.RequiresCitation = opPerm.RequiresCitation
|
|
|
|
return response, nil
|
|
}
|
|
|
|
// =============================================================================
|
|
// PII CHECKING
|
|
// =============================================================================
|
|
|
|
// DetectPII scans text for PII patterns and returns matches.
|
|
func (e *Enforcer) DetectPII(ctx context.Context, text string) (*PIITestResponse, error) {
|
|
return e.piiDetector.Detect(ctx, text)
|
|
}
|
|
|
|
// ShouldBlockForPII determines if content should be blocked based on PII matches.
|
|
func (e *Enforcer) ShouldBlockForPII(response *PIITestResponse) bool {
|
|
if response == nil {
|
|
return false
|
|
}
|
|
return response.ShouldBlock
|
|
}
|
|
|
|
// =============================================================================
|
|
// LOGGING
|
|
// =============================================================================
|
|
|
|
// LogBlocked logs a blocked URL to the blocked content log.
|
|
func (e *Enforcer) LogBlocked(ctx context.Context, rawURL string, reason BlockReason, ruleID *uuid.UUID, details map[string]interface{}) error {
|
|
domain, _ := extractDomain(rawURL)
|
|
return e.auditor.LogBlocked(ctx, rawURL, domain, reason, ruleID, details)
|
|
}
|
|
|
|
// LogChange logs a policy change to the audit log.
|
|
func (e *Enforcer) LogChange(ctx context.Context, action AuditAction, entityType AuditEntityType, entityID *uuid.UUID, oldValue, newValue interface{}, userEmail *string) error {
|
|
return e.auditor.LogChange(ctx, action, entityType, entityID, oldValue, newValue, userEmail, nil, nil)
|
|
}
|
|
|
|
// =============================================================================
|
|
// BATCH OPERATIONS
|
|
// =============================================================================
|
|
|
|
// FilterURLs filters a list of URLs, returning only whitelisted ones.
|
|
func (e *Enforcer) FilterURLs(ctx context.Context, urls []string, bundesland *Bundesland, operation Operation) ([]FilteredURL, error) {
|
|
results := make([]FilteredURL, 0, len(urls))
|
|
|
|
for _, u := range urls {
|
|
result := FilteredURL{
|
|
URL: u,
|
|
IsAllowed: false,
|
|
}
|
|
|
|
source, err := e.CheckSource(ctx, u, bundesland)
|
|
if err != nil {
|
|
result.Error = err.Error()
|
|
results = append(results, result)
|
|
continue
|
|
}
|
|
|
|
if source == nil {
|
|
result.BlockReason = BlockReasonNotWhitelisted
|
|
results = append(results, result)
|
|
continue
|
|
}
|
|
|
|
opPerm, err := e.CheckOperation(ctx, source, operation)
|
|
if err != nil {
|
|
result.Error = err.Error()
|
|
results = append(results, result)
|
|
continue
|
|
}
|
|
|
|
if opPerm == nil || !opPerm.IsAllowed {
|
|
if operation == OperationTraining {
|
|
result.BlockReason = BlockReasonTrainingForbidden
|
|
} else {
|
|
result.BlockReason = BlockReasonLicenseViolation
|
|
}
|
|
results = append(results, result)
|
|
continue
|
|
}
|
|
|
|
result.IsAllowed = true
|
|
result.Source = source
|
|
result.RequiresCitation = opPerm.RequiresCitation
|
|
results = append(results, result)
|
|
}
|
|
|
|
return results, nil
|
|
}
|
|
|
|
// FilteredURL represents the result of filtering a single URL.
|
|
type FilteredURL struct {
|
|
URL string `json:"url"`
|
|
IsAllowed bool `json:"is_allowed"`
|
|
Source *AllowedSource `json:"source,omitempty"`
|
|
BlockReason BlockReason `json:"block_reason,omitempty"`
|
|
RequiresCitation bool `json:"requires_citation"`
|
|
Error string `json:"error,omitempty"`
|
|
}
|
|
|
|
// =============================================================================
|
|
// HELPERS
|
|
// =============================================================================
|
|
|
|
// extractDomain extracts the domain from a URL.
|
|
func extractDomain(rawURL string) (string, error) {
|
|
// Handle URLs without scheme
|
|
if !strings.Contains(rawURL, "://") {
|
|
rawURL = "https://" + rawURL
|
|
}
|
|
|
|
parsed, err := url.Parse(rawURL)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
|
|
host := parsed.Hostname()
|
|
|
|
// Remove www. prefix
|
|
host = strings.TrimPrefix(host, "www.")
|
|
|
|
return host, nil
|
|
}
|
|
|
|
// IsTrainingAllowed checks if training is allowed for any source (should always be false).
|
|
func (e *Enforcer) IsTrainingAllowed(ctx context.Context) (bool, error) {
|
|
// Training should NEVER be allowed - this is a safeguard
|
|
matrix, err := e.store.GetOperationsMatrix(ctx)
|
|
if err != nil {
|
|
return false, err
|
|
}
|
|
|
|
for _, source := range matrix {
|
|
for _, op := range source.Operations {
|
|
if op.Operation == OperationTraining && op.IsAllowed {
|
|
// This should never happen - log a warning
|
|
return true, nil
|
|
}
|
|
}
|
|
}
|
|
|
|
return false, nil
|
|
}
|
|
|
|
// GetSourceByURL is a convenience method to get a source by URL.
|
|
func (e *Enforcer) GetSourceByURL(ctx context.Context, rawURL string, bundesland *Bundesland) (*AllowedSource, error) {
|
|
return e.CheckSource(ctx, rawURL, bundesland)
|
|
}
|
|
|
|
// GetCitationForURL generates a citation for a URL if required.
|
|
func (e *Enforcer) GetCitationForURL(ctx context.Context, rawURL string, bundesland *Bundesland, title string, date string) (string, error) {
|
|
source, err := e.CheckSource(ctx, rawURL, bundesland)
|
|
if err != nil || source == nil {
|
|
return "", err
|
|
}
|
|
|
|
if source.CitationTemplate == nil || *source.CitationTemplate == "" {
|
|
// Default citation format
|
|
return "Quelle: " + source.Name + ", " + title + ", " + date, nil
|
|
}
|
|
|
|
// Replace placeholders in template
|
|
citation := *source.CitationTemplate
|
|
citation = strings.ReplaceAll(citation, "{title}", title)
|
|
citation = strings.ReplaceAll(citation, "{date}", date)
|
|
citation = strings.ReplaceAll(citation, "{url}", rawURL)
|
|
citation = strings.ReplaceAll(citation, "{domain}", source.Domain)
|
|
citation = strings.ReplaceAll(citation, "{source}", source.Name)
|
|
|
|
return citation, nil
|
|
}
|