fix: Restore all files lost during destructive rebase

A previous `git pull --rebase origin main` dropped 177 local commits, losing 3400+ files across admin-v2, backend, studio-v2, website, klausur-service, and many other services. The partial restore attempt (660295e2) only recovered some files. This commit restores all missing files from pre-rebase ref 98933f5e while preserving post-rebase additions (night-scheduler, night-mode UI, NightModeWidget dashboard integration). Restored features include: - AI Module Sidebar (FAB), OCR Labeling, OCR Compare - GPU Dashboard, RAG Pipeline, Magic Help - Klausur-Korrektur (8 files), Abitur-Archiv (5+ files) - Companion, Zeugnisse-Crawler, Screen Flow - Full backend, studio-v2, website, klausur-service - All compliance SDKs, agent-core, voice-service - CI/CD configs, documentation, scripts Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-09 09:51:32 +01:00
parent f7487ee240
commit bfdaf63ba9
2009 changed files with 749983 additions and 1731 deletions
@@ -0,0 +1,282 @@
+package robots
+
+import (
+	"bufio"
+	"context"
+	"fmt"
+	"io"
+	"net/http"
+	"net/url"
+	"regexp"
+	"strings"
+	"sync"
+	"time"
+)
+
+// Checker handles robots.txt parsing and checking
+type Checker struct {
+	mu        sync.RWMutex
+	cache     map[string]*RobotsData
+	userAgent string
+	client    *http.Client
+	cacheTTL  time.Duration
+}
+
+// RobotsData holds parsed robots.txt data for a host
+type RobotsData struct {
+	DisallowPatterns []string
+	AllowPatterns    []string
+	CrawlDelay       int // seconds
+	FetchedAt        time.Time
+	Error            error
+}
+
+// NewChecker creates a new robots.txt checker
+func NewChecker(userAgent string) *Checker {
+	return &Checker{
+		cache:     make(map[string]*RobotsData),
+		userAgent: userAgent,
+		client: &http.Client{
+			Timeout: 10 * time.Second,
+		},
+		cacheTTL: 24 * time.Hour, // Cache robots.txt for 24 hours
+	}
+}
+
+// IsAllowed checks if a URL is allowed to be crawled
+func (c *Checker) IsAllowed(ctx context.Context, urlStr string) (bool, error) {
+	u, err := url.Parse(urlStr)
+	if err != nil {
+		return false, fmt.Errorf("invalid URL: %w", err)
+	}
+
+	host := u.Host
+	path := u.Path
+	if path == "" {
+		path = "/"
+	}
+
+	// Get or fetch robots.txt
+	robotsData, err := c.getRobotsData(ctx, u.Scheme, host)
+	if err != nil {
+		// If we can't fetch robots.txt, assume allowed (be lenient)
+		return true, nil
+	}
+
+	// If there was an error fetching robots.txt, allow crawling
+	if robotsData.Error != nil {
+		return true, nil
+	}
+
+	// Check allow rules first (they take precedence)
+	for _, pattern := range robotsData.AllowPatterns {
+		if matchPattern(pattern, path) {
+			return true, nil
+		}
+	}
+
+	// Check disallow rules
+	for _, pattern := range robotsData.DisallowPatterns {
+		if matchPattern(pattern, path) {
+			return false, nil
+		}
+	}
+
+	// If no rules match, allow
+	return true, nil
+}
+
+// GetCrawlDelay returns the crawl delay for a host
+func (c *Checker) GetCrawlDelay(ctx context.Context, urlStr string) (int, error) {
+	u, err := url.Parse(urlStr)
+	if err != nil {
+		return 0, err
+	}
+
+	robotsData, err := c.getRobotsData(ctx, u.Scheme, u.Host)
+	if err != nil || robotsData.Error != nil {
+		return 0, nil
+	}
+
+	return robotsData.CrawlDelay, nil
+}
+
+// getRobotsData fetches and caches robots.txt for a host
+func (c *Checker) getRobotsData(ctx context.Context, scheme, host string) (*RobotsData, error) {
+	c.mu.RLock()
+	data, exists := c.cache[host]
+	c.mu.RUnlock()
+
+	// Return cached data if not expired
+	if exists && time.Since(data.FetchedAt) < c.cacheTTL {
+		return data, nil
+	}
+
+	// Fetch robots.txt
+	robotsURL := fmt.Sprintf("%s://%s/robots.txt", scheme, host)
+	data = c.fetchRobots(ctx, robotsURL)
+
+	// Cache the result
+	c.mu.Lock()
+	c.cache[host] = data
+	c.mu.Unlock()
+
+	return data, nil
+}
+
+// fetchRobots fetches and parses robots.txt
+func (c *Checker) fetchRobots(ctx context.Context, robotsURL string) *RobotsData {
+	data := &RobotsData{
+		FetchedAt: time.Now(),
+	}
+
+	req, err := http.NewRequestWithContext(ctx, "GET", robotsURL, nil)
+	if err != nil {
+		data.Error = err
+		return data
+	}
+
+	req.Header.Set("User-Agent", c.userAgent)
+
+	resp, err := c.client.Do(req)
+	if err != nil {
+		data.Error = err
+		return data
+	}
+	defer resp.Body.Close()
+
+	// If robots.txt doesn't exist, allow everything
+	if resp.StatusCode == http.StatusNotFound {
+		return data
+	}
+
+	if resp.StatusCode != http.StatusOK {
+		data.Error = fmt.Errorf("HTTP %d", resp.StatusCode)
+		return data
+	}
+
+	// Parse the robots.txt
+	c.parseRobotsTxt(data, resp.Body)
+
+	return data
+}
+
+// parseRobotsTxt parses robots.txt content
+func (c *Checker) parseRobotsTxt(data *RobotsData, reader io.Reader) {
+	scanner := bufio.NewScanner(reader)
+
+	// Track which user-agent section we're in
+	inRelevantSection := false
+	inWildcardSection := false
+
+	// Normalize our user agent for matching
+	ourAgent := strings.ToLower(c.userAgent)
+
+	for scanner.Scan() {
+		line := strings.TrimSpace(scanner.Text())
+
+		// Skip empty lines and comments
+		if line == "" || strings.HasPrefix(line, "#") {
+			continue
+		}
+
+		// Split on first colon
+		parts := strings.SplitN(line, ":", 2)
+		if len(parts) != 2 {
+			continue
+		}
+
+		directive := strings.ToLower(strings.TrimSpace(parts[0]))
+		value := strings.TrimSpace(parts[1])
+
+		// Remove inline comments
+		if idx := strings.Index(value, "#"); idx >= 0 {
+			value = strings.TrimSpace(value[:idx])
+		}
+
+		switch directive {
+		case "user-agent":
+			agent := strings.ToLower(value)
+			if agent == "*" {
+				inWildcardSection = true
+				inRelevantSection = false
+			} else if strings.Contains(ourAgent, agent) || strings.Contains(agent, "breakpilot") || strings.Contains(agent, "edubot") {
+				inRelevantSection = true
+			} else {
+				inRelevantSection = false
+				inWildcardSection = false
+			}
+
+		case "disallow":
+			if value != "" && (inRelevantSection || inWildcardSection) {
+				data.DisallowPatterns = append(data.DisallowPatterns, value)
+			}
+
+		case "allow":
+			if value != "" && (inRelevantSection || inWildcardSection) {
+				data.AllowPatterns = append(data.AllowPatterns, value)
+			}
+
+		case "crawl-delay":
+			if inRelevantSection || inWildcardSection {
+				var delay int
+				fmt.Sscanf(value, "%d", &delay)
+				if delay > 0 {
+					data.CrawlDelay = delay
+				}
+			}
+		}
+	}
+}
+
+// matchPattern matches a URL path against a robots.txt pattern
+func matchPattern(pattern, path string) bool {
+	// Empty pattern matches nothing
+	if pattern == "" {
+		return false
+	}
+
+	// Handle wildcards
+	if strings.Contains(pattern, "*") {
+		// Convert to regex
+		regexPattern := regexp.QuoteMeta(pattern)
+		regexPattern = strings.ReplaceAll(regexPattern, `\*`, ".*")
+
+		// Handle $ at end (exact match)
+		if strings.HasSuffix(regexPattern, `\$`) {
+			regexPattern = strings.TrimSuffix(regexPattern, `\$`) + "$"
+		}
+
+		re, err := regexp.Compile("^" + regexPattern)
+		if err != nil {
+			return false
+		}
+		return re.MatchString(path)
+	}
+
+	// Handle $ (exact end match)
+	if strings.HasSuffix(pattern, "$") {
+		return path == strings.TrimSuffix(pattern, "$")
+	}
+
+	// Simple prefix match
+	return strings.HasPrefix(path, pattern)
+}
+
+// ClearCache clears the robots.txt cache
+func (c *Checker) ClearCache() {
+	c.mu.Lock()
+	c.cache = make(map[string]*RobotsData)
+	c.mu.Unlock()
+}
+
+// CacheStats returns cache statistics
+func (c *Checker) CacheStats() (count int, hosts []string) {
+	c.mu.RLock()
+	defer c.mu.RUnlock()
+
+	for host := range c.cache {
+		hosts = append(hosts, host)
+	}
+	return len(c.cache), hosts
+}
@@ -0,0 +1,324 @@
+package robots
+
+import (
+	"context"
+	"net/http"
+	"net/http/httptest"
+	"testing"
+)
+
+func TestNewChecker(t *testing.T) {
+	checker := NewChecker("TestBot/1.0")
+	if checker == nil {
+		t.Fatal("Expected non-nil checker")
+	}
+}
+
+func TestIsAllowed_NoRobots(t *testing.T) {
+	// Server that returns 404 for robots.txt
+	server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		w.WriteHeader(http.StatusNotFound)
+	}))
+	defer server.Close()
+
+	checker := NewChecker("TestBot/1.0")
+	allowed, err := checker.IsAllowed(context.Background(), server.URL+"/some/page")
+
+	if err != nil {
+		t.Errorf("Unexpected error: %v", err)
+	}
+	if !allowed {
+		t.Error("Should be allowed when robots.txt doesn't exist")
+	}
+}
+
+func TestIsAllowed_AllowAll(t *testing.T) {
+	robotsTxt := `User-agent: *
+Allow: /
+`
+	server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		if r.URL.Path == "/robots.txt" {
+			w.Write([]byte(robotsTxt))
+			return
+		}
+		w.WriteHeader(http.StatusOK)
+	}))
+	defer server.Close()
+
+	checker := NewChecker("TestBot/1.0")
+	allowed, _ := checker.IsAllowed(context.Background(), server.URL+"/any/path")
+
+	if !allowed {
+		t.Error("Should be allowed with Allow: /")
+	}
+}
+
+func TestIsAllowed_DisallowPath(t *testing.T) {
+	robotsTxt := `User-agent: *
+Disallow: /private/
+Disallow: /admin/
+`
+	server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		if r.URL.Path == "/robots.txt" {
+			w.Write([]byte(robotsTxt))
+			return
+		}
+		w.WriteHeader(http.StatusOK)
+	}))
+	defer server.Close()
+
+	checker := NewChecker("TestBot/1.0")
+
+	// Should be disallowed
+	allowed, _ := checker.IsAllowed(context.Background(), server.URL+"/private/secret")
+	if allowed {
+		t.Error("/private/secret should be disallowed")
+	}
+
+	allowed, _ = checker.IsAllowed(context.Background(), server.URL+"/admin/users")
+	if allowed {
+		t.Error("/admin/users should be disallowed")
+	}
+
+	// Should be allowed
+	allowed, _ = checker.IsAllowed(context.Background(), server.URL+"/public/page")
+	if !allowed {
+		t.Error("/public/page should be allowed")
+	}
+}
+
+func TestIsAllowed_AllowTakesPrecedence(t *testing.T) {
+	robotsTxt := `User-agent: *
+Disallow: /api/
+Allow: /api/public/
+`
+	server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		if r.URL.Path == "/robots.txt" {
+			w.Write([]byte(robotsTxt))
+			return
+		}
+		w.WriteHeader(http.StatusOK)
+	}))
+	defer server.Close()
+
+	checker := NewChecker("TestBot/1.0")
+
+	// Allow takes precedence
+	allowed, _ := checker.IsAllowed(context.Background(), server.URL+"/api/public/docs")
+	if !allowed {
+		t.Error("/api/public/docs should be allowed (Allow takes precedence)")
+	}
+
+	// Still disallowed
+	allowed, _ = checker.IsAllowed(context.Background(), server.URL+"/api/internal")
+	if allowed {
+		t.Error("/api/internal should be disallowed")
+	}
+}
+
+func TestIsAllowed_SpecificUserAgent(t *testing.T) {
+	robotsTxt := `User-agent: BadBot
+Disallow: /
+
+User-agent: *
+Allow: /
+`
+	server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		if r.URL.Path == "/robots.txt" {
+			w.Write([]byte(robotsTxt))
+			return
+		}
+		w.WriteHeader(http.StatusOK)
+	}))
+	defer server.Close()
+
+	checker := NewChecker("GoodBot/1.0")
+	allowed, _ := checker.IsAllowed(context.Background(), server.URL+"/page")
+
+	if !allowed {
+		t.Error("GoodBot should be allowed")
+	}
+}
+
+func TestGetCrawlDelay(t *testing.T) {
+	robotsTxt := `User-agent: *
+Crawl-delay: 5
+`
+	server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		if r.URL.Path == "/robots.txt" {
+			w.Write([]byte(robotsTxt))
+			return
+		}
+		w.WriteHeader(http.StatusOK)
+	}))
+	defer server.Close()
+
+	checker := NewChecker("TestBot/1.0")
+	delay, err := checker.GetCrawlDelay(context.Background(), server.URL+"/page")
+
+	if err != nil {
+		t.Errorf("Unexpected error: %v", err)
+	}
+	if delay != 5 {
+		t.Errorf("Expected delay 5, got %d", delay)
+	}
+}
+
+func TestMatchPattern_Simple(t *testing.T) {
+	tests := []struct {
+		pattern string
+		path    string
+		match   bool
+	}{
+		{"/private/", "/private/secret", true},
+		{"/private/", "/public/", false},
+		{"/", "/anything", true},
+		{"", "/anything", false},
+	}
+
+	for _, tt := range tests {
+		result := matchPattern(tt.pattern, tt.path)
+		if result != tt.match {
+			t.Errorf("Pattern '%s' vs Path '%s': expected %v, got %v",
+				tt.pattern, tt.path, tt.match, result)
+		}
+	}
+}
+
+func TestMatchPattern_Wildcard(t *testing.T) {
+	tests := []struct {
+		pattern string
+		path    string
+		match   bool
+	}{
+		{"/*.pdf", "/document.pdf", true},
+		{"/*.pdf", "/folder/doc.pdf", true},
+		{"/*.pdf", "/document.html", false},
+		{"/dir/*/page", "/dir/sub/page", true},
+		{"/dir/*/page", "/dir/other/page", true},
+	}
+
+	for _, tt := range tests {
+		result := matchPattern(tt.pattern, tt.path)
+		if result != tt.match {
+			t.Errorf("Pattern '%s' vs Path '%s': expected %v, got %v",
+				tt.pattern, tt.path, tt.match, result)
+		}
+	}
+}
+
+func TestMatchPattern_EndAnchor(t *testing.T) {
+	tests := []struct {
+		pattern string
+		path    string
+		match   bool
+	}{
+		{"/exact$", "/exact", true},
+		{"/exact$", "/exactmore", false},
+		{"/exact$", "/exact/more", false},
+	}
+
+	for _, tt := range tests {
+		result := matchPattern(tt.pattern, tt.path)
+		if result != tt.match {
+			t.Errorf("Pattern '%s' vs Path '%s': expected %v, got %v",
+				tt.pattern, tt.path, tt.match, result)
+		}
+	}
+}
+
+func TestCacheStats(t *testing.T) {
+	robotsTxt := `User-agent: *
+Allow: /
+`
+	server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		w.Write([]byte(robotsTxt))
+	}))
+	defer server.Close()
+
+	checker := NewChecker("TestBot/1.0")
+
+	// Initially empty
+	count, _ := checker.CacheStats()
+	if count != 0 {
+		t.Errorf("Expected 0 cached entries, got %d", count)
+	}
+
+	// Fetch robots.txt
+	checker.IsAllowed(context.Background(), server.URL+"/page")
+
+	// Should have 1 entry
+	count, hosts := checker.CacheStats()
+	if count != 1 {
+		t.Errorf("Expected 1 cached entry, got %d", count)
+	}
+	if len(hosts) != 1 {
+		t.Errorf("Expected 1 host, got %v", hosts)
+	}
+}
+
+func TestClearCache(t *testing.T) {
+	robotsTxt := `User-agent: *
+Allow: /
+`
+	server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		w.Write([]byte(robotsTxt))
+	}))
+	defer server.Close()
+
+	checker := NewChecker("TestBot/1.0")
+
+	// Populate cache
+	checker.IsAllowed(context.Background(), server.URL+"/page")
+
+	count, _ := checker.CacheStats()
+	if count != 1 {
+		t.Errorf("Expected 1 cached entry, got %d", count)
+	}
+
+	// Clear cache
+	checker.ClearCache()
+
+	count, _ = checker.CacheStats()
+	if count != 0 {
+		t.Errorf("Expected 0 cached entries after clear, got %d", count)
+	}
+}
+
+func TestParseRobotsTxt_Comments(t *testing.T) {
+	robotsTxt := `# This is a comment
+User-agent: *
+# Another comment
+Disallow: /private/  # inline comment
+Allow: /public/
+`
+	server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		if r.URL.Path == "/robots.txt" {
+			w.Write([]byte(robotsTxt))
+			return
+		}
+		w.WriteHeader(http.StatusOK)
+	}))
+	defer server.Close()
+
+	checker := NewChecker("TestBot/1.0")
+
+	allowed, _ := checker.IsAllowed(context.Background(), server.URL+"/public/page")
+	if !allowed {
+		t.Error("/public/page should be allowed")
+	}
+
+	allowed, _ = checker.IsAllowed(context.Background(), server.URL+"/private/page")
+	if allowed {
+		t.Error("/private/page should be disallowed")
+	}
+}
+
+func TestIsAllowed_InvalidURL(t *testing.T) {
+	checker := NewChecker("TestBot/1.0")
+
+	_, err := checker.IsAllowed(context.Background(), "not a valid url ://")
+	if err == nil {
+		t.Error("Expected error for invalid URL")
+	}
+}