fix: Restore all files lost during destructive rebase

A previous `git pull --rebase origin main` dropped 177 local commits,
losing 3400+ files across admin-v2, backend, studio-v2, website,
klausur-service, and many other services. The partial restore attempt
(660295e2) only recovered some files.

This commit restores all missing files from pre-rebase ref 98933f5e
while preserving post-rebase additions (night-scheduler, night-mode UI,
NightModeWidget dashboard integration).

Restored features include:
- AI Module Sidebar (FAB), OCR Labeling, OCR Compare
- GPU Dashboard, RAG Pipeline, Magic Help
- Klausur-Korrektur (8 files), Abitur-Archiv (5+ files)
- Companion, Zeugnisse-Crawler, Screen Flow
- Full backend, studio-v2, website, klausur-service
- All compliance SDKs, agent-core, voice-service
- CI/CD configs, documentation, scripts

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-02-09 09:51:32 +01:00
parent f7487ee240
commit bfdaf63ba9
2009 changed files with 749983 additions and 1731 deletions

View File

@@ -0,0 +1,282 @@
package robots
import (
"bufio"
"context"
"fmt"
"io"
"net/http"
"net/url"
"regexp"
"strings"
"sync"
"time"
)
// Checker handles robots.txt parsing and checking
type Checker struct {
mu sync.RWMutex
cache map[string]*RobotsData
userAgent string
client *http.Client
cacheTTL time.Duration
}
// RobotsData holds parsed robots.txt data for a host
type RobotsData struct {
DisallowPatterns []string
AllowPatterns []string
CrawlDelay int // seconds
FetchedAt time.Time
Error error
}
// NewChecker creates a new robots.txt checker
func NewChecker(userAgent string) *Checker {
return &Checker{
cache: make(map[string]*RobotsData),
userAgent: userAgent,
client: &http.Client{
Timeout: 10 * time.Second,
},
cacheTTL: 24 * time.Hour, // Cache robots.txt for 24 hours
}
}
// IsAllowed checks if a URL is allowed to be crawled
func (c *Checker) IsAllowed(ctx context.Context, urlStr string) (bool, error) {
u, err := url.Parse(urlStr)
if err != nil {
return false, fmt.Errorf("invalid URL: %w", err)
}
host := u.Host
path := u.Path
if path == "" {
path = "/"
}
// Get or fetch robots.txt
robotsData, err := c.getRobotsData(ctx, u.Scheme, host)
if err != nil {
// If we can't fetch robots.txt, assume allowed (be lenient)
return true, nil
}
// If there was an error fetching robots.txt, allow crawling
if robotsData.Error != nil {
return true, nil
}
// Check allow rules first (they take precedence)
for _, pattern := range robotsData.AllowPatterns {
if matchPattern(pattern, path) {
return true, nil
}
}
// Check disallow rules
for _, pattern := range robotsData.DisallowPatterns {
if matchPattern(pattern, path) {
return false, nil
}
}
// If no rules match, allow
return true, nil
}
// GetCrawlDelay returns the crawl delay for a host
func (c *Checker) GetCrawlDelay(ctx context.Context, urlStr string) (int, error) {
u, err := url.Parse(urlStr)
if err != nil {
return 0, err
}
robotsData, err := c.getRobotsData(ctx, u.Scheme, u.Host)
if err != nil || robotsData.Error != nil {
return 0, nil
}
return robotsData.CrawlDelay, nil
}
// getRobotsData fetches and caches robots.txt for a host
func (c *Checker) getRobotsData(ctx context.Context, scheme, host string) (*RobotsData, error) {
c.mu.RLock()
data, exists := c.cache[host]
c.mu.RUnlock()
// Return cached data if not expired
if exists && time.Since(data.FetchedAt) < c.cacheTTL {
return data, nil
}
// Fetch robots.txt
robotsURL := fmt.Sprintf("%s://%s/robots.txt", scheme, host)
data = c.fetchRobots(ctx, robotsURL)
// Cache the result
c.mu.Lock()
c.cache[host] = data
c.mu.Unlock()
return data, nil
}
// fetchRobots fetches and parses robots.txt
func (c *Checker) fetchRobots(ctx context.Context, robotsURL string) *RobotsData {
data := &RobotsData{
FetchedAt: time.Now(),
}
req, err := http.NewRequestWithContext(ctx, "GET", robotsURL, nil)
if err != nil {
data.Error = err
return data
}
req.Header.Set("User-Agent", c.userAgent)
resp, err := c.client.Do(req)
if err != nil {
data.Error = err
return data
}
defer resp.Body.Close()
// If robots.txt doesn't exist, allow everything
if resp.StatusCode == http.StatusNotFound {
return data
}
if resp.StatusCode != http.StatusOK {
data.Error = fmt.Errorf("HTTP %d", resp.StatusCode)
return data
}
// Parse the robots.txt
c.parseRobotsTxt(data, resp.Body)
return data
}
// parseRobotsTxt parses robots.txt content
func (c *Checker) parseRobotsTxt(data *RobotsData, reader io.Reader) {
scanner := bufio.NewScanner(reader)
// Track which user-agent section we're in
inRelevantSection := false
inWildcardSection := false
// Normalize our user agent for matching
ourAgent := strings.ToLower(c.userAgent)
for scanner.Scan() {
line := strings.TrimSpace(scanner.Text())
// Skip empty lines and comments
if line == "" || strings.HasPrefix(line, "#") {
continue
}
// Split on first colon
parts := strings.SplitN(line, ":", 2)
if len(parts) != 2 {
continue
}
directive := strings.ToLower(strings.TrimSpace(parts[0]))
value := strings.TrimSpace(parts[1])
// Remove inline comments
if idx := strings.Index(value, "#"); idx >= 0 {
value = strings.TrimSpace(value[:idx])
}
switch directive {
case "user-agent":
agent := strings.ToLower(value)
if agent == "*" {
inWildcardSection = true
inRelevantSection = false
} else if strings.Contains(ourAgent, agent) || strings.Contains(agent, "breakpilot") || strings.Contains(agent, "edubot") {
inRelevantSection = true
} else {
inRelevantSection = false
inWildcardSection = false
}
case "disallow":
if value != "" && (inRelevantSection || inWildcardSection) {
data.DisallowPatterns = append(data.DisallowPatterns, value)
}
case "allow":
if value != "" && (inRelevantSection || inWildcardSection) {
data.AllowPatterns = append(data.AllowPatterns, value)
}
case "crawl-delay":
if inRelevantSection || inWildcardSection {
var delay int
fmt.Sscanf(value, "%d", &delay)
if delay > 0 {
data.CrawlDelay = delay
}
}
}
}
}
// matchPattern matches a URL path against a robots.txt pattern
func matchPattern(pattern, path string) bool {
// Empty pattern matches nothing
if pattern == "" {
return false
}
// Handle wildcards
if strings.Contains(pattern, "*") {
// Convert to regex
regexPattern := regexp.QuoteMeta(pattern)
regexPattern = strings.ReplaceAll(regexPattern, `\*`, ".*")
// Handle $ at end (exact match)
if strings.HasSuffix(regexPattern, `\$`) {
regexPattern = strings.TrimSuffix(regexPattern, `\$`) + "$"
}
re, err := regexp.Compile("^" + regexPattern)
if err != nil {
return false
}
return re.MatchString(path)
}
// Handle $ (exact end match)
if strings.HasSuffix(pattern, "$") {
return path == strings.TrimSuffix(pattern, "$")
}
// Simple prefix match
return strings.HasPrefix(path, pattern)
}
// ClearCache clears the robots.txt cache
func (c *Checker) ClearCache() {
c.mu.Lock()
c.cache = make(map[string]*RobotsData)
c.mu.Unlock()
}
// CacheStats returns cache statistics
func (c *Checker) CacheStats() (count int, hosts []string) {
c.mu.RLock()
defer c.mu.RUnlock()
for host := range c.cache {
hosts = append(hosts, host)
}
return len(c.cache), hosts
}

View File

@@ -0,0 +1,324 @@
package robots
import (
"context"
"net/http"
"net/http/httptest"
"testing"
)
func TestNewChecker(t *testing.T) {
checker := NewChecker("TestBot/1.0")
if checker == nil {
t.Fatal("Expected non-nil checker")
}
}
func TestIsAllowed_NoRobots(t *testing.T) {
// Server that returns 404 for robots.txt
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusNotFound)
}))
defer server.Close()
checker := NewChecker("TestBot/1.0")
allowed, err := checker.IsAllowed(context.Background(), server.URL+"/some/page")
if err != nil {
t.Errorf("Unexpected error: %v", err)
}
if !allowed {
t.Error("Should be allowed when robots.txt doesn't exist")
}
}
func TestIsAllowed_AllowAll(t *testing.T) {
robotsTxt := `User-agent: *
Allow: /
`
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
if r.URL.Path == "/robots.txt" {
w.Write([]byte(robotsTxt))
return
}
w.WriteHeader(http.StatusOK)
}))
defer server.Close()
checker := NewChecker("TestBot/1.0")
allowed, _ := checker.IsAllowed(context.Background(), server.URL+"/any/path")
if !allowed {
t.Error("Should be allowed with Allow: /")
}
}
func TestIsAllowed_DisallowPath(t *testing.T) {
robotsTxt := `User-agent: *
Disallow: /private/
Disallow: /admin/
`
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
if r.URL.Path == "/robots.txt" {
w.Write([]byte(robotsTxt))
return
}
w.WriteHeader(http.StatusOK)
}))
defer server.Close()
checker := NewChecker("TestBot/1.0")
// Should be disallowed
allowed, _ := checker.IsAllowed(context.Background(), server.URL+"/private/secret")
if allowed {
t.Error("/private/secret should be disallowed")
}
allowed, _ = checker.IsAllowed(context.Background(), server.URL+"/admin/users")
if allowed {
t.Error("/admin/users should be disallowed")
}
// Should be allowed
allowed, _ = checker.IsAllowed(context.Background(), server.URL+"/public/page")
if !allowed {
t.Error("/public/page should be allowed")
}
}
func TestIsAllowed_AllowTakesPrecedence(t *testing.T) {
robotsTxt := `User-agent: *
Disallow: /api/
Allow: /api/public/
`
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
if r.URL.Path == "/robots.txt" {
w.Write([]byte(robotsTxt))
return
}
w.WriteHeader(http.StatusOK)
}))
defer server.Close()
checker := NewChecker("TestBot/1.0")
// Allow takes precedence
allowed, _ := checker.IsAllowed(context.Background(), server.URL+"/api/public/docs")
if !allowed {
t.Error("/api/public/docs should be allowed (Allow takes precedence)")
}
// Still disallowed
allowed, _ = checker.IsAllowed(context.Background(), server.URL+"/api/internal")
if allowed {
t.Error("/api/internal should be disallowed")
}
}
func TestIsAllowed_SpecificUserAgent(t *testing.T) {
robotsTxt := `User-agent: BadBot
Disallow: /
User-agent: *
Allow: /
`
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
if r.URL.Path == "/robots.txt" {
w.Write([]byte(robotsTxt))
return
}
w.WriteHeader(http.StatusOK)
}))
defer server.Close()
checker := NewChecker("GoodBot/1.0")
allowed, _ := checker.IsAllowed(context.Background(), server.URL+"/page")
if !allowed {
t.Error("GoodBot should be allowed")
}
}
func TestGetCrawlDelay(t *testing.T) {
robotsTxt := `User-agent: *
Crawl-delay: 5
`
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
if r.URL.Path == "/robots.txt" {
w.Write([]byte(robotsTxt))
return
}
w.WriteHeader(http.StatusOK)
}))
defer server.Close()
checker := NewChecker("TestBot/1.0")
delay, err := checker.GetCrawlDelay(context.Background(), server.URL+"/page")
if err != nil {
t.Errorf("Unexpected error: %v", err)
}
if delay != 5 {
t.Errorf("Expected delay 5, got %d", delay)
}
}
func TestMatchPattern_Simple(t *testing.T) {
tests := []struct {
pattern string
path string
match bool
}{
{"/private/", "/private/secret", true},
{"/private/", "/public/", false},
{"/", "/anything", true},
{"", "/anything", false},
}
for _, tt := range tests {
result := matchPattern(tt.pattern, tt.path)
if result != tt.match {
t.Errorf("Pattern '%s' vs Path '%s': expected %v, got %v",
tt.pattern, tt.path, tt.match, result)
}
}
}
func TestMatchPattern_Wildcard(t *testing.T) {
tests := []struct {
pattern string
path string
match bool
}{
{"/*.pdf", "/document.pdf", true},
{"/*.pdf", "/folder/doc.pdf", true},
{"/*.pdf", "/document.html", false},
{"/dir/*/page", "/dir/sub/page", true},
{"/dir/*/page", "/dir/other/page", true},
}
for _, tt := range tests {
result := matchPattern(tt.pattern, tt.path)
if result != tt.match {
t.Errorf("Pattern '%s' vs Path '%s': expected %v, got %v",
tt.pattern, tt.path, tt.match, result)
}
}
}
func TestMatchPattern_EndAnchor(t *testing.T) {
tests := []struct {
pattern string
path string
match bool
}{
{"/exact$", "/exact", true},
{"/exact$", "/exactmore", false},
{"/exact$", "/exact/more", false},
}
for _, tt := range tests {
result := matchPattern(tt.pattern, tt.path)
if result != tt.match {
t.Errorf("Pattern '%s' vs Path '%s': expected %v, got %v",
tt.pattern, tt.path, tt.match, result)
}
}
}
func TestCacheStats(t *testing.T) {
robotsTxt := `User-agent: *
Allow: /
`
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.Write([]byte(robotsTxt))
}))
defer server.Close()
checker := NewChecker("TestBot/1.0")
// Initially empty
count, _ := checker.CacheStats()
if count != 0 {
t.Errorf("Expected 0 cached entries, got %d", count)
}
// Fetch robots.txt
checker.IsAllowed(context.Background(), server.URL+"/page")
// Should have 1 entry
count, hosts := checker.CacheStats()
if count != 1 {
t.Errorf("Expected 1 cached entry, got %d", count)
}
if len(hosts) != 1 {
t.Errorf("Expected 1 host, got %v", hosts)
}
}
func TestClearCache(t *testing.T) {
robotsTxt := `User-agent: *
Allow: /
`
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.Write([]byte(robotsTxt))
}))
defer server.Close()
checker := NewChecker("TestBot/1.0")
// Populate cache
checker.IsAllowed(context.Background(), server.URL+"/page")
count, _ := checker.CacheStats()
if count != 1 {
t.Errorf("Expected 1 cached entry, got %d", count)
}
// Clear cache
checker.ClearCache()
count, _ = checker.CacheStats()
if count != 0 {
t.Errorf("Expected 0 cached entries after clear, got %d", count)
}
}
func TestParseRobotsTxt_Comments(t *testing.T) {
robotsTxt := `# This is a comment
User-agent: *
# Another comment
Disallow: /private/ # inline comment
Allow: /public/
`
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
if r.URL.Path == "/robots.txt" {
w.Write([]byte(robotsTxt))
return
}
w.WriteHeader(http.StatusOK)
}))
defer server.Close()
checker := NewChecker("TestBot/1.0")
allowed, _ := checker.IsAllowed(context.Background(), server.URL+"/public/page")
if !allowed {
t.Error("/public/page should be allowed")
}
allowed, _ = checker.IsAllowed(context.Background(), server.URL+"/private/page")
if allowed {
t.Error("/private/page should be disallowed")
}
}
func TestIsAllowed_InvalidURL(t *testing.T) {
checker := NewChecker("TestBot/1.0")
_, err := checker.IsAllowed(context.Background(), "not a valid url ://")
if err == nil {
t.Error("Expected error for invalid URL")
}
}