fix: Restore all files lost during destructive rebase
A previous `git pull --rebase origin main` dropped 177 local commits,
losing 3400+ files across admin-v2, backend, studio-v2, website,
klausur-service, and many other services. The partial restore attempt
(660295e2) only recovered some files.
This commit restores all missing files from pre-rebase ref 98933f5e
while preserving post-rebase additions (night-scheduler, night-mode UI,
NightModeWidget dashboard integration).
Restored features include:
- AI Module Sidebar (FAB), OCR Labeling, OCR Compare
- GPU Dashboard, RAG Pipeline, Magic Help
- Klausur-Korrektur (8 files), Abitur-Archiv (5+ files)
- Companion, Zeugnisse-Crawler, Screen Flow
- Full backend, studio-v2, website, klausur-service
- All compliance SDKs, agent-core, voice-service
- CI/CD configs, documentation, scripts
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
282
edu-search-service/internal/robots/robots.go
Normal file
282
edu-search-service/internal/robots/robots.go
Normal file
@@ -0,0 +1,282 @@
|
||||
package robots
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"context"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"regexp"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
)
|
||||
|
||||
// Checker handles robots.txt parsing and checking
|
||||
type Checker struct {
|
||||
mu sync.RWMutex
|
||||
cache map[string]*RobotsData
|
||||
userAgent string
|
||||
client *http.Client
|
||||
cacheTTL time.Duration
|
||||
}
|
||||
|
||||
// RobotsData holds parsed robots.txt data for a host
|
||||
type RobotsData struct {
|
||||
DisallowPatterns []string
|
||||
AllowPatterns []string
|
||||
CrawlDelay int // seconds
|
||||
FetchedAt time.Time
|
||||
Error error
|
||||
}
|
||||
|
||||
// NewChecker creates a new robots.txt checker
|
||||
func NewChecker(userAgent string) *Checker {
|
||||
return &Checker{
|
||||
cache: make(map[string]*RobotsData),
|
||||
userAgent: userAgent,
|
||||
client: &http.Client{
|
||||
Timeout: 10 * time.Second,
|
||||
},
|
||||
cacheTTL: 24 * time.Hour, // Cache robots.txt for 24 hours
|
||||
}
|
||||
}
|
||||
|
||||
// IsAllowed checks if a URL is allowed to be crawled
|
||||
func (c *Checker) IsAllowed(ctx context.Context, urlStr string) (bool, error) {
|
||||
u, err := url.Parse(urlStr)
|
||||
if err != nil {
|
||||
return false, fmt.Errorf("invalid URL: %w", err)
|
||||
}
|
||||
|
||||
host := u.Host
|
||||
path := u.Path
|
||||
if path == "" {
|
||||
path = "/"
|
||||
}
|
||||
|
||||
// Get or fetch robots.txt
|
||||
robotsData, err := c.getRobotsData(ctx, u.Scheme, host)
|
||||
if err != nil {
|
||||
// If we can't fetch robots.txt, assume allowed (be lenient)
|
||||
return true, nil
|
||||
}
|
||||
|
||||
// If there was an error fetching robots.txt, allow crawling
|
||||
if robotsData.Error != nil {
|
||||
return true, nil
|
||||
}
|
||||
|
||||
// Check allow rules first (they take precedence)
|
||||
for _, pattern := range robotsData.AllowPatterns {
|
||||
if matchPattern(pattern, path) {
|
||||
return true, nil
|
||||
}
|
||||
}
|
||||
|
||||
// Check disallow rules
|
||||
for _, pattern := range robotsData.DisallowPatterns {
|
||||
if matchPattern(pattern, path) {
|
||||
return false, nil
|
||||
}
|
||||
}
|
||||
|
||||
// If no rules match, allow
|
||||
return true, nil
|
||||
}
|
||||
|
||||
// GetCrawlDelay returns the crawl delay for a host
|
||||
func (c *Checker) GetCrawlDelay(ctx context.Context, urlStr string) (int, error) {
|
||||
u, err := url.Parse(urlStr)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
|
||||
robotsData, err := c.getRobotsData(ctx, u.Scheme, u.Host)
|
||||
if err != nil || robotsData.Error != nil {
|
||||
return 0, nil
|
||||
}
|
||||
|
||||
return robotsData.CrawlDelay, nil
|
||||
}
|
||||
|
||||
// getRobotsData fetches and caches robots.txt for a host
|
||||
func (c *Checker) getRobotsData(ctx context.Context, scheme, host string) (*RobotsData, error) {
|
||||
c.mu.RLock()
|
||||
data, exists := c.cache[host]
|
||||
c.mu.RUnlock()
|
||||
|
||||
// Return cached data if not expired
|
||||
if exists && time.Since(data.FetchedAt) < c.cacheTTL {
|
||||
return data, nil
|
||||
}
|
||||
|
||||
// Fetch robots.txt
|
||||
robotsURL := fmt.Sprintf("%s://%s/robots.txt", scheme, host)
|
||||
data = c.fetchRobots(ctx, robotsURL)
|
||||
|
||||
// Cache the result
|
||||
c.mu.Lock()
|
||||
c.cache[host] = data
|
||||
c.mu.Unlock()
|
||||
|
||||
return data, nil
|
||||
}
|
||||
|
||||
// fetchRobots fetches and parses robots.txt
|
||||
func (c *Checker) fetchRobots(ctx context.Context, robotsURL string) *RobotsData {
|
||||
data := &RobotsData{
|
||||
FetchedAt: time.Now(),
|
||||
}
|
||||
|
||||
req, err := http.NewRequestWithContext(ctx, "GET", robotsURL, nil)
|
||||
if err != nil {
|
||||
data.Error = err
|
||||
return data
|
||||
}
|
||||
|
||||
req.Header.Set("User-Agent", c.userAgent)
|
||||
|
||||
resp, err := c.client.Do(req)
|
||||
if err != nil {
|
||||
data.Error = err
|
||||
return data
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
// If robots.txt doesn't exist, allow everything
|
||||
if resp.StatusCode == http.StatusNotFound {
|
||||
return data
|
||||
}
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
data.Error = fmt.Errorf("HTTP %d", resp.StatusCode)
|
||||
return data
|
||||
}
|
||||
|
||||
// Parse the robots.txt
|
||||
c.parseRobotsTxt(data, resp.Body)
|
||||
|
||||
return data
|
||||
}
|
||||
|
||||
// parseRobotsTxt parses robots.txt content
|
||||
func (c *Checker) parseRobotsTxt(data *RobotsData, reader io.Reader) {
|
||||
scanner := bufio.NewScanner(reader)
|
||||
|
||||
// Track which user-agent section we're in
|
||||
inRelevantSection := false
|
||||
inWildcardSection := false
|
||||
|
||||
// Normalize our user agent for matching
|
||||
ourAgent := strings.ToLower(c.userAgent)
|
||||
|
||||
for scanner.Scan() {
|
||||
line := strings.TrimSpace(scanner.Text())
|
||||
|
||||
// Skip empty lines and comments
|
||||
if line == "" || strings.HasPrefix(line, "#") {
|
||||
continue
|
||||
}
|
||||
|
||||
// Split on first colon
|
||||
parts := strings.SplitN(line, ":", 2)
|
||||
if len(parts) != 2 {
|
||||
continue
|
||||
}
|
||||
|
||||
directive := strings.ToLower(strings.TrimSpace(parts[0]))
|
||||
value := strings.TrimSpace(parts[1])
|
||||
|
||||
// Remove inline comments
|
||||
if idx := strings.Index(value, "#"); idx >= 0 {
|
||||
value = strings.TrimSpace(value[:idx])
|
||||
}
|
||||
|
||||
switch directive {
|
||||
case "user-agent":
|
||||
agent := strings.ToLower(value)
|
||||
if agent == "*" {
|
||||
inWildcardSection = true
|
||||
inRelevantSection = false
|
||||
} else if strings.Contains(ourAgent, agent) || strings.Contains(agent, "breakpilot") || strings.Contains(agent, "edubot") {
|
||||
inRelevantSection = true
|
||||
} else {
|
||||
inRelevantSection = false
|
||||
inWildcardSection = false
|
||||
}
|
||||
|
||||
case "disallow":
|
||||
if value != "" && (inRelevantSection || inWildcardSection) {
|
||||
data.DisallowPatterns = append(data.DisallowPatterns, value)
|
||||
}
|
||||
|
||||
case "allow":
|
||||
if value != "" && (inRelevantSection || inWildcardSection) {
|
||||
data.AllowPatterns = append(data.AllowPatterns, value)
|
||||
}
|
||||
|
||||
case "crawl-delay":
|
||||
if inRelevantSection || inWildcardSection {
|
||||
var delay int
|
||||
fmt.Sscanf(value, "%d", &delay)
|
||||
if delay > 0 {
|
||||
data.CrawlDelay = delay
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// matchPattern matches a URL path against a robots.txt pattern
|
||||
func matchPattern(pattern, path string) bool {
|
||||
// Empty pattern matches nothing
|
||||
if pattern == "" {
|
||||
return false
|
||||
}
|
||||
|
||||
// Handle wildcards
|
||||
if strings.Contains(pattern, "*") {
|
||||
// Convert to regex
|
||||
regexPattern := regexp.QuoteMeta(pattern)
|
||||
regexPattern = strings.ReplaceAll(regexPattern, `\*`, ".*")
|
||||
|
||||
// Handle $ at end (exact match)
|
||||
if strings.HasSuffix(regexPattern, `\$`) {
|
||||
regexPattern = strings.TrimSuffix(regexPattern, `\$`) + "$"
|
||||
}
|
||||
|
||||
re, err := regexp.Compile("^" + regexPattern)
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
return re.MatchString(path)
|
||||
}
|
||||
|
||||
// Handle $ (exact end match)
|
||||
if strings.HasSuffix(pattern, "$") {
|
||||
return path == strings.TrimSuffix(pattern, "$")
|
||||
}
|
||||
|
||||
// Simple prefix match
|
||||
return strings.HasPrefix(path, pattern)
|
||||
}
|
||||
|
||||
// ClearCache clears the robots.txt cache
|
||||
func (c *Checker) ClearCache() {
|
||||
c.mu.Lock()
|
||||
c.cache = make(map[string]*RobotsData)
|
||||
c.mu.Unlock()
|
||||
}
|
||||
|
||||
// CacheStats returns cache statistics
|
||||
func (c *Checker) CacheStats() (count int, hosts []string) {
|
||||
c.mu.RLock()
|
||||
defer c.mu.RUnlock()
|
||||
|
||||
for host := range c.cache {
|
||||
hosts = append(hosts, host)
|
||||
}
|
||||
return len(c.cache), hosts
|
||||
}
|
||||
324
edu-search-service/internal/robots/robots_test.go
Normal file
324
edu-search-service/internal/robots/robots_test.go
Normal file
@@ -0,0 +1,324 @@
|
||||
package robots
|
||||
|
||||
import (
|
||||
"context"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestNewChecker(t *testing.T) {
|
||||
checker := NewChecker("TestBot/1.0")
|
||||
if checker == nil {
|
||||
t.Fatal("Expected non-nil checker")
|
||||
}
|
||||
}
|
||||
|
||||
func TestIsAllowed_NoRobots(t *testing.T) {
|
||||
// Server that returns 404 for robots.txt
|
||||
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
w.WriteHeader(http.StatusNotFound)
|
||||
}))
|
||||
defer server.Close()
|
||||
|
||||
checker := NewChecker("TestBot/1.0")
|
||||
allowed, err := checker.IsAllowed(context.Background(), server.URL+"/some/page")
|
||||
|
||||
if err != nil {
|
||||
t.Errorf("Unexpected error: %v", err)
|
||||
}
|
||||
if !allowed {
|
||||
t.Error("Should be allowed when robots.txt doesn't exist")
|
||||
}
|
||||
}
|
||||
|
||||
func TestIsAllowed_AllowAll(t *testing.T) {
|
||||
robotsTxt := `User-agent: *
|
||||
Allow: /
|
||||
`
|
||||
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
if r.URL.Path == "/robots.txt" {
|
||||
w.Write([]byte(robotsTxt))
|
||||
return
|
||||
}
|
||||
w.WriteHeader(http.StatusOK)
|
||||
}))
|
||||
defer server.Close()
|
||||
|
||||
checker := NewChecker("TestBot/1.0")
|
||||
allowed, _ := checker.IsAllowed(context.Background(), server.URL+"/any/path")
|
||||
|
||||
if !allowed {
|
||||
t.Error("Should be allowed with Allow: /")
|
||||
}
|
||||
}
|
||||
|
||||
func TestIsAllowed_DisallowPath(t *testing.T) {
|
||||
robotsTxt := `User-agent: *
|
||||
Disallow: /private/
|
||||
Disallow: /admin/
|
||||
`
|
||||
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
if r.URL.Path == "/robots.txt" {
|
||||
w.Write([]byte(robotsTxt))
|
||||
return
|
||||
}
|
||||
w.WriteHeader(http.StatusOK)
|
||||
}))
|
||||
defer server.Close()
|
||||
|
||||
checker := NewChecker("TestBot/1.0")
|
||||
|
||||
// Should be disallowed
|
||||
allowed, _ := checker.IsAllowed(context.Background(), server.URL+"/private/secret")
|
||||
if allowed {
|
||||
t.Error("/private/secret should be disallowed")
|
||||
}
|
||||
|
||||
allowed, _ = checker.IsAllowed(context.Background(), server.URL+"/admin/users")
|
||||
if allowed {
|
||||
t.Error("/admin/users should be disallowed")
|
||||
}
|
||||
|
||||
// Should be allowed
|
||||
allowed, _ = checker.IsAllowed(context.Background(), server.URL+"/public/page")
|
||||
if !allowed {
|
||||
t.Error("/public/page should be allowed")
|
||||
}
|
||||
}
|
||||
|
||||
func TestIsAllowed_AllowTakesPrecedence(t *testing.T) {
|
||||
robotsTxt := `User-agent: *
|
||||
Disallow: /api/
|
||||
Allow: /api/public/
|
||||
`
|
||||
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
if r.URL.Path == "/robots.txt" {
|
||||
w.Write([]byte(robotsTxt))
|
||||
return
|
||||
}
|
||||
w.WriteHeader(http.StatusOK)
|
||||
}))
|
||||
defer server.Close()
|
||||
|
||||
checker := NewChecker("TestBot/1.0")
|
||||
|
||||
// Allow takes precedence
|
||||
allowed, _ := checker.IsAllowed(context.Background(), server.URL+"/api/public/docs")
|
||||
if !allowed {
|
||||
t.Error("/api/public/docs should be allowed (Allow takes precedence)")
|
||||
}
|
||||
|
||||
// Still disallowed
|
||||
allowed, _ = checker.IsAllowed(context.Background(), server.URL+"/api/internal")
|
||||
if allowed {
|
||||
t.Error("/api/internal should be disallowed")
|
||||
}
|
||||
}
|
||||
|
||||
func TestIsAllowed_SpecificUserAgent(t *testing.T) {
|
||||
robotsTxt := `User-agent: BadBot
|
||||
Disallow: /
|
||||
|
||||
User-agent: *
|
||||
Allow: /
|
||||
`
|
||||
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
if r.URL.Path == "/robots.txt" {
|
||||
w.Write([]byte(robotsTxt))
|
||||
return
|
||||
}
|
||||
w.WriteHeader(http.StatusOK)
|
||||
}))
|
||||
defer server.Close()
|
||||
|
||||
checker := NewChecker("GoodBot/1.0")
|
||||
allowed, _ := checker.IsAllowed(context.Background(), server.URL+"/page")
|
||||
|
||||
if !allowed {
|
||||
t.Error("GoodBot should be allowed")
|
||||
}
|
||||
}
|
||||
|
||||
func TestGetCrawlDelay(t *testing.T) {
|
||||
robotsTxt := `User-agent: *
|
||||
Crawl-delay: 5
|
||||
`
|
||||
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
if r.URL.Path == "/robots.txt" {
|
||||
w.Write([]byte(robotsTxt))
|
||||
return
|
||||
}
|
||||
w.WriteHeader(http.StatusOK)
|
||||
}))
|
||||
defer server.Close()
|
||||
|
||||
checker := NewChecker("TestBot/1.0")
|
||||
delay, err := checker.GetCrawlDelay(context.Background(), server.URL+"/page")
|
||||
|
||||
if err != nil {
|
||||
t.Errorf("Unexpected error: %v", err)
|
||||
}
|
||||
if delay != 5 {
|
||||
t.Errorf("Expected delay 5, got %d", delay)
|
||||
}
|
||||
}
|
||||
|
||||
func TestMatchPattern_Simple(t *testing.T) {
|
||||
tests := []struct {
|
||||
pattern string
|
||||
path string
|
||||
match bool
|
||||
}{
|
||||
{"/private/", "/private/secret", true},
|
||||
{"/private/", "/public/", false},
|
||||
{"/", "/anything", true},
|
||||
{"", "/anything", false},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
result := matchPattern(tt.pattern, tt.path)
|
||||
if result != tt.match {
|
||||
t.Errorf("Pattern '%s' vs Path '%s': expected %v, got %v",
|
||||
tt.pattern, tt.path, tt.match, result)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestMatchPattern_Wildcard(t *testing.T) {
|
||||
tests := []struct {
|
||||
pattern string
|
||||
path string
|
||||
match bool
|
||||
}{
|
||||
{"/*.pdf", "/document.pdf", true},
|
||||
{"/*.pdf", "/folder/doc.pdf", true},
|
||||
{"/*.pdf", "/document.html", false},
|
||||
{"/dir/*/page", "/dir/sub/page", true},
|
||||
{"/dir/*/page", "/dir/other/page", true},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
result := matchPattern(tt.pattern, tt.path)
|
||||
if result != tt.match {
|
||||
t.Errorf("Pattern '%s' vs Path '%s': expected %v, got %v",
|
||||
tt.pattern, tt.path, tt.match, result)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestMatchPattern_EndAnchor(t *testing.T) {
|
||||
tests := []struct {
|
||||
pattern string
|
||||
path string
|
||||
match bool
|
||||
}{
|
||||
{"/exact$", "/exact", true},
|
||||
{"/exact$", "/exactmore", false},
|
||||
{"/exact$", "/exact/more", false},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
result := matchPattern(tt.pattern, tt.path)
|
||||
if result != tt.match {
|
||||
t.Errorf("Pattern '%s' vs Path '%s': expected %v, got %v",
|
||||
tt.pattern, tt.path, tt.match, result)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestCacheStats(t *testing.T) {
|
||||
robotsTxt := `User-agent: *
|
||||
Allow: /
|
||||
`
|
||||
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
w.Write([]byte(robotsTxt))
|
||||
}))
|
||||
defer server.Close()
|
||||
|
||||
checker := NewChecker("TestBot/1.0")
|
||||
|
||||
// Initially empty
|
||||
count, _ := checker.CacheStats()
|
||||
if count != 0 {
|
||||
t.Errorf("Expected 0 cached entries, got %d", count)
|
||||
}
|
||||
|
||||
// Fetch robots.txt
|
||||
checker.IsAllowed(context.Background(), server.URL+"/page")
|
||||
|
||||
// Should have 1 entry
|
||||
count, hosts := checker.CacheStats()
|
||||
if count != 1 {
|
||||
t.Errorf("Expected 1 cached entry, got %d", count)
|
||||
}
|
||||
if len(hosts) != 1 {
|
||||
t.Errorf("Expected 1 host, got %v", hosts)
|
||||
}
|
||||
}
|
||||
|
||||
func TestClearCache(t *testing.T) {
|
||||
robotsTxt := `User-agent: *
|
||||
Allow: /
|
||||
`
|
||||
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
w.Write([]byte(robotsTxt))
|
||||
}))
|
||||
defer server.Close()
|
||||
|
||||
checker := NewChecker("TestBot/1.0")
|
||||
|
||||
// Populate cache
|
||||
checker.IsAllowed(context.Background(), server.URL+"/page")
|
||||
|
||||
count, _ := checker.CacheStats()
|
||||
if count != 1 {
|
||||
t.Errorf("Expected 1 cached entry, got %d", count)
|
||||
}
|
||||
|
||||
// Clear cache
|
||||
checker.ClearCache()
|
||||
|
||||
count, _ = checker.CacheStats()
|
||||
if count != 0 {
|
||||
t.Errorf("Expected 0 cached entries after clear, got %d", count)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseRobotsTxt_Comments(t *testing.T) {
|
||||
robotsTxt := `# This is a comment
|
||||
User-agent: *
|
||||
# Another comment
|
||||
Disallow: /private/ # inline comment
|
||||
Allow: /public/
|
||||
`
|
||||
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
if r.URL.Path == "/robots.txt" {
|
||||
w.Write([]byte(robotsTxt))
|
||||
return
|
||||
}
|
||||
w.WriteHeader(http.StatusOK)
|
||||
}))
|
||||
defer server.Close()
|
||||
|
||||
checker := NewChecker("TestBot/1.0")
|
||||
|
||||
allowed, _ := checker.IsAllowed(context.Background(), server.URL+"/public/page")
|
||||
if !allowed {
|
||||
t.Error("/public/page should be allowed")
|
||||
}
|
||||
|
||||
allowed, _ = checker.IsAllowed(context.Background(), server.URL+"/private/page")
|
||||
if allowed {
|
||||
t.Error("/private/page should be disallowed")
|
||||
}
|
||||
}
|
||||
|
||||
func TestIsAllowed_InvalidURL(t *testing.T) {
|
||||
checker := NewChecker("TestBot/1.0")
|
||||
|
||||
_, err := checker.IsAllowed(context.Background(), "not a valid url ://")
|
||||
if err == nil {
|
||||
t.Error("Expected error for invalid URL")
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user