Files
breakpilot-lehrer/edu-search-service/internal/robots/robots_test.go
Benjamin Boenisch 414e0f5ec0
All checks were successful
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 28s
CI / test-go-edu-search (push) Successful in 27s
CI / test-python-klausur (push) Successful in 1m45s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 21s
feat: edu-search-service migriert, voice-service/geo-service entfernt
- edu-search-service von breakpilot-pwa nach breakpilot-lehrer kopiert (ohne vendor)
- opensearch + edu-search-service in docker-compose.yml hinzugefuegt
- voice-service aus docker-compose.yml entfernt (jetzt in breakpilot-core)
- geo-service aus docker-compose.yml entfernt (nicht mehr benoetigt)
- CI/CD: edu-search-service zu Gitea Actions und Woodpecker hinzugefuegt
  (Go lint, test mit go mod download, build, SBOM)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-15 18:36:38 +01:00

325 lines
7.5 KiB
Go

package robots
import (
"context"
"net/http"
"net/http/httptest"
"testing"
)
func TestNewChecker(t *testing.T) {
checker := NewChecker("TestBot/1.0")
if checker == nil {
t.Fatal("Expected non-nil checker")
}
}
func TestIsAllowed_NoRobots(t *testing.T) {
// Server that returns 404 for robots.txt
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusNotFound)
}))
defer server.Close()
checker := NewChecker("TestBot/1.0")
allowed, err := checker.IsAllowed(context.Background(), server.URL+"/some/page")
if err != nil {
t.Errorf("Unexpected error: %v", err)
}
if !allowed {
t.Error("Should be allowed when robots.txt doesn't exist")
}
}
func TestIsAllowed_AllowAll(t *testing.T) {
robotsTxt := `User-agent: *
Allow: /
`
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
if r.URL.Path == "/robots.txt" {
w.Write([]byte(robotsTxt))
return
}
w.WriteHeader(http.StatusOK)
}))
defer server.Close()
checker := NewChecker("TestBot/1.0")
allowed, _ := checker.IsAllowed(context.Background(), server.URL+"/any/path")
if !allowed {
t.Error("Should be allowed with Allow: /")
}
}
func TestIsAllowed_DisallowPath(t *testing.T) {
robotsTxt := `User-agent: *
Disallow: /private/
Disallow: /admin/
`
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
if r.URL.Path == "/robots.txt" {
w.Write([]byte(robotsTxt))
return
}
w.WriteHeader(http.StatusOK)
}))
defer server.Close()
checker := NewChecker("TestBot/1.0")
// Should be disallowed
allowed, _ := checker.IsAllowed(context.Background(), server.URL+"/private/secret")
if allowed {
t.Error("/private/secret should be disallowed")
}
allowed, _ = checker.IsAllowed(context.Background(), server.URL+"/admin/users")
if allowed {
t.Error("/admin/users should be disallowed")
}
// Should be allowed
allowed, _ = checker.IsAllowed(context.Background(), server.URL+"/public/page")
if !allowed {
t.Error("/public/page should be allowed")
}
}
func TestIsAllowed_AllowTakesPrecedence(t *testing.T) {
robotsTxt := `User-agent: *
Disallow: /api/
Allow: /api/public/
`
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
if r.URL.Path == "/robots.txt" {
w.Write([]byte(robotsTxt))
return
}
w.WriteHeader(http.StatusOK)
}))
defer server.Close()
checker := NewChecker("TestBot/1.0")
// Allow takes precedence
allowed, _ := checker.IsAllowed(context.Background(), server.URL+"/api/public/docs")
if !allowed {
t.Error("/api/public/docs should be allowed (Allow takes precedence)")
}
// Still disallowed
allowed, _ = checker.IsAllowed(context.Background(), server.URL+"/api/internal")
if allowed {
t.Error("/api/internal should be disallowed")
}
}
func TestIsAllowed_SpecificUserAgent(t *testing.T) {
robotsTxt := `User-agent: BadBot
Disallow: /
User-agent: *
Allow: /
`
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
if r.URL.Path == "/robots.txt" {
w.Write([]byte(robotsTxt))
return
}
w.WriteHeader(http.StatusOK)
}))
defer server.Close()
checker := NewChecker("GoodBot/1.0")
allowed, _ := checker.IsAllowed(context.Background(), server.URL+"/page")
if !allowed {
t.Error("GoodBot should be allowed")
}
}
func TestGetCrawlDelay(t *testing.T) {
robotsTxt := `User-agent: *
Crawl-delay: 5
`
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
if r.URL.Path == "/robots.txt" {
w.Write([]byte(robotsTxt))
return
}
w.WriteHeader(http.StatusOK)
}))
defer server.Close()
checker := NewChecker("TestBot/1.0")
delay, err := checker.GetCrawlDelay(context.Background(), server.URL+"/page")
if err != nil {
t.Errorf("Unexpected error: %v", err)
}
if delay != 5 {
t.Errorf("Expected delay 5, got %d", delay)
}
}
func TestMatchPattern_Simple(t *testing.T) {
tests := []struct {
pattern string
path string
match bool
}{
{"/private/", "/private/secret", true},
{"/private/", "/public/", false},
{"/", "/anything", true},
{"", "/anything", false},
}
for _, tt := range tests {
result := matchPattern(tt.pattern, tt.path)
if result != tt.match {
t.Errorf("Pattern '%s' vs Path '%s': expected %v, got %v",
tt.pattern, tt.path, tt.match, result)
}
}
}
func TestMatchPattern_Wildcard(t *testing.T) {
tests := []struct {
pattern string
path string
match bool
}{
{"/*.pdf", "/document.pdf", true},
{"/*.pdf", "/folder/doc.pdf", true},
{"/*.pdf", "/document.html", false},
{"/dir/*/page", "/dir/sub/page", true},
{"/dir/*/page", "/dir/other/page", true},
}
for _, tt := range tests {
result := matchPattern(tt.pattern, tt.path)
if result != tt.match {
t.Errorf("Pattern '%s' vs Path '%s': expected %v, got %v",
tt.pattern, tt.path, tt.match, result)
}
}
}
func TestMatchPattern_EndAnchor(t *testing.T) {
tests := []struct {
pattern string
path string
match bool
}{
{"/exact$", "/exact", true},
{"/exact$", "/exactmore", false},
{"/exact$", "/exact/more", false},
}
for _, tt := range tests {
result := matchPattern(tt.pattern, tt.path)
if result != tt.match {
t.Errorf("Pattern '%s' vs Path '%s': expected %v, got %v",
tt.pattern, tt.path, tt.match, result)
}
}
}
func TestCacheStats(t *testing.T) {
robotsTxt := `User-agent: *
Allow: /
`
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.Write([]byte(robotsTxt))
}))
defer server.Close()
checker := NewChecker("TestBot/1.0")
// Initially empty
count, _ := checker.CacheStats()
if count != 0 {
t.Errorf("Expected 0 cached entries, got %d", count)
}
// Fetch robots.txt
checker.IsAllowed(context.Background(), server.URL+"/page")
// Should have 1 entry
count, hosts := checker.CacheStats()
if count != 1 {
t.Errorf("Expected 1 cached entry, got %d", count)
}
if len(hosts) != 1 {
t.Errorf("Expected 1 host, got %v", hosts)
}
}
func TestClearCache(t *testing.T) {
robotsTxt := `User-agent: *
Allow: /
`
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.Write([]byte(robotsTxt))
}))
defer server.Close()
checker := NewChecker("TestBot/1.0")
// Populate cache
checker.IsAllowed(context.Background(), server.URL+"/page")
count, _ := checker.CacheStats()
if count != 1 {
t.Errorf("Expected 1 cached entry, got %d", count)
}
// Clear cache
checker.ClearCache()
count, _ = checker.CacheStats()
if count != 0 {
t.Errorf("Expected 0 cached entries after clear, got %d", count)
}
}
func TestParseRobotsTxt_Comments(t *testing.T) {
robotsTxt := `# This is a comment
User-agent: *
# Another comment
Disallow: /private/ # inline comment
Allow: /public/
`
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
if r.URL.Path == "/robots.txt" {
w.Write([]byte(robotsTxt))
return
}
w.WriteHeader(http.StatusOK)
}))
defer server.Close()
checker := NewChecker("TestBot/1.0")
allowed, _ := checker.IsAllowed(context.Background(), server.URL+"/public/page")
if !allowed {
t.Error("/public/page should be allowed")
}
allowed, _ = checker.IsAllowed(context.Background(), server.URL+"/private/page")
if allowed {
t.Error("/private/page should be disallowed")
}
}
func TestIsAllowed_InvalidURL(t *testing.T) {
checker := NewChecker("TestBot/1.0")
_, err := checker.IsAllowed(context.Background(), "not a valid url ://")
if err == nil {
t.Error("Expected error for invalid URL")
}
}