package robots import ( "context" "net/http" "net/http/httptest" "testing" ) func TestNewChecker(t *testing.T) { checker := NewChecker("TestBot/1.0") if checker == nil { t.Fatal("Expected non-nil checker") } } func TestIsAllowed_NoRobots(t *testing.T) { // Server that returns 404 for robots.txt server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { w.WriteHeader(http.StatusNotFound) })) defer server.Close() checker := NewChecker("TestBot/1.0") allowed, err := checker.IsAllowed(context.Background(), server.URL+"/some/page") if err != nil { t.Errorf("Unexpected error: %v", err) } if !allowed { t.Error("Should be allowed when robots.txt doesn't exist") } } func TestIsAllowed_AllowAll(t *testing.T) { robotsTxt := `User-agent: * Allow: / ` server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { if r.URL.Path == "/robots.txt" { w.Write([]byte(robotsTxt)) return } w.WriteHeader(http.StatusOK) })) defer server.Close() checker := NewChecker("TestBot/1.0") allowed, _ := checker.IsAllowed(context.Background(), server.URL+"/any/path") if !allowed { t.Error("Should be allowed with Allow: /") } } func TestIsAllowed_DisallowPath(t *testing.T) { robotsTxt := `User-agent: * Disallow: /private/ Disallow: /admin/ ` server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { if r.URL.Path == "/robots.txt" { w.Write([]byte(robotsTxt)) return } w.WriteHeader(http.StatusOK) })) defer server.Close() checker := NewChecker("TestBot/1.0") // Should be disallowed allowed, _ := checker.IsAllowed(context.Background(), server.URL+"/private/secret") if allowed { t.Error("/private/secret should be disallowed") } allowed, _ = checker.IsAllowed(context.Background(), server.URL+"/admin/users") if allowed { t.Error("/admin/users should be disallowed") } // Should be allowed allowed, _ = checker.IsAllowed(context.Background(), server.URL+"/public/page") if !allowed { t.Error("/public/page should be allowed") } } func TestIsAllowed_AllowTakesPrecedence(t *testing.T) { robotsTxt := `User-agent: * Disallow: /api/ Allow: /api/public/ ` server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { if r.URL.Path == "/robots.txt" { w.Write([]byte(robotsTxt)) return } w.WriteHeader(http.StatusOK) })) defer server.Close() checker := NewChecker("TestBot/1.0") // Allow takes precedence allowed, _ := checker.IsAllowed(context.Background(), server.URL+"/api/public/docs") if !allowed { t.Error("/api/public/docs should be allowed (Allow takes precedence)") } // Still disallowed allowed, _ = checker.IsAllowed(context.Background(), server.URL+"/api/internal") if allowed { t.Error("/api/internal should be disallowed") } } func TestIsAllowed_SpecificUserAgent(t *testing.T) { robotsTxt := `User-agent: BadBot Disallow: / User-agent: * Allow: / ` server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { if r.URL.Path == "/robots.txt" { w.Write([]byte(robotsTxt)) return } w.WriteHeader(http.StatusOK) })) defer server.Close() checker := NewChecker("GoodBot/1.0") allowed, _ := checker.IsAllowed(context.Background(), server.URL+"/page") if !allowed { t.Error("GoodBot should be allowed") } } func TestGetCrawlDelay(t *testing.T) { robotsTxt := `User-agent: * Crawl-delay: 5 ` server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { if r.URL.Path == "/robots.txt" { w.Write([]byte(robotsTxt)) return } w.WriteHeader(http.StatusOK) })) defer server.Close() checker := NewChecker("TestBot/1.0") delay, err := checker.GetCrawlDelay(context.Background(), server.URL+"/page") if err != nil { t.Errorf("Unexpected error: %v", err) } if delay != 5 { t.Errorf("Expected delay 5, got %d", delay) } } func TestMatchPattern_Simple(t *testing.T) { tests := []struct { pattern string path string match bool }{ {"/private/", "/private/secret", true}, {"/private/", "/public/", false}, {"/", "/anything", true}, {"", "/anything", false}, } for _, tt := range tests { result := matchPattern(tt.pattern, tt.path) if result != tt.match { t.Errorf("Pattern '%s' vs Path '%s': expected %v, got %v", tt.pattern, tt.path, tt.match, result) } } } func TestMatchPattern_Wildcard(t *testing.T) { tests := []struct { pattern string path string match bool }{ {"/*.pdf", "/document.pdf", true}, {"/*.pdf", "/folder/doc.pdf", true}, {"/*.pdf", "/document.html", false}, {"/dir/*/page", "/dir/sub/page", true}, {"/dir/*/page", "/dir/other/page", true}, } for _, tt := range tests { result := matchPattern(tt.pattern, tt.path) if result != tt.match { t.Errorf("Pattern '%s' vs Path '%s': expected %v, got %v", tt.pattern, tt.path, tt.match, result) } } } func TestMatchPattern_EndAnchor(t *testing.T) { tests := []struct { pattern string path string match bool }{ {"/exact$", "/exact", true}, {"/exact$", "/exactmore", false}, {"/exact$", "/exact/more", false}, } for _, tt := range tests { result := matchPattern(tt.pattern, tt.path) if result != tt.match { t.Errorf("Pattern '%s' vs Path '%s': expected %v, got %v", tt.pattern, tt.path, tt.match, result) } } } func TestCacheStats(t *testing.T) { robotsTxt := `User-agent: * Allow: / ` server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { w.Write([]byte(robotsTxt)) })) defer server.Close() checker := NewChecker("TestBot/1.0") // Initially empty count, _ := checker.CacheStats() if count != 0 { t.Errorf("Expected 0 cached entries, got %d", count) } // Fetch robots.txt checker.IsAllowed(context.Background(), server.URL+"/page") // Should have 1 entry count, hosts := checker.CacheStats() if count != 1 { t.Errorf("Expected 1 cached entry, got %d", count) } if len(hosts) != 1 { t.Errorf("Expected 1 host, got %v", hosts) } } func TestClearCache(t *testing.T) { robotsTxt := `User-agent: * Allow: / ` server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { w.Write([]byte(robotsTxt)) })) defer server.Close() checker := NewChecker("TestBot/1.0") // Populate cache checker.IsAllowed(context.Background(), server.URL+"/page") count, _ := checker.CacheStats() if count != 1 { t.Errorf("Expected 1 cached entry, got %d", count) } // Clear cache checker.ClearCache() count, _ = checker.CacheStats() if count != 0 { t.Errorf("Expected 0 cached entries after clear, got %d", count) } } func TestParseRobotsTxt_Comments(t *testing.T) { robotsTxt := `# This is a comment User-agent: * # Another comment Disallow: /private/ # inline comment Allow: /public/ ` server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { if r.URL.Path == "/robots.txt" { w.Write([]byte(robotsTxt)) return } w.WriteHeader(http.StatusOK) })) defer server.Close() checker := NewChecker("TestBot/1.0") allowed, _ := checker.IsAllowed(context.Background(), server.URL+"/public/page") if !allowed { t.Error("/public/page should be allowed") } allowed, _ = checker.IsAllowed(context.Background(), server.URL+"/private/page") if allowed { t.Error("/private/page should be disallowed") } } func TestIsAllowed_InvalidURL(t *testing.T) { checker := NewChecker("TestBot/1.0") _, err := checker.IsAllowed(context.Background(), "not a valid url ://") if err == nil { t.Error("Expected error for invalid URL") } }