All checks were successful
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 28s
CI / test-go-edu-search (push) Successful in 27s
CI / test-python-klausur (push) Successful in 1m45s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 21s
- edu-search-service von breakpilot-pwa nach breakpilot-lehrer kopiert (ohne vendor) - opensearch + edu-search-service in docker-compose.yml hinzugefuegt - voice-service aus docker-compose.yml entfernt (jetzt in breakpilot-core) - geo-service aus docker-compose.yml entfernt (nicht mehr benoetigt) - CI/CD: edu-search-service zu Gitea Actions und Woodpecker hinzugefuegt (Go lint, test mit go mod download, build, SBOM) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
283 lines
6.3 KiB
Go
283 lines
6.3 KiB
Go
package robots
|
|
|
|
import (
|
|
"bufio"
|
|
"context"
|
|
"fmt"
|
|
"io"
|
|
"net/http"
|
|
"net/url"
|
|
"regexp"
|
|
"strings"
|
|
"sync"
|
|
"time"
|
|
)
|
|
|
|
// Checker handles robots.txt parsing and checking
|
|
type Checker struct {
|
|
mu sync.RWMutex
|
|
cache map[string]*RobotsData
|
|
userAgent string
|
|
client *http.Client
|
|
cacheTTL time.Duration
|
|
}
|
|
|
|
// RobotsData holds parsed robots.txt data for a host
|
|
type RobotsData struct {
|
|
DisallowPatterns []string
|
|
AllowPatterns []string
|
|
CrawlDelay int // seconds
|
|
FetchedAt time.Time
|
|
Error error
|
|
}
|
|
|
|
// NewChecker creates a new robots.txt checker
|
|
func NewChecker(userAgent string) *Checker {
|
|
return &Checker{
|
|
cache: make(map[string]*RobotsData),
|
|
userAgent: userAgent,
|
|
client: &http.Client{
|
|
Timeout: 10 * time.Second,
|
|
},
|
|
cacheTTL: 24 * time.Hour, // Cache robots.txt for 24 hours
|
|
}
|
|
}
|
|
|
|
// IsAllowed checks if a URL is allowed to be crawled
|
|
func (c *Checker) IsAllowed(ctx context.Context, urlStr string) (bool, error) {
|
|
u, err := url.Parse(urlStr)
|
|
if err != nil {
|
|
return false, fmt.Errorf("invalid URL: %w", err)
|
|
}
|
|
|
|
host := u.Host
|
|
path := u.Path
|
|
if path == "" {
|
|
path = "/"
|
|
}
|
|
|
|
// Get or fetch robots.txt
|
|
robotsData, err := c.getRobotsData(ctx, u.Scheme, host)
|
|
if err != nil {
|
|
// If we can't fetch robots.txt, assume allowed (be lenient)
|
|
return true, nil
|
|
}
|
|
|
|
// If there was an error fetching robots.txt, allow crawling
|
|
if robotsData.Error != nil {
|
|
return true, nil
|
|
}
|
|
|
|
// Check allow rules first (they take precedence)
|
|
for _, pattern := range robotsData.AllowPatterns {
|
|
if matchPattern(pattern, path) {
|
|
return true, nil
|
|
}
|
|
}
|
|
|
|
// Check disallow rules
|
|
for _, pattern := range robotsData.DisallowPatterns {
|
|
if matchPattern(pattern, path) {
|
|
return false, nil
|
|
}
|
|
}
|
|
|
|
// If no rules match, allow
|
|
return true, nil
|
|
}
|
|
|
|
// GetCrawlDelay returns the crawl delay for a host
|
|
func (c *Checker) GetCrawlDelay(ctx context.Context, urlStr string) (int, error) {
|
|
u, err := url.Parse(urlStr)
|
|
if err != nil {
|
|
return 0, err
|
|
}
|
|
|
|
robotsData, err := c.getRobotsData(ctx, u.Scheme, u.Host)
|
|
if err != nil || robotsData.Error != nil {
|
|
return 0, nil
|
|
}
|
|
|
|
return robotsData.CrawlDelay, nil
|
|
}
|
|
|
|
// getRobotsData fetches and caches robots.txt for a host
|
|
func (c *Checker) getRobotsData(ctx context.Context, scheme, host string) (*RobotsData, error) {
|
|
c.mu.RLock()
|
|
data, exists := c.cache[host]
|
|
c.mu.RUnlock()
|
|
|
|
// Return cached data if not expired
|
|
if exists && time.Since(data.FetchedAt) < c.cacheTTL {
|
|
return data, nil
|
|
}
|
|
|
|
// Fetch robots.txt
|
|
robotsURL := fmt.Sprintf("%s://%s/robots.txt", scheme, host)
|
|
data = c.fetchRobots(ctx, robotsURL)
|
|
|
|
// Cache the result
|
|
c.mu.Lock()
|
|
c.cache[host] = data
|
|
c.mu.Unlock()
|
|
|
|
return data, nil
|
|
}
|
|
|
|
// fetchRobots fetches and parses robots.txt
|
|
func (c *Checker) fetchRobots(ctx context.Context, robotsURL string) *RobotsData {
|
|
data := &RobotsData{
|
|
FetchedAt: time.Now(),
|
|
}
|
|
|
|
req, err := http.NewRequestWithContext(ctx, "GET", robotsURL, nil)
|
|
if err != nil {
|
|
data.Error = err
|
|
return data
|
|
}
|
|
|
|
req.Header.Set("User-Agent", c.userAgent)
|
|
|
|
resp, err := c.client.Do(req)
|
|
if err != nil {
|
|
data.Error = err
|
|
return data
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
// If robots.txt doesn't exist, allow everything
|
|
if resp.StatusCode == http.StatusNotFound {
|
|
return data
|
|
}
|
|
|
|
if resp.StatusCode != http.StatusOK {
|
|
data.Error = fmt.Errorf("HTTP %d", resp.StatusCode)
|
|
return data
|
|
}
|
|
|
|
// Parse the robots.txt
|
|
c.parseRobotsTxt(data, resp.Body)
|
|
|
|
return data
|
|
}
|
|
|
|
// parseRobotsTxt parses robots.txt content
|
|
func (c *Checker) parseRobotsTxt(data *RobotsData, reader io.Reader) {
|
|
scanner := bufio.NewScanner(reader)
|
|
|
|
// Track which user-agent section we're in
|
|
inRelevantSection := false
|
|
inWildcardSection := false
|
|
|
|
// Normalize our user agent for matching
|
|
ourAgent := strings.ToLower(c.userAgent)
|
|
|
|
for scanner.Scan() {
|
|
line := strings.TrimSpace(scanner.Text())
|
|
|
|
// Skip empty lines and comments
|
|
if line == "" || strings.HasPrefix(line, "#") {
|
|
continue
|
|
}
|
|
|
|
// Split on first colon
|
|
parts := strings.SplitN(line, ":", 2)
|
|
if len(parts) != 2 {
|
|
continue
|
|
}
|
|
|
|
directive := strings.ToLower(strings.TrimSpace(parts[0]))
|
|
value := strings.TrimSpace(parts[1])
|
|
|
|
// Remove inline comments
|
|
if idx := strings.Index(value, "#"); idx >= 0 {
|
|
value = strings.TrimSpace(value[:idx])
|
|
}
|
|
|
|
switch directive {
|
|
case "user-agent":
|
|
agent := strings.ToLower(value)
|
|
if agent == "*" {
|
|
inWildcardSection = true
|
|
inRelevantSection = false
|
|
} else if strings.Contains(ourAgent, agent) || strings.Contains(agent, "breakpilot") || strings.Contains(agent, "edubot") {
|
|
inRelevantSection = true
|
|
} else {
|
|
inRelevantSection = false
|
|
inWildcardSection = false
|
|
}
|
|
|
|
case "disallow":
|
|
if value != "" && (inRelevantSection || inWildcardSection) {
|
|
data.DisallowPatterns = append(data.DisallowPatterns, value)
|
|
}
|
|
|
|
case "allow":
|
|
if value != "" && (inRelevantSection || inWildcardSection) {
|
|
data.AllowPatterns = append(data.AllowPatterns, value)
|
|
}
|
|
|
|
case "crawl-delay":
|
|
if inRelevantSection || inWildcardSection {
|
|
var delay int
|
|
fmt.Sscanf(value, "%d", &delay)
|
|
if delay > 0 {
|
|
data.CrawlDelay = delay
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// matchPattern matches a URL path against a robots.txt pattern
|
|
func matchPattern(pattern, path string) bool {
|
|
// Empty pattern matches nothing
|
|
if pattern == "" {
|
|
return false
|
|
}
|
|
|
|
// Handle wildcards
|
|
if strings.Contains(pattern, "*") {
|
|
// Convert to regex
|
|
regexPattern := regexp.QuoteMeta(pattern)
|
|
regexPattern = strings.ReplaceAll(regexPattern, `\*`, ".*")
|
|
|
|
// Handle $ at end (exact match)
|
|
if strings.HasSuffix(regexPattern, `\$`) {
|
|
regexPattern = strings.TrimSuffix(regexPattern, `\$`) + "$"
|
|
}
|
|
|
|
re, err := regexp.Compile("^" + regexPattern)
|
|
if err != nil {
|
|
return false
|
|
}
|
|
return re.MatchString(path)
|
|
}
|
|
|
|
// Handle $ (exact end match)
|
|
if strings.HasSuffix(pattern, "$") {
|
|
return path == strings.TrimSuffix(pattern, "$")
|
|
}
|
|
|
|
// Simple prefix match
|
|
return strings.HasPrefix(path, pattern)
|
|
}
|
|
|
|
// ClearCache clears the robots.txt cache
|
|
func (c *Checker) ClearCache() {
|
|
c.mu.Lock()
|
|
c.cache = make(map[string]*RobotsData)
|
|
c.mu.Unlock()
|
|
}
|
|
|
|
// CacheStats returns cache statistics
|
|
func (c *Checker) CacheStats() (count int, hosts []string) {
|
|
c.mu.RLock()
|
|
defer c.mu.RUnlock()
|
|
|
|
for host := range c.cache {
|
|
hosts = append(hosts, host)
|
|
}
|
|
return len(c.cache), hosts
|
|
}
|