breakpilot-lehrer/edu-search-service/internal/robots/robots.go

package robots

import (
	"bufio"
	"context"
	"fmt"
	"io"
	"net/http"
	"net/url"
	"regexp"
	"strings"
	"sync"
	"time"
)

// Checker handles robots.txt parsing and checking
type Checker struct {
	mu        sync.RWMutex
	cache     map[string]*RobotsData
	userAgent string
	client    *http.Client
	cacheTTL  time.Duration
}

// RobotsData holds parsed robots.txt data for a host
type RobotsData struct {
	DisallowPatterns []string
	AllowPatterns    []string
	CrawlDelay       int // seconds
	FetchedAt        time.Time
	Error            error
}

// NewChecker creates a new robots.txt checker
func NewChecker(userAgent string) *Checker {
	return &Checker{
		cache:     make(map[string]*RobotsData),
		userAgent: userAgent,
		client: &http.Client{
			Timeout: 10 * time.Second,
		},
		cacheTTL: 24 * time.Hour, // Cache robots.txt for 24 hours
	}
}

// IsAllowed checks if a URL is allowed to be crawled
func (c *Checker) IsAllowed(ctx context.Context, urlStr string) (bool, error) {
	u, err := url.Parse(urlStr)
	if err != nil {
		return false, fmt.Errorf("invalid URL: %w", err)
	}

	host := u.Host
	path := u.Path
	if path == "" {
		path = "/"
	}

	// Get or fetch robots.txt
	robotsData, err := c.getRobotsData(ctx, u.Scheme, host)
	if err != nil {
		// If we can't fetch robots.txt, assume allowed (be lenient)
		return true, nil
	}

	// If there was an error fetching robots.txt, allow crawling
	if robotsData.Error != nil {
		return true, nil
	}

	// Check allow rules first (they take precedence)
	for _, pattern := range robotsData.AllowPatterns {
		if matchPattern(pattern, path) {
			return true, nil
		}
	}

	// Check disallow rules
	for _, pattern := range robotsData.DisallowPatterns {
		if matchPattern(pattern, path) {
			return false, nil
		}
	}

	// If no rules match, allow
	return true, nil
}

// GetCrawlDelay returns the crawl delay for a host
func (c *Checker) GetCrawlDelay(ctx context.Context, urlStr string) (int, error) {
	u, err := url.Parse(urlStr)
	if err != nil {
		return 0, err
	}

	robotsData, err := c.getRobotsData(ctx, u.Scheme, u.Host)
	if err != nil || robotsData.Error != nil {
		return 0, nil
	}

	return robotsData.CrawlDelay, nil
}

// getRobotsData fetches and caches robots.txt for a host
func (c *Checker) getRobotsData(ctx context.Context, scheme, host string) (*RobotsData, error) {
	c.mu.RLock()
	data, exists := c.cache[host]
	c.mu.RUnlock()

	// Return cached data if not expired
	if exists && time.Since(data.FetchedAt) < c.cacheTTL {
		return data, nil
	}

	// Fetch robots.txt
	robotsURL := fmt.Sprintf("%s://%s/robots.txt", scheme, host)
	data = c.fetchRobots(ctx, robotsURL)

	// Cache the result
	c.mu.Lock()
	c.cache[host] = data
	c.mu.Unlock()

	return data, nil
}

// fetchRobots fetches and parses robots.txt
func (c *Checker) fetchRobots(ctx context.Context, robotsURL string) *RobotsData {
	data := &RobotsData{
		FetchedAt: time.Now(),
	}

	req, err := http.NewRequestWithContext(ctx, "GET", robotsURL, nil)
	if err != nil {
		data.Error = err
		return data
	}

	req.Header.Set("User-Agent", c.userAgent)

	resp, err := c.client.Do(req)
	if err != nil {
		data.Error = err
		return data
	}
	defer resp.Body.Close()

	// If robots.txt doesn't exist, allow everything
	if resp.StatusCode == http.StatusNotFound {
		return data
	}

	if resp.StatusCode != http.StatusOK {
		data.Error = fmt.Errorf("HTTP %d", resp.StatusCode)
		return data
	}

	// Parse the robots.txt
	c.parseRobotsTxt(data, resp.Body)

	return data
}

// parseRobotsTxt parses robots.txt content
func (c *Checker) parseRobotsTxt(data *RobotsData, reader io.Reader) {
	scanner := bufio.NewScanner(reader)

	// Track which user-agent section we're in
	inRelevantSection := false
	inWildcardSection := false

	// Normalize our user agent for matching
	ourAgent := strings.ToLower(c.userAgent)

	for scanner.Scan() {
		line := strings.TrimSpace(scanner.Text())

		// Skip empty lines and comments
		if line == "" || strings.HasPrefix(line, "#") {
			continue
		}

		// Split on first colon
		parts := strings.SplitN(line, ":", 2)
		if len(parts) != 2 {
			continue
		}

		directive := strings.ToLower(strings.TrimSpace(parts[0]))
		value := strings.TrimSpace(parts[1])

		// Remove inline comments
		if idx := strings.Index(value, "#"); idx >= 0 {
			value = strings.TrimSpace(value[:idx])
		}

		switch directive {
		case "user-agent":
			agent := strings.ToLower(value)
			if agent == "*" {
				inWildcardSection = true
				inRelevantSection = false
			} else if strings.Contains(ourAgent, agent) || strings.Contains(agent, "breakpilot") || strings.Contains(agent, "edubot") {
				inRelevantSection = true
			} else {
				inRelevantSection = false
				inWildcardSection = false
			}

		case "disallow":
			if value != "" && (inRelevantSection || inWildcardSection) {
				data.DisallowPatterns = append(data.DisallowPatterns, value)
			}

		case "allow":
			if value != "" && (inRelevantSection || inWildcardSection) {
				data.AllowPatterns = append(data.AllowPatterns, value)
			}

		case "crawl-delay":
			if inRelevantSection || inWildcardSection {
				var delay int
				fmt.Sscanf(value, "%d", &delay)
				if delay > 0 {
					data.CrawlDelay = delay
				}
			}
		}
	}
}

// matchPattern matches a URL path against a robots.txt pattern
func matchPattern(pattern, path string) bool {
	// Empty pattern matches nothing
	if pattern == "" {
		return false
	}

	// Handle wildcards
	if strings.Contains(pattern, "*") {
		// Convert to regex
		regexPattern := regexp.QuoteMeta(pattern)
		regexPattern = strings.ReplaceAll(regexPattern, `\*`, ".*")

		// Handle $ at end (exact match)
		if strings.HasSuffix(regexPattern, `\$`) {
			regexPattern = strings.TrimSuffix(regexPattern, `\$`) + "$"
		}

		re, err := regexp.Compile("^" + regexPattern)
		if err != nil {
			return false
		}
		return re.MatchString(path)
	}

	// Handle $ (exact end match)
	if strings.HasSuffix(pattern, "$") {
		return path == strings.TrimSuffix(pattern, "$")
	}

	// Simple prefix match
	return strings.HasPrefix(path, pattern)
}

// ClearCache clears the robots.txt cache
func (c *Checker) ClearCache() {
	c.mu.Lock()
	c.cache = make(map[string]*RobotsData)
	c.mu.Unlock()
}

// CacheStats returns cache statistics
func (c *Checker) CacheStats() (count int, hosts []string) {
	c.mu.RLock()
	defer c.mu.RUnlock()

	for host := range c.cache {
		hosts = append(hosts, host)
	}
	return len(c.cache), hosts
}