package robots import ( "bufio" "context" "fmt" "io" "net/http" "net/url" "regexp" "strings" "sync" "time" ) // Checker handles robots.txt parsing and checking type Checker struct { mu sync.RWMutex cache map[string]*RobotsData userAgent string client *http.Client cacheTTL time.Duration } // RobotsData holds parsed robots.txt data for a host type RobotsData struct { DisallowPatterns []string AllowPatterns []string CrawlDelay int // seconds FetchedAt time.Time Error error } // NewChecker creates a new robots.txt checker func NewChecker(userAgent string) *Checker { return &Checker{ cache: make(map[string]*RobotsData), userAgent: userAgent, client: &http.Client{ Timeout: 10 * time.Second, }, cacheTTL: 24 * time.Hour, // Cache robots.txt for 24 hours } } // IsAllowed checks if a URL is allowed to be crawled func (c *Checker) IsAllowed(ctx context.Context, urlStr string) (bool, error) { u, err := url.Parse(urlStr) if err != nil { return false, fmt.Errorf("invalid URL: %w", err) } host := u.Host path := u.Path if path == "" { path = "/" } // Get or fetch robots.txt robotsData, err := c.getRobotsData(ctx, u.Scheme, host) if err != nil { // If we can't fetch robots.txt, assume allowed (be lenient) return true, nil } // If there was an error fetching robots.txt, allow crawling if robotsData.Error != nil { return true, nil } // Check allow rules first (they take precedence) for _, pattern := range robotsData.AllowPatterns { if matchPattern(pattern, path) { return true, nil } } // Check disallow rules for _, pattern := range robotsData.DisallowPatterns { if matchPattern(pattern, path) { return false, nil } } // If no rules match, allow return true, nil } // GetCrawlDelay returns the crawl delay for a host func (c *Checker) GetCrawlDelay(ctx context.Context, urlStr string) (int, error) { u, err := url.Parse(urlStr) if err != nil { return 0, err } robotsData, err := c.getRobotsData(ctx, u.Scheme, u.Host) if err != nil || robotsData.Error != nil { return 0, nil } return robotsData.CrawlDelay, nil } // getRobotsData fetches and caches robots.txt for a host func (c *Checker) getRobotsData(ctx context.Context, scheme, host string) (*RobotsData, error) { c.mu.RLock() data, exists := c.cache[host] c.mu.RUnlock() // Return cached data if not expired if exists && time.Since(data.FetchedAt) < c.cacheTTL { return data, nil } // Fetch robots.txt robotsURL := fmt.Sprintf("%s://%s/robots.txt", scheme, host) data = c.fetchRobots(ctx, robotsURL) // Cache the result c.mu.Lock() c.cache[host] = data c.mu.Unlock() return data, nil } // fetchRobots fetches and parses robots.txt func (c *Checker) fetchRobots(ctx context.Context, robotsURL string) *RobotsData { data := &RobotsData{ FetchedAt: time.Now(), } req, err := http.NewRequestWithContext(ctx, "GET", robotsURL, nil) if err != nil { data.Error = err return data } req.Header.Set("User-Agent", c.userAgent) resp, err := c.client.Do(req) if err != nil { data.Error = err return data } defer resp.Body.Close() // If robots.txt doesn't exist, allow everything if resp.StatusCode == http.StatusNotFound { return data } if resp.StatusCode != http.StatusOK { data.Error = fmt.Errorf("HTTP %d", resp.StatusCode) return data } // Parse the robots.txt c.parseRobotsTxt(data, resp.Body) return data } // parseRobotsTxt parses robots.txt content func (c *Checker) parseRobotsTxt(data *RobotsData, reader io.Reader) { scanner := bufio.NewScanner(reader) // Track which user-agent section we're in inRelevantSection := false inWildcardSection := false // Normalize our user agent for matching ourAgent := strings.ToLower(c.userAgent) for scanner.Scan() { line := strings.TrimSpace(scanner.Text()) // Skip empty lines and comments if line == "" || strings.HasPrefix(line, "#") { continue } // Split on first colon parts := strings.SplitN(line, ":", 2) if len(parts) != 2 { continue } directive := strings.ToLower(strings.TrimSpace(parts[0])) value := strings.TrimSpace(parts[1]) // Remove inline comments if idx := strings.Index(value, "#"); idx >= 0 { value = strings.TrimSpace(value[:idx]) } switch directive { case "user-agent": agent := strings.ToLower(value) if agent == "*" { inWildcardSection = true inRelevantSection = false } else if strings.Contains(ourAgent, agent) || strings.Contains(agent, "breakpilot") || strings.Contains(agent, "edubot") { inRelevantSection = true } else { inRelevantSection = false inWildcardSection = false } case "disallow": if value != "" && (inRelevantSection || inWildcardSection) { data.DisallowPatterns = append(data.DisallowPatterns, value) } case "allow": if value != "" && (inRelevantSection || inWildcardSection) { data.AllowPatterns = append(data.AllowPatterns, value) } case "crawl-delay": if inRelevantSection || inWildcardSection { var delay int fmt.Sscanf(value, "%d", &delay) if delay > 0 { data.CrawlDelay = delay } } } } } // matchPattern matches a URL path against a robots.txt pattern func matchPattern(pattern, path string) bool { // Empty pattern matches nothing if pattern == "" { return false } // Handle wildcards if strings.Contains(pattern, "*") { // Convert to regex regexPattern := regexp.QuoteMeta(pattern) regexPattern = strings.ReplaceAll(regexPattern, `\*`, ".*") // Handle $ at end (exact match) if strings.HasSuffix(regexPattern, `\$`) { regexPattern = strings.TrimSuffix(regexPattern, `\$`) + "$" } re, err := regexp.Compile("^" + regexPattern) if err != nil { return false } return re.MatchString(path) } // Handle $ (exact end match) if strings.HasSuffix(pattern, "$") { return path == strings.TrimSuffix(pattern, "$") } // Simple prefix match return strings.HasPrefix(path, pattern) } // ClearCache clears the robots.txt cache func (c *Checker) ClearCache() { c.mu.Lock() c.cache = make(map[string]*RobotsData) c.mu.Unlock() } // CacheStats returns cache statistics func (c *Checker) CacheStats() (count int, hosts []string) { c.mu.RLock() defer c.mu.RUnlock() for host := range c.cache { hosts = append(hosts, host) } return len(c.cache), hosts }