feat: edu-search-service migriert, voice-service/geo-service entfernt
All checks were successful
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 28s
CI / test-go-edu-search (push) Successful in 27s
CI / test-python-klausur (push) Successful in 1m45s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 21s
All checks were successful
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 28s
CI / test-go-edu-search (push) Successful in 27s
CI / test-python-klausur (push) Successful in 1m45s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 21s
- edu-search-service von breakpilot-pwa nach breakpilot-lehrer kopiert (ohne vendor) - opensearch + edu-search-service in docker-compose.yml hinzugefuegt - voice-service aus docker-compose.yml entfernt (jetzt in breakpilot-core) - geo-service aus docker-compose.yml entfernt (nicht mehr benoetigt) - CI/CD: edu-search-service zu Gitea Actions und Woodpecker hinzugefuegt (Go lint, test mit go mod download, build, SBOM) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
282
edu-search-service/internal/robots/robots.go
Normal file
282
edu-search-service/internal/robots/robots.go
Normal file
@@ -0,0 +1,282 @@
|
||||
package robots
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"context"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"regexp"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
)
|
||||
|
||||
// Checker handles robots.txt parsing and checking
|
||||
type Checker struct {
|
||||
mu sync.RWMutex
|
||||
cache map[string]*RobotsData
|
||||
userAgent string
|
||||
client *http.Client
|
||||
cacheTTL time.Duration
|
||||
}
|
||||
|
||||
// RobotsData holds parsed robots.txt data for a host
|
||||
type RobotsData struct {
|
||||
DisallowPatterns []string
|
||||
AllowPatterns []string
|
||||
CrawlDelay int // seconds
|
||||
FetchedAt time.Time
|
||||
Error error
|
||||
}
|
||||
|
||||
// NewChecker creates a new robots.txt checker
|
||||
func NewChecker(userAgent string) *Checker {
|
||||
return &Checker{
|
||||
cache: make(map[string]*RobotsData),
|
||||
userAgent: userAgent,
|
||||
client: &http.Client{
|
||||
Timeout: 10 * time.Second,
|
||||
},
|
||||
cacheTTL: 24 * time.Hour, // Cache robots.txt for 24 hours
|
||||
}
|
||||
}
|
||||
|
||||
// IsAllowed checks if a URL is allowed to be crawled
|
||||
func (c *Checker) IsAllowed(ctx context.Context, urlStr string) (bool, error) {
|
||||
u, err := url.Parse(urlStr)
|
||||
if err != nil {
|
||||
return false, fmt.Errorf("invalid URL: %w", err)
|
||||
}
|
||||
|
||||
host := u.Host
|
||||
path := u.Path
|
||||
if path == "" {
|
||||
path = "/"
|
||||
}
|
||||
|
||||
// Get or fetch robots.txt
|
||||
robotsData, err := c.getRobotsData(ctx, u.Scheme, host)
|
||||
if err != nil {
|
||||
// If we can't fetch robots.txt, assume allowed (be lenient)
|
||||
return true, nil
|
||||
}
|
||||
|
||||
// If there was an error fetching robots.txt, allow crawling
|
||||
if robotsData.Error != nil {
|
||||
return true, nil
|
||||
}
|
||||
|
||||
// Check allow rules first (they take precedence)
|
||||
for _, pattern := range robotsData.AllowPatterns {
|
||||
if matchPattern(pattern, path) {
|
||||
return true, nil
|
||||
}
|
||||
}
|
||||
|
||||
// Check disallow rules
|
||||
for _, pattern := range robotsData.DisallowPatterns {
|
||||
if matchPattern(pattern, path) {
|
||||
return false, nil
|
||||
}
|
||||
}
|
||||
|
||||
// If no rules match, allow
|
||||
return true, nil
|
||||
}
|
||||
|
||||
// GetCrawlDelay returns the crawl delay for a host
|
||||
func (c *Checker) GetCrawlDelay(ctx context.Context, urlStr string) (int, error) {
|
||||
u, err := url.Parse(urlStr)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
|
||||
robotsData, err := c.getRobotsData(ctx, u.Scheme, u.Host)
|
||||
if err != nil || robotsData.Error != nil {
|
||||
return 0, nil
|
||||
}
|
||||
|
||||
return robotsData.CrawlDelay, nil
|
||||
}
|
||||
|
||||
// getRobotsData fetches and caches robots.txt for a host
|
||||
func (c *Checker) getRobotsData(ctx context.Context, scheme, host string) (*RobotsData, error) {
|
||||
c.mu.RLock()
|
||||
data, exists := c.cache[host]
|
||||
c.mu.RUnlock()
|
||||
|
||||
// Return cached data if not expired
|
||||
if exists && time.Since(data.FetchedAt) < c.cacheTTL {
|
||||
return data, nil
|
||||
}
|
||||
|
||||
// Fetch robots.txt
|
||||
robotsURL := fmt.Sprintf("%s://%s/robots.txt", scheme, host)
|
||||
data = c.fetchRobots(ctx, robotsURL)
|
||||
|
||||
// Cache the result
|
||||
c.mu.Lock()
|
||||
c.cache[host] = data
|
||||
c.mu.Unlock()
|
||||
|
||||
return data, nil
|
||||
}
|
||||
|
||||
// fetchRobots fetches and parses robots.txt
|
||||
func (c *Checker) fetchRobots(ctx context.Context, robotsURL string) *RobotsData {
|
||||
data := &RobotsData{
|
||||
FetchedAt: time.Now(),
|
||||
}
|
||||
|
||||
req, err := http.NewRequestWithContext(ctx, "GET", robotsURL, nil)
|
||||
if err != nil {
|
||||
data.Error = err
|
||||
return data
|
||||
}
|
||||
|
||||
req.Header.Set("User-Agent", c.userAgent)
|
||||
|
||||
resp, err := c.client.Do(req)
|
||||
if err != nil {
|
||||
data.Error = err
|
||||
return data
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
// If robots.txt doesn't exist, allow everything
|
||||
if resp.StatusCode == http.StatusNotFound {
|
||||
return data
|
||||
}
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
data.Error = fmt.Errorf("HTTP %d", resp.StatusCode)
|
||||
return data
|
||||
}
|
||||
|
||||
// Parse the robots.txt
|
||||
c.parseRobotsTxt(data, resp.Body)
|
||||
|
||||
return data
|
||||
}
|
||||
|
||||
// parseRobotsTxt parses robots.txt content
|
||||
func (c *Checker) parseRobotsTxt(data *RobotsData, reader io.Reader) {
|
||||
scanner := bufio.NewScanner(reader)
|
||||
|
||||
// Track which user-agent section we're in
|
||||
inRelevantSection := false
|
||||
inWildcardSection := false
|
||||
|
||||
// Normalize our user agent for matching
|
||||
ourAgent := strings.ToLower(c.userAgent)
|
||||
|
||||
for scanner.Scan() {
|
||||
line := strings.TrimSpace(scanner.Text())
|
||||
|
||||
// Skip empty lines and comments
|
||||
if line == "" || strings.HasPrefix(line, "#") {
|
||||
continue
|
||||
}
|
||||
|
||||
// Split on first colon
|
||||
parts := strings.SplitN(line, ":", 2)
|
||||
if len(parts) != 2 {
|
||||
continue
|
||||
}
|
||||
|
||||
directive := strings.ToLower(strings.TrimSpace(parts[0]))
|
||||
value := strings.TrimSpace(parts[1])
|
||||
|
||||
// Remove inline comments
|
||||
if idx := strings.Index(value, "#"); idx >= 0 {
|
||||
value = strings.TrimSpace(value[:idx])
|
||||
}
|
||||
|
||||
switch directive {
|
||||
case "user-agent":
|
||||
agent := strings.ToLower(value)
|
||||
if agent == "*" {
|
||||
inWildcardSection = true
|
||||
inRelevantSection = false
|
||||
} else if strings.Contains(ourAgent, agent) || strings.Contains(agent, "breakpilot") || strings.Contains(agent, "edubot") {
|
||||
inRelevantSection = true
|
||||
} else {
|
||||
inRelevantSection = false
|
||||
inWildcardSection = false
|
||||
}
|
||||
|
||||
case "disallow":
|
||||
if value != "" && (inRelevantSection || inWildcardSection) {
|
||||
data.DisallowPatterns = append(data.DisallowPatterns, value)
|
||||
}
|
||||
|
||||
case "allow":
|
||||
if value != "" && (inRelevantSection || inWildcardSection) {
|
||||
data.AllowPatterns = append(data.AllowPatterns, value)
|
||||
}
|
||||
|
||||
case "crawl-delay":
|
||||
if inRelevantSection || inWildcardSection {
|
||||
var delay int
|
||||
fmt.Sscanf(value, "%d", &delay)
|
||||
if delay > 0 {
|
||||
data.CrawlDelay = delay
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// matchPattern matches a URL path against a robots.txt pattern
|
||||
func matchPattern(pattern, path string) bool {
|
||||
// Empty pattern matches nothing
|
||||
if pattern == "" {
|
||||
return false
|
||||
}
|
||||
|
||||
// Handle wildcards
|
||||
if strings.Contains(pattern, "*") {
|
||||
// Convert to regex
|
||||
regexPattern := regexp.QuoteMeta(pattern)
|
||||
regexPattern = strings.ReplaceAll(regexPattern, `\*`, ".*")
|
||||
|
||||
// Handle $ at end (exact match)
|
||||
if strings.HasSuffix(regexPattern, `\$`) {
|
||||
regexPattern = strings.TrimSuffix(regexPattern, `\$`) + "$"
|
||||
}
|
||||
|
||||
re, err := regexp.Compile("^" + regexPattern)
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
return re.MatchString(path)
|
||||
}
|
||||
|
||||
// Handle $ (exact end match)
|
||||
if strings.HasSuffix(pattern, "$") {
|
||||
return path == strings.TrimSuffix(pattern, "$")
|
||||
}
|
||||
|
||||
// Simple prefix match
|
||||
return strings.HasPrefix(path, pattern)
|
||||
}
|
||||
|
||||
// ClearCache clears the robots.txt cache
|
||||
func (c *Checker) ClearCache() {
|
||||
c.mu.Lock()
|
||||
c.cache = make(map[string]*RobotsData)
|
||||
c.mu.Unlock()
|
||||
}
|
||||
|
||||
// CacheStats returns cache statistics
|
||||
func (c *Checker) CacheStats() (count int, hosts []string) {
|
||||
c.mu.RLock()
|
||||
defer c.mu.RUnlock()
|
||||
|
||||
for host := range c.cache {
|
||||
hosts = append(hosts, host)
|
||||
}
|
||||
return len(c.cache), hosts
|
||||
}
|
||||
Reference in New Issue
Block a user