feat: BreakPilot PWA - Full codebase (clean push without large binaries)
Some checks failed
Tests / Go Tests (push) Has been cancelled
Tests / Python Tests (push) Has been cancelled
Tests / Integration Tests (push) Has been cancelled
Tests / Go Lint (push) Has been cancelled
Tests / Python Lint (push) Has been cancelled
Tests / Security Scan (push) Has been cancelled
Tests / All Checks Passed (push) Has been cancelled
Security Scanning / Secret Scanning (push) Has been cancelled
Security Scanning / Dependency Vulnerability Scan (push) Has been cancelled
Security Scanning / Go Security Scan (push) Has been cancelled
Security Scanning / Python Security Scan (push) Has been cancelled
Security Scanning / Node.js Security Scan (push) Has been cancelled
Security Scanning / Docker Image Security (push) Has been cancelled
Security Scanning / Security Summary (push) Has been cancelled
CI/CD Pipeline / Go Tests (push) Has been cancelled
CI/CD Pipeline / Python Tests (push) Has been cancelled
CI/CD Pipeline / Website Tests (push) Has been cancelled
CI/CD Pipeline / Linting (push) Has been cancelled
CI/CD Pipeline / Security Scan (push) Has been cancelled
CI/CD Pipeline / Docker Build & Push (push) Has been cancelled
CI/CD Pipeline / Integration Tests (push) Has been cancelled
CI/CD Pipeline / Deploy to Staging (push) Has been cancelled
CI/CD Pipeline / Deploy to Production (push) Has been cancelled
CI/CD Pipeline / CI Summary (push) Has been cancelled
ci/woodpecker/manual/build-ci-image Pipeline was successful
ci/woodpecker/manual/main Pipeline failed
Some checks failed
Tests / Go Tests (push) Has been cancelled
Tests / Python Tests (push) Has been cancelled
Tests / Integration Tests (push) Has been cancelled
Tests / Go Lint (push) Has been cancelled
Tests / Python Lint (push) Has been cancelled
Tests / Security Scan (push) Has been cancelled
Tests / All Checks Passed (push) Has been cancelled
Security Scanning / Secret Scanning (push) Has been cancelled
Security Scanning / Dependency Vulnerability Scan (push) Has been cancelled
Security Scanning / Go Security Scan (push) Has been cancelled
Security Scanning / Python Security Scan (push) Has been cancelled
Security Scanning / Node.js Security Scan (push) Has been cancelled
Security Scanning / Docker Image Security (push) Has been cancelled
Security Scanning / Security Summary (push) Has been cancelled
CI/CD Pipeline / Go Tests (push) Has been cancelled
CI/CD Pipeline / Python Tests (push) Has been cancelled
CI/CD Pipeline / Website Tests (push) Has been cancelled
CI/CD Pipeline / Linting (push) Has been cancelled
CI/CD Pipeline / Security Scan (push) Has been cancelled
CI/CD Pipeline / Docker Build & Push (push) Has been cancelled
CI/CD Pipeline / Integration Tests (push) Has been cancelled
CI/CD Pipeline / Deploy to Staging (push) Has been cancelled
CI/CD Pipeline / Deploy to Production (push) Has been cancelled
CI/CD Pipeline / CI Summary (push) Has been cancelled
ci/woodpecker/manual/build-ci-image Pipeline was successful
ci/woodpecker/manual/main Pipeline failed
All services: admin-v2, studio-v2, website, ai-compliance-sdk, consent-service, klausur-service, voice-service, and infrastructure. Large PDFs and compiled binaries excluded via .gitignore.
This commit is contained in:
282
edu-search-service/internal/robots/robots.go
Normal file
282
edu-search-service/internal/robots/robots.go
Normal file
@@ -0,0 +1,282 @@
|
||||
package robots
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"context"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"regexp"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
)
|
||||
|
||||
// Checker handles robots.txt parsing and checking
|
||||
type Checker struct {
|
||||
mu sync.RWMutex
|
||||
cache map[string]*RobotsData
|
||||
userAgent string
|
||||
client *http.Client
|
||||
cacheTTL time.Duration
|
||||
}
|
||||
|
||||
// RobotsData holds parsed robots.txt data for a host
|
||||
type RobotsData struct {
|
||||
DisallowPatterns []string
|
||||
AllowPatterns []string
|
||||
CrawlDelay int // seconds
|
||||
FetchedAt time.Time
|
||||
Error error
|
||||
}
|
||||
|
||||
// NewChecker creates a new robots.txt checker
|
||||
func NewChecker(userAgent string) *Checker {
|
||||
return &Checker{
|
||||
cache: make(map[string]*RobotsData),
|
||||
userAgent: userAgent,
|
||||
client: &http.Client{
|
||||
Timeout: 10 * time.Second,
|
||||
},
|
||||
cacheTTL: 24 * time.Hour, // Cache robots.txt for 24 hours
|
||||
}
|
||||
}
|
||||
|
||||
// IsAllowed checks if a URL is allowed to be crawled
|
||||
func (c *Checker) IsAllowed(ctx context.Context, urlStr string) (bool, error) {
|
||||
u, err := url.Parse(urlStr)
|
||||
if err != nil {
|
||||
return false, fmt.Errorf("invalid URL: %w", err)
|
||||
}
|
||||
|
||||
host := u.Host
|
||||
path := u.Path
|
||||
if path == "" {
|
||||
path = "/"
|
||||
}
|
||||
|
||||
// Get or fetch robots.txt
|
||||
robotsData, err := c.getRobotsData(ctx, u.Scheme, host)
|
||||
if err != nil {
|
||||
// If we can't fetch robots.txt, assume allowed (be lenient)
|
||||
return true, nil
|
||||
}
|
||||
|
||||
// If there was an error fetching robots.txt, allow crawling
|
||||
if robotsData.Error != nil {
|
||||
return true, nil
|
||||
}
|
||||
|
||||
// Check allow rules first (they take precedence)
|
||||
for _, pattern := range robotsData.AllowPatterns {
|
||||
if matchPattern(pattern, path) {
|
||||
return true, nil
|
||||
}
|
||||
}
|
||||
|
||||
// Check disallow rules
|
||||
for _, pattern := range robotsData.DisallowPatterns {
|
||||
if matchPattern(pattern, path) {
|
||||
return false, nil
|
||||
}
|
||||
}
|
||||
|
||||
// If no rules match, allow
|
||||
return true, nil
|
||||
}
|
||||
|
||||
// GetCrawlDelay returns the crawl delay for a host
|
||||
func (c *Checker) GetCrawlDelay(ctx context.Context, urlStr string) (int, error) {
|
||||
u, err := url.Parse(urlStr)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
|
||||
robotsData, err := c.getRobotsData(ctx, u.Scheme, u.Host)
|
||||
if err != nil || robotsData.Error != nil {
|
||||
return 0, nil
|
||||
}
|
||||
|
||||
return robotsData.CrawlDelay, nil
|
||||
}
|
||||
|
||||
// getRobotsData fetches and caches robots.txt for a host
|
||||
func (c *Checker) getRobotsData(ctx context.Context, scheme, host string) (*RobotsData, error) {
|
||||
c.mu.RLock()
|
||||
data, exists := c.cache[host]
|
||||
c.mu.RUnlock()
|
||||
|
||||
// Return cached data if not expired
|
||||
if exists && time.Since(data.FetchedAt) < c.cacheTTL {
|
||||
return data, nil
|
||||
}
|
||||
|
||||
// Fetch robots.txt
|
||||
robotsURL := fmt.Sprintf("%s://%s/robots.txt", scheme, host)
|
||||
data = c.fetchRobots(ctx, robotsURL)
|
||||
|
||||
// Cache the result
|
||||
c.mu.Lock()
|
||||
c.cache[host] = data
|
||||
c.mu.Unlock()
|
||||
|
||||
return data, nil
|
||||
}
|
||||
|
||||
// fetchRobots fetches and parses robots.txt
|
||||
func (c *Checker) fetchRobots(ctx context.Context, robotsURL string) *RobotsData {
|
||||
data := &RobotsData{
|
||||
FetchedAt: time.Now(),
|
||||
}
|
||||
|
||||
req, err := http.NewRequestWithContext(ctx, "GET", robotsURL, nil)
|
||||
if err != nil {
|
||||
data.Error = err
|
||||
return data
|
||||
}
|
||||
|
||||
req.Header.Set("User-Agent", c.userAgent)
|
||||
|
||||
resp, err := c.client.Do(req)
|
||||
if err != nil {
|
||||
data.Error = err
|
||||
return data
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
// If robots.txt doesn't exist, allow everything
|
||||
if resp.StatusCode == http.StatusNotFound {
|
||||
return data
|
||||
}
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
data.Error = fmt.Errorf("HTTP %d", resp.StatusCode)
|
||||
return data
|
||||
}
|
||||
|
||||
// Parse the robots.txt
|
||||
c.parseRobotsTxt(data, resp.Body)
|
||||
|
||||
return data
|
||||
}
|
||||
|
||||
// parseRobotsTxt parses robots.txt content
|
||||
func (c *Checker) parseRobotsTxt(data *RobotsData, reader io.Reader) {
|
||||
scanner := bufio.NewScanner(reader)
|
||||
|
||||
// Track which user-agent section we're in
|
||||
inRelevantSection := false
|
||||
inWildcardSection := false
|
||||
|
||||
// Normalize our user agent for matching
|
||||
ourAgent := strings.ToLower(c.userAgent)
|
||||
|
||||
for scanner.Scan() {
|
||||
line := strings.TrimSpace(scanner.Text())
|
||||
|
||||
// Skip empty lines and comments
|
||||
if line == "" || strings.HasPrefix(line, "#") {
|
||||
continue
|
||||
}
|
||||
|
||||
// Split on first colon
|
||||
parts := strings.SplitN(line, ":", 2)
|
||||
if len(parts) != 2 {
|
||||
continue
|
||||
}
|
||||
|
||||
directive := strings.ToLower(strings.TrimSpace(parts[0]))
|
||||
value := strings.TrimSpace(parts[1])
|
||||
|
||||
// Remove inline comments
|
||||
if idx := strings.Index(value, "#"); idx >= 0 {
|
||||
value = strings.TrimSpace(value[:idx])
|
||||
}
|
||||
|
||||
switch directive {
|
||||
case "user-agent":
|
||||
agent := strings.ToLower(value)
|
||||
if agent == "*" {
|
||||
inWildcardSection = true
|
||||
inRelevantSection = false
|
||||
} else if strings.Contains(ourAgent, agent) || strings.Contains(agent, "breakpilot") || strings.Contains(agent, "edubot") {
|
||||
inRelevantSection = true
|
||||
} else {
|
||||
inRelevantSection = false
|
||||
inWildcardSection = false
|
||||
}
|
||||
|
||||
case "disallow":
|
||||
if value != "" && (inRelevantSection || inWildcardSection) {
|
||||
data.DisallowPatterns = append(data.DisallowPatterns, value)
|
||||
}
|
||||
|
||||
case "allow":
|
||||
if value != "" && (inRelevantSection || inWildcardSection) {
|
||||
data.AllowPatterns = append(data.AllowPatterns, value)
|
||||
}
|
||||
|
||||
case "crawl-delay":
|
||||
if inRelevantSection || inWildcardSection {
|
||||
var delay int
|
||||
fmt.Sscanf(value, "%d", &delay)
|
||||
if delay > 0 {
|
||||
data.CrawlDelay = delay
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// matchPattern matches a URL path against a robots.txt pattern
|
||||
func matchPattern(pattern, path string) bool {
|
||||
// Empty pattern matches nothing
|
||||
if pattern == "" {
|
||||
return false
|
||||
}
|
||||
|
||||
// Handle wildcards
|
||||
if strings.Contains(pattern, "*") {
|
||||
// Convert to regex
|
||||
regexPattern := regexp.QuoteMeta(pattern)
|
||||
regexPattern = strings.ReplaceAll(regexPattern, `\*`, ".*")
|
||||
|
||||
// Handle $ at end (exact match)
|
||||
if strings.HasSuffix(regexPattern, `\$`) {
|
||||
regexPattern = strings.TrimSuffix(regexPattern, `\$`) + "$"
|
||||
}
|
||||
|
||||
re, err := regexp.Compile("^" + regexPattern)
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
return re.MatchString(path)
|
||||
}
|
||||
|
||||
// Handle $ (exact end match)
|
||||
if strings.HasSuffix(pattern, "$") {
|
||||
return path == strings.TrimSuffix(pattern, "$")
|
||||
}
|
||||
|
||||
// Simple prefix match
|
||||
return strings.HasPrefix(path, pattern)
|
||||
}
|
||||
|
||||
// ClearCache clears the robots.txt cache
|
||||
func (c *Checker) ClearCache() {
|
||||
c.mu.Lock()
|
||||
c.cache = make(map[string]*RobotsData)
|
||||
c.mu.Unlock()
|
||||
}
|
||||
|
||||
// CacheStats returns cache statistics
|
||||
func (c *Checker) CacheStats() (count int, hosts []string) {
|
||||
c.mu.RLock()
|
||||
defer c.mu.RUnlock()
|
||||
|
||||
for host := range c.cache {
|
||||
hosts = append(hosts, host)
|
||||
}
|
||||
return len(c.cache), hosts
|
||||
}
|
||||
Reference in New Issue
Block a user