package crawler import ( "bufio" "context" "crypto/sha256" "encoding/hex" "fmt" "io" "log" "net/http" "net/url" "os" "path/filepath" "strings" "sync" "time" "github.com/google/uuid" ) // Note: API client is in the same package (api_client.go) // FetchResult contains the result of fetching a URL type FetchResult struct { URL string CanonicalURL string ContentType string StatusCode int Body []byte ContentHash string FetchTime time.Time Error error } // Seed represents a URL to crawl with metadata type Seed struct { URL string TrustBoost float64 Source string // GOV, EDU, UNI, etc. Scope string // FEDERAL, STATE, etc. State string // BW, BY, etc. (optional) MaxDepth int // Custom crawl depth for this seed Category string // Category name } // Crawler handles URL fetching with rate limiting and robots.txt respect type Crawler struct { userAgent string rateLimitPerSec float64 maxDepth int timeout time.Duration client *http.Client denylist map[string]bool lastFetch map[string]time.Time mu sync.Mutex apiClient *APIClient // API client for fetching seeds from Backend } // NewCrawler creates a new crawler instance func NewCrawler(userAgent string, rateLimitPerSec float64, maxDepth int) *Crawler { return &Crawler{ userAgent: userAgent, rateLimitPerSec: rateLimitPerSec, maxDepth: maxDepth, timeout: 30 * time.Second, client: &http.Client{ Timeout: 30 * time.Second, CheckRedirect: func(req *http.Request, via []*http.Request) error { if len(via) >= 5 { return fmt.Errorf("too many redirects") } return nil }, }, denylist: make(map[string]bool), lastFetch: make(map[string]time.Time), } } // SetAPIClient sets the API client for fetching seeds from Backend func (c *Crawler) SetAPIClient(backendURL string) { c.apiClient = NewAPIClient(backendURL) } // LoadSeedsFromAPI fetches seeds from the Backend API func (c *Crawler) LoadSeedsFromAPI(ctx context.Context) ([]Seed, error) { if c.apiClient == nil { return nil, fmt.Errorf("API client not initialized - call SetAPIClient first") } response, err := c.apiClient.FetchSeeds(ctx) if err != nil { return nil, fmt.Errorf("failed to fetch seeds from API: %w", err) } seeds := make([]Seed, 0, len(response.Seeds)) for _, apiSeed := range response.Seeds { seed := Seed{ URL: apiSeed.URL, TrustBoost: apiSeed.Trust, Source: apiSeed.Source, Scope: apiSeed.Scope, State: apiSeed.State, MaxDepth: apiSeed.Depth, Category: apiSeed.Category, } // Use default depth if not specified if seed.MaxDepth <= 0 { seed.MaxDepth = c.maxDepth } seeds = append(seeds, seed) } log.Printf("Loaded %d seeds from API (exported at: %s)", len(seeds), response.ExportedAt) return seeds, nil } // LoadSeeds loads seed URLs from files in a directory (legacy method) func (c *Crawler) LoadSeeds(seedsDir string) ([]string, error) { var seeds []string files, err := filepath.Glob(filepath.Join(seedsDir, "*.txt")) if err != nil { return nil, err } for _, file := range files { if strings.Contains(file, "denylist") { // Load denylist if err := c.loadDenylist(file); err != nil { log.Printf("Warning: Could not load denylist %s: %v", file, err) } continue } fileSeeds, err := c.loadSeedFile(file) if err != nil { log.Printf("Warning: Could not load seed file %s: %v", file, err) continue } seeds = append(seeds, fileSeeds...) } log.Printf("Loaded %d seeds from files, %d domains in denylist", len(seeds), len(c.denylist)) return seeds, nil } // LoadSeedsWithMetadata loads seeds from files and converts to Seed struct // This provides backward compatibility while allowing metadata func (c *Crawler) LoadSeedsWithMetadata(seedsDir string) ([]Seed, error) { urlList, err := c.LoadSeeds(seedsDir) if err != nil { return nil, err } seeds := make([]Seed, 0, len(urlList)) for _, url := range urlList { seeds = append(seeds, Seed{ URL: url, TrustBoost: 0.5, // Default trust boost MaxDepth: c.maxDepth, }) } return seeds, nil } func (c *Crawler) loadSeedFile(filename string) ([]string, error) { file, err := os.Open(filename) if err != nil { return nil, err } defer file.Close() var seeds []string scanner := bufio.NewScanner(file) for scanner.Scan() { line := strings.TrimSpace(scanner.Text()) // Skip comments and empty lines if line == "" || strings.HasPrefix(line, "#") { continue } // Extract URL (ignore comments after URL) parts := strings.SplitN(line, " ", 2) urlStr := strings.TrimSpace(parts[0]) if urlStr != "" { seeds = append(seeds, urlStr) } } return seeds, scanner.Err() } func (c *Crawler) loadDenylist(filename string) error { file, err := os.Open(filename) if err != nil { return err } defer file.Close() scanner := bufio.NewScanner(file) for scanner.Scan() { line := strings.TrimSpace(scanner.Text()) if line == "" || strings.HasPrefix(line, "#") { continue } c.denylist[strings.ToLower(line)] = true } return scanner.Err() } // IsDenied checks if a domain is in the denylist func (c *Crawler) IsDenied(urlStr string) bool { u, err := url.Parse(urlStr) if err != nil { return true } host := strings.ToLower(u.Host) // Check exact match if c.denylist[host] { return true } // Check parent domains parts := strings.Split(host, ".") for i := 1; i < len(parts)-1; i++ { parent := strings.Join(parts[i:], ".") if c.denylist[parent] { return true } } return false } // Fetch fetches a single URL with rate limiting func (c *Crawler) Fetch(ctx context.Context, urlStr string) (*FetchResult, error) { result := &FetchResult{ URL: urlStr, FetchTime: time.Now(), } // Check denylist if c.IsDenied(urlStr) { result.Error = fmt.Errorf("domain denied") return result, result.Error } // Parse URL u, err := url.Parse(urlStr) if err != nil { result.Error = err return result, err } // Rate limiting per domain c.waitForRateLimit(u.Host) // Create request req, err := http.NewRequestWithContext(ctx, "GET", urlStr, nil) if err != nil { result.Error = err return result, err } req.Header.Set("User-Agent", c.userAgent) req.Header.Set("Accept", "text/html,application/pdf,application/xhtml+xml") req.Header.Set("Accept-Language", "de-DE,de;q=0.9,en;q=0.8") // Execute request resp, err := c.client.Do(req) if err != nil { result.Error = err return result, err } defer resp.Body.Close() result.StatusCode = resp.StatusCode result.ContentType = resp.Header.Get("Content-Type") result.CanonicalURL = resp.Request.URL.String() if resp.StatusCode != http.StatusOK { result.Error = fmt.Errorf("HTTP %d", resp.StatusCode) return result, result.Error } // Read body (limit to 20MB) limitedReader := io.LimitReader(resp.Body, 20*1024*1024) body, err := io.ReadAll(limitedReader) if err != nil { result.Error = err return result, err } result.Body = body // Calculate content hash hash := sha256.Sum256(body) result.ContentHash = hex.EncodeToString(hash[:]) return result, nil } func (c *Crawler) waitForRateLimit(host string) { c.mu.Lock() defer c.mu.Unlock() minInterval := time.Duration(float64(time.Second) / c.rateLimitPerSec) if last, ok := c.lastFetch[host]; ok { elapsed := time.Since(last) if elapsed < minInterval { time.Sleep(minInterval - elapsed) } } c.lastFetch[host] = time.Now() } // ExtractDomain extracts the domain from a URL func ExtractDomain(urlStr string) string { u, err := url.Parse(urlStr) if err != nil { return "" } return u.Host } // GenerateDocID generates a unique document ID func GenerateDocID() string { return uuid.New().String() } // NormalizeURL normalizes a URL for deduplication func NormalizeURL(urlStr string) string { u, err := url.Parse(urlStr) if err != nil { return urlStr } // Remove trailing slashes u.Path = strings.TrimSuffix(u.Path, "/") // Remove common tracking parameters q := u.Query() for key := range q { lowerKey := strings.ToLower(key) if strings.HasPrefix(lowerKey, "utm_") || lowerKey == "ref" || lowerKey == "source" || lowerKey == "fbclid" || lowerKey == "gclid" { q.Del(key) } } u.RawQuery = q.Encode() // Lowercase host u.Host = strings.ToLower(u.Host) return u.String() }