breakpilot-lehrer/edu-search-service/internal/crawler/crawler.go

package crawler

import (
	"bufio"
	"context"
	"crypto/sha256"
	"encoding/hex"
	"fmt"
	"io"
	"log"
	"net/http"
	"net/url"
	"os"
	"path/filepath"
	"strings"
	"sync"
	"time"

	"github.com/google/uuid"
)

// Note: API client is in the same package (api_client.go)

// FetchResult contains the result of fetching a URL
type FetchResult struct {
	URL          string
	CanonicalURL string
	ContentType  string
	StatusCode   int
	Body         []byte
	ContentHash  string
	FetchTime    time.Time
	Error        error
}

// Seed represents a URL to crawl with metadata
type Seed struct {
	URL        string
	TrustBoost float64
	Source     string // GOV, EDU, UNI, etc.
	Scope      string // FEDERAL, STATE, etc.
	State      string // BW, BY, etc. (optional)
	MaxDepth   int    // Custom crawl depth for this seed
	Category   string // Category name
}

// Crawler handles URL fetching with rate limiting and robots.txt respect
type Crawler struct {
	userAgent       string
	rateLimitPerSec float64
	maxDepth        int
	timeout         time.Duration
	client          *http.Client
	denylist        map[string]bool
	lastFetch       map[string]time.Time
	mu              sync.Mutex
	apiClient       *APIClient // API client for fetching seeds from Backend
}

// NewCrawler creates a new crawler instance
func NewCrawler(userAgent string, rateLimitPerSec float64, maxDepth int) *Crawler {
	return &Crawler{
		userAgent:       userAgent,
		rateLimitPerSec: rateLimitPerSec,
		maxDepth:        maxDepth,
		timeout:         30 * time.Second,
		client: &http.Client{
			Timeout: 30 * time.Second,
			CheckRedirect: func(req *http.Request, via []*http.Request) error {
				if len(via) >= 5 {
					return fmt.Errorf("too many redirects")
				}
				return nil
			},
		},
		denylist:  make(map[string]bool),
		lastFetch: make(map[string]time.Time),
	}
}

// SetAPIClient sets the API client for fetching seeds from Backend
func (c *Crawler) SetAPIClient(backendURL string) {
	c.apiClient = NewAPIClient(backendURL)
}

// LoadSeedsFromAPI fetches seeds from the Backend API
func (c *Crawler) LoadSeedsFromAPI(ctx context.Context) ([]Seed, error) {
	if c.apiClient == nil {
		return nil, fmt.Errorf("API client not initialized - call SetAPIClient first")
	}

	response, err := c.apiClient.FetchSeeds(ctx)
	if err != nil {
		return nil, fmt.Errorf("failed to fetch seeds from API: %w", err)
	}

	seeds := make([]Seed, 0, len(response.Seeds))
	for _, apiSeed := range response.Seeds {
		seed := Seed{
			URL:        apiSeed.URL,
			TrustBoost: apiSeed.Trust,
			Source:     apiSeed.Source,
			Scope:      apiSeed.Scope,
			State:      apiSeed.State,
			MaxDepth:   apiSeed.Depth,
			Category:   apiSeed.Category,
		}
		// Use default depth if not specified
		if seed.MaxDepth <= 0 {
			seed.MaxDepth = c.maxDepth
		}
		seeds = append(seeds, seed)
	}

	log.Printf("Loaded %d seeds from API (exported at: %s)", len(seeds), response.ExportedAt)
	return seeds, nil
}

// LoadSeeds loads seed URLs from files in a directory (legacy method)
func (c *Crawler) LoadSeeds(seedsDir string) ([]string, error) {
	var seeds []string

	files, err := filepath.Glob(filepath.Join(seedsDir, "*.txt"))
	if err != nil {
		return nil, err
	}

	for _, file := range files {
		if strings.Contains(file, "denylist") {
			// Load denylist
			if err := c.loadDenylist(file); err != nil {
				log.Printf("Warning: Could not load denylist %s: %v", file, err)
			}
			continue
		}

		fileSeeds, err := c.loadSeedFile(file)
		if err != nil {
			log.Printf("Warning: Could not load seed file %s: %v", file, err)
			continue
		}
		seeds = append(seeds, fileSeeds...)
	}

	log.Printf("Loaded %d seeds from files, %d domains in denylist", len(seeds), len(c.denylist))
	return seeds, nil
}

// LoadSeedsWithMetadata loads seeds from files and converts to Seed struct
// This provides backward compatibility while allowing metadata
func (c *Crawler) LoadSeedsWithMetadata(seedsDir string) ([]Seed, error) {
	urlList, err := c.LoadSeeds(seedsDir)
	if err != nil {
		return nil, err
	}

	seeds := make([]Seed, 0, len(urlList))
	for _, url := range urlList {
		seeds = append(seeds, Seed{
			URL:        url,
			TrustBoost: 0.5, // Default trust boost
			MaxDepth:   c.maxDepth,
		})
	}

	return seeds, nil
}

func (c *Crawler) loadSeedFile(filename string) ([]string, error) {
	file, err := os.Open(filename)
	if err != nil {
		return nil, err
	}
	defer file.Close()

	var seeds []string
	scanner := bufio.NewScanner(file)
	for scanner.Scan() {
		line := strings.TrimSpace(scanner.Text())
		// Skip comments and empty lines
		if line == "" || strings.HasPrefix(line, "#") {
			continue
		}
		// Extract URL (ignore comments after URL)
		parts := strings.SplitN(line, " ", 2)
		urlStr := strings.TrimSpace(parts[0])
		if urlStr != "" {
			seeds = append(seeds, urlStr)
		}
	}
	return seeds, scanner.Err()
}

func (c *Crawler) loadDenylist(filename string) error {
	file, err := os.Open(filename)
	if err != nil {
		return err
	}
	defer file.Close()

	scanner := bufio.NewScanner(file)
	for scanner.Scan() {
		line := strings.TrimSpace(scanner.Text())
		if line == "" || strings.HasPrefix(line, "#") {
			continue
		}
		c.denylist[strings.ToLower(line)] = true
	}
	return scanner.Err()
}

// IsDenied checks if a domain is in the denylist
func (c *Crawler) IsDenied(urlStr string) bool {
	u, err := url.Parse(urlStr)
	if err != nil {
		return true
	}

	host := strings.ToLower(u.Host)

	// Check exact match
	if c.denylist[host] {
		return true
	}

	// Check parent domains
	parts := strings.Split(host, ".")
	for i := 1; i < len(parts)-1; i++ {
		parent := strings.Join(parts[i:], ".")
		if c.denylist[parent] {
			return true
		}
	}

	return false
}

// Fetch fetches a single URL with rate limiting
func (c *Crawler) Fetch(ctx context.Context, urlStr string) (*FetchResult, error) {
	result := &FetchResult{
		URL:       urlStr,
		FetchTime: time.Now(),
	}

	// Check denylist
	if c.IsDenied(urlStr) {
		result.Error = fmt.Errorf("domain denied")
		return result, result.Error
	}

	// Parse URL
	u, err := url.Parse(urlStr)
	if err != nil {
		result.Error = err
		return result, err
	}

	// Rate limiting per domain
	c.waitForRateLimit(u.Host)

	// Create request
	req, err := http.NewRequestWithContext(ctx, "GET", urlStr, nil)
	if err != nil {
		result.Error = err
		return result, err
	}

	req.Header.Set("User-Agent", c.userAgent)
	req.Header.Set("Accept", "text/html,application/pdf,application/xhtml+xml")
	req.Header.Set("Accept-Language", "de-DE,de;q=0.9,en;q=0.8")

	// Execute request
	resp, err := c.client.Do(req)
	if err != nil {
		result.Error = err
		return result, err
	}
	defer resp.Body.Close()

	result.StatusCode = resp.StatusCode
	result.ContentType = resp.Header.Get("Content-Type")
	result.CanonicalURL = resp.Request.URL.String()

	if resp.StatusCode != http.StatusOK {
		result.Error = fmt.Errorf("HTTP %d", resp.StatusCode)
		return result, result.Error
	}

	// Read body (limit to 20MB)
	limitedReader := io.LimitReader(resp.Body, 20*1024*1024)
	body, err := io.ReadAll(limitedReader)
	if err != nil {
		result.Error = err
		return result, err
	}

	result.Body = body

	// Calculate content hash
	hash := sha256.Sum256(body)
	result.ContentHash = hex.EncodeToString(hash[:])

	return result, nil
}

func (c *Crawler) waitForRateLimit(host string) {
	c.mu.Lock()
	defer c.mu.Unlock()

	minInterval := time.Duration(float64(time.Second) / c.rateLimitPerSec)

	if last, ok := c.lastFetch[host]; ok {
		elapsed := time.Since(last)
		if elapsed < minInterval {
			time.Sleep(minInterval - elapsed)
		}
	}

	c.lastFetch[host] = time.Now()
}

// ExtractDomain extracts the domain from a URL
func ExtractDomain(urlStr string) string {
	u, err := url.Parse(urlStr)
	if err != nil {
		return ""
	}
	return u.Host
}

// GenerateDocID generates a unique document ID
func GenerateDocID() string {
	return uuid.New().String()
}

// NormalizeURL normalizes a URL for deduplication
func NormalizeURL(urlStr string) string {
	u, err := url.Parse(urlStr)
	if err != nil {
		return urlStr
	}

	// Remove trailing slashes
	u.Path = strings.TrimSuffix(u.Path, "/")

	// Remove common tracking parameters
	q := u.Query()
	for key := range q {
		lowerKey := strings.ToLower(key)
		if strings.HasPrefix(lowerKey, "utm_") ||
			lowerKey == "ref" ||
			lowerKey == "source" ||
			lowerKey == "fbclid" ||
			lowerKey == "gclid" {
			q.Del(key)
		}
	}
	u.RawQuery = q.Encode()

	// Lowercase host
	u.Host = strings.ToLower(u.Host)

	return u.String()
}