feat: edu-search-service migriert, voice-service/geo-service entfernt

- edu-search-service von breakpilot-pwa nach breakpilot-lehrer kopiert (ohne vendor) - opensearch + edu-search-service in docker-compose.yml hinzugefuegt - voice-service aus docker-compose.yml entfernt (jetzt in breakpilot-core) - geo-service aus docker-compose.yml entfernt (nicht mehr benoetigt) - CI/CD: edu-search-service zu Gitea Actions und Woodpecker hinzugefuegt (Go lint, test mit go mod download, build, SBOM) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-15 18:36:38 +01:00
parent d4e1d6bab6
commit 414e0f5ec0
73 changed files with 23938 additions and 92 deletions
--- a/edu-search-service/internal/crawler/crawler.go
+++ b/edu-search-service/internal/crawler/crawler.go
@@ -0,0 +1,364 @@
+package crawler
+
+import (
+	"bufio"
+	"context"
+	"crypto/sha256"
+	"encoding/hex"
+	"fmt"
+	"io"
+	"log"
+	"net/http"
+	"net/url"
+	"os"
+	"path/filepath"
+	"strings"
+	"sync"
+	"time"
+
+	"github.com/google/uuid"
+)
+
+// Note: API client is in the same package (api_client.go)
+
+// FetchResult contains the result of fetching a URL
+type FetchResult struct {
+	URL          string
+	CanonicalURL string
+	ContentType  string
+	StatusCode   int
+	Body         []byte
+	ContentHash  string
+	FetchTime    time.Time
+	Error        error
+}
+
+// Seed represents a URL to crawl with metadata
+type Seed struct {
+	URL        string
+	TrustBoost float64
+	Source     string // GOV, EDU, UNI, etc.
+	Scope      string // FEDERAL, STATE, etc.
+	State      string // BW, BY, etc. (optional)
+	MaxDepth   int    // Custom crawl depth for this seed
+	Category   string // Category name
+}
+
+// Crawler handles URL fetching with rate limiting and robots.txt respect
+type Crawler struct {
+	userAgent       string
+	rateLimitPerSec float64
+	maxDepth        int
+	timeout         time.Duration
+	client          *http.Client
+	denylist        map[string]bool
+	lastFetch       map[string]time.Time
+	mu              sync.Mutex
+	apiClient       *APIClient // API client for fetching seeds from Backend
+}
+
+// NewCrawler creates a new crawler instance
+func NewCrawler(userAgent string, rateLimitPerSec float64, maxDepth int) *Crawler {
+	return &Crawler{
+		userAgent:       userAgent,
+		rateLimitPerSec: rateLimitPerSec,
+		maxDepth:        maxDepth,
+		timeout:         30 * time.Second,
+		client: &http.Client{
+			Timeout: 30 * time.Second,
+			CheckRedirect: func(req *http.Request, via []*http.Request) error {
+				if len(via) >= 5 {
+					return fmt.Errorf("too many redirects")
+				}
+				return nil
+			},
+		},
+		denylist:  make(map[string]bool),
+		lastFetch: make(map[string]time.Time),
+	}
+}
+
+// SetAPIClient sets the API client for fetching seeds from Backend
+func (c *Crawler) SetAPIClient(backendURL string) {
+	c.apiClient = NewAPIClient(backendURL)
+}
+
+// LoadSeedsFromAPI fetches seeds from the Backend API
+func (c *Crawler) LoadSeedsFromAPI(ctx context.Context) ([]Seed, error) {
+	if c.apiClient == nil {
+		return nil, fmt.Errorf("API client not initialized - call SetAPIClient first")
+	}
+
+	response, err := c.apiClient.FetchSeeds(ctx)
+	if err != nil {
+		return nil, fmt.Errorf("failed to fetch seeds from API: %w", err)
+	}
+
+	seeds := make([]Seed, 0, len(response.Seeds))
+	for _, apiSeed := range response.Seeds {
+		seed := Seed{
+			URL:        apiSeed.URL,
+			TrustBoost: apiSeed.Trust,
+			Source:     apiSeed.Source,
+			Scope:      apiSeed.Scope,
+			State:      apiSeed.State,
+			MaxDepth:   apiSeed.Depth,
+			Category:   apiSeed.Category,
+		}
+		// Use default depth if not specified
+		if seed.MaxDepth <= 0 {
+			seed.MaxDepth = c.maxDepth
+		}
+		seeds = append(seeds, seed)
+	}
+
+	log.Printf("Loaded %d seeds from API (exported at: %s)", len(seeds), response.ExportedAt)
+	return seeds, nil
+}
+
+// LoadSeeds loads seed URLs from files in a directory (legacy method)
+func (c *Crawler) LoadSeeds(seedsDir string) ([]string, error) {
+	var seeds []string
+
+	files, err := filepath.Glob(filepath.Join(seedsDir, "*.txt"))
+	if err != nil {
+		return nil, err
+	}
+
+	for _, file := range files {
+		if strings.Contains(file, "denylist") {
+			// Load denylist
+			if err := c.loadDenylist(file); err != nil {
+				log.Printf("Warning: Could not load denylist %s: %v", file, err)
+			}
+			continue
+		}
+
+		fileSeeds, err := c.loadSeedFile(file)
+		if err != nil {
+			log.Printf("Warning: Could not load seed file %s: %v", file, err)
+			continue
+		}
+		seeds = append(seeds, fileSeeds...)
+	}
+
+	log.Printf("Loaded %d seeds from files, %d domains in denylist", len(seeds), len(c.denylist))
+	return seeds, nil
+}
+
+// LoadSeedsWithMetadata loads seeds from files and converts to Seed struct
+// This provides backward compatibility while allowing metadata
+func (c *Crawler) LoadSeedsWithMetadata(seedsDir string) ([]Seed, error) {
+	urlList, err := c.LoadSeeds(seedsDir)
+	if err != nil {
+		return nil, err
+	}
+
+	seeds := make([]Seed, 0, len(urlList))
+	for _, url := range urlList {
+		seeds = append(seeds, Seed{
+			URL:        url,
+			TrustBoost: 0.5, // Default trust boost
+			MaxDepth:   c.maxDepth,
+		})
+	}
+
+	return seeds, nil
+}
+
+func (c *Crawler) loadSeedFile(filename string) ([]string, error) {
+	file, err := os.Open(filename)
+	if err != nil {
+		return nil, err
+	}
+	defer file.Close()
+
+	var seeds []string
+	scanner := bufio.NewScanner(file)
+	for scanner.Scan() {
+		line := strings.TrimSpace(scanner.Text())
+		// Skip comments and empty lines
+		if line == "" || strings.HasPrefix(line, "#") {
+			continue
+		}
+		// Extract URL (ignore comments after URL)
+		parts := strings.SplitN(line, " ", 2)
+		urlStr := strings.TrimSpace(parts[0])
+		if urlStr != "" {
+			seeds = append(seeds, urlStr)
+		}
+	}
+	return seeds, scanner.Err()
+}
+
+func (c *Crawler) loadDenylist(filename string) error {
+	file, err := os.Open(filename)
+	if err != nil {
+		return err
+	}
+	defer file.Close()
+
+	scanner := bufio.NewScanner(file)
+	for scanner.Scan() {
+		line := strings.TrimSpace(scanner.Text())
+		if line == "" || strings.HasPrefix(line, "#") {
+			continue
+		}
+		c.denylist[strings.ToLower(line)] = true
+	}
+	return scanner.Err()
+}
+
+// IsDenied checks if a domain is in the denylist
+func (c *Crawler) IsDenied(urlStr string) bool {
+	u, err := url.Parse(urlStr)
+	if err != nil {
+		return true
+	}
+
+	host := strings.ToLower(u.Host)
+
+	// Check exact match
+	if c.denylist[host] {
+		return true
+	}
+
+	// Check parent domains
+	parts := strings.Split(host, ".")
+	for i := 1; i < len(parts)-1; i++ {
+		parent := strings.Join(parts[i:], ".")
+		if c.denylist[parent] {
+			return true
+		}
+	}
+
+	return false
+}
+
+// Fetch fetches a single URL with rate limiting
+func (c *Crawler) Fetch(ctx context.Context, urlStr string) (*FetchResult, error) {
+	result := &FetchResult{
+		URL:       urlStr,
+		FetchTime: time.Now(),
+	}
+
+	// Check denylist
+	if c.IsDenied(urlStr) {
+		result.Error = fmt.Errorf("domain denied")
+		return result, result.Error
+	}
+
+	// Parse URL
+	u, err := url.Parse(urlStr)
+	if err != nil {
+		result.Error = err
+		return result, err
+	}
+
+	// Rate limiting per domain
+	c.waitForRateLimit(u.Host)
+
+	// Create request
+	req, err := http.NewRequestWithContext(ctx, "GET", urlStr, nil)
+	if err != nil {
+		result.Error = err
+		return result, err
+	}
+
+	req.Header.Set("User-Agent", c.userAgent)
+	req.Header.Set("Accept", "text/html,application/pdf,application/xhtml+xml")
+	req.Header.Set("Accept-Language", "de-DE,de;q=0.9,en;q=0.8")
+
+	// Execute request
+	resp, err := c.client.Do(req)
+	if err != nil {
+		result.Error = err
+		return result, err
+	}
+	defer resp.Body.Close()
+
+	result.StatusCode = resp.StatusCode
+	result.ContentType = resp.Header.Get("Content-Type")
+	result.CanonicalURL = resp.Request.URL.String()
+
+	if resp.StatusCode != http.StatusOK {
+		result.Error = fmt.Errorf("HTTP %d", resp.StatusCode)
+		return result, result.Error
+	}
+
+	// Read body (limit to 20MB)
+	limitedReader := io.LimitReader(resp.Body, 20*1024*1024)
+	body, err := io.ReadAll(limitedReader)
+	if err != nil {
+		result.Error = err
+		return result, err
+	}
+
+	result.Body = body
+
+	// Calculate content hash
+	hash := sha256.Sum256(body)
+	result.ContentHash = hex.EncodeToString(hash[:])
+
+	return result, nil
+}
+
+func (c *Crawler) waitForRateLimit(host string) {
+	c.mu.Lock()
+	defer c.mu.Unlock()
+
+	minInterval := time.Duration(float64(time.Second) / c.rateLimitPerSec)
+
+	if last, ok := c.lastFetch[host]; ok {
+		elapsed := time.Since(last)
+		if elapsed < minInterval {
+			time.Sleep(minInterval - elapsed)
+		}
+	}
+
+	c.lastFetch[host] = time.Now()
+}
+
+// ExtractDomain extracts the domain from a URL
+func ExtractDomain(urlStr string) string {
+	u, err := url.Parse(urlStr)
+	if err != nil {
+		return ""
+	}
+	return u.Host
+}
+
+// GenerateDocID generates a unique document ID
+func GenerateDocID() string {
+	return uuid.New().String()
+}
+
+// NormalizeURL normalizes a URL for deduplication
+func NormalizeURL(urlStr string) string {
+	u, err := url.Parse(urlStr)
+	if err != nil {
+		return urlStr
+	}
+
+	// Remove trailing slashes
+	u.Path = strings.TrimSuffix(u.Path, "/")
+
+	// Remove common tracking parameters
+	q := u.Query()
+	for key := range q {
+		lowerKey := strings.ToLower(key)
+		if strings.HasPrefix(lowerKey, "utm_") ||
+			lowerKey == "ref" ||
+			lowerKey == "source" ||
+			lowerKey == "fbclid" ||
+			lowerKey == "gclid" {
+			q.Del(key)
+		}
+	}
+	u.RawQuery = q.Encode()
+
+	// Lowercase host
+	u.Host = strings.ToLower(u.Host)
+
+	return u.String()
+}