feat: edu-search-service migriert, voice-service/geo-service entfernt
All checks were successful
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 28s
CI / test-go-edu-search (push) Successful in 27s
CI / test-python-klausur (push) Successful in 1m45s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 21s
All checks were successful
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 28s
CI / test-go-edu-search (push) Successful in 27s
CI / test-python-klausur (push) Successful in 1m45s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 21s
- edu-search-service von breakpilot-pwa nach breakpilot-lehrer kopiert (ohne vendor) - opensearch + edu-search-service in docker-compose.yml hinzugefuegt - voice-service aus docker-compose.yml entfernt (jetzt in breakpilot-core) - geo-service aus docker-compose.yml entfernt (nicht mehr benoetigt) - CI/CD: edu-search-service zu Gitea Actions und Woodpecker hinzugefuegt (Go lint, test mit go mod download, build, SBOM) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
364
edu-search-service/internal/crawler/crawler.go
Normal file
364
edu-search-service/internal/crawler/crawler.go
Normal file
@@ -0,0 +1,364 @@
|
||||
package crawler
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"context"
|
||||
"crypto/sha256"
|
||||
"encoding/hex"
|
||||
"fmt"
|
||||
"io"
|
||||
"log"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/google/uuid"
|
||||
)
|
||||
|
||||
// Note: API client is in the same package (api_client.go)
|
||||
|
||||
// FetchResult contains the result of fetching a URL
|
||||
type FetchResult struct {
|
||||
URL string
|
||||
CanonicalURL string
|
||||
ContentType string
|
||||
StatusCode int
|
||||
Body []byte
|
||||
ContentHash string
|
||||
FetchTime time.Time
|
||||
Error error
|
||||
}
|
||||
|
||||
// Seed represents a URL to crawl with metadata
|
||||
type Seed struct {
|
||||
URL string
|
||||
TrustBoost float64
|
||||
Source string // GOV, EDU, UNI, etc.
|
||||
Scope string // FEDERAL, STATE, etc.
|
||||
State string // BW, BY, etc. (optional)
|
||||
MaxDepth int // Custom crawl depth for this seed
|
||||
Category string // Category name
|
||||
}
|
||||
|
||||
// Crawler handles URL fetching with rate limiting and robots.txt respect
|
||||
type Crawler struct {
|
||||
userAgent string
|
||||
rateLimitPerSec float64
|
||||
maxDepth int
|
||||
timeout time.Duration
|
||||
client *http.Client
|
||||
denylist map[string]bool
|
||||
lastFetch map[string]time.Time
|
||||
mu sync.Mutex
|
||||
apiClient *APIClient // API client for fetching seeds from Backend
|
||||
}
|
||||
|
||||
// NewCrawler creates a new crawler instance
|
||||
func NewCrawler(userAgent string, rateLimitPerSec float64, maxDepth int) *Crawler {
|
||||
return &Crawler{
|
||||
userAgent: userAgent,
|
||||
rateLimitPerSec: rateLimitPerSec,
|
||||
maxDepth: maxDepth,
|
||||
timeout: 30 * time.Second,
|
||||
client: &http.Client{
|
||||
Timeout: 30 * time.Second,
|
||||
CheckRedirect: func(req *http.Request, via []*http.Request) error {
|
||||
if len(via) >= 5 {
|
||||
return fmt.Errorf("too many redirects")
|
||||
}
|
||||
return nil
|
||||
},
|
||||
},
|
||||
denylist: make(map[string]bool),
|
||||
lastFetch: make(map[string]time.Time),
|
||||
}
|
||||
}
|
||||
|
||||
// SetAPIClient sets the API client for fetching seeds from Backend
|
||||
func (c *Crawler) SetAPIClient(backendURL string) {
|
||||
c.apiClient = NewAPIClient(backendURL)
|
||||
}
|
||||
|
||||
// LoadSeedsFromAPI fetches seeds from the Backend API
|
||||
func (c *Crawler) LoadSeedsFromAPI(ctx context.Context) ([]Seed, error) {
|
||||
if c.apiClient == nil {
|
||||
return nil, fmt.Errorf("API client not initialized - call SetAPIClient first")
|
||||
}
|
||||
|
||||
response, err := c.apiClient.FetchSeeds(ctx)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to fetch seeds from API: %w", err)
|
||||
}
|
||||
|
||||
seeds := make([]Seed, 0, len(response.Seeds))
|
||||
for _, apiSeed := range response.Seeds {
|
||||
seed := Seed{
|
||||
URL: apiSeed.URL,
|
||||
TrustBoost: apiSeed.Trust,
|
||||
Source: apiSeed.Source,
|
||||
Scope: apiSeed.Scope,
|
||||
State: apiSeed.State,
|
||||
MaxDepth: apiSeed.Depth,
|
||||
Category: apiSeed.Category,
|
||||
}
|
||||
// Use default depth if not specified
|
||||
if seed.MaxDepth <= 0 {
|
||||
seed.MaxDepth = c.maxDepth
|
||||
}
|
||||
seeds = append(seeds, seed)
|
||||
}
|
||||
|
||||
log.Printf("Loaded %d seeds from API (exported at: %s)", len(seeds), response.ExportedAt)
|
||||
return seeds, nil
|
||||
}
|
||||
|
||||
// LoadSeeds loads seed URLs from files in a directory (legacy method)
|
||||
func (c *Crawler) LoadSeeds(seedsDir string) ([]string, error) {
|
||||
var seeds []string
|
||||
|
||||
files, err := filepath.Glob(filepath.Join(seedsDir, "*.txt"))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
for _, file := range files {
|
||||
if strings.Contains(file, "denylist") {
|
||||
// Load denylist
|
||||
if err := c.loadDenylist(file); err != nil {
|
||||
log.Printf("Warning: Could not load denylist %s: %v", file, err)
|
||||
}
|
||||
continue
|
||||
}
|
||||
|
||||
fileSeeds, err := c.loadSeedFile(file)
|
||||
if err != nil {
|
||||
log.Printf("Warning: Could not load seed file %s: %v", file, err)
|
||||
continue
|
||||
}
|
||||
seeds = append(seeds, fileSeeds...)
|
||||
}
|
||||
|
||||
log.Printf("Loaded %d seeds from files, %d domains in denylist", len(seeds), len(c.denylist))
|
||||
return seeds, nil
|
||||
}
|
||||
|
||||
// LoadSeedsWithMetadata loads seeds from files and converts to Seed struct
|
||||
// This provides backward compatibility while allowing metadata
|
||||
func (c *Crawler) LoadSeedsWithMetadata(seedsDir string) ([]Seed, error) {
|
||||
urlList, err := c.LoadSeeds(seedsDir)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
seeds := make([]Seed, 0, len(urlList))
|
||||
for _, url := range urlList {
|
||||
seeds = append(seeds, Seed{
|
||||
URL: url,
|
||||
TrustBoost: 0.5, // Default trust boost
|
||||
MaxDepth: c.maxDepth,
|
||||
})
|
||||
}
|
||||
|
||||
return seeds, nil
|
||||
}
|
||||
|
||||
func (c *Crawler) loadSeedFile(filename string) ([]string, error) {
|
||||
file, err := os.Open(filename)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer file.Close()
|
||||
|
||||
var seeds []string
|
||||
scanner := bufio.NewScanner(file)
|
||||
for scanner.Scan() {
|
||||
line := strings.TrimSpace(scanner.Text())
|
||||
// Skip comments and empty lines
|
||||
if line == "" || strings.HasPrefix(line, "#") {
|
||||
continue
|
||||
}
|
||||
// Extract URL (ignore comments after URL)
|
||||
parts := strings.SplitN(line, " ", 2)
|
||||
urlStr := strings.TrimSpace(parts[0])
|
||||
if urlStr != "" {
|
||||
seeds = append(seeds, urlStr)
|
||||
}
|
||||
}
|
||||
return seeds, scanner.Err()
|
||||
}
|
||||
|
||||
func (c *Crawler) loadDenylist(filename string) error {
|
||||
file, err := os.Open(filename)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer file.Close()
|
||||
|
||||
scanner := bufio.NewScanner(file)
|
||||
for scanner.Scan() {
|
||||
line := strings.TrimSpace(scanner.Text())
|
||||
if line == "" || strings.HasPrefix(line, "#") {
|
||||
continue
|
||||
}
|
||||
c.denylist[strings.ToLower(line)] = true
|
||||
}
|
||||
return scanner.Err()
|
||||
}
|
||||
|
||||
// IsDenied checks if a domain is in the denylist
|
||||
func (c *Crawler) IsDenied(urlStr string) bool {
|
||||
u, err := url.Parse(urlStr)
|
||||
if err != nil {
|
||||
return true
|
||||
}
|
||||
|
||||
host := strings.ToLower(u.Host)
|
||||
|
||||
// Check exact match
|
||||
if c.denylist[host] {
|
||||
return true
|
||||
}
|
||||
|
||||
// Check parent domains
|
||||
parts := strings.Split(host, ".")
|
||||
for i := 1; i < len(parts)-1; i++ {
|
||||
parent := strings.Join(parts[i:], ".")
|
||||
if c.denylist[parent] {
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
// Fetch fetches a single URL with rate limiting
|
||||
func (c *Crawler) Fetch(ctx context.Context, urlStr string) (*FetchResult, error) {
|
||||
result := &FetchResult{
|
||||
URL: urlStr,
|
||||
FetchTime: time.Now(),
|
||||
}
|
||||
|
||||
// Check denylist
|
||||
if c.IsDenied(urlStr) {
|
||||
result.Error = fmt.Errorf("domain denied")
|
||||
return result, result.Error
|
||||
}
|
||||
|
||||
// Parse URL
|
||||
u, err := url.Parse(urlStr)
|
||||
if err != nil {
|
||||
result.Error = err
|
||||
return result, err
|
||||
}
|
||||
|
||||
// Rate limiting per domain
|
||||
c.waitForRateLimit(u.Host)
|
||||
|
||||
// Create request
|
||||
req, err := http.NewRequestWithContext(ctx, "GET", urlStr, nil)
|
||||
if err != nil {
|
||||
result.Error = err
|
||||
return result, err
|
||||
}
|
||||
|
||||
req.Header.Set("User-Agent", c.userAgent)
|
||||
req.Header.Set("Accept", "text/html,application/pdf,application/xhtml+xml")
|
||||
req.Header.Set("Accept-Language", "de-DE,de;q=0.9,en;q=0.8")
|
||||
|
||||
// Execute request
|
||||
resp, err := c.client.Do(req)
|
||||
if err != nil {
|
||||
result.Error = err
|
||||
return result, err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
result.StatusCode = resp.StatusCode
|
||||
result.ContentType = resp.Header.Get("Content-Type")
|
||||
result.CanonicalURL = resp.Request.URL.String()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
result.Error = fmt.Errorf("HTTP %d", resp.StatusCode)
|
||||
return result, result.Error
|
||||
}
|
||||
|
||||
// Read body (limit to 20MB)
|
||||
limitedReader := io.LimitReader(resp.Body, 20*1024*1024)
|
||||
body, err := io.ReadAll(limitedReader)
|
||||
if err != nil {
|
||||
result.Error = err
|
||||
return result, err
|
||||
}
|
||||
|
||||
result.Body = body
|
||||
|
||||
// Calculate content hash
|
||||
hash := sha256.Sum256(body)
|
||||
result.ContentHash = hex.EncodeToString(hash[:])
|
||||
|
||||
return result, nil
|
||||
}
|
||||
|
||||
func (c *Crawler) waitForRateLimit(host string) {
|
||||
c.mu.Lock()
|
||||
defer c.mu.Unlock()
|
||||
|
||||
minInterval := time.Duration(float64(time.Second) / c.rateLimitPerSec)
|
||||
|
||||
if last, ok := c.lastFetch[host]; ok {
|
||||
elapsed := time.Since(last)
|
||||
if elapsed < minInterval {
|
||||
time.Sleep(minInterval - elapsed)
|
||||
}
|
||||
}
|
||||
|
||||
c.lastFetch[host] = time.Now()
|
||||
}
|
||||
|
||||
// ExtractDomain extracts the domain from a URL
|
||||
func ExtractDomain(urlStr string) string {
|
||||
u, err := url.Parse(urlStr)
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
return u.Host
|
||||
}
|
||||
|
||||
// GenerateDocID generates a unique document ID
|
||||
func GenerateDocID() string {
|
||||
return uuid.New().String()
|
||||
}
|
||||
|
||||
// NormalizeURL normalizes a URL for deduplication
|
||||
func NormalizeURL(urlStr string) string {
|
||||
u, err := url.Parse(urlStr)
|
||||
if err != nil {
|
||||
return urlStr
|
||||
}
|
||||
|
||||
// Remove trailing slashes
|
||||
u.Path = strings.TrimSuffix(u.Path, "/")
|
||||
|
||||
// Remove common tracking parameters
|
||||
q := u.Query()
|
||||
for key := range q {
|
||||
lowerKey := strings.ToLower(key)
|
||||
if strings.HasPrefix(lowerKey, "utm_") ||
|
||||
lowerKey == "ref" ||
|
||||
lowerKey == "source" ||
|
||||
lowerKey == "fbclid" ||
|
||||
lowerKey == "gclid" {
|
||||
q.Del(key)
|
||||
}
|
||||
}
|
||||
u.RawQuery = q.Encode()
|
||||
|
||||
// Lowercase host
|
||||
u.Host = strings.ToLower(u.Host)
|
||||
|
||||
return u.String()
|
||||
}
|
||||
Reference in New Issue
Block a user