feat: edu-search-service migriert, voice-service/geo-service entfernt

- edu-search-service von breakpilot-pwa nach breakpilot-lehrer kopiert (ohne vendor) - opensearch + edu-search-service in docker-compose.yml hinzugefuegt - voice-service aus docker-compose.yml entfernt (jetzt in breakpilot-core) - geo-service aus docker-compose.yml entfernt (nicht mehr benoetigt) - CI/CD: edu-search-service zu Gitea Actions und Woodpecker hinzugefuegt (Go lint, test mit go mod download, build, SBOM) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-15 18:36:38 +01:00
parent d4e1d6bab6
commit 414e0f5ec0
73 changed files with 23938 additions and 92 deletions
@@ -0,0 +1,183 @@
+package crawler
+
+import (
+	"bytes"
+	"context"
+	"encoding/json"
+	"fmt"
+	"io"
+	"net/http"
+	"time"
+)
+
+// SeedFromAPI represents a seed URL from the Backend API
+type SeedFromAPI struct {
+	URL      string  `json:"url"`
+	Trust    float64 `json:"trust"`
+	Source   string  `json:"source"`   // GOV, EDU, UNI, etc.
+	Scope    string  `json:"scope"`    // FEDERAL, STATE, etc.
+	State    string  `json:"state"`    // BW, BY, etc. (optional)
+	Depth    int     `json:"depth"`    // Crawl depth for this seed
+	Category string  `json:"category"` // Category name
+}
+
+// SeedsExportResponse represents the API response from /seeds/export/for-crawler
+type SeedsExportResponse struct {
+	Seeds      []SeedFromAPI `json:"seeds"`
+	Total      int           `json:"total"`
+	ExportedAt string        `json:"exported_at"`
+}
+
+// APIClient handles communication with the Python Backend
+type APIClient struct {
+	baseURL    string
+	httpClient *http.Client
+}
+
+// NewAPIClient creates a new API client for fetching seeds
+func NewAPIClient(backendURL string) *APIClient {
+	return &APIClient{
+		baseURL: backendURL,
+		httpClient: &http.Client{
+			Timeout: 30 * time.Second,
+		},
+	}
+}
+
+// FetchSeeds retrieves enabled seeds from the Backend API
+func (c *APIClient) FetchSeeds(ctx context.Context) (*SeedsExportResponse, error) {
+	url := fmt.Sprintf("%s/v1/edu-search/seeds/export/for-crawler", c.baseURL)
+
+	req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
+	if err != nil {
+		return nil, fmt.Errorf("failed to create request: %w", err)
+	}
+
+	req.Header.Set("Accept", "application/json")
+	req.Header.Set("User-Agent", "EduSearchCrawler/1.0")
+
+	resp, err := c.httpClient.Do(req)
+	if err != nil {
+		return nil, fmt.Errorf("failed to fetch seeds: %w", err)
+	}
+	defer resp.Body.Close()
+
+	if resp.StatusCode != http.StatusOK {
+		body, _ := io.ReadAll(resp.Body)
+		return nil, fmt.Errorf("API returned status %d: %s", resp.StatusCode, string(body))
+	}
+
+	body, err := io.ReadAll(resp.Body)
+	if err != nil {
+		return nil, fmt.Errorf("failed to read response: %w", err)
+	}
+
+	var result SeedsExportResponse
+	if err := json.Unmarshal(body, &result); err != nil {
+		return nil, fmt.Errorf("failed to parse response: %w", err)
+	}
+
+	return &result, nil
+}
+
+// CrawlStatusReport represents a crawl status to report to the Backend
+type CrawlStatusReport struct {
+	SeedURL          string  `json:"seed_url"`
+	Status           string  `json:"status"` // "success", "error", "partial"
+	DocumentsCrawled int     `json:"documents_crawled"`
+	ErrorMessage     string  `json:"error_message,omitempty"`
+	CrawlDuration    float64 `json:"crawl_duration_seconds"`
+}
+
+// CrawlStatusResponse represents the response from crawl status endpoint
+type CrawlStatusResponse struct {
+	Success bool   `json:"success"`
+	SeedURL string `json:"seed_url"`
+	Message string `json:"message"`
+}
+
+// BulkCrawlStatusResponse represents the response from bulk crawl status endpoint
+type BulkCrawlStatusResponse struct {
+	Updated int      `json:"updated"`
+	Failed  int      `json:"failed"`
+	Errors  []string `json:"errors"`
+}
+
+// ReportStatus sends crawl status for a single seed to the Backend
+func (c *APIClient) ReportStatus(ctx context.Context, report *CrawlStatusReport) error {
+	url := fmt.Sprintf("%s/v1/edu-search/seeds/crawl-status", c.baseURL)
+
+	body, err := json.Marshal(report)
+	if err != nil {
+		return fmt.Errorf("failed to marshal report: %w", err)
+	}
+
+	req, err := http.NewRequestWithContext(ctx, "POST", url, bytes.NewReader(body))
+	if err != nil {
+		return fmt.Errorf("failed to create request: %w", err)
+	}
+
+	req.Header.Set("Content-Type", "application/json")
+	req.Header.Set("Accept", "application/json")
+	req.Header.Set("User-Agent", "EduSearchCrawler/1.0")
+
+	resp, err := c.httpClient.Do(req)
+	if err != nil {
+		return fmt.Errorf("failed to report status: %w", err)
+	}
+	defer resp.Body.Close()
+
+	if resp.StatusCode != http.StatusOK {
+		respBody, _ := io.ReadAll(resp.Body)
+		return fmt.Errorf("API returned status %d: %s", resp.StatusCode, string(respBody))
+	}
+
+	return nil
+}
+
+// ReportStatusBulk sends crawl status for multiple seeds in one request
+func (c *APIClient) ReportStatusBulk(ctx context.Context, reports []*CrawlStatusReport) (*BulkCrawlStatusResponse, error) {
+	url := fmt.Sprintf("%s/v1/edu-search/seeds/crawl-status/bulk", c.baseURL)
+
+	payload := struct {
+		Updates []*CrawlStatusReport `json:"updates"`
+	}{
+		Updates: reports,
+	}
+
+	body, err := json.Marshal(payload)
+	if err != nil {
+		return nil, fmt.Errorf("failed to marshal reports: %w", err)
+	}
+
+	req, err := http.NewRequestWithContext(ctx, "POST", url, bytes.NewReader(body))
+	if err != nil {
+		return nil, fmt.Errorf("failed to create request: %w", err)
+	}
+
+	req.Header.Set("Content-Type", "application/json")
+	req.Header.Set("Accept", "application/json")
+	req.Header.Set("User-Agent", "EduSearchCrawler/1.0")
+
+	resp, err := c.httpClient.Do(req)
+	if err != nil {
+		return nil, fmt.Errorf("failed to report status: %w", err)
+	}
+	defer resp.Body.Close()
+
+	respBody, err := io.ReadAll(resp.Body)
+	if err != nil {
+		return nil, fmt.Errorf("failed to read response: %w", err)
+	}
+
+	if resp.StatusCode != http.StatusOK {
+		return nil, fmt.Errorf("API returned status %d: %s", resp.StatusCode, string(respBody))
+	}
+
+	var result BulkCrawlStatusResponse
+	if err := json.Unmarshal(respBody, &result); err != nil {
+		return nil, fmt.Errorf("failed to parse response: %w", err)
+	}
+
+	return &result, nil
+}
@@ -0,0 +1,428 @@
+package crawler
+
+import (
+	"context"
+	"encoding/json"
+	"net/http"
+	"net/http/httptest"
+	"testing"
+	"time"
+)
+
+func TestNewAPIClient(t *testing.T) {
+	client := NewAPIClient("http://backend:8000")
+
+	if client == nil {
+		t.Fatal("Expected non-nil client")
+	}
+
+	if client.baseURL != "http://backend:8000" {
+		t.Errorf("Expected baseURL 'http://backend:8000', got '%s'", client.baseURL)
+	}
+
+	if client.httpClient == nil {
+		t.Fatal("Expected non-nil httpClient")
+	}
+}
+
+func TestFetchSeeds_Success(t *testing.T) {
+	// Create mock server
+	mockResponse := SeedsExportResponse{
+		Seeds: []SeedFromAPI{
+			{
+				URL:      "https://www.kmk.org",
+				Trust:    0.8,
+				Source:   "GOV",
+				Scope:    "FEDERAL",
+				State:    "",
+				Depth:    3,
+				Category: "federal",
+			},
+			{
+				URL:      "https://www.km-bw.de",
+				Trust:    0.7,
+				Source:   "GOV",
+				Scope:    "STATE",
+				State:    "BW",
+				Depth:    2,
+				Category: "states",
+			},
+		},
+		Total:      2,
+		ExportedAt: "2025-01-17T10:00:00Z",
+	}
+
+	server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		// Verify request path
+		if r.URL.Path != "/v1/edu-search/seeds/export/for-crawler" {
+			t.Errorf("Expected path '/v1/edu-search/seeds/export/for-crawler', got '%s'", r.URL.Path)
+		}
+
+		// Verify headers
+		if r.Header.Get("Accept") != "application/json" {
+			t.Errorf("Expected Accept header 'application/json', got '%s'", r.Header.Get("Accept"))
+		}
+
+		w.Header().Set("Content-Type", "application/json")
+		json.NewEncoder(w).Encode(mockResponse)
+	}))
+	defer server.Close()
+
+	// Test
+	client := NewAPIClient(server.URL)
+	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
+	defer cancel()
+
+	result, err := client.FetchSeeds(ctx)
+
+	if err != nil {
+		t.Fatalf("Unexpected error: %v", err)
+	}
+
+	if result.Total != 2 {
+		t.Errorf("Expected 2 seeds, got %d", result.Total)
+	}
+
+	if len(result.Seeds) != 2 {
+		t.Fatalf("Expected 2 seeds in array, got %d", len(result.Seeds))
+	}
+
+	// Verify first seed
+	if result.Seeds[0].URL != "https://www.kmk.org" {
+		t.Errorf("Expected URL 'https://www.kmk.org', got '%s'", result.Seeds[0].URL)
+	}
+
+	if result.Seeds[0].Trust != 0.8 {
+		t.Errorf("Expected Trust 0.8, got %f", result.Seeds[0].Trust)
+	}
+
+	if result.Seeds[0].Source != "GOV" {
+		t.Errorf("Expected Source 'GOV', got '%s'", result.Seeds[0].Source)
+	}
+
+	// Verify second seed with state
+	if result.Seeds[1].State != "BW" {
+		t.Errorf("Expected State 'BW', got '%s'", result.Seeds[1].State)
+	}
+}
+
+func TestFetchSeeds_ServerError(t *testing.T) {
+	server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		w.WriteHeader(http.StatusInternalServerError)
+		w.Write([]byte("Internal server error"))
+	}))
+	defer server.Close()
+
+	client := NewAPIClient(server.URL)
+	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
+	defer cancel()
+
+	_, err := client.FetchSeeds(ctx)
+
+	if err == nil {
+		t.Fatal("Expected error for server error response")
+	}
+}
+
+func TestFetchSeeds_InvalidJSON(t *testing.T) {
+	server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		w.Header().Set("Content-Type", "application/json")
+		w.Write([]byte("not valid json"))
+	}))
+	defer server.Close()
+
+	client := NewAPIClient(server.URL)
+	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
+	defer cancel()
+
+	_, err := client.FetchSeeds(ctx)
+
+	if err == nil {
+		t.Fatal("Expected error for invalid JSON response")
+	}
+}
+
+func TestFetchSeeds_Timeout(t *testing.T) {
+	server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		// Simulate slow response
+		time.Sleep(2 * time.Second)
+		w.WriteHeader(http.StatusOK)
+	}))
+	defer server.Close()
+
+	client := NewAPIClient(server.URL)
+	// Very short timeout
+	ctx, cancel := context.WithTimeout(context.Background(), 100*time.Millisecond)
+	defer cancel()
+
+	_, err := client.FetchSeeds(ctx)
+
+	if err == nil {
+		t.Fatal("Expected timeout error")
+	}
+}
+
+func TestFetchSeeds_EmptyResponse(t *testing.T) {
+	mockResponse := SeedsExportResponse{
+		Seeds:      []SeedFromAPI{},
+		Total:      0,
+		ExportedAt: "2025-01-17T10:00:00Z",
+	}
+
+	server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		w.Header().Set("Content-Type", "application/json")
+		json.NewEncoder(w).Encode(mockResponse)
+	}))
+	defer server.Close()
+
+	client := NewAPIClient(server.URL)
+	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
+	defer cancel()
+
+	result, err := client.FetchSeeds(ctx)
+
+	if err != nil {
+		t.Fatalf("Unexpected error: %v", err)
+	}
+
+	if result.Total != 0 {
+		t.Errorf("Expected 0 seeds, got %d", result.Total)
+	}
+
+	if len(result.Seeds) != 0 {
+		t.Errorf("Expected empty seeds array, got %d", len(result.Seeds))
+	}
+}
+
+// Tests for Crawl Status Reporting
+
+func TestReportStatus_Success(t *testing.T) {
+	var receivedReport CrawlStatusReport
+
+	server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		// Verify request method and path
+		if r.Method != "POST" {
+			t.Errorf("Expected POST method, got %s", r.Method)
+		}
+		if r.URL.Path != "/v1/edu-search/seeds/crawl-status" {
+			t.Errorf("Expected path '/v1/edu-search/seeds/crawl-status', got '%s'", r.URL.Path)
+		}
+		if r.Header.Get("Content-Type") != "application/json" {
+			t.Errorf("Expected Content-Type 'application/json', got '%s'", r.Header.Get("Content-Type"))
+		}
+
+		// Parse body
+		json.NewDecoder(r.Body).Decode(&receivedReport)
+
+		// Send response
+		w.Header().Set("Content-Type", "application/json")
+		json.NewEncoder(w).Encode(CrawlStatusResponse{
+			Success: true,
+			SeedURL: receivedReport.SeedURL,
+			Message: "Status updated",
+		})
+	}))
+	defer server.Close()
+
+	client := NewAPIClient(server.URL)
+	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
+	defer cancel()
+
+	report := &CrawlStatusReport{
+		SeedURL:          "https://www.kmk.org",
+		Status:           "success",
+		DocumentsCrawled: 42,
+		CrawlDuration:    15.5,
+	}
+
+	err := client.ReportStatus(ctx, report)
+
+	if err != nil {
+		t.Fatalf("Unexpected error: %v", err)
+	}
+
+	// Verify the report was sent correctly
+	if receivedReport.SeedURL != "https://www.kmk.org" {
+		t.Errorf("Expected SeedURL 'https://www.kmk.org', got '%s'", receivedReport.SeedURL)
+	}
+	if receivedReport.Status != "success" {
+		t.Errorf("Expected Status 'success', got '%s'", receivedReport.Status)
+	}
+	if receivedReport.DocumentsCrawled != 42 {
+		t.Errorf("Expected DocumentsCrawled 42, got %d", receivedReport.DocumentsCrawled)
+	}
+}
+
+func TestReportStatus_ServerError(t *testing.T) {
+	server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		w.WriteHeader(http.StatusInternalServerError)
+		w.Write([]byte("Internal server error"))
+	}))
+	defer server.Close()
+
+	client := NewAPIClient(server.URL)
+	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
+	defer cancel()
+
+	report := &CrawlStatusReport{
+		SeedURL: "https://www.kmk.org",
+		Status:  "success",
+	}
+
+	err := client.ReportStatus(ctx, report)
+
+	if err == nil {
+		t.Fatal("Expected error for server error response")
+	}
+}
+
+func TestReportStatus_NotFound(t *testing.T) {
+	server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		w.WriteHeader(http.StatusNotFound)
+		w.Write([]byte(`{"detail": "Seed nicht gefunden"}`))
+	}))
+	defer server.Close()
+
+	client := NewAPIClient(server.URL)
+	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
+	defer cancel()
+
+	report := &CrawlStatusReport{
+		SeedURL: "https://unknown.example.com",
+		Status:  "error",
+	}
+
+	err := client.ReportStatus(ctx, report)
+
+	if err == nil {
+		t.Fatal("Expected error for 404 response")
+	}
+}
+
+func TestReportStatusBulk_Success(t *testing.T) {
+	server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		// Verify request method and path
+		if r.Method != "POST" {
+			t.Errorf("Expected POST method, got %s", r.Method)
+		}
+		if r.URL.Path != "/v1/edu-search/seeds/crawl-status/bulk" {
+			t.Errorf("Expected path '/v1/edu-search/seeds/crawl-status/bulk', got '%s'", r.URL.Path)
+		}
+
+		// Parse body
+		var payload struct {
+			Updates []*CrawlStatusReport `json:"updates"`
+		}
+		json.NewDecoder(r.Body).Decode(&payload)
+
+		// Send response
+		w.Header().Set("Content-Type", "application/json")
+		json.NewEncoder(w).Encode(BulkCrawlStatusResponse{
+			Updated: len(payload.Updates),
+			Failed:  0,
+			Errors:  []string{},
+		})
+	}))
+	defer server.Close()
+
+	client := NewAPIClient(server.URL)
+	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
+	defer cancel()
+
+	reports := []*CrawlStatusReport{
+		{
+			SeedURL:          "https://www.kmk.org",
+			Status:           "success",
+			DocumentsCrawled: 42,
+		},
+		{
+			SeedURL:          "https://www.km-bw.de",
+			Status:           "partial",
+			DocumentsCrawled: 15,
+		},
+	}
+
+	result, err := client.ReportStatusBulk(ctx, reports)
+
+	if err != nil {
+		t.Fatalf("Unexpected error: %v", err)
+	}
+
+	if result.Updated != 2 {
+		t.Errorf("Expected 2 updated, got %d", result.Updated)
+	}
+	if result.Failed != 0 {
+		t.Errorf("Expected 0 failed, got %d", result.Failed)
+	}
+}
+
+func TestReportStatusBulk_PartialFailure(t *testing.T) {
+	server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		w.Header().Set("Content-Type", "application/json")
+		json.NewEncoder(w).Encode(BulkCrawlStatusResponse{
+			Updated: 1,
+			Failed:  1,
+			Errors:  []string{"Seed nicht gefunden: https://unknown.example.com"},
+		})
+	}))
+	defer server.Close()
+
+	client := NewAPIClient(server.URL)
+	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
+	defer cancel()
+
+	reports := []*CrawlStatusReport{
+		{SeedURL: "https://www.kmk.org", Status: "success"},
+		{SeedURL: "https://unknown.example.com", Status: "error"},
+	}
+
+	result, err := client.ReportStatusBulk(ctx, reports)
+
+	if err != nil {
+		t.Fatalf("Unexpected error: %v", err)
+	}
+
+	if result.Updated != 1 {
+		t.Errorf("Expected 1 updated, got %d", result.Updated)
+	}
+	if result.Failed != 1 {
+		t.Errorf("Expected 1 failed, got %d", result.Failed)
+	}
+	if len(result.Errors) != 1 {
+		t.Errorf("Expected 1 error, got %d", len(result.Errors))
+	}
+}
+
+func TestCrawlStatusReport_Struct(t *testing.T) {
+	report := CrawlStatusReport{
+		SeedURL:          "https://www.example.com",
+		Status:           "success",
+		DocumentsCrawled: 100,
+		ErrorMessage:     "",
+		CrawlDuration:    25.5,
+	}
+
+	// Test JSON marshaling
+	data, err := json.Marshal(report)
+	if err != nil {
+		t.Fatalf("Failed to marshal: %v", err)
+	}
+
+	var decoded CrawlStatusReport
+	if err := json.Unmarshal(data, &decoded); err != nil {
+		t.Fatalf("Failed to unmarshal: %v", err)
+	}
+
+	if decoded.SeedURL != report.SeedURL {
+		t.Errorf("SeedURL mismatch")
+	}
+	if decoded.Status != report.Status {
+		t.Errorf("Status mismatch")
+	}
+	if decoded.DocumentsCrawled != report.DocumentsCrawled {
+		t.Errorf("DocumentsCrawled mismatch")
+	}
+	if decoded.CrawlDuration != report.CrawlDuration {
+		t.Errorf("CrawlDuration mismatch")
+	}
+}
@@ -0,0 +1,364 @@
+package crawler
+
+import (
+	"bufio"
+	"context"
+	"crypto/sha256"
+	"encoding/hex"
+	"fmt"
+	"io"
+	"log"
+	"net/http"
+	"net/url"
+	"os"
+	"path/filepath"
+	"strings"
+	"sync"
+	"time"
+
+	"github.com/google/uuid"
+)
+
+// Note: API client is in the same package (api_client.go)
+
+// FetchResult contains the result of fetching a URL
+type FetchResult struct {
+	URL          string
+	CanonicalURL string
+	ContentType  string
+	StatusCode   int
+	Body         []byte
+	ContentHash  string
+	FetchTime    time.Time
+	Error        error
+}
+
+// Seed represents a URL to crawl with metadata
+type Seed struct {
+	URL        string
+	TrustBoost float64
+	Source     string // GOV, EDU, UNI, etc.
+	Scope      string // FEDERAL, STATE, etc.
+	State      string // BW, BY, etc. (optional)
+	MaxDepth   int    // Custom crawl depth for this seed
+	Category   string // Category name
+}
+
+// Crawler handles URL fetching with rate limiting and robots.txt respect
+type Crawler struct {
+	userAgent       string
+	rateLimitPerSec float64
+	maxDepth        int
+	timeout         time.Duration
+	client          *http.Client
+	denylist        map[string]bool
+	lastFetch       map[string]time.Time
+	mu              sync.Mutex
+	apiClient       *APIClient // API client for fetching seeds from Backend
+}
+
+// NewCrawler creates a new crawler instance
+func NewCrawler(userAgent string, rateLimitPerSec float64, maxDepth int) *Crawler {
+	return &Crawler{
+		userAgent:       userAgent,
+		rateLimitPerSec: rateLimitPerSec,
+		maxDepth:        maxDepth,
+		timeout:         30 * time.Second,
+		client: &http.Client{
+			Timeout: 30 * time.Second,
+			CheckRedirect: func(req *http.Request, via []*http.Request) error {
+				if len(via) >= 5 {
+					return fmt.Errorf("too many redirects")
+				}
+				return nil
+			},
+		},
+		denylist:  make(map[string]bool),
+		lastFetch: make(map[string]time.Time),
+	}
+}
+
+// SetAPIClient sets the API client for fetching seeds from Backend
+func (c *Crawler) SetAPIClient(backendURL string) {
+	c.apiClient = NewAPIClient(backendURL)
+}
+
+// LoadSeedsFromAPI fetches seeds from the Backend API
+func (c *Crawler) LoadSeedsFromAPI(ctx context.Context) ([]Seed, error) {
+	if c.apiClient == nil {
+		return nil, fmt.Errorf("API client not initialized - call SetAPIClient first")
+	}
+
+	response, err := c.apiClient.FetchSeeds(ctx)
+	if err != nil {
+		return nil, fmt.Errorf("failed to fetch seeds from API: %w", err)
+	}
+
+	seeds := make([]Seed, 0, len(response.Seeds))
+	for _, apiSeed := range response.Seeds {
+		seed := Seed{
+			URL:        apiSeed.URL,
+			TrustBoost: apiSeed.Trust,
+			Source:     apiSeed.Source,
+			Scope:      apiSeed.Scope,
+			State:      apiSeed.State,
+			MaxDepth:   apiSeed.Depth,
+			Category:   apiSeed.Category,
+		}
+		// Use default depth if not specified
+		if seed.MaxDepth <= 0 {
+			seed.MaxDepth = c.maxDepth
+		}
+		seeds = append(seeds, seed)
+	}
+
+	log.Printf("Loaded %d seeds from API (exported at: %s)", len(seeds), response.ExportedAt)
+	return seeds, nil
+}
+
+// LoadSeeds loads seed URLs from files in a directory (legacy method)
+func (c *Crawler) LoadSeeds(seedsDir string) ([]string, error) {
+	var seeds []string
+
+	files, err := filepath.Glob(filepath.Join(seedsDir, "*.txt"))
+	if err != nil {
+		return nil, err
+	}
+
+	for _, file := range files {
+		if strings.Contains(file, "denylist") {
+			// Load denylist
+			if err := c.loadDenylist(file); err != nil {
+				log.Printf("Warning: Could not load denylist %s: %v", file, err)
+			}
+			continue
+		}
+
+		fileSeeds, err := c.loadSeedFile(file)
+		if err != nil {
+			log.Printf("Warning: Could not load seed file %s: %v", file, err)
+			continue
+		}
+		seeds = append(seeds, fileSeeds...)
+	}
+
+	log.Printf("Loaded %d seeds from files, %d domains in denylist", len(seeds), len(c.denylist))
+	return seeds, nil
+}
+
+// LoadSeedsWithMetadata loads seeds from files and converts to Seed struct
+// This provides backward compatibility while allowing metadata
+func (c *Crawler) LoadSeedsWithMetadata(seedsDir string) ([]Seed, error) {
+	urlList, err := c.LoadSeeds(seedsDir)
+	if err != nil {
+		return nil, err
+	}
+
+	seeds := make([]Seed, 0, len(urlList))
+	for _, url := range urlList {
+		seeds = append(seeds, Seed{
+			URL:        url,
+			TrustBoost: 0.5, // Default trust boost
+			MaxDepth:   c.maxDepth,
+		})
+	}
+
+	return seeds, nil
+}
+
+func (c *Crawler) loadSeedFile(filename string) ([]string, error) {
+	file, err := os.Open(filename)
+	if err != nil {
+		return nil, err
+	}
+	defer file.Close()
+
+	var seeds []string
+	scanner := bufio.NewScanner(file)
+	for scanner.Scan() {
+		line := strings.TrimSpace(scanner.Text())
+		// Skip comments and empty lines
+		if line == "" || strings.HasPrefix(line, "#") {
+			continue
+		}
+		// Extract URL (ignore comments after URL)
+		parts := strings.SplitN(line, " ", 2)
+		urlStr := strings.TrimSpace(parts[0])
+		if urlStr != "" {
+			seeds = append(seeds, urlStr)
+		}
+	}
+	return seeds, scanner.Err()
+}
+
+func (c *Crawler) loadDenylist(filename string) error {
+	file, err := os.Open(filename)
+	if err != nil {
+		return err
+	}
+	defer file.Close()
+
+	scanner := bufio.NewScanner(file)
+	for scanner.Scan() {
+		line := strings.TrimSpace(scanner.Text())
+		if line == "" || strings.HasPrefix(line, "#") {
+			continue
+		}
+		c.denylist[strings.ToLower(line)] = true
+	}
+	return scanner.Err()
+}
+
+// IsDenied checks if a domain is in the denylist
+func (c *Crawler) IsDenied(urlStr string) bool {
+	u, err := url.Parse(urlStr)
+	if err != nil {
+		return true
+	}
+
+	host := strings.ToLower(u.Host)
+
+	// Check exact match
+	if c.denylist[host] {
+		return true
+	}
+
+	// Check parent domains
+	parts := strings.Split(host, ".")
+	for i := 1; i < len(parts)-1; i++ {
+		parent := strings.Join(parts[i:], ".")
+		if c.denylist[parent] {
+			return true
+		}
+	}
+
+	return false
+}
+
+// Fetch fetches a single URL with rate limiting
+func (c *Crawler) Fetch(ctx context.Context, urlStr string) (*FetchResult, error) {
+	result := &FetchResult{
+		URL:       urlStr,
+		FetchTime: time.Now(),
+	}
+
+	// Check denylist
+	if c.IsDenied(urlStr) {
+		result.Error = fmt.Errorf("domain denied")
+		return result, result.Error
+	}
+
+	// Parse URL
+	u, err := url.Parse(urlStr)
+	if err != nil {
+		result.Error = err
+		return result, err
+	}
+
+	// Rate limiting per domain
+	c.waitForRateLimit(u.Host)
+
+	// Create request
+	req, err := http.NewRequestWithContext(ctx, "GET", urlStr, nil)
+	if err != nil {
+		result.Error = err
+		return result, err
+	}
+
+	req.Header.Set("User-Agent", c.userAgent)
+	req.Header.Set("Accept", "text/html,application/pdf,application/xhtml+xml")
+	req.Header.Set("Accept-Language", "de-DE,de;q=0.9,en;q=0.8")
+
+	// Execute request
+	resp, err := c.client.Do(req)
+	if err != nil {
+		result.Error = err
+		return result, err
+	}
+	defer resp.Body.Close()
+
+	result.StatusCode = resp.StatusCode
+	result.ContentType = resp.Header.Get("Content-Type")
+	result.CanonicalURL = resp.Request.URL.String()
+
+	if resp.StatusCode != http.StatusOK {
+		result.Error = fmt.Errorf("HTTP %d", resp.StatusCode)
+		return result, result.Error
+	}
+
+	// Read body (limit to 20MB)
+	limitedReader := io.LimitReader(resp.Body, 20*1024*1024)
+	body, err := io.ReadAll(limitedReader)
+	if err != nil {
+		result.Error = err
+		return result, err
+	}
+
+	result.Body = body
+
+	// Calculate content hash
+	hash := sha256.Sum256(body)
+	result.ContentHash = hex.EncodeToString(hash[:])
+
+	return result, nil
+}
+
+func (c *Crawler) waitForRateLimit(host string) {
+	c.mu.Lock()
+	defer c.mu.Unlock()
+
+	minInterval := time.Duration(float64(time.Second) / c.rateLimitPerSec)
+
+	if last, ok := c.lastFetch[host]; ok {
+		elapsed := time.Since(last)
+		if elapsed < minInterval {
+			time.Sleep(minInterval - elapsed)
+		}
+	}
+
+	c.lastFetch[host] = time.Now()
+}
+
+// ExtractDomain extracts the domain from a URL
+func ExtractDomain(urlStr string) string {
+	u, err := url.Parse(urlStr)
+	if err != nil {
+		return ""
+	}
+	return u.Host
+}
+
+// GenerateDocID generates a unique document ID
+func GenerateDocID() string {
+	return uuid.New().String()
+}
+
+// NormalizeURL normalizes a URL for deduplication
+func NormalizeURL(urlStr string) string {
+	u, err := url.Parse(urlStr)
+	if err != nil {
+		return urlStr
+	}
+
+	// Remove trailing slashes
+	u.Path = strings.TrimSuffix(u.Path, "/")
+
+	// Remove common tracking parameters
+	q := u.Query()
+	for key := range q {
+		lowerKey := strings.ToLower(key)
+		if strings.HasPrefix(lowerKey, "utm_") ||
+			lowerKey == "ref" ||
+			lowerKey == "source" ||
+			lowerKey == "fbclid" ||
+			lowerKey == "gclid" {
+			q.Del(key)
+		}
+	}
+	u.RawQuery = q.Encode()
+
+	// Lowercase host
+	u.Host = strings.ToLower(u.Host)
+
+	return u.String()
+}
@@ -0,0 +1,639 @@
+package crawler
+
+import (
+	"context"
+	"net/http"
+	"net/http/httptest"
+	"os"
+	"path/filepath"
+	"strings"
+	"testing"
+	"time"
+)
+
+func TestNewCrawler(t *testing.T) {
+	crawler := NewCrawler("TestBot/1.0", 1.0, 3)
+
+	if crawler == nil {
+		t.Fatal("Expected non-nil crawler")
+	}
+	if crawler.userAgent != "TestBot/1.0" {
+		t.Errorf("Expected userAgent 'TestBot/1.0', got %q", crawler.userAgent)
+	}
+	if crawler.rateLimitPerSec != 1.0 {
+		t.Errorf("Expected rateLimitPerSec 1.0, got %f", crawler.rateLimitPerSec)
+	}
+	if crawler.maxDepth != 3 {
+		t.Errorf("Expected maxDepth 3, got %d", crawler.maxDepth)
+	}
+	if crawler.client == nil {
+		t.Error("Expected non-nil HTTP client")
+	}
+}
+
+func TestCrawler_LoadSeeds(t *testing.T) {
+	// Create temp directory with seed files
+	dir := t.TempDir()
+
+	// Create a seed file
+	seedContent := `# Federal education sources
+https://www.kmk.org
+https://www.bildungsserver.de
+
+# Comment line
+https://www.bpb.de # with inline comment
+`
+	if err := os.WriteFile(filepath.Join(dir, "federal.txt"), []byte(seedContent), 0644); err != nil {
+		t.Fatal(err)
+	}
+
+	// Create another seed file
+	stateContent := `https://www.km.bayern.de
+https://www.schulministerium.nrw.de
+`
+	if err := os.WriteFile(filepath.Join(dir, "states.txt"), []byte(stateContent), 0644); err != nil {
+		t.Fatal(err)
+	}
+
+	// Create denylist
+	denylistContent := `# Denylist
+facebook.com
+twitter.com
+instagram.com
+`
+	if err := os.WriteFile(filepath.Join(dir, "denylist.txt"), []byte(denylistContent), 0644); err != nil {
+		t.Fatal(err)
+	}
+
+	crawler := NewCrawler("TestBot/1.0", 1.0, 3)
+	seeds, err := crawler.LoadSeeds(dir)
+	if err != nil {
+		t.Fatalf("LoadSeeds failed: %v", err)
+	}
+
+	// Check seeds loaded
+	if len(seeds) != 5 {
+		t.Errorf("Expected 5 seeds, got %d", len(seeds))
+	}
+
+	// Check expected URLs
+	expectedURLs := []string{
+		"https://www.kmk.org",
+		"https://www.bildungsserver.de",
+		"https://www.bpb.de",
+		"https://www.km.bayern.de",
+		"https://www.schulministerium.nrw.de",
+	}
+
+	for _, expected := range expectedURLs {
+		found := false
+		for _, seed := range seeds {
+			if seed == expected {
+				found = true
+				break
+			}
+		}
+		if !found {
+			t.Errorf("Expected seed %q not found", expected)
+		}
+	}
+
+	// Check denylist loaded
+	if len(crawler.denylist) != 3 {
+		t.Errorf("Expected 3 denylist entries, got %d", len(crawler.denylist))
+	}
+}
+
+func TestCrawler_IsDenied(t *testing.T) {
+	crawler := NewCrawler("TestBot/1.0", 1.0, 3)
+	crawler.denylist = map[string]bool{
+		"facebook.com":    true,
+		"twitter.com":     true,
+		"ads.example.com": true,
+	}
+
+	tests := []struct {
+		name     string
+		url      string
+		expected bool
+	}{
+		{
+			name:     "Exact domain match",
+			url:      "https://facebook.com/page",
+			expected: true,
+		},
+		{
+			name:     "Subdomain of denied domain",
+			url:      "https://www.facebook.com/page",
+			expected: true,
+		},
+		{
+			name:     "Allowed domain",
+			url:      "https://www.kmk.org/bildung",
+			expected: false,
+		},
+		{
+			name:     "Denied subdomain",
+			url:      "https://ads.example.com/banner",
+			expected: true,
+		},
+		{
+			name:     "Parent domain allowed",
+			url:      "https://example.com/page",
+			expected: false,
+		},
+		{
+			name:     "Invalid URL scheme",
+			url:      "://invalid",
+			expected: true,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			result := crawler.IsDenied(tt.url)
+			if result != tt.expected {
+				t.Errorf("IsDenied(%q) = %v, expected %v", tt.url, result, tt.expected)
+			}
+		})
+	}
+}
+
+func TestCrawler_Fetch_Success(t *testing.T) {
+	// Create test server
+	server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		// Check user agent
+		if r.Header.Get("User-Agent") != "TestBot/1.0" {
+			t.Errorf("Expected User-Agent 'TestBot/1.0', got %q", r.Header.Get("User-Agent"))
+		}
+
+		w.Header().Set("Content-Type", "text/html; charset=utf-8")
+		w.WriteHeader(http.StatusOK)
+		w.Write([]byte("<html><body>Test content</body></html>"))
+	}))
+	defer server.Close()
+
+	crawler := NewCrawler("TestBot/1.0", 100.0, 3) // High rate limit for testing
+	ctx := context.Background()
+
+	result, err := crawler.Fetch(ctx, server.URL+"/page")
+	if err != nil {
+		t.Fatalf("Fetch failed: %v", err)
+	}
+
+	if result.StatusCode != 200 {
+		t.Errorf("Expected status 200, got %d", result.StatusCode)
+	}
+	if result.Error != nil {
+		t.Errorf("Expected no error, got %v", result.Error)
+	}
+	if !strings.Contains(result.ContentType, "text/html") {
+		t.Errorf("Expected Content-Type to contain 'text/html', got %q", result.ContentType)
+	}
+	if len(result.Body) == 0 {
+		t.Error("Expected non-empty body")
+	}
+	if result.ContentHash == "" {
+		t.Error("Expected non-empty content hash")
+	}
+	if result.FetchTime.IsZero() {
+		t.Error("Expected non-zero fetch time")
+	}
+}
+
+func TestCrawler_Fetch_DeniedDomain(t *testing.T) {
+	crawler := NewCrawler("TestBot/1.0", 100.0, 3)
+	crawler.denylist = map[string]bool{
+		"denied.com": true,
+	}
+
+	ctx := context.Background()
+	result, err := crawler.Fetch(ctx, "https://denied.com/page")
+
+	if err == nil {
+		t.Error("Expected error for denied domain")
+	}
+	if result.Error == nil {
+		t.Error("Expected error in result")
+	}
+	if !strings.Contains(result.Error.Error(), "denied") {
+		t.Errorf("Expected 'denied' in error message, got %v", result.Error)
+	}
+}
+
+func TestCrawler_Fetch_HTTPError(t *testing.T) {
+	server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		w.WriteHeader(http.StatusNotFound)
+	}))
+	defer server.Close()
+
+	crawler := NewCrawler("TestBot/1.0", 100.0, 3)
+	ctx := context.Background()
+
+	result, err := crawler.Fetch(ctx, server.URL+"/notfound")
+	if err == nil {
+		t.Error("Expected error for 404 response")
+	}
+	if result.StatusCode != 404 {
+		t.Errorf("Expected status 404, got %d", result.StatusCode)
+	}
+}
+
+func TestCrawler_Fetch_Redirect(t *testing.T) {
+	redirectCount := 0
+	server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		if r.URL.Path == "/redirect" {
+			redirectCount++
+			http.Redirect(w, r, "/final", http.StatusFound)
+			return
+		}
+		w.WriteHeader(http.StatusOK)
+		w.Write([]byte("Final content"))
+	}))
+	defer server.Close()
+
+	crawler := NewCrawler("TestBot/1.0", 100.0, 3)
+	ctx := context.Background()
+
+	result, err := crawler.Fetch(ctx, server.URL+"/redirect")
+	if err != nil {
+		t.Fatalf("Fetch failed: %v", err)
+	}
+
+	// CanonicalURL should be the final URL after redirect
+	if !strings.HasSuffix(result.CanonicalURL, "/final") {
+		t.Errorf("Expected canonical URL to end with '/final', got %q", result.CanonicalURL)
+	}
+}
+
+func TestCrawler_Fetch_Timeout(t *testing.T) {
+	server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		time.Sleep(2 * time.Second) // Delay response
+		w.WriteHeader(http.StatusOK)
+	}))
+	defer server.Close()
+
+	crawler := NewCrawler("TestBot/1.0", 100.0, 3)
+	crawler.timeout = 100 * time.Millisecond // Very short timeout
+
+	ctx, cancel := context.WithTimeout(context.Background(), 100*time.Millisecond)
+	defer cancel()
+
+	_, err := crawler.Fetch(ctx, server.URL+"/slow")
+	if err == nil {
+		t.Error("Expected timeout error")
+	}
+}
+
+func TestExtractDomain(t *testing.T) {
+	tests := []struct {
+		url      string
+		expected string
+	}{
+		{
+			url:      "https://www.example.com/page",
+			expected: "www.example.com",
+		},
+		{
+			url:      "https://example.com:8080/path",
+			expected: "example.com:8080",
+		},
+		{
+			url:      "http://subdomain.example.com",
+			expected: "subdomain.example.com",
+		},
+		{
+			url:      "invalid-url",
+			expected: "",
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.url, func(t *testing.T) {
+			result := ExtractDomain(tt.url)
+			if result != tt.expected {
+				t.Errorf("ExtractDomain(%q) = %q, expected %q", tt.url, result, tt.expected)
+			}
+		})
+	}
+}
+
+func TestGenerateDocID(t *testing.T) {
+	id1 := GenerateDocID()
+	id2 := GenerateDocID()
+
+	if id1 == "" {
+		t.Error("Expected non-empty ID")
+	}
+	if id1 == id2 {
+		t.Error("Expected unique IDs")
+	}
+	// UUID format check (basic)
+	if len(id1) != 36 {
+		t.Errorf("Expected UUID length 36, got %d", len(id1))
+	}
+}
+
+func TestNormalizeURL(t *testing.T) {
+	tests := []struct {
+		name     string
+		url      string
+		expected string
+	}{
+		{
+			name:     "Remove trailing slash",
+			url:      "https://example.com/page/",
+			expected: "https://example.com/page",
+		},
+		{
+			name:     "Remove UTM parameters",
+			url:      "https://example.com/page?utm_source=google&utm_medium=cpc",
+			expected: "https://example.com/page",
+		},
+		{
+			name:     "Remove multiple tracking params",
+			url:      "https://example.com/page?id=123&utm_campaign=test&fbclid=abc",
+			expected: "https://example.com/page?id=123",
+		},
+		{
+			name:     "Keep non-tracking params",
+			url:      "https://example.com/search?q=test&page=2",
+			expected: "https://example.com/search?page=2&q=test",
+		},
+		{
+			name:     "Lowercase host",
+			url:      "https://EXAMPLE.COM/Page",
+			expected: "https://example.com/Page",
+		},
+		{
+			name:     "Invalid URL returns as-is",
+			url:      "not-a-url",
+			expected: "not-a-url",
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			result := NormalizeURL(tt.url)
+			if result != tt.expected {
+				t.Errorf("NormalizeURL(%q) = %q, expected %q", tt.url, result, tt.expected)
+			}
+		})
+	}
+}
+
+func TestCrawler_RateLimit(t *testing.T) {
+	requestCount := 0
+	server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		requestCount++
+		w.WriteHeader(http.StatusOK)
+		w.Write([]byte("OK"))
+	}))
+	defer server.Close()
+
+	// 2 requests per second = 500ms between requests
+	crawler := NewCrawler("TestBot/1.0", 2.0, 3)
+	ctx := context.Background()
+
+	start := time.Now()
+
+	// Make 3 requests
+	for i := 0; i < 3; i++ {
+		crawler.Fetch(ctx, server.URL+"/page")
+	}
+
+	elapsed := time.Since(start)
+
+	// With 2 req/sec, 3 requests should take at least 1 second (2 intervals)
+	if elapsed < 800*time.Millisecond {
+		t.Errorf("Rate limiting not working: 3 requests took only %v", elapsed)
+	}
+}
+
+func TestLoadSeedFile_EmptyLines(t *testing.T) {
+	dir := t.TempDir()
+
+	content := `
+
+https://example.com
+
+# comment
+
+https://example.org
+
+`
+	if err := os.WriteFile(filepath.Join(dir, "seeds.txt"), []byte(content), 0644); err != nil {
+		t.Fatal(err)
+	}
+
+	crawler := NewCrawler("TestBot/1.0", 1.0, 3)
+	seeds, err := crawler.LoadSeeds(dir)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	if len(seeds) != 2 {
+		t.Errorf("Expected 2 seeds (ignoring empty lines and comments), got %d", len(seeds))
+	}
+}
+
+func TestCrawler_Fetch_LargeBody(t *testing.T) {
+	// Create a large response (but under the limit)
+	largeBody := strings.Repeat("A", 1024*1024) // 1MB
+
+	server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		w.Header().Set("Content-Type", "text/plain")
+		w.WriteHeader(http.StatusOK)
+		w.Write([]byte(largeBody))
+	}))
+	defer server.Close()
+
+	crawler := NewCrawler("TestBot/1.0", 100.0, 3)
+	ctx := context.Background()
+
+	result, err := crawler.Fetch(ctx, server.URL+"/large")
+	if err != nil {
+		t.Fatalf("Fetch failed: %v", err)
+	}
+
+	if len(result.Body) != len(largeBody) {
+		t.Errorf("Expected body length %d, got %d", len(largeBody), len(result.Body))
+	}
+}
+
+// Tests for API Integration (new functionality)
+
+func TestCrawler_SetAPIClient(t *testing.T) {
+	crawler := NewCrawler("TestBot/1.0", 1.0, 3)
+
+	if crawler.apiClient != nil {
+		t.Error("Expected nil apiClient initially")
+	}
+
+	crawler.SetAPIClient("http://backend:8000")
+
+	if crawler.apiClient == nil {
+		t.Error("Expected non-nil apiClient after SetAPIClient")
+	}
+}
+
+func TestCrawler_LoadSeedsFromAPI_NotInitialized(t *testing.T) {
+	crawler := NewCrawler("TestBot/1.0", 1.0, 3)
+	ctx := context.Background()
+
+	_, err := crawler.LoadSeedsFromAPI(ctx)
+
+	if err == nil {
+		t.Error("Expected error when API client not initialized")
+	}
+}
+
+func TestCrawler_LoadSeedsFromAPI_Success(t *testing.T) {
+	// Create mock server
+	server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		w.Header().Set("Content-Type", "application/json")
+		w.Write([]byte(`{
+			"seeds": [
+				{"url": "https://www.kmk.org", "trust": 0.8, "source": "GOV", "scope": "FEDERAL", "state": "", "depth": 3, "category": "federal"},
+				{"url": "https://www.km-bw.de", "trust": 0.7, "source": "GOV", "scope": "STATE", "state": "BW", "depth": 2, "category": "states"}
+			],
+			"total": 2,
+			"exported_at": "2025-01-17T10:00:00Z"
+		}`))
+	}))
+	defer server.Close()
+
+	crawler := NewCrawler("TestBot/1.0", 1.0, 4)
+	crawler.SetAPIClient(server.URL)
+	ctx := context.Background()
+
+	seeds, err := crawler.LoadSeedsFromAPI(ctx)
+
+	if err != nil {
+		t.Fatalf("Unexpected error: %v", err)
+	}
+
+	if len(seeds) != 2 {
+		t.Fatalf("Expected 2 seeds, got %d", len(seeds))
+	}
+
+	// Check first seed
+	if seeds[0].URL != "https://www.kmk.org" {
+		t.Errorf("Expected URL 'https://www.kmk.org', got '%s'", seeds[0].URL)
+	}
+	if seeds[0].TrustBoost != 0.8 {
+		t.Errorf("Expected TrustBoost 0.8, got %f", seeds[0].TrustBoost)
+	}
+	if seeds[0].Source != "GOV" {
+		t.Errorf("Expected Source 'GOV', got '%s'", seeds[0].Source)
+	}
+	if seeds[0].MaxDepth != 3 {
+		t.Errorf("Expected MaxDepth 3, got %d", seeds[0].MaxDepth)
+	}
+
+	// Check second seed with state
+	if seeds[1].State != "BW" {
+		t.Errorf("Expected State 'BW', got '%s'", seeds[1].State)
+	}
+	if seeds[1].Category != "states" {
+		t.Errorf("Expected Category 'states', got '%s'", seeds[1].Category)
+	}
+}
+
+func TestCrawler_LoadSeedsFromAPI_DefaultDepth(t *testing.T) {
+	// Create mock server with seed that has no depth
+	server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		w.Header().Set("Content-Type", "application/json")
+		w.Write([]byte(`{
+			"seeds": [
+				{"url": "https://www.example.com", "trust": 0.5, "source": "EDU", "scope": "FEDERAL", "state": "", "depth": 0, "category": "edu"}
+			],
+			"total": 1,
+			"exported_at": "2025-01-17T10:00:00Z"
+		}`))
+	}))
+	defer server.Close()
+
+	defaultDepth := 5
+	crawler := NewCrawler("TestBot/1.0", 1.0, defaultDepth)
+	crawler.SetAPIClient(server.URL)
+	ctx := context.Background()
+
+	seeds, err := crawler.LoadSeedsFromAPI(ctx)
+
+	if err != nil {
+		t.Fatalf("Unexpected error: %v", err)
+	}
+
+	// When depth is 0 or not specified, it should use crawler's default
+	if seeds[0].MaxDepth != defaultDepth {
+		t.Errorf("Expected default MaxDepth %d, got %d", defaultDepth, seeds[0].MaxDepth)
+	}
+}
+
+func TestCrawler_LoadSeedsWithMetadata(t *testing.T) {
+	dir := t.TempDir()
+
+	seedContent := `https://www.kmk.org
+https://www.bildungsserver.de`
+
+	if err := os.WriteFile(filepath.Join(dir, "seeds.txt"), []byte(seedContent), 0644); err != nil {
+		t.Fatal(err)
+	}
+
+	defaultDepth := 4
+	crawler := NewCrawler("TestBot/1.0", 1.0, defaultDepth)
+	seeds, err := crawler.LoadSeedsWithMetadata(dir)
+
+	if err != nil {
+		t.Fatalf("LoadSeedsWithMetadata failed: %v", err)
+	}
+
+	if len(seeds) != 2 {
+		t.Fatalf("Expected 2 seeds, got %d", len(seeds))
+	}
+
+	// Check default values
+	for _, seed := range seeds {
+		if seed.TrustBoost != 0.5 {
+			t.Errorf("Expected default TrustBoost 0.5, got %f", seed.TrustBoost)
+		}
+		if seed.MaxDepth != defaultDepth {
+			t.Errorf("Expected default MaxDepth %d, got %d", defaultDepth, seed.MaxDepth)
+		}
+	}
+}
+
+func TestSeed_Struct(t *testing.T) {
+	seed := Seed{
+		URL:        "https://www.example.com",
+		TrustBoost: 0.75,
+		Source:     "GOV",
+		Scope:      "STATE",
+		State:      "BY",
+		MaxDepth:   3,
+		Category:   "states",
+	}
+
+	if seed.URL != "https://www.example.com" {
+		t.Errorf("URL mismatch")
+	}
+	if seed.TrustBoost != 0.75 {
+		t.Errorf("TrustBoost mismatch")
+	}
+	if seed.Source != "GOV" {
+		t.Errorf("Source mismatch")
+	}
+	if seed.Scope != "STATE" {
+		t.Errorf("Scope mismatch")
+	}
+	if seed.State != "BY" {
+		t.Errorf("State mismatch")
+	}
+	if seed.MaxDepth != 3 {
+		t.Errorf("MaxDepth mismatch")
+	}
+	if seed.Category != "states" {
+		t.Errorf("Category mismatch")
+	}
+}