feat: edu-search-service migriert, voice-service/geo-service entfernt
All checks were successful
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 28s
CI / test-go-edu-search (push) Successful in 27s
CI / test-python-klausur (push) Successful in 1m45s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 21s
All checks were successful
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 28s
CI / test-go-edu-search (push) Successful in 27s
CI / test-python-klausur (push) Successful in 1m45s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 21s
- edu-search-service von breakpilot-pwa nach breakpilot-lehrer kopiert (ohne vendor) - opensearch + edu-search-service in docker-compose.yml hinzugefuegt - voice-service aus docker-compose.yml entfernt (jetzt in breakpilot-core) - geo-service aus docker-compose.yml entfernt (nicht mehr benoetigt) - CI/CD: edu-search-service zu Gitea Actions und Woodpecker hinzugefuegt (Go lint, test mit go mod download, build, SBOM) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
183
edu-search-service/internal/crawler/api_client.go
Normal file
183
edu-search-service/internal/crawler/api_client.go
Normal file
@@ -0,0 +1,183 @@
|
||||
package crawler
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"time"
|
||||
)
|
||||
|
||||
// SeedFromAPI represents a seed URL from the Backend API
|
||||
type SeedFromAPI struct {
|
||||
URL string `json:"url"`
|
||||
Trust float64 `json:"trust"`
|
||||
Source string `json:"source"` // GOV, EDU, UNI, etc.
|
||||
Scope string `json:"scope"` // FEDERAL, STATE, etc.
|
||||
State string `json:"state"` // BW, BY, etc. (optional)
|
||||
Depth int `json:"depth"` // Crawl depth for this seed
|
||||
Category string `json:"category"` // Category name
|
||||
}
|
||||
|
||||
// SeedsExportResponse represents the API response from /seeds/export/for-crawler
|
||||
type SeedsExportResponse struct {
|
||||
Seeds []SeedFromAPI `json:"seeds"`
|
||||
Total int `json:"total"`
|
||||
ExportedAt string `json:"exported_at"`
|
||||
}
|
||||
|
||||
// APIClient handles communication with the Python Backend
|
||||
type APIClient struct {
|
||||
baseURL string
|
||||
httpClient *http.Client
|
||||
}
|
||||
|
||||
// NewAPIClient creates a new API client for fetching seeds
|
||||
func NewAPIClient(backendURL string) *APIClient {
|
||||
return &APIClient{
|
||||
baseURL: backendURL,
|
||||
httpClient: &http.Client{
|
||||
Timeout: 30 * time.Second,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
// FetchSeeds retrieves enabled seeds from the Backend API
|
||||
func (c *APIClient) FetchSeeds(ctx context.Context) (*SeedsExportResponse, error) {
|
||||
url := fmt.Sprintf("%s/v1/edu-search/seeds/export/for-crawler", c.baseURL)
|
||||
|
||||
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to create request: %w", err)
|
||||
}
|
||||
|
||||
req.Header.Set("Accept", "application/json")
|
||||
req.Header.Set("User-Agent", "EduSearchCrawler/1.0")
|
||||
|
||||
resp, err := c.httpClient.Do(req)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to fetch seeds: %w", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
body, _ := io.ReadAll(resp.Body)
|
||||
return nil, fmt.Errorf("API returned status %d: %s", resp.StatusCode, string(body))
|
||||
}
|
||||
|
||||
body, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to read response: %w", err)
|
||||
}
|
||||
|
||||
var result SeedsExportResponse
|
||||
if err := json.Unmarshal(body, &result); err != nil {
|
||||
return nil, fmt.Errorf("failed to parse response: %w", err)
|
||||
}
|
||||
|
||||
return &result, nil
|
||||
}
|
||||
|
||||
// CrawlStatusReport represents a crawl status to report to the Backend
|
||||
type CrawlStatusReport struct {
|
||||
SeedURL string `json:"seed_url"`
|
||||
Status string `json:"status"` // "success", "error", "partial"
|
||||
DocumentsCrawled int `json:"documents_crawled"`
|
||||
ErrorMessage string `json:"error_message,omitempty"`
|
||||
CrawlDuration float64 `json:"crawl_duration_seconds"`
|
||||
}
|
||||
|
||||
// CrawlStatusResponse represents the response from crawl status endpoint
|
||||
type CrawlStatusResponse struct {
|
||||
Success bool `json:"success"`
|
||||
SeedURL string `json:"seed_url"`
|
||||
Message string `json:"message"`
|
||||
}
|
||||
|
||||
// BulkCrawlStatusResponse represents the response from bulk crawl status endpoint
|
||||
type BulkCrawlStatusResponse struct {
|
||||
Updated int `json:"updated"`
|
||||
Failed int `json:"failed"`
|
||||
Errors []string `json:"errors"`
|
||||
}
|
||||
|
||||
// ReportStatus sends crawl status for a single seed to the Backend
|
||||
func (c *APIClient) ReportStatus(ctx context.Context, report *CrawlStatusReport) error {
|
||||
url := fmt.Sprintf("%s/v1/edu-search/seeds/crawl-status", c.baseURL)
|
||||
|
||||
body, err := json.Marshal(report)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to marshal report: %w", err)
|
||||
}
|
||||
|
||||
req, err := http.NewRequestWithContext(ctx, "POST", url, bytes.NewReader(body))
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to create request: %w", err)
|
||||
}
|
||||
|
||||
req.Header.Set("Content-Type", "application/json")
|
||||
req.Header.Set("Accept", "application/json")
|
||||
req.Header.Set("User-Agent", "EduSearchCrawler/1.0")
|
||||
|
||||
resp, err := c.httpClient.Do(req)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to report status: %w", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
respBody, _ := io.ReadAll(resp.Body)
|
||||
return fmt.Errorf("API returned status %d: %s", resp.StatusCode, string(respBody))
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// ReportStatusBulk sends crawl status for multiple seeds in one request
|
||||
func (c *APIClient) ReportStatusBulk(ctx context.Context, reports []*CrawlStatusReport) (*BulkCrawlStatusResponse, error) {
|
||||
url := fmt.Sprintf("%s/v1/edu-search/seeds/crawl-status/bulk", c.baseURL)
|
||||
|
||||
payload := struct {
|
||||
Updates []*CrawlStatusReport `json:"updates"`
|
||||
}{
|
||||
Updates: reports,
|
||||
}
|
||||
|
||||
body, err := json.Marshal(payload)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to marshal reports: %w", err)
|
||||
}
|
||||
|
||||
req, err := http.NewRequestWithContext(ctx, "POST", url, bytes.NewReader(body))
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to create request: %w", err)
|
||||
}
|
||||
|
||||
req.Header.Set("Content-Type", "application/json")
|
||||
req.Header.Set("Accept", "application/json")
|
||||
req.Header.Set("User-Agent", "EduSearchCrawler/1.0")
|
||||
|
||||
resp, err := c.httpClient.Do(req)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to report status: %w", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
respBody, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to read response: %w", err)
|
||||
}
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return nil, fmt.Errorf("API returned status %d: %s", resp.StatusCode, string(respBody))
|
||||
}
|
||||
|
||||
var result BulkCrawlStatusResponse
|
||||
if err := json.Unmarshal(respBody, &result); err != nil {
|
||||
return nil, fmt.Errorf("failed to parse response: %w", err)
|
||||
}
|
||||
|
||||
return &result, nil
|
||||
}
|
||||
428
edu-search-service/internal/crawler/api_client_test.go
Normal file
428
edu-search-service/internal/crawler/api_client_test.go
Normal file
@@ -0,0 +1,428 @@
|
||||
package crawler
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
func TestNewAPIClient(t *testing.T) {
|
||||
client := NewAPIClient("http://backend:8000")
|
||||
|
||||
if client == nil {
|
||||
t.Fatal("Expected non-nil client")
|
||||
}
|
||||
|
||||
if client.baseURL != "http://backend:8000" {
|
||||
t.Errorf("Expected baseURL 'http://backend:8000', got '%s'", client.baseURL)
|
||||
}
|
||||
|
||||
if client.httpClient == nil {
|
||||
t.Fatal("Expected non-nil httpClient")
|
||||
}
|
||||
}
|
||||
|
||||
func TestFetchSeeds_Success(t *testing.T) {
|
||||
// Create mock server
|
||||
mockResponse := SeedsExportResponse{
|
||||
Seeds: []SeedFromAPI{
|
||||
{
|
||||
URL: "https://www.kmk.org",
|
||||
Trust: 0.8,
|
||||
Source: "GOV",
|
||||
Scope: "FEDERAL",
|
||||
State: "",
|
||||
Depth: 3,
|
||||
Category: "federal",
|
||||
},
|
||||
{
|
||||
URL: "https://www.km-bw.de",
|
||||
Trust: 0.7,
|
||||
Source: "GOV",
|
||||
Scope: "STATE",
|
||||
State: "BW",
|
||||
Depth: 2,
|
||||
Category: "states",
|
||||
},
|
||||
},
|
||||
Total: 2,
|
||||
ExportedAt: "2025-01-17T10:00:00Z",
|
||||
}
|
||||
|
||||
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
// Verify request path
|
||||
if r.URL.Path != "/v1/edu-search/seeds/export/for-crawler" {
|
||||
t.Errorf("Expected path '/v1/edu-search/seeds/export/for-crawler', got '%s'", r.URL.Path)
|
||||
}
|
||||
|
||||
// Verify headers
|
||||
if r.Header.Get("Accept") != "application/json" {
|
||||
t.Errorf("Expected Accept header 'application/json', got '%s'", r.Header.Get("Accept"))
|
||||
}
|
||||
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
json.NewEncoder(w).Encode(mockResponse)
|
||||
}))
|
||||
defer server.Close()
|
||||
|
||||
// Test
|
||||
client := NewAPIClient(server.URL)
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||
defer cancel()
|
||||
|
||||
result, err := client.FetchSeeds(ctx)
|
||||
|
||||
if err != nil {
|
||||
t.Fatalf("Unexpected error: %v", err)
|
||||
}
|
||||
|
||||
if result.Total != 2 {
|
||||
t.Errorf("Expected 2 seeds, got %d", result.Total)
|
||||
}
|
||||
|
||||
if len(result.Seeds) != 2 {
|
||||
t.Fatalf("Expected 2 seeds in array, got %d", len(result.Seeds))
|
||||
}
|
||||
|
||||
// Verify first seed
|
||||
if result.Seeds[0].URL != "https://www.kmk.org" {
|
||||
t.Errorf("Expected URL 'https://www.kmk.org', got '%s'", result.Seeds[0].URL)
|
||||
}
|
||||
|
||||
if result.Seeds[0].Trust != 0.8 {
|
||||
t.Errorf("Expected Trust 0.8, got %f", result.Seeds[0].Trust)
|
||||
}
|
||||
|
||||
if result.Seeds[0].Source != "GOV" {
|
||||
t.Errorf("Expected Source 'GOV', got '%s'", result.Seeds[0].Source)
|
||||
}
|
||||
|
||||
// Verify second seed with state
|
||||
if result.Seeds[1].State != "BW" {
|
||||
t.Errorf("Expected State 'BW', got '%s'", result.Seeds[1].State)
|
||||
}
|
||||
}
|
||||
|
||||
func TestFetchSeeds_ServerError(t *testing.T) {
|
||||
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
w.WriteHeader(http.StatusInternalServerError)
|
||||
w.Write([]byte("Internal server error"))
|
||||
}))
|
||||
defer server.Close()
|
||||
|
||||
client := NewAPIClient(server.URL)
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||
defer cancel()
|
||||
|
||||
_, err := client.FetchSeeds(ctx)
|
||||
|
||||
if err == nil {
|
||||
t.Fatal("Expected error for server error response")
|
||||
}
|
||||
}
|
||||
|
||||
func TestFetchSeeds_InvalidJSON(t *testing.T) {
|
||||
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
w.Write([]byte("not valid json"))
|
||||
}))
|
||||
defer server.Close()
|
||||
|
||||
client := NewAPIClient(server.URL)
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||
defer cancel()
|
||||
|
||||
_, err := client.FetchSeeds(ctx)
|
||||
|
||||
if err == nil {
|
||||
t.Fatal("Expected error for invalid JSON response")
|
||||
}
|
||||
}
|
||||
|
||||
func TestFetchSeeds_Timeout(t *testing.T) {
|
||||
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
// Simulate slow response
|
||||
time.Sleep(2 * time.Second)
|
||||
w.WriteHeader(http.StatusOK)
|
||||
}))
|
||||
defer server.Close()
|
||||
|
||||
client := NewAPIClient(server.URL)
|
||||
// Very short timeout
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 100*time.Millisecond)
|
||||
defer cancel()
|
||||
|
||||
_, err := client.FetchSeeds(ctx)
|
||||
|
||||
if err == nil {
|
||||
t.Fatal("Expected timeout error")
|
||||
}
|
||||
}
|
||||
|
||||
func TestFetchSeeds_EmptyResponse(t *testing.T) {
|
||||
mockResponse := SeedsExportResponse{
|
||||
Seeds: []SeedFromAPI{},
|
||||
Total: 0,
|
||||
ExportedAt: "2025-01-17T10:00:00Z",
|
||||
}
|
||||
|
||||
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
json.NewEncoder(w).Encode(mockResponse)
|
||||
}))
|
||||
defer server.Close()
|
||||
|
||||
client := NewAPIClient(server.URL)
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||
defer cancel()
|
||||
|
||||
result, err := client.FetchSeeds(ctx)
|
||||
|
||||
if err != nil {
|
||||
t.Fatalf("Unexpected error: %v", err)
|
||||
}
|
||||
|
||||
if result.Total != 0 {
|
||||
t.Errorf("Expected 0 seeds, got %d", result.Total)
|
||||
}
|
||||
|
||||
if len(result.Seeds) != 0 {
|
||||
t.Errorf("Expected empty seeds array, got %d", len(result.Seeds))
|
||||
}
|
||||
}
|
||||
|
||||
// Tests for Crawl Status Reporting
|
||||
|
||||
func TestReportStatus_Success(t *testing.T) {
|
||||
var receivedReport CrawlStatusReport
|
||||
|
||||
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
// Verify request method and path
|
||||
if r.Method != "POST" {
|
||||
t.Errorf("Expected POST method, got %s", r.Method)
|
||||
}
|
||||
if r.URL.Path != "/v1/edu-search/seeds/crawl-status" {
|
||||
t.Errorf("Expected path '/v1/edu-search/seeds/crawl-status', got '%s'", r.URL.Path)
|
||||
}
|
||||
if r.Header.Get("Content-Type") != "application/json" {
|
||||
t.Errorf("Expected Content-Type 'application/json', got '%s'", r.Header.Get("Content-Type"))
|
||||
}
|
||||
|
||||
// Parse body
|
||||
json.NewDecoder(r.Body).Decode(&receivedReport)
|
||||
|
||||
// Send response
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
json.NewEncoder(w).Encode(CrawlStatusResponse{
|
||||
Success: true,
|
||||
SeedURL: receivedReport.SeedURL,
|
||||
Message: "Status updated",
|
||||
})
|
||||
}))
|
||||
defer server.Close()
|
||||
|
||||
client := NewAPIClient(server.URL)
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||
defer cancel()
|
||||
|
||||
report := &CrawlStatusReport{
|
||||
SeedURL: "https://www.kmk.org",
|
||||
Status: "success",
|
||||
DocumentsCrawled: 42,
|
||||
CrawlDuration: 15.5,
|
||||
}
|
||||
|
||||
err := client.ReportStatus(ctx, report)
|
||||
|
||||
if err != nil {
|
||||
t.Fatalf("Unexpected error: %v", err)
|
||||
}
|
||||
|
||||
// Verify the report was sent correctly
|
||||
if receivedReport.SeedURL != "https://www.kmk.org" {
|
||||
t.Errorf("Expected SeedURL 'https://www.kmk.org', got '%s'", receivedReport.SeedURL)
|
||||
}
|
||||
if receivedReport.Status != "success" {
|
||||
t.Errorf("Expected Status 'success', got '%s'", receivedReport.Status)
|
||||
}
|
||||
if receivedReport.DocumentsCrawled != 42 {
|
||||
t.Errorf("Expected DocumentsCrawled 42, got %d", receivedReport.DocumentsCrawled)
|
||||
}
|
||||
}
|
||||
|
||||
func TestReportStatus_ServerError(t *testing.T) {
|
||||
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
w.WriteHeader(http.StatusInternalServerError)
|
||||
w.Write([]byte("Internal server error"))
|
||||
}))
|
||||
defer server.Close()
|
||||
|
||||
client := NewAPIClient(server.URL)
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||
defer cancel()
|
||||
|
||||
report := &CrawlStatusReport{
|
||||
SeedURL: "https://www.kmk.org",
|
||||
Status: "success",
|
||||
}
|
||||
|
||||
err := client.ReportStatus(ctx, report)
|
||||
|
||||
if err == nil {
|
||||
t.Fatal("Expected error for server error response")
|
||||
}
|
||||
}
|
||||
|
||||
func TestReportStatus_NotFound(t *testing.T) {
|
||||
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
w.WriteHeader(http.StatusNotFound)
|
||||
w.Write([]byte(`{"detail": "Seed nicht gefunden"}`))
|
||||
}))
|
||||
defer server.Close()
|
||||
|
||||
client := NewAPIClient(server.URL)
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||
defer cancel()
|
||||
|
||||
report := &CrawlStatusReport{
|
||||
SeedURL: "https://unknown.example.com",
|
||||
Status: "error",
|
||||
}
|
||||
|
||||
err := client.ReportStatus(ctx, report)
|
||||
|
||||
if err == nil {
|
||||
t.Fatal("Expected error for 404 response")
|
||||
}
|
||||
}
|
||||
|
||||
func TestReportStatusBulk_Success(t *testing.T) {
|
||||
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
// Verify request method and path
|
||||
if r.Method != "POST" {
|
||||
t.Errorf("Expected POST method, got %s", r.Method)
|
||||
}
|
||||
if r.URL.Path != "/v1/edu-search/seeds/crawl-status/bulk" {
|
||||
t.Errorf("Expected path '/v1/edu-search/seeds/crawl-status/bulk', got '%s'", r.URL.Path)
|
||||
}
|
||||
|
||||
// Parse body
|
||||
var payload struct {
|
||||
Updates []*CrawlStatusReport `json:"updates"`
|
||||
}
|
||||
json.NewDecoder(r.Body).Decode(&payload)
|
||||
|
||||
// Send response
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
json.NewEncoder(w).Encode(BulkCrawlStatusResponse{
|
||||
Updated: len(payload.Updates),
|
||||
Failed: 0,
|
||||
Errors: []string{},
|
||||
})
|
||||
}))
|
||||
defer server.Close()
|
||||
|
||||
client := NewAPIClient(server.URL)
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||
defer cancel()
|
||||
|
||||
reports := []*CrawlStatusReport{
|
||||
{
|
||||
SeedURL: "https://www.kmk.org",
|
||||
Status: "success",
|
||||
DocumentsCrawled: 42,
|
||||
},
|
||||
{
|
||||
SeedURL: "https://www.km-bw.de",
|
||||
Status: "partial",
|
||||
DocumentsCrawled: 15,
|
||||
},
|
||||
}
|
||||
|
||||
result, err := client.ReportStatusBulk(ctx, reports)
|
||||
|
||||
if err != nil {
|
||||
t.Fatalf("Unexpected error: %v", err)
|
||||
}
|
||||
|
||||
if result.Updated != 2 {
|
||||
t.Errorf("Expected 2 updated, got %d", result.Updated)
|
||||
}
|
||||
if result.Failed != 0 {
|
||||
t.Errorf("Expected 0 failed, got %d", result.Failed)
|
||||
}
|
||||
}
|
||||
|
||||
func TestReportStatusBulk_PartialFailure(t *testing.T) {
|
||||
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
json.NewEncoder(w).Encode(BulkCrawlStatusResponse{
|
||||
Updated: 1,
|
||||
Failed: 1,
|
||||
Errors: []string{"Seed nicht gefunden: https://unknown.example.com"},
|
||||
})
|
||||
}))
|
||||
defer server.Close()
|
||||
|
||||
client := NewAPIClient(server.URL)
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||
defer cancel()
|
||||
|
||||
reports := []*CrawlStatusReport{
|
||||
{SeedURL: "https://www.kmk.org", Status: "success"},
|
||||
{SeedURL: "https://unknown.example.com", Status: "error"},
|
||||
}
|
||||
|
||||
result, err := client.ReportStatusBulk(ctx, reports)
|
||||
|
||||
if err != nil {
|
||||
t.Fatalf("Unexpected error: %v", err)
|
||||
}
|
||||
|
||||
if result.Updated != 1 {
|
||||
t.Errorf("Expected 1 updated, got %d", result.Updated)
|
||||
}
|
||||
if result.Failed != 1 {
|
||||
t.Errorf("Expected 1 failed, got %d", result.Failed)
|
||||
}
|
||||
if len(result.Errors) != 1 {
|
||||
t.Errorf("Expected 1 error, got %d", len(result.Errors))
|
||||
}
|
||||
}
|
||||
|
||||
func TestCrawlStatusReport_Struct(t *testing.T) {
|
||||
report := CrawlStatusReport{
|
||||
SeedURL: "https://www.example.com",
|
||||
Status: "success",
|
||||
DocumentsCrawled: 100,
|
||||
ErrorMessage: "",
|
||||
CrawlDuration: 25.5,
|
||||
}
|
||||
|
||||
// Test JSON marshaling
|
||||
data, err := json.Marshal(report)
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to marshal: %v", err)
|
||||
}
|
||||
|
||||
var decoded CrawlStatusReport
|
||||
if err := json.Unmarshal(data, &decoded); err != nil {
|
||||
t.Fatalf("Failed to unmarshal: %v", err)
|
||||
}
|
||||
|
||||
if decoded.SeedURL != report.SeedURL {
|
||||
t.Errorf("SeedURL mismatch")
|
||||
}
|
||||
if decoded.Status != report.Status {
|
||||
t.Errorf("Status mismatch")
|
||||
}
|
||||
if decoded.DocumentsCrawled != report.DocumentsCrawled {
|
||||
t.Errorf("DocumentsCrawled mismatch")
|
||||
}
|
||||
if decoded.CrawlDuration != report.CrawlDuration {
|
||||
t.Errorf("CrawlDuration mismatch")
|
||||
}
|
||||
}
|
||||
364
edu-search-service/internal/crawler/crawler.go
Normal file
364
edu-search-service/internal/crawler/crawler.go
Normal file
@@ -0,0 +1,364 @@
|
||||
package crawler
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"context"
|
||||
"crypto/sha256"
|
||||
"encoding/hex"
|
||||
"fmt"
|
||||
"io"
|
||||
"log"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/google/uuid"
|
||||
)
|
||||
|
||||
// Note: API client is in the same package (api_client.go)
|
||||
|
||||
// FetchResult contains the result of fetching a URL
|
||||
type FetchResult struct {
|
||||
URL string
|
||||
CanonicalURL string
|
||||
ContentType string
|
||||
StatusCode int
|
||||
Body []byte
|
||||
ContentHash string
|
||||
FetchTime time.Time
|
||||
Error error
|
||||
}
|
||||
|
||||
// Seed represents a URL to crawl with metadata
|
||||
type Seed struct {
|
||||
URL string
|
||||
TrustBoost float64
|
||||
Source string // GOV, EDU, UNI, etc.
|
||||
Scope string // FEDERAL, STATE, etc.
|
||||
State string // BW, BY, etc. (optional)
|
||||
MaxDepth int // Custom crawl depth for this seed
|
||||
Category string // Category name
|
||||
}
|
||||
|
||||
// Crawler handles URL fetching with rate limiting and robots.txt respect
|
||||
type Crawler struct {
|
||||
userAgent string
|
||||
rateLimitPerSec float64
|
||||
maxDepth int
|
||||
timeout time.Duration
|
||||
client *http.Client
|
||||
denylist map[string]bool
|
||||
lastFetch map[string]time.Time
|
||||
mu sync.Mutex
|
||||
apiClient *APIClient // API client for fetching seeds from Backend
|
||||
}
|
||||
|
||||
// NewCrawler creates a new crawler instance
|
||||
func NewCrawler(userAgent string, rateLimitPerSec float64, maxDepth int) *Crawler {
|
||||
return &Crawler{
|
||||
userAgent: userAgent,
|
||||
rateLimitPerSec: rateLimitPerSec,
|
||||
maxDepth: maxDepth,
|
||||
timeout: 30 * time.Second,
|
||||
client: &http.Client{
|
||||
Timeout: 30 * time.Second,
|
||||
CheckRedirect: func(req *http.Request, via []*http.Request) error {
|
||||
if len(via) >= 5 {
|
||||
return fmt.Errorf("too many redirects")
|
||||
}
|
||||
return nil
|
||||
},
|
||||
},
|
||||
denylist: make(map[string]bool),
|
||||
lastFetch: make(map[string]time.Time),
|
||||
}
|
||||
}
|
||||
|
||||
// SetAPIClient sets the API client for fetching seeds from Backend
|
||||
func (c *Crawler) SetAPIClient(backendURL string) {
|
||||
c.apiClient = NewAPIClient(backendURL)
|
||||
}
|
||||
|
||||
// LoadSeedsFromAPI fetches seeds from the Backend API
|
||||
func (c *Crawler) LoadSeedsFromAPI(ctx context.Context) ([]Seed, error) {
|
||||
if c.apiClient == nil {
|
||||
return nil, fmt.Errorf("API client not initialized - call SetAPIClient first")
|
||||
}
|
||||
|
||||
response, err := c.apiClient.FetchSeeds(ctx)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to fetch seeds from API: %w", err)
|
||||
}
|
||||
|
||||
seeds := make([]Seed, 0, len(response.Seeds))
|
||||
for _, apiSeed := range response.Seeds {
|
||||
seed := Seed{
|
||||
URL: apiSeed.URL,
|
||||
TrustBoost: apiSeed.Trust,
|
||||
Source: apiSeed.Source,
|
||||
Scope: apiSeed.Scope,
|
||||
State: apiSeed.State,
|
||||
MaxDepth: apiSeed.Depth,
|
||||
Category: apiSeed.Category,
|
||||
}
|
||||
// Use default depth if not specified
|
||||
if seed.MaxDepth <= 0 {
|
||||
seed.MaxDepth = c.maxDepth
|
||||
}
|
||||
seeds = append(seeds, seed)
|
||||
}
|
||||
|
||||
log.Printf("Loaded %d seeds from API (exported at: %s)", len(seeds), response.ExportedAt)
|
||||
return seeds, nil
|
||||
}
|
||||
|
||||
// LoadSeeds loads seed URLs from files in a directory (legacy method)
|
||||
func (c *Crawler) LoadSeeds(seedsDir string) ([]string, error) {
|
||||
var seeds []string
|
||||
|
||||
files, err := filepath.Glob(filepath.Join(seedsDir, "*.txt"))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
for _, file := range files {
|
||||
if strings.Contains(file, "denylist") {
|
||||
// Load denylist
|
||||
if err := c.loadDenylist(file); err != nil {
|
||||
log.Printf("Warning: Could not load denylist %s: %v", file, err)
|
||||
}
|
||||
continue
|
||||
}
|
||||
|
||||
fileSeeds, err := c.loadSeedFile(file)
|
||||
if err != nil {
|
||||
log.Printf("Warning: Could not load seed file %s: %v", file, err)
|
||||
continue
|
||||
}
|
||||
seeds = append(seeds, fileSeeds...)
|
||||
}
|
||||
|
||||
log.Printf("Loaded %d seeds from files, %d domains in denylist", len(seeds), len(c.denylist))
|
||||
return seeds, nil
|
||||
}
|
||||
|
||||
// LoadSeedsWithMetadata loads seeds from files and converts to Seed struct
|
||||
// This provides backward compatibility while allowing metadata
|
||||
func (c *Crawler) LoadSeedsWithMetadata(seedsDir string) ([]Seed, error) {
|
||||
urlList, err := c.LoadSeeds(seedsDir)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
seeds := make([]Seed, 0, len(urlList))
|
||||
for _, url := range urlList {
|
||||
seeds = append(seeds, Seed{
|
||||
URL: url,
|
||||
TrustBoost: 0.5, // Default trust boost
|
||||
MaxDepth: c.maxDepth,
|
||||
})
|
||||
}
|
||||
|
||||
return seeds, nil
|
||||
}
|
||||
|
||||
func (c *Crawler) loadSeedFile(filename string) ([]string, error) {
|
||||
file, err := os.Open(filename)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer file.Close()
|
||||
|
||||
var seeds []string
|
||||
scanner := bufio.NewScanner(file)
|
||||
for scanner.Scan() {
|
||||
line := strings.TrimSpace(scanner.Text())
|
||||
// Skip comments and empty lines
|
||||
if line == "" || strings.HasPrefix(line, "#") {
|
||||
continue
|
||||
}
|
||||
// Extract URL (ignore comments after URL)
|
||||
parts := strings.SplitN(line, " ", 2)
|
||||
urlStr := strings.TrimSpace(parts[0])
|
||||
if urlStr != "" {
|
||||
seeds = append(seeds, urlStr)
|
||||
}
|
||||
}
|
||||
return seeds, scanner.Err()
|
||||
}
|
||||
|
||||
func (c *Crawler) loadDenylist(filename string) error {
|
||||
file, err := os.Open(filename)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer file.Close()
|
||||
|
||||
scanner := bufio.NewScanner(file)
|
||||
for scanner.Scan() {
|
||||
line := strings.TrimSpace(scanner.Text())
|
||||
if line == "" || strings.HasPrefix(line, "#") {
|
||||
continue
|
||||
}
|
||||
c.denylist[strings.ToLower(line)] = true
|
||||
}
|
||||
return scanner.Err()
|
||||
}
|
||||
|
||||
// IsDenied checks if a domain is in the denylist
|
||||
func (c *Crawler) IsDenied(urlStr string) bool {
|
||||
u, err := url.Parse(urlStr)
|
||||
if err != nil {
|
||||
return true
|
||||
}
|
||||
|
||||
host := strings.ToLower(u.Host)
|
||||
|
||||
// Check exact match
|
||||
if c.denylist[host] {
|
||||
return true
|
||||
}
|
||||
|
||||
// Check parent domains
|
||||
parts := strings.Split(host, ".")
|
||||
for i := 1; i < len(parts)-1; i++ {
|
||||
parent := strings.Join(parts[i:], ".")
|
||||
if c.denylist[parent] {
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
// Fetch fetches a single URL with rate limiting
|
||||
func (c *Crawler) Fetch(ctx context.Context, urlStr string) (*FetchResult, error) {
|
||||
result := &FetchResult{
|
||||
URL: urlStr,
|
||||
FetchTime: time.Now(),
|
||||
}
|
||||
|
||||
// Check denylist
|
||||
if c.IsDenied(urlStr) {
|
||||
result.Error = fmt.Errorf("domain denied")
|
||||
return result, result.Error
|
||||
}
|
||||
|
||||
// Parse URL
|
||||
u, err := url.Parse(urlStr)
|
||||
if err != nil {
|
||||
result.Error = err
|
||||
return result, err
|
||||
}
|
||||
|
||||
// Rate limiting per domain
|
||||
c.waitForRateLimit(u.Host)
|
||||
|
||||
// Create request
|
||||
req, err := http.NewRequestWithContext(ctx, "GET", urlStr, nil)
|
||||
if err != nil {
|
||||
result.Error = err
|
||||
return result, err
|
||||
}
|
||||
|
||||
req.Header.Set("User-Agent", c.userAgent)
|
||||
req.Header.Set("Accept", "text/html,application/pdf,application/xhtml+xml")
|
||||
req.Header.Set("Accept-Language", "de-DE,de;q=0.9,en;q=0.8")
|
||||
|
||||
// Execute request
|
||||
resp, err := c.client.Do(req)
|
||||
if err != nil {
|
||||
result.Error = err
|
||||
return result, err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
result.StatusCode = resp.StatusCode
|
||||
result.ContentType = resp.Header.Get("Content-Type")
|
||||
result.CanonicalURL = resp.Request.URL.String()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
result.Error = fmt.Errorf("HTTP %d", resp.StatusCode)
|
||||
return result, result.Error
|
||||
}
|
||||
|
||||
// Read body (limit to 20MB)
|
||||
limitedReader := io.LimitReader(resp.Body, 20*1024*1024)
|
||||
body, err := io.ReadAll(limitedReader)
|
||||
if err != nil {
|
||||
result.Error = err
|
||||
return result, err
|
||||
}
|
||||
|
||||
result.Body = body
|
||||
|
||||
// Calculate content hash
|
||||
hash := sha256.Sum256(body)
|
||||
result.ContentHash = hex.EncodeToString(hash[:])
|
||||
|
||||
return result, nil
|
||||
}
|
||||
|
||||
func (c *Crawler) waitForRateLimit(host string) {
|
||||
c.mu.Lock()
|
||||
defer c.mu.Unlock()
|
||||
|
||||
minInterval := time.Duration(float64(time.Second) / c.rateLimitPerSec)
|
||||
|
||||
if last, ok := c.lastFetch[host]; ok {
|
||||
elapsed := time.Since(last)
|
||||
if elapsed < minInterval {
|
||||
time.Sleep(minInterval - elapsed)
|
||||
}
|
||||
}
|
||||
|
||||
c.lastFetch[host] = time.Now()
|
||||
}
|
||||
|
||||
// ExtractDomain extracts the domain from a URL
|
||||
func ExtractDomain(urlStr string) string {
|
||||
u, err := url.Parse(urlStr)
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
return u.Host
|
||||
}
|
||||
|
||||
// GenerateDocID generates a unique document ID
|
||||
func GenerateDocID() string {
|
||||
return uuid.New().String()
|
||||
}
|
||||
|
||||
// NormalizeURL normalizes a URL for deduplication
|
||||
func NormalizeURL(urlStr string) string {
|
||||
u, err := url.Parse(urlStr)
|
||||
if err != nil {
|
||||
return urlStr
|
||||
}
|
||||
|
||||
// Remove trailing slashes
|
||||
u.Path = strings.TrimSuffix(u.Path, "/")
|
||||
|
||||
// Remove common tracking parameters
|
||||
q := u.Query()
|
||||
for key := range q {
|
||||
lowerKey := strings.ToLower(key)
|
||||
if strings.HasPrefix(lowerKey, "utm_") ||
|
||||
lowerKey == "ref" ||
|
||||
lowerKey == "source" ||
|
||||
lowerKey == "fbclid" ||
|
||||
lowerKey == "gclid" {
|
||||
q.Del(key)
|
||||
}
|
||||
}
|
||||
u.RawQuery = q.Encode()
|
||||
|
||||
// Lowercase host
|
||||
u.Host = strings.ToLower(u.Host)
|
||||
|
||||
return u.String()
|
||||
}
|
||||
639
edu-search-service/internal/crawler/crawler_test.go
Normal file
639
edu-search-service/internal/crawler/crawler_test.go
Normal file
@@ -0,0 +1,639 @@
|
||||
package crawler
|
||||
|
||||
import (
|
||||
"context"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
func TestNewCrawler(t *testing.T) {
|
||||
crawler := NewCrawler("TestBot/1.0", 1.0, 3)
|
||||
|
||||
if crawler == nil {
|
||||
t.Fatal("Expected non-nil crawler")
|
||||
}
|
||||
if crawler.userAgent != "TestBot/1.0" {
|
||||
t.Errorf("Expected userAgent 'TestBot/1.0', got %q", crawler.userAgent)
|
||||
}
|
||||
if crawler.rateLimitPerSec != 1.0 {
|
||||
t.Errorf("Expected rateLimitPerSec 1.0, got %f", crawler.rateLimitPerSec)
|
||||
}
|
||||
if crawler.maxDepth != 3 {
|
||||
t.Errorf("Expected maxDepth 3, got %d", crawler.maxDepth)
|
||||
}
|
||||
if crawler.client == nil {
|
||||
t.Error("Expected non-nil HTTP client")
|
||||
}
|
||||
}
|
||||
|
||||
func TestCrawler_LoadSeeds(t *testing.T) {
|
||||
// Create temp directory with seed files
|
||||
dir := t.TempDir()
|
||||
|
||||
// Create a seed file
|
||||
seedContent := `# Federal education sources
|
||||
https://www.kmk.org
|
||||
https://www.bildungsserver.de
|
||||
|
||||
# Comment line
|
||||
https://www.bpb.de # with inline comment
|
||||
`
|
||||
if err := os.WriteFile(filepath.Join(dir, "federal.txt"), []byte(seedContent), 0644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
// Create another seed file
|
||||
stateContent := `https://www.km.bayern.de
|
||||
https://www.schulministerium.nrw.de
|
||||
`
|
||||
if err := os.WriteFile(filepath.Join(dir, "states.txt"), []byte(stateContent), 0644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
// Create denylist
|
||||
denylistContent := `# Denylist
|
||||
facebook.com
|
||||
twitter.com
|
||||
instagram.com
|
||||
`
|
||||
if err := os.WriteFile(filepath.Join(dir, "denylist.txt"), []byte(denylistContent), 0644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
crawler := NewCrawler("TestBot/1.0", 1.0, 3)
|
||||
seeds, err := crawler.LoadSeeds(dir)
|
||||
if err != nil {
|
||||
t.Fatalf("LoadSeeds failed: %v", err)
|
||||
}
|
||||
|
||||
// Check seeds loaded
|
||||
if len(seeds) != 5 {
|
||||
t.Errorf("Expected 5 seeds, got %d", len(seeds))
|
||||
}
|
||||
|
||||
// Check expected URLs
|
||||
expectedURLs := []string{
|
||||
"https://www.kmk.org",
|
||||
"https://www.bildungsserver.de",
|
||||
"https://www.bpb.de",
|
||||
"https://www.km.bayern.de",
|
||||
"https://www.schulministerium.nrw.de",
|
||||
}
|
||||
|
||||
for _, expected := range expectedURLs {
|
||||
found := false
|
||||
for _, seed := range seeds {
|
||||
if seed == expected {
|
||||
found = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if !found {
|
||||
t.Errorf("Expected seed %q not found", expected)
|
||||
}
|
||||
}
|
||||
|
||||
// Check denylist loaded
|
||||
if len(crawler.denylist) != 3 {
|
||||
t.Errorf("Expected 3 denylist entries, got %d", len(crawler.denylist))
|
||||
}
|
||||
}
|
||||
|
||||
func TestCrawler_IsDenied(t *testing.T) {
|
||||
crawler := NewCrawler("TestBot/1.0", 1.0, 3)
|
||||
crawler.denylist = map[string]bool{
|
||||
"facebook.com": true,
|
||||
"twitter.com": true,
|
||||
"ads.example.com": true,
|
||||
}
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
url string
|
||||
expected bool
|
||||
}{
|
||||
{
|
||||
name: "Exact domain match",
|
||||
url: "https://facebook.com/page",
|
||||
expected: true,
|
||||
},
|
||||
{
|
||||
name: "Subdomain of denied domain",
|
||||
url: "https://www.facebook.com/page",
|
||||
expected: true,
|
||||
},
|
||||
{
|
||||
name: "Allowed domain",
|
||||
url: "https://www.kmk.org/bildung",
|
||||
expected: false,
|
||||
},
|
||||
{
|
||||
name: "Denied subdomain",
|
||||
url: "https://ads.example.com/banner",
|
||||
expected: true,
|
||||
},
|
||||
{
|
||||
name: "Parent domain allowed",
|
||||
url: "https://example.com/page",
|
||||
expected: false,
|
||||
},
|
||||
{
|
||||
name: "Invalid URL scheme",
|
||||
url: "://invalid",
|
||||
expected: true,
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
result := crawler.IsDenied(tt.url)
|
||||
if result != tt.expected {
|
||||
t.Errorf("IsDenied(%q) = %v, expected %v", tt.url, result, tt.expected)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestCrawler_Fetch_Success(t *testing.T) {
|
||||
// Create test server
|
||||
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
// Check user agent
|
||||
if r.Header.Get("User-Agent") != "TestBot/1.0" {
|
||||
t.Errorf("Expected User-Agent 'TestBot/1.0', got %q", r.Header.Get("User-Agent"))
|
||||
}
|
||||
|
||||
w.Header().Set("Content-Type", "text/html; charset=utf-8")
|
||||
w.WriteHeader(http.StatusOK)
|
||||
w.Write([]byte("<html><body>Test content</body></html>"))
|
||||
}))
|
||||
defer server.Close()
|
||||
|
||||
crawler := NewCrawler("TestBot/1.0", 100.0, 3) // High rate limit for testing
|
||||
ctx := context.Background()
|
||||
|
||||
result, err := crawler.Fetch(ctx, server.URL+"/page")
|
||||
if err != nil {
|
||||
t.Fatalf("Fetch failed: %v", err)
|
||||
}
|
||||
|
||||
if result.StatusCode != 200 {
|
||||
t.Errorf("Expected status 200, got %d", result.StatusCode)
|
||||
}
|
||||
if result.Error != nil {
|
||||
t.Errorf("Expected no error, got %v", result.Error)
|
||||
}
|
||||
if !strings.Contains(result.ContentType, "text/html") {
|
||||
t.Errorf("Expected Content-Type to contain 'text/html', got %q", result.ContentType)
|
||||
}
|
||||
if len(result.Body) == 0 {
|
||||
t.Error("Expected non-empty body")
|
||||
}
|
||||
if result.ContentHash == "" {
|
||||
t.Error("Expected non-empty content hash")
|
||||
}
|
||||
if result.FetchTime.IsZero() {
|
||||
t.Error("Expected non-zero fetch time")
|
||||
}
|
||||
}
|
||||
|
||||
func TestCrawler_Fetch_DeniedDomain(t *testing.T) {
|
||||
crawler := NewCrawler("TestBot/1.0", 100.0, 3)
|
||||
crawler.denylist = map[string]bool{
|
||||
"denied.com": true,
|
||||
}
|
||||
|
||||
ctx := context.Background()
|
||||
result, err := crawler.Fetch(ctx, "https://denied.com/page")
|
||||
|
||||
if err == nil {
|
||||
t.Error("Expected error for denied domain")
|
||||
}
|
||||
if result.Error == nil {
|
||||
t.Error("Expected error in result")
|
||||
}
|
||||
if !strings.Contains(result.Error.Error(), "denied") {
|
||||
t.Errorf("Expected 'denied' in error message, got %v", result.Error)
|
||||
}
|
||||
}
|
||||
|
||||
func TestCrawler_Fetch_HTTPError(t *testing.T) {
|
||||
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
w.WriteHeader(http.StatusNotFound)
|
||||
}))
|
||||
defer server.Close()
|
||||
|
||||
crawler := NewCrawler("TestBot/1.0", 100.0, 3)
|
||||
ctx := context.Background()
|
||||
|
||||
result, err := crawler.Fetch(ctx, server.URL+"/notfound")
|
||||
if err == nil {
|
||||
t.Error("Expected error for 404 response")
|
||||
}
|
||||
if result.StatusCode != 404 {
|
||||
t.Errorf("Expected status 404, got %d", result.StatusCode)
|
||||
}
|
||||
}
|
||||
|
||||
func TestCrawler_Fetch_Redirect(t *testing.T) {
|
||||
redirectCount := 0
|
||||
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
if r.URL.Path == "/redirect" {
|
||||
redirectCount++
|
||||
http.Redirect(w, r, "/final", http.StatusFound)
|
||||
return
|
||||
}
|
||||
w.WriteHeader(http.StatusOK)
|
||||
w.Write([]byte("Final content"))
|
||||
}))
|
||||
defer server.Close()
|
||||
|
||||
crawler := NewCrawler("TestBot/1.0", 100.0, 3)
|
||||
ctx := context.Background()
|
||||
|
||||
result, err := crawler.Fetch(ctx, server.URL+"/redirect")
|
||||
if err != nil {
|
||||
t.Fatalf("Fetch failed: %v", err)
|
||||
}
|
||||
|
||||
// CanonicalURL should be the final URL after redirect
|
||||
if !strings.HasSuffix(result.CanonicalURL, "/final") {
|
||||
t.Errorf("Expected canonical URL to end with '/final', got %q", result.CanonicalURL)
|
||||
}
|
||||
}
|
||||
|
||||
func TestCrawler_Fetch_Timeout(t *testing.T) {
|
||||
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
time.Sleep(2 * time.Second) // Delay response
|
||||
w.WriteHeader(http.StatusOK)
|
||||
}))
|
||||
defer server.Close()
|
||||
|
||||
crawler := NewCrawler("TestBot/1.0", 100.0, 3)
|
||||
crawler.timeout = 100 * time.Millisecond // Very short timeout
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 100*time.Millisecond)
|
||||
defer cancel()
|
||||
|
||||
_, err := crawler.Fetch(ctx, server.URL+"/slow")
|
||||
if err == nil {
|
||||
t.Error("Expected timeout error")
|
||||
}
|
||||
}
|
||||
|
||||
func TestExtractDomain(t *testing.T) {
|
||||
tests := []struct {
|
||||
url string
|
||||
expected string
|
||||
}{
|
||||
{
|
||||
url: "https://www.example.com/page",
|
||||
expected: "www.example.com",
|
||||
},
|
||||
{
|
||||
url: "https://example.com:8080/path",
|
||||
expected: "example.com:8080",
|
||||
},
|
||||
{
|
||||
url: "http://subdomain.example.com",
|
||||
expected: "subdomain.example.com",
|
||||
},
|
||||
{
|
||||
url: "invalid-url",
|
||||
expected: "",
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.url, func(t *testing.T) {
|
||||
result := ExtractDomain(tt.url)
|
||||
if result != tt.expected {
|
||||
t.Errorf("ExtractDomain(%q) = %q, expected %q", tt.url, result, tt.expected)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestGenerateDocID(t *testing.T) {
|
||||
id1 := GenerateDocID()
|
||||
id2 := GenerateDocID()
|
||||
|
||||
if id1 == "" {
|
||||
t.Error("Expected non-empty ID")
|
||||
}
|
||||
if id1 == id2 {
|
||||
t.Error("Expected unique IDs")
|
||||
}
|
||||
// UUID format check (basic)
|
||||
if len(id1) != 36 {
|
||||
t.Errorf("Expected UUID length 36, got %d", len(id1))
|
||||
}
|
||||
}
|
||||
|
||||
func TestNormalizeURL(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
url string
|
||||
expected string
|
||||
}{
|
||||
{
|
||||
name: "Remove trailing slash",
|
||||
url: "https://example.com/page/",
|
||||
expected: "https://example.com/page",
|
||||
},
|
||||
{
|
||||
name: "Remove UTM parameters",
|
||||
url: "https://example.com/page?utm_source=google&utm_medium=cpc",
|
||||
expected: "https://example.com/page",
|
||||
},
|
||||
{
|
||||
name: "Remove multiple tracking params",
|
||||
url: "https://example.com/page?id=123&utm_campaign=test&fbclid=abc",
|
||||
expected: "https://example.com/page?id=123",
|
||||
},
|
||||
{
|
||||
name: "Keep non-tracking params",
|
||||
url: "https://example.com/search?q=test&page=2",
|
||||
expected: "https://example.com/search?page=2&q=test",
|
||||
},
|
||||
{
|
||||
name: "Lowercase host",
|
||||
url: "https://EXAMPLE.COM/Page",
|
||||
expected: "https://example.com/Page",
|
||||
},
|
||||
{
|
||||
name: "Invalid URL returns as-is",
|
||||
url: "not-a-url",
|
||||
expected: "not-a-url",
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
result := NormalizeURL(tt.url)
|
||||
if result != tt.expected {
|
||||
t.Errorf("NormalizeURL(%q) = %q, expected %q", tt.url, result, tt.expected)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestCrawler_RateLimit(t *testing.T) {
|
||||
requestCount := 0
|
||||
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
requestCount++
|
||||
w.WriteHeader(http.StatusOK)
|
||||
w.Write([]byte("OK"))
|
||||
}))
|
||||
defer server.Close()
|
||||
|
||||
// 2 requests per second = 500ms between requests
|
||||
crawler := NewCrawler("TestBot/1.0", 2.0, 3)
|
||||
ctx := context.Background()
|
||||
|
||||
start := time.Now()
|
||||
|
||||
// Make 3 requests
|
||||
for i := 0; i < 3; i++ {
|
||||
crawler.Fetch(ctx, server.URL+"/page")
|
||||
}
|
||||
|
||||
elapsed := time.Since(start)
|
||||
|
||||
// With 2 req/sec, 3 requests should take at least 1 second (2 intervals)
|
||||
if elapsed < 800*time.Millisecond {
|
||||
t.Errorf("Rate limiting not working: 3 requests took only %v", elapsed)
|
||||
}
|
||||
}
|
||||
|
||||
func TestLoadSeedFile_EmptyLines(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
|
||||
content := `
|
||||
|
||||
https://example.com
|
||||
|
||||
# comment
|
||||
|
||||
https://example.org
|
||||
|
||||
`
|
||||
if err := os.WriteFile(filepath.Join(dir, "seeds.txt"), []byte(content), 0644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
crawler := NewCrawler("TestBot/1.0", 1.0, 3)
|
||||
seeds, err := crawler.LoadSeeds(dir)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if len(seeds) != 2 {
|
||||
t.Errorf("Expected 2 seeds (ignoring empty lines and comments), got %d", len(seeds))
|
||||
}
|
||||
}
|
||||
|
||||
func TestCrawler_Fetch_LargeBody(t *testing.T) {
|
||||
// Create a large response (but under the limit)
|
||||
largeBody := strings.Repeat("A", 1024*1024) // 1MB
|
||||
|
||||
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
w.Header().Set("Content-Type", "text/plain")
|
||||
w.WriteHeader(http.StatusOK)
|
||||
w.Write([]byte(largeBody))
|
||||
}))
|
||||
defer server.Close()
|
||||
|
||||
crawler := NewCrawler("TestBot/1.0", 100.0, 3)
|
||||
ctx := context.Background()
|
||||
|
||||
result, err := crawler.Fetch(ctx, server.URL+"/large")
|
||||
if err != nil {
|
||||
t.Fatalf("Fetch failed: %v", err)
|
||||
}
|
||||
|
||||
if len(result.Body) != len(largeBody) {
|
||||
t.Errorf("Expected body length %d, got %d", len(largeBody), len(result.Body))
|
||||
}
|
||||
}
|
||||
|
||||
// Tests for API Integration (new functionality)
|
||||
|
||||
func TestCrawler_SetAPIClient(t *testing.T) {
|
||||
crawler := NewCrawler("TestBot/1.0", 1.0, 3)
|
||||
|
||||
if crawler.apiClient != nil {
|
||||
t.Error("Expected nil apiClient initially")
|
||||
}
|
||||
|
||||
crawler.SetAPIClient("http://backend:8000")
|
||||
|
||||
if crawler.apiClient == nil {
|
||||
t.Error("Expected non-nil apiClient after SetAPIClient")
|
||||
}
|
||||
}
|
||||
|
||||
func TestCrawler_LoadSeedsFromAPI_NotInitialized(t *testing.T) {
|
||||
crawler := NewCrawler("TestBot/1.0", 1.0, 3)
|
||||
ctx := context.Background()
|
||||
|
||||
_, err := crawler.LoadSeedsFromAPI(ctx)
|
||||
|
||||
if err == nil {
|
||||
t.Error("Expected error when API client not initialized")
|
||||
}
|
||||
}
|
||||
|
||||
func TestCrawler_LoadSeedsFromAPI_Success(t *testing.T) {
|
||||
// Create mock server
|
||||
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
w.Write([]byte(`{
|
||||
"seeds": [
|
||||
{"url": "https://www.kmk.org", "trust": 0.8, "source": "GOV", "scope": "FEDERAL", "state": "", "depth": 3, "category": "federal"},
|
||||
{"url": "https://www.km-bw.de", "trust": 0.7, "source": "GOV", "scope": "STATE", "state": "BW", "depth": 2, "category": "states"}
|
||||
],
|
||||
"total": 2,
|
||||
"exported_at": "2025-01-17T10:00:00Z"
|
||||
}`))
|
||||
}))
|
||||
defer server.Close()
|
||||
|
||||
crawler := NewCrawler("TestBot/1.0", 1.0, 4)
|
||||
crawler.SetAPIClient(server.URL)
|
||||
ctx := context.Background()
|
||||
|
||||
seeds, err := crawler.LoadSeedsFromAPI(ctx)
|
||||
|
||||
if err != nil {
|
||||
t.Fatalf("Unexpected error: %v", err)
|
||||
}
|
||||
|
||||
if len(seeds) != 2 {
|
||||
t.Fatalf("Expected 2 seeds, got %d", len(seeds))
|
||||
}
|
||||
|
||||
// Check first seed
|
||||
if seeds[0].URL != "https://www.kmk.org" {
|
||||
t.Errorf("Expected URL 'https://www.kmk.org', got '%s'", seeds[0].URL)
|
||||
}
|
||||
if seeds[0].TrustBoost != 0.8 {
|
||||
t.Errorf("Expected TrustBoost 0.8, got %f", seeds[0].TrustBoost)
|
||||
}
|
||||
if seeds[0].Source != "GOV" {
|
||||
t.Errorf("Expected Source 'GOV', got '%s'", seeds[0].Source)
|
||||
}
|
||||
if seeds[0].MaxDepth != 3 {
|
||||
t.Errorf("Expected MaxDepth 3, got %d", seeds[0].MaxDepth)
|
||||
}
|
||||
|
||||
// Check second seed with state
|
||||
if seeds[1].State != "BW" {
|
||||
t.Errorf("Expected State 'BW', got '%s'", seeds[1].State)
|
||||
}
|
||||
if seeds[1].Category != "states" {
|
||||
t.Errorf("Expected Category 'states', got '%s'", seeds[1].Category)
|
||||
}
|
||||
}
|
||||
|
||||
func TestCrawler_LoadSeedsFromAPI_DefaultDepth(t *testing.T) {
|
||||
// Create mock server with seed that has no depth
|
||||
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
w.Write([]byte(`{
|
||||
"seeds": [
|
||||
{"url": "https://www.example.com", "trust": 0.5, "source": "EDU", "scope": "FEDERAL", "state": "", "depth": 0, "category": "edu"}
|
||||
],
|
||||
"total": 1,
|
||||
"exported_at": "2025-01-17T10:00:00Z"
|
||||
}`))
|
||||
}))
|
||||
defer server.Close()
|
||||
|
||||
defaultDepth := 5
|
||||
crawler := NewCrawler("TestBot/1.0", 1.0, defaultDepth)
|
||||
crawler.SetAPIClient(server.URL)
|
||||
ctx := context.Background()
|
||||
|
||||
seeds, err := crawler.LoadSeedsFromAPI(ctx)
|
||||
|
||||
if err != nil {
|
||||
t.Fatalf("Unexpected error: %v", err)
|
||||
}
|
||||
|
||||
// When depth is 0 or not specified, it should use crawler's default
|
||||
if seeds[0].MaxDepth != defaultDepth {
|
||||
t.Errorf("Expected default MaxDepth %d, got %d", defaultDepth, seeds[0].MaxDepth)
|
||||
}
|
||||
}
|
||||
|
||||
func TestCrawler_LoadSeedsWithMetadata(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
|
||||
seedContent := `https://www.kmk.org
|
||||
https://www.bildungsserver.de`
|
||||
|
||||
if err := os.WriteFile(filepath.Join(dir, "seeds.txt"), []byte(seedContent), 0644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
defaultDepth := 4
|
||||
crawler := NewCrawler("TestBot/1.0", 1.0, defaultDepth)
|
||||
seeds, err := crawler.LoadSeedsWithMetadata(dir)
|
||||
|
||||
if err != nil {
|
||||
t.Fatalf("LoadSeedsWithMetadata failed: %v", err)
|
||||
}
|
||||
|
||||
if len(seeds) != 2 {
|
||||
t.Fatalf("Expected 2 seeds, got %d", len(seeds))
|
||||
}
|
||||
|
||||
// Check default values
|
||||
for _, seed := range seeds {
|
||||
if seed.TrustBoost != 0.5 {
|
||||
t.Errorf("Expected default TrustBoost 0.5, got %f", seed.TrustBoost)
|
||||
}
|
||||
if seed.MaxDepth != defaultDepth {
|
||||
t.Errorf("Expected default MaxDepth %d, got %d", defaultDepth, seed.MaxDepth)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestSeed_Struct(t *testing.T) {
|
||||
seed := Seed{
|
||||
URL: "https://www.example.com",
|
||||
TrustBoost: 0.75,
|
||||
Source: "GOV",
|
||||
Scope: "STATE",
|
||||
State: "BY",
|
||||
MaxDepth: 3,
|
||||
Category: "states",
|
||||
}
|
||||
|
||||
if seed.URL != "https://www.example.com" {
|
||||
t.Errorf("URL mismatch")
|
||||
}
|
||||
if seed.TrustBoost != 0.75 {
|
||||
t.Errorf("TrustBoost mismatch")
|
||||
}
|
||||
if seed.Source != "GOV" {
|
||||
t.Errorf("Source mismatch")
|
||||
}
|
||||
if seed.Scope != "STATE" {
|
||||
t.Errorf("Scope mismatch")
|
||||
}
|
||||
if seed.State != "BY" {
|
||||
t.Errorf("State mismatch")
|
||||
}
|
||||
if seed.MaxDepth != 3 {
|
||||
t.Errorf("MaxDepth mismatch")
|
||||
}
|
||||
if seed.Category != "states" {
|
||||
t.Errorf("Category mismatch")
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user