Files
breakpilot-lehrer/edu-search-service/internal/crawler/api_client.go
Benjamin Boenisch 414e0f5ec0
All checks were successful
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 28s
CI / test-go-edu-search (push) Successful in 27s
CI / test-python-klausur (push) Successful in 1m45s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 21s
feat: edu-search-service migriert, voice-service/geo-service entfernt
- edu-search-service von breakpilot-pwa nach breakpilot-lehrer kopiert (ohne vendor)
- opensearch + edu-search-service in docker-compose.yml hinzugefuegt
- voice-service aus docker-compose.yml entfernt (jetzt in breakpilot-core)
- geo-service aus docker-compose.yml entfernt (nicht mehr benoetigt)
- CI/CD: edu-search-service zu Gitea Actions und Woodpecker hinzugefuegt
  (Go lint, test mit go mod download, build, SBOM)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-15 18:36:38 +01:00

184 lines
5.3 KiB
Go

package crawler
import (
"bytes"
"context"
"encoding/json"
"fmt"
"io"
"net/http"
"time"
)
// SeedFromAPI represents a seed URL from the Backend API
type SeedFromAPI struct {
URL string `json:"url"`
Trust float64 `json:"trust"`
Source string `json:"source"` // GOV, EDU, UNI, etc.
Scope string `json:"scope"` // FEDERAL, STATE, etc.
State string `json:"state"` // BW, BY, etc. (optional)
Depth int `json:"depth"` // Crawl depth for this seed
Category string `json:"category"` // Category name
}
// SeedsExportResponse represents the API response from /seeds/export/for-crawler
type SeedsExportResponse struct {
Seeds []SeedFromAPI `json:"seeds"`
Total int `json:"total"`
ExportedAt string `json:"exported_at"`
}
// APIClient handles communication with the Python Backend
type APIClient struct {
baseURL string
httpClient *http.Client
}
// NewAPIClient creates a new API client for fetching seeds
func NewAPIClient(backendURL string) *APIClient {
return &APIClient{
baseURL: backendURL,
httpClient: &http.Client{
Timeout: 30 * time.Second,
},
}
}
// FetchSeeds retrieves enabled seeds from the Backend API
func (c *APIClient) FetchSeeds(ctx context.Context) (*SeedsExportResponse, error) {
url := fmt.Sprintf("%s/v1/edu-search/seeds/export/for-crawler", c.baseURL)
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
if err != nil {
return nil, fmt.Errorf("failed to create request: %w", err)
}
req.Header.Set("Accept", "application/json")
req.Header.Set("User-Agent", "EduSearchCrawler/1.0")
resp, err := c.httpClient.Do(req)
if err != nil {
return nil, fmt.Errorf("failed to fetch seeds: %w", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
body, _ := io.ReadAll(resp.Body)
return nil, fmt.Errorf("API returned status %d: %s", resp.StatusCode, string(body))
}
body, err := io.ReadAll(resp.Body)
if err != nil {
return nil, fmt.Errorf("failed to read response: %w", err)
}
var result SeedsExportResponse
if err := json.Unmarshal(body, &result); err != nil {
return nil, fmt.Errorf("failed to parse response: %w", err)
}
return &result, nil
}
// CrawlStatusReport represents a crawl status to report to the Backend
type CrawlStatusReport struct {
SeedURL string `json:"seed_url"`
Status string `json:"status"` // "success", "error", "partial"
DocumentsCrawled int `json:"documents_crawled"`
ErrorMessage string `json:"error_message,omitempty"`
CrawlDuration float64 `json:"crawl_duration_seconds"`
}
// CrawlStatusResponse represents the response from crawl status endpoint
type CrawlStatusResponse struct {
Success bool `json:"success"`
SeedURL string `json:"seed_url"`
Message string `json:"message"`
}
// BulkCrawlStatusResponse represents the response from bulk crawl status endpoint
type BulkCrawlStatusResponse struct {
Updated int `json:"updated"`
Failed int `json:"failed"`
Errors []string `json:"errors"`
}
// ReportStatus sends crawl status for a single seed to the Backend
func (c *APIClient) ReportStatus(ctx context.Context, report *CrawlStatusReport) error {
url := fmt.Sprintf("%s/v1/edu-search/seeds/crawl-status", c.baseURL)
body, err := json.Marshal(report)
if err != nil {
return fmt.Errorf("failed to marshal report: %w", err)
}
req, err := http.NewRequestWithContext(ctx, "POST", url, bytes.NewReader(body))
if err != nil {
return fmt.Errorf("failed to create request: %w", err)
}
req.Header.Set("Content-Type", "application/json")
req.Header.Set("Accept", "application/json")
req.Header.Set("User-Agent", "EduSearchCrawler/1.0")
resp, err := c.httpClient.Do(req)
if err != nil {
return fmt.Errorf("failed to report status: %w", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
respBody, _ := io.ReadAll(resp.Body)
return fmt.Errorf("API returned status %d: %s", resp.StatusCode, string(respBody))
}
return nil
}
// ReportStatusBulk sends crawl status for multiple seeds in one request
func (c *APIClient) ReportStatusBulk(ctx context.Context, reports []*CrawlStatusReport) (*BulkCrawlStatusResponse, error) {
url := fmt.Sprintf("%s/v1/edu-search/seeds/crawl-status/bulk", c.baseURL)
payload := struct {
Updates []*CrawlStatusReport `json:"updates"`
}{
Updates: reports,
}
body, err := json.Marshal(payload)
if err != nil {
return nil, fmt.Errorf("failed to marshal reports: %w", err)
}
req, err := http.NewRequestWithContext(ctx, "POST", url, bytes.NewReader(body))
if err != nil {
return nil, fmt.Errorf("failed to create request: %w", err)
}
req.Header.Set("Content-Type", "application/json")
req.Header.Set("Accept", "application/json")
req.Header.Set("User-Agent", "EduSearchCrawler/1.0")
resp, err := c.httpClient.Do(req)
if err != nil {
return nil, fmt.Errorf("failed to report status: %w", err)
}
defer resp.Body.Close()
respBody, err := io.ReadAll(resp.Body)
if err != nil {
return nil, fmt.Errorf("failed to read response: %w", err)
}
if resp.StatusCode != http.StatusOK {
return nil, fmt.Errorf("API returned status %d: %s", resp.StatusCode, string(respBody))
}
var result BulkCrawlStatusResponse
if err := json.Unmarshal(respBody, &result); err != nil {
return nil, fmt.Errorf("failed to parse response: %w", err)
}
return &result, nil
}