package crawler import ( "bytes" "context" "encoding/json" "fmt" "io" "net/http" "time" ) // SeedFromAPI represents a seed URL from the Backend API type SeedFromAPI struct { URL string `json:"url"` Trust float64 `json:"trust"` Source string `json:"source"` // GOV, EDU, UNI, etc. Scope string `json:"scope"` // FEDERAL, STATE, etc. State string `json:"state"` // BW, BY, etc. (optional) Depth int `json:"depth"` // Crawl depth for this seed Category string `json:"category"` // Category name } // SeedsExportResponse represents the API response from /seeds/export/for-crawler type SeedsExportResponse struct { Seeds []SeedFromAPI `json:"seeds"` Total int `json:"total"` ExportedAt string `json:"exported_at"` } // APIClient handles communication with the Python Backend type APIClient struct { baseURL string httpClient *http.Client } // NewAPIClient creates a new API client for fetching seeds func NewAPIClient(backendURL string) *APIClient { return &APIClient{ baseURL: backendURL, httpClient: &http.Client{ Timeout: 30 * time.Second, }, } } // FetchSeeds retrieves enabled seeds from the Backend API func (c *APIClient) FetchSeeds(ctx context.Context) (*SeedsExportResponse, error) { url := fmt.Sprintf("%s/v1/edu-search/seeds/export/for-crawler", c.baseURL) req, err := http.NewRequestWithContext(ctx, "GET", url, nil) if err != nil { return nil, fmt.Errorf("failed to create request: %w", err) } req.Header.Set("Accept", "application/json") req.Header.Set("User-Agent", "EduSearchCrawler/1.0") resp, err := c.httpClient.Do(req) if err != nil { return nil, fmt.Errorf("failed to fetch seeds: %w", err) } defer resp.Body.Close() if resp.StatusCode != http.StatusOK { body, _ := io.ReadAll(resp.Body) return nil, fmt.Errorf("API returned status %d: %s", resp.StatusCode, string(body)) } body, err := io.ReadAll(resp.Body) if err != nil { return nil, fmt.Errorf("failed to read response: %w", err) } var result SeedsExportResponse if err := json.Unmarshal(body, &result); err != nil { return nil, fmt.Errorf("failed to parse response: %w", err) } return &result, nil } // CrawlStatusReport represents a crawl status to report to the Backend type CrawlStatusReport struct { SeedURL string `json:"seed_url"` Status string `json:"status"` // "success", "error", "partial" DocumentsCrawled int `json:"documents_crawled"` ErrorMessage string `json:"error_message,omitempty"` CrawlDuration float64 `json:"crawl_duration_seconds"` } // CrawlStatusResponse represents the response from crawl status endpoint type CrawlStatusResponse struct { Success bool `json:"success"` SeedURL string `json:"seed_url"` Message string `json:"message"` } // BulkCrawlStatusResponse represents the response from bulk crawl status endpoint type BulkCrawlStatusResponse struct { Updated int `json:"updated"` Failed int `json:"failed"` Errors []string `json:"errors"` } // ReportStatus sends crawl status for a single seed to the Backend func (c *APIClient) ReportStatus(ctx context.Context, report *CrawlStatusReport) error { url := fmt.Sprintf("%s/v1/edu-search/seeds/crawl-status", c.baseURL) body, err := json.Marshal(report) if err != nil { return fmt.Errorf("failed to marshal report: %w", err) } req, err := http.NewRequestWithContext(ctx, "POST", url, bytes.NewReader(body)) if err != nil { return fmt.Errorf("failed to create request: %w", err) } req.Header.Set("Content-Type", "application/json") req.Header.Set("Accept", "application/json") req.Header.Set("User-Agent", "EduSearchCrawler/1.0") resp, err := c.httpClient.Do(req) if err != nil { return fmt.Errorf("failed to report status: %w", err) } defer resp.Body.Close() if resp.StatusCode != http.StatusOK { respBody, _ := io.ReadAll(resp.Body) return fmt.Errorf("API returned status %d: %s", resp.StatusCode, string(respBody)) } return nil } // ReportStatusBulk sends crawl status for multiple seeds in one request func (c *APIClient) ReportStatusBulk(ctx context.Context, reports []*CrawlStatusReport) (*BulkCrawlStatusResponse, error) { url := fmt.Sprintf("%s/v1/edu-search/seeds/crawl-status/bulk", c.baseURL) payload := struct { Updates []*CrawlStatusReport `json:"updates"` }{ Updates: reports, } body, err := json.Marshal(payload) if err != nil { return nil, fmt.Errorf("failed to marshal reports: %w", err) } req, err := http.NewRequestWithContext(ctx, "POST", url, bytes.NewReader(body)) if err != nil { return nil, fmt.Errorf("failed to create request: %w", err) } req.Header.Set("Content-Type", "application/json") req.Header.Set("Accept", "application/json") req.Header.Set("User-Agent", "EduSearchCrawler/1.0") resp, err := c.httpClient.Do(req) if err != nil { return nil, fmt.Errorf("failed to report status: %w", err) } defer resp.Body.Close() respBody, err := io.ReadAll(resp.Body) if err != nil { return nil, fmt.Errorf("failed to read response: %w", err) } if resp.StatusCode != http.StatusOK { return nil, fmt.Errorf("API returned status %d: %s", resp.StatusCode, string(respBody)) } var result BulkCrawlStatusResponse if err := json.Unmarshal(respBody, &result); err != nil { return nil, fmt.Errorf("failed to parse response: %w", err) } return &result, nil }