package indexer import ( "context" "encoding/json" "strings" "time" "github.com/opensearch-project/opensearch-go/v2" "github.com/opensearch-project/opensearch-go/v2/opensearchapi" ) // IndexMapping defines the OpenSearch index mapping for education documents const IndexMapping = `{ "settings": { "index": { "number_of_shards": 3, "number_of_replicas": 1, "refresh_interval": "5s" }, "analysis": { "analyzer": { "german_custom": { "type": "custom", "tokenizer": "standard", "filter": ["lowercase", "german_normalization", "german_stemmer"] } }, "filter": { "german_stemmer": { "type": "stemmer", "language": "german" } } } }, "mappings": { "properties": { "doc_id": { "type": "keyword" }, "url": { "type": "keyword" }, "canonical_url": { "type": "keyword" }, "domain": { "type": "keyword" }, "fetch_time": { "type": "date" }, "last_modified": { "type": "date" }, "content_hash": { "type": "keyword" }, "title": { "type": "text", "analyzer": "german_custom", "fields": { "keyword": { "type": "keyword", "ignore_above": 512 } } }, "content_text": { "type": "text", "analyzer": "german_custom" }, "snippet_text": { "type": "text", "index": false }, "content_type": { "type": "keyword" }, "language": { "type": "keyword" }, "country_hint": { "type": "keyword" }, "source_category": { "type": "keyword" }, "doc_type": { "type": "keyword" }, "school_level": { "type": "keyword" }, "subjects": { "type": "keyword" }, "state": { "type": "keyword" }, "trust_score": { "type": "float" }, "quality_score": { "type": "float" }, "spam_flags": { "type": "keyword" }, "outlinks": { "type": "keyword" }, "inlinks_count": { "type": "integer" }, "content_length": { "type": "integer" }, "raw_refs": { "properties": { "html_raw_ref": { "type": "keyword" }, "pdf_raw_ref": { "type": "keyword" } } }, "tag_reasons": { "type": "keyword" } } } }` // Document represents an indexed education document type Document struct { DocID string `json:"doc_id"` URL string `json:"url"` CanonicalURL string `json:"canonical_url,omitempty"` Domain string `json:"domain"` FetchedAt time.Time `json:"fetch_time"` UpdatedAt time.Time `json:"last_modified,omitempty"` ContentHash string `json:"content_hash"` Title string `json:"title"` ContentText string `json:"content_text"` SnippetText string `json:"snippet_text"` ContentType string `json:"content_type,omitempty"` Language string `json:"language"` CountryHint string `json:"country_hint,omitempty"` SourceCategory string `json:"source_category,omitempty"` DocType string `json:"doc_type"` SchoolLevel string `json:"school_level"` Subjects []string `json:"subjects"` State string `json:"state,omitempty"` TrustScore float64 `json:"trust_score"` QualityScore float64 `json:"quality_score"` SpamFlags []string `json:"spam_flags,omitempty"` Outlinks []string `json:"outlinks,omitempty"` InlinksCount int `json:"inlinks_count,omitempty"` ContentLength int `json:"content_length,omitempty"` TagReasons []string `json:"tag_reasons,omitempty"` } // Client wraps OpenSearch operations type Client struct { client *opensearch.Client indexName string } // NewClient creates a new OpenSearch indexer client func NewClient(url, username, password, indexName string) (*Client, error) { cfg := opensearch.Config{ Addresses: []string{url}, Username: username, Password: password, } client, err := opensearch.NewClient(cfg) if err != nil { return nil, err } return &Client{ client: client, indexName: indexName, }, nil } // CreateIndex creates the index with proper mapping func (c *Client) CreateIndex(ctx context.Context) error { // Check if index exists res, err := c.client.Indices.Exists([]string{c.indexName}) if err != nil { return err } defer res.Body.Close() if res.StatusCode == 200 { // Index already exists return nil } // Create index with mapping req := opensearchapi.IndicesCreateRequest{ Index: c.indexName, Body: strings.NewReader(IndexMapping), } res, err = req.Do(ctx, c.client) if err != nil { return err } defer res.Body.Close() return nil } // IndexDocument indexes a single document func (c *Client) IndexDocument(ctx context.Context, doc *Document) error { body, err := json.Marshal(doc) if err != nil { return err } req := opensearchapi.IndexRequest{ Index: c.indexName, DocumentID: doc.DocID, Body: strings.NewReader(string(body)), Refresh: "false", } res, err := req.Do(ctx, c.client) if err != nil { return err } defer res.Body.Close() return nil } // BulkIndex indexes multiple documents efficiently func (c *Client) BulkIndex(ctx context.Context, docs []Document) error { if len(docs) == 0 { return nil } var builder strings.Builder for _, doc := range docs { // Action line meta := map[string]interface{}{ "index": map[string]interface{}{ "_index": c.indexName, "_id": doc.DocID, }, } metaBytes, _ := json.Marshal(meta) builder.Write(metaBytes) builder.WriteString("\n") // Document line docBytes, _ := json.Marshal(doc) builder.Write(docBytes) builder.WriteString("\n") } req := opensearchapi.BulkRequest{ Body: strings.NewReader(builder.String()), } res, err := req.Do(ctx, c.client) if err != nil { return err } defer res.Body.Close() return nil } // Health checks OpenSearch cluster health func (c *Client) Health(ctx context.Context) (string, error) { res, err := c.client.Cluster.Health() if err != nil { return "", err } defer res.Body.Close() var result map[string]interface{} if err := json.NewDecoder(res.Body).Decode(&result); err != nil { return "", err } status, _ := result["status"].(string) return status, nil }