Files
breakpilot-lehrer/edu-search-service/internal/indexer/mapping.go
Benjamin Boenisch 414e0f5ec0
All checks were successful
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 28s
CI / test-go-edu-search (push) Successful in 27s
CI / test-python-klausur (push) Successful in 1m45s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 21s
feat: edu-search-service migriert, voice-service/geo-service entfernt
- edu-search-service von breakpilot-pwa nach breakpilot-lehrer kopiert (ohne vendor)
- opensearch + edu-search-service in docker-compose.yml hinzugefuegt
- voice-service aus docker-compose.yml entfernt (jetzt in breakpilot-core)
- geo-service aus docker-compose.yml entfernt (nicht mehr benoetigt)
- CI/CD: edu-search-service zu Gitea Actions und Woodpecker hinzugefuegt
  (Go lint, test mit go mod download, build, SBOM)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-15 18:36:38 +01:00

244 lines
5.9 KiB
Go

package indexer
import (
"context"
"encoding/json"
"strings"
"time"
"github.com/opensearch-project/opensearch-go/v2"
"github.com/opensearch-project/opensearch-go/v2/opensearchapi"
)
// IndexMapping defines the OpenSearch index mapping for education documents
const IndexMapping = `{
"settings": {
"index": {
"number_of_shards": 3,
"number_of_replicas": 1,
"refresh_interval": "5s"
},
"analysis": {
"analyzer": {
"german_custom": {
"type": "custom",
"tokenizer": "standard",
"filter": ["lowercase", "german_normalization", "german_stemmer"]
}
},
"filter": {
"german_stemmer": {
"type": "stemmer",
"language": "german"
}
}
}
},
"mappings": {
"properties": {
"doc_id": { "type": "keyword" },
"url": { "type": "keyword" },
"canonical_url": { "type": "keyword" },
"domain": { "type": "keyword" },
"fetch_time": { "type": "date" },
"last_modified": { "type": "date" },
"content_hash": { "type": "keyword" },
"title": {
"type": "text",
"analyzer": "german_custom",
"fields": {
"keyword": { "type": "keyword", "ignore_above": 512 }
}
},
"content_text": {
"type": "text",
"analyzer": "german_custom"
},
"snippet_text": { "type": "text", "index": false },
"content_type": { "type": "keyword" },
"language": { "type": "keyword" },
"country_hint": { "type": "keyword" },
"source_category": { "type": "keyword" },
"doc_type": { "type": "keyword" },
"school_level": { "type": "keyword" },
"subjects": { "type": "keyword" },
"state": { "type": "keyword" },
"trust_score": { "type": "float" },
"quality_score": { "type": "float" },
"spam_flags": { "type": "keyword" },
"outlinks": { "type": "keyword" },
"inlinks_count": { "type": "integer" },
"content_length": { "type": "integer" },
"raw_refs": {
"properties": {
"html_raw_ref": { "type": "keyword" },
"pdf_raw_ref": { "type": "keyword" }
}
},
"tag_reasons": { "type": "keyword" }
}
}
}`
// Document represents an indexed education document
type Document struct {
DocID string `json:"doc_id"`
URL string `json:"url"`
CanonicalURL string `json:"canonical_url,omitempty"`
Domain string `json:"domain"`
FetchedAt time.Time `json:"fetch_time"`
UpdatedAt time.Time `json:"last_modified,omitempty"`
ContentHash string `json:"content_hash"`
Title string `json:"title"`
ContentText string `json:"content_text"`
SnippetText string `json:"snippet_text"`
ContentType string `json:"content_type,omitempty"`
Language string `json:"language"`
CountryHint string `json:"country_hint,omitempty"`
SourceCategory string `json:"source_category,omitempty"`
DocType string `json:"doc_type"`
SchoolLevel string `json:"school_level"`
Subjects []string `json:"subjects"`
State string `json:"state,omitempty"`
TrustScore float64 `json:"trust_score"`
QualityScore float64 `json:"quality_score"`
SpamFlags []string `json:"spam_flags,omitempty"`
Outlinks []string `json:"outlinks,omitempty"`
InlinksCount int `json:"inlinks_count,omitempty"`
ContentLength int `json:"content_length,omitempty"`
TagReasons []string `json:"tag_reasons,omitempty"`
}
// Client wraps OpenSearch operations
type Client struct {
client *opensearch.Client
indexName string
}
// NewClient creates a new OpenSearch indexer client
func NewClient(url, username, password, indexName string) (*Client, error) {
cfg := opensearch.Config{
Addresses: []string{url},
Username: username,
Password: password,
}
client, err := opensearch.NewClient(cfg)
if err != nil {
return nil, err
}
return &Client{
client: client,
indexName: indexName,
}, nil
}
// CreateIndex creates the index with proper mapping
func (c *Client) CreateIndex(ctx context.Context) error {
// Check if index exists
res, err := c.client.Indices.Exists([]string{c.indexName})
if err != nil {
return err
}
defer res.Body.Close()
if res.StatusCode == 200 {
// Index already exists
return nil
}
// Create index with mapping
req := opensearchapi.IndicesCreateRequest{
Index: c.indexName,
Body: strings.NewReader(IndexMapping),
}
res, err = req.Do(ctx, c.client)
if err != nil {
return err
}
defer res.Body.Close()
return nil
}
// IndexDocument indexes a single document
func (c *Client) IndexDocument(ctx context.Context, doc *Document) error {
body, err := json.Marshal(doc)
if err != nil {
return err
}
req := opensearchapi.IndexRequest{
Index: c.indexName,
DocumentID: doc.DocID,
Body: strings.NewReader(string(body)),
Refresh: "false",
}
res, err := req.Do(ctx, c.client)
if err != nil {
return err
}
defer res.Body.Close()
return nil
}
// BulkIndex indexes multiple documents efficiently
func (c *Client) BulkIndex(ctx context.Context, docs []Document) error {
if len(docs) == 0 {
return nil
}
var builder strings.Builder
for _, doc := range docs {
// Action line
meta := map[string]interface{}{
"index": map[string]interface{}{
"_index": c.indexName,
"_id": doc.DocID,
},
}
metaBytes, _ := json.Marshal(meta)
builder.Write(metaBytes)
builder.WriteString("\n")
// Document line
docBytes, _ := json.Marshal(doc)
builder.Write(docBytes)
builder.WriteString("\n")
}
req := opensearchapi.BulkRequest{
Body: strings.NewReader(builder.String()),
}
res, err := req.Do(ctx, c.client)
if err != nil {
return err
}
defer res.Body.Close()
return nil
}
// Health checks OpenSearch cluster health
func (c *Client) Health(ctx context.Context) (string, error) {
res, err := c.client.Cluster.Health()
if err != nil {
return "", err
}
defer res.Body.Close()
var result map[string]interface{}
if err := json.NewDecoder(res.Body).Decode(&result); err != nil {
return "", err
}
status, _ := result["status"].(string)
return status, nil
}