All checks were successful
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 28s
CI / test-go-edu-search (push) Successful in 27s
CI / test-python-klausur (push) Successful in 1m45s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 21s
- edu-search-service von breakpilot-pwa nach breakpilot-lehrer kopiert (ohne vendor) - opensearch + edu-search-service in docker-compose.yml hinzugefuegt - voice-service aus docker-compose.yml entfernt (jetzt in breakpilot-core) - geo-service aus docker-compose.yml entfernt (nicht mehr benoetigt) - CI/CD: edu-search-service zu Gitea Actions und Woodpecker hinzugefuegt (Go lint, test mit go mod download, build, SBOM) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
244 lines
5.9 KiB
Go
244 lines
5.9 KiB
Go
package indexer
|
|
|
|
import (
|
|
"context"
|
|
"encoding/json"
|
|
"strings"
|
|
"time"
|
|
|
|
"github.com/opensearch-project/opensearch-go/v2"
|
|
"github.com/opensearch-project/opensearch-go/v2/opensearchapi"
|
|
)
|
|
|
|
// IndexMapping defines the OpenSearch index mapping for education documents
|
|
const IndexMapping = `{
|
|
"settings": {
|
|
"index": {
|
|
"number_of_shards": 3,
|
|
"number_of_replicas": 1,
|
|
"refresh_interval": "5s"
|
|
},
|
|
"analysis": {
|
|
"analyzer": {
|
|
"german_custom": {
|
|
"type": "custom",
|
|
"tokenizer": "standard",
|
|
"filter": ["lowercase", "german_normalization", "german_stemmer"]
|
|
}
|
|
},
|
|
"filter": {
|
|
"german_stemmer": {
|
|
"type": "stemmer",
|
|
"language": "german"
|
|
}
|
|
}
|
|
}
|
|
},
|
|
"mappings": {
|
|
"properties": {
|
|
"doc_id": { "type": "keyword" },
|
|
"url": { "type": "keyword" },
|
|
"canonical_url": { "type": "keyword" },
|
|
"domain": { "type": "keyword" },
|
|
"fetch_time": { "type": "date" },
|
|
"last_modified": { "type": "date" },
|
|
"content_hash": { "type": "keyword" },
|
|
"title": {
|
|
"type": "text",
|
|
"analyzer": "german_custom",
|
|
"fields": {
|
|
"keyword": { "type": "keyword", "ignore_above": 512 }
|
|
}
|
|
},
|
|
"content_text": {
|
|
"type": "text",
|
|
"analyzer": "german_custom"
|
|
},
|
|
"snippet_text": { "type": "text", "index": false },
|
|
"content_type": { "type": "keyword" },
|
|
"language": { "type": "keyword" },
|
|
"country_hint": { "type": "keyword" },
|
|
"source_category": { "type": "keyword" },
|
|
"doc_type": { "type": "keyword" },
|
|
"school_level": { "type": "keyword" },
|
|
"subjects": { "type": "keyword" },
|
|
"state": { "type": "keyword" },
|
|
"trust_score": { "type": "float" },
|
|
"quality_score": { "type": "float" },
|
|
"spam_flags": { "type": "keyword" },
|
|
"outlinks": { "type": "keyword" },
|
|
"inlinks_count": { "type": "integer" },
|
|
"content_length": { "type": "integer" },
|
|
"raw_refs": {
|
|
"properties": {
|
|
"html_raw_ref": { "type": "keyword" },
|
|
"pdf_raw_ref": { "type": "keyword" }
|
|
}
|
|
},
|
|
"tag_reasons": { "type": "keyword" }
|
|
}
|
|
}
|
|
}`
|
|
|
|
// Document represents an indexed education document
|
|
type Document struct {
|
|
DocID string `json:"doc_id"`
|
|
URL string `json:"url"`
|
|
CanonicalURL string `json:"canonical_url,omitempty"`
|
|
Domain string `json:"domain"`
|
|
FetchedAt time.Time `json:"fetch_time"`
|
|
UpdatedAt time.Time `json:"last_modified,omitempty"`
|
|
ContentHash string `json:"content_hash"`
|
|
Title string `json:"title"`
|
|
ContentText string `json:"content_text"`
|
|
SnippetText string `json:"snippet_text"`
|
|
ContentType string `json:"content_type,omitempty"`
|
|
Language string `json:"language"`
|
|
CountryHint string `json:"country_hint,omitempty"`
|
|
SourceCategory string `json:"source_category,omitempty"`
|
|
DocType string `json:"doc_type"`
|
|
SchoolLevel string `json:"school_level"`
|
|
Subjects []string `json:"subjects"`
|
|
State string `json:"state,omitempty"`
|
|
TrustScore float64 `json:"trust_score"`
|
|
QualityScore float64 `json:"quality_score"`
|
|
SpamFlags []string `json:"spam_flags,omitempty"`
|
|
Outlinks []string `json:"outlinks,omitempty"`
|
|
InlinksCount int `json:"inlinks_count,omitempty"`
|
|
ContentLength int `json:"content_length,omitempty"`
|
|
TagReasons []string `json:"tag_reasons,omitempty"`
|
|
}
|
|
|
|
// Client wraps OpenSearch operations
|
|
type Client struct {
|
|
client *opensearch.Client
|
|
indexName string
|
|
}
|
|
|
|
// NewClient creates a new OpenSearch indexer client
|
|
func NewClient(url, username, password, indexName string) (*Client, error) {
|
|
cfg := opensearch.Config{
|
|
Addresses: []string{url},
|
|
Username: username,
|
|
Password: password,
|
|
}
|
|
|
|
client, err := opensearch.NewClient(cfg)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
return &Client{
|
|
client: client,
|
|
indexName: indexName,
|
|
}, nil
|
|
}
|
|
|
|
// CreateIndex creates the index with proper mapping
|
|
func (c *Client) CreateIndex(ctx context.Context) error {
|
|
// Check if index exists
|
|
res, err := c.client.Indices.Exists([]string{c.indexName})
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer res.Body.Close()
|
|
|
|
if res.StatusCode == 200 {
|
|
// Index already exists
|
|
return nil
|
|
}
|
|
|
|
// Create index with mapping
|
|
req := opensearchapi.IndicesCreateRequest{
|
|
Index: c.indexName,
|
|
Body: strings.NewReader(IndexMapping),
|
|
}
|
|
|
|
res, err = req.Do(ctx, c.client)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer res.Body.Close()
|
|
|
|
return nil
|
|
}
|
|
|
|
// IndexDocument indexes a single document
|
|
func (c *Client) IndexDocument(ctx context.Context, doc *Document) error {
|
|
body, err := json.Marshal(doc)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
req := opensearchapi.IndexRequest{
|
|
Index: c.indexName,
|
|
DocumentID: doc.DocID,
|
|
Body: strings.NewReader(string(body)),
|
|
Refresh: "false",
|
|
}
|
|
|
|
res, err := req.Do(ctx, c.client)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer res.Body.Close()
|
|
|
|
return nil
|
|
}
|
|
|
|
// BulkIndex indexes multiple documents efficiently
|
|
func (c *Client) BulkIndex(ctx context.Context, docs []Document) error {
|
|
if len(docs) == 0 {
|
|
return nil
|
|
}
|
|
|
|
var builder strings.Builder
|
|
|
|
for _, doc := range docs {
|
|
// Action line
|
|
meta := map[string]interface{}{
|
|
"index": map[string]interface{}{
|
|
"_index": c.indexName,
|
|
"_id": doc.DocID,
|
|
},
|
|
}
|
|
metaBytes, _ := json.Marshal(meta)
|
|
builder.Write(metaBytes)
|
|
builder.WriteString("\n")
|
|
|
|
// Document line
|
|
docBytes, _ := json.Marshal(doc)
|
|
builder.Write(docBytes)
|
|
builder.WriteString("\n")
|
|
}
|
|
|
|
req := opensearchapi.BulkRequest{
|
|
Body: strings.NewReader(builder.String()),
|
|
}
|
|
|
|
res, err := req.Do(ctx, c.client)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer res.Body.Close()
|
|
|
|
return nil
|
|
}
|
|
|
|
// Health checks OpenSearch cluster health
|
|
func (c *Client) Health(ctx context.Context) (string, error) {
|
|
res, err := c.client.Cluster.Health()
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
defer res.Body.Close()
|
|
|
|
var result map[string]interface{}
|
|
if err := json.NewDecoder(res.Body).Decode(&result); err != nil {
|
|
return "", err
|
|
}
|
|
|
|
status, _ := result["status"].(string)
|
|
return status, nil
|
|
}
|