feat: edu-search-service migriert, voice-service/geo-service entfernt
All checks were successful
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 28s
CI / test-go-edu-search (push) Successful in 27s
CI / test-python-klausur (push) Successful in 1m45s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 21s
All checks were successful
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 28s
CI / test-go-edu-search (push) Successful in 27s
CI / test-python-klausur (push) Successful in 1m45s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 21s
- edu-search-service von breakpilot-pwa nach breakpilot-lehrer kopiert (ohne vendor) - opensearch + edu-search-service in docker-compose.yml hinzugefuegt - voice-service aus docker-compose.yml entfernt (jetzt in breakpilot-core) - geo-service aus docker-compose.yml entfernt (nicht mehr benoetigt) - CI/CD: edu-search-service zu Gitea Actions und Woodpecker hinzugefuegt (Go lint, test mit go mod download, build, SBOM) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
243
edu-search-service/internal/indexer/mapping.go
Normal file
243
edu-search-service/internal/indexer/mapping.go
Normal file
@@ -0,0 +1,243 @@
|
||||
package indexer
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/opensearch-project/opensearch-go/v2"
|
||||
"github.com/opensearch-project/opensearch-go/v2/opensearchapi"
|
||||
)
|
||||
|
||||
// IndexMapping defines the OpenSearch index mapping for education documents
|
||||
const IndexMapping = `{
|
||||
"settings": {
|
||||
"index": {
|
||||
"number_of_shards": 3,
|
||||
"number_of_replicas": 1,
|
||||
"refresh_interval": "5s"
|
||||
},
|
||||
"analysis": {
|
||||
"analyzer": {
|
||||
"german_custom": {
|
||||
"type": "custom",
|
||||
"tokenizer": "standard",
|
||||
"filter": ["lowercase", "german_normalization", "german_stemmer"]
|
||||
}
|
||||
},
|
||||
"filter": {
|
||||
"german_stemmer": {
|
||||
"type": "stemmer",
|
||||
"language": "german"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"mappings": {
|
||||
"properties": {
|
||||
"doc_id": { "type": "keyword" },
|
||||
"url": { "type": "keyword" },
|
||||
"canonical_url": { "type": "keyword" },
|
||||
"domain": { "type": "keyword" },
|
||||
"fetch_time": { "type": "date" },
|
||||
"last_modified": { "type": "date" },
|
||||
"content_hash": { "type": "keyword" },
|
||||
"title": {
|
||||
"type": "text",
|
||||
"analyzer": "german_custom",
|
||||
"fields": {
|
||||
"keyword": { "type": "keyword", "ignore_above": 512 }
|
||||
}
|
||||
},
|
||||
"content_text": {
|
||||
"type": "text",
|
||||
"analyzer": "german_custom"
|
||||
},
|
||||
"snippet_text": { "type": "text", "index": false },
|
||||
"content_type": { "type": "keyword" },
|
||||
"language": { "type": "keyword" },
|
||||
"country_hint": { "type": "keyword" },
|
||||
"source_category": { "type": "keyword" },
|
||||
"doc_type": { "type": "keyword" },
|
||||
"school_level": { "type": "keyword" },
|
||||
"subjects": { "type": "keyword" },
|
||||
"state": { "type": "keyword" },
|
||||
"trust_score": { "type": "float" },
|
||||
"quality_score": { "type": "float" },
|
||||
"spam_flags": { "type": "keyword" },
|
||||
"outlinks": { "type": "keyword" },
|
||||
"inlinks_count": { "type": "integer" },
|
||||
"content_length": { "type": "integer" },
|
||||
"raw_refs": {
|
||||
"properties": {
|
||||
"html_raw_ref": { "type": "keyword" },
|
||||
"pdf_raw_ref": { "type": "keyword" }
|
||||
}
|
||||
},
|
||||
"tag_reasons": { "type": "keyword" }
|
||||
}
|
||||
}
|
||||
}`
|
||||
|
||||
// Document represents an indexed education document
|
||||
type Document struct {
|
||||
DocID string `json:"doc_id"`
|
||||
URL string `json:"url"`
|
||||
CanonicalURL string `json:"canonical_url,omitempty"`
|
||||
Domain string `json:"domain"`
|
||||
FetchedAt time.Time `json:"fetch_time"`
|
||||
UpdatedAt time.Time `json:"last_modified,omitempty"`
|
||||
ContentHash string `json:"content_hash"`
|
||||
Title string `json:"title"`
|
||||
ContentText string `json:"content_text"`
|
||||
SnippetText string `json:"snippet_text"`
|
||||
ContentType string `json:"content_type,omitempty"`
|
||||
Language string `json:"language"`
|
||||
CountryHint string `json:"country_hint,omitempty"`
|
||||
SourceCategory string `json:"source_category,omitempty"`
|
||||
DocType string `json:"doc_type"`
|
||||
SchoolLevel string `json:"school_level"`
|
||||
Subjects []string `json:"subjects"`
|
||||
State string `json:"state,omitempty"`
|
||||
TrustScore float64 `json:"trust_score"`
|
||||
QualityScore float64 `json:"quality_score"`
|
||||
SpamFlags []string `json:"spam_flags,omitempty"`
|
||||
Outlinks []string `json:"outlinks,omitempty"`
|
||||
InlinksCount int `json:"inlinks_count,omitempty"`
|
||||
ContentLength int `json:"content_length,omitempty"`
|
||||
TagReasons []string `json:"tag_reasons,omitempty"`
|
||||
}
|
||||
|
||||
// Client wraps OpenSearch operations
|
||||
type Client struct {
|
||||
client *opensearch.Client
|
||||
indexName string
|
||||
}
|
||||
|
||||
// NewClient creates a new OpenSearch indexer client
|
||||
func NewClient(url, username, password, indexName string) (*Client, error) {
|
||||
cfg := opensearch.Config{
|
||||
Addresses: []string{url},
|
||||
Username: username,
|
||||
Password: password,
|
||||
}
|
||||
|
||||
client, err := opensearch.NewClient(cfg)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return &Client{
|
||||
client: client,
|
||||
indexName: indexName,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// CreateIndex creates the index with proper mapping
|
||||
func (c *Client) CreateIndex(ctx context.Context) error {
|
||||
// Check if index exists
|
||||
res, err := c.client.Indices.Exists([]string{c.indexName})
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer res.Body.Close()
|
||||
|
||||
if res.StatusCode == 200 {
|
||||
// Index already exists
|
||||
return nil
|
||||
}
|
||||
|
||||
// Create index with mapping
|
||||
req := opensearchapi.IndicesCreateRequest{
|
||||
Index: c.indexName,
|
||||
Body: strings.NewReader(IndexMapping),
|
||||
}
|
||||
|
||||
res, err = req.Do(ctx, c.client)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer res.Body.Close()
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// IndexDocument indexes a single document
|
||||
func (c *Client) IndexDocument(ctx context.Context, doc *Document) error {
|
||||
body, err := json.Marshal(doc)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
req := opensearchapi.IndexRequest{
|
||||
Index: c.indexName,
|
||||
DocumentID: doc.DocID,
|
||||
Body: strings.NewReader(string(body)),
|
||||
Refresh: "false",
|
||||
}
|
||||
|
||||
res, err := req.Do(ctx, c.client)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer res.Body.Close()
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// BulkIndex indexes multiple documents efficiently
|
||||
func (c *Client) BulkIndex(ctx context.Context, docs []Document) error {
|
||||
if len(docs) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
var builder strings.Builder
|
||||
|
||||
for _, doc := range docs {
|
||||
// Action line
|
||||
meta := map[string]interface{}{
|
||||
"index": map[string]interface{}{
|
||||
"_index": c.indexName,
|
||||
"_id": doc.DocID,
|
||||
},
|
||||
}
|
||||
metaBytes, _ := json.Marshal(meta)
|
||||
builder.Write(metaBytes)
|
||||
builder.WriteString("\n")
|
||||
|
||||
// Document line
|
||||
docBytes, _ := json.Marshal(doc)
|
||||
builder.Write(docBytes)
|
||||
builder.WriteString("\n")
|
||||
}
|
||||
|
||||
req := opensearchapi.BulkRequest{
|
||||
Body: strings.NewReader(builder.String()),
|
||||
}
|
||||
|
||||
res, err := req.Do(ctx, c.client)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer res.Body.Close()
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// Health checks OpenSearch cluster health
|
||||
func (c *Client) Health(ctx context.Context) (string, error) {
|
||||
res, err := c.client.Cluster.Health()
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
defer res.Body.Close()
|
||||
|
||||
var result map[string]interface{}
|
||||
if err := json.NewDecoder(res.Body).Decode(&result); err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
status, _ := result["status"].(string)
|
||||
return status, nil
|
||||
}
|
||||
Reference in New Issue
Block a user