Files
Benjamin Boenisch 414e0f5ec0
All checks were successful
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 28s
CI / test-go-edu-search (push) Successful in 27s
CI / test-python-klausur (push) Successful in 1m45s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 21s
feat: edu-search-service migriert, voice-service/geo-service entfernt
- edu-search-service von breakpilot-pwa nach breakpilot-lehrer kopiert (ohne vendor)
- opensearch + edu-search-service in docker-compose.yml hinzugefuegt
- voice-service aus docker-compose.yml entfernt (jetzt in breakpilot-core)
- geo-service aus docker-compose.yml entfernt (nicht mehr benoetigt)
- CI/CD: edu-search-service zu Gitea Actions und Woodpecker hinzugefuegt
  (Go lint, test mit go mod download, build, SBOM)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-15 18:36:38 +01:00

327 lines
8.0 KiB
Go

package quality
import (
"regexp"
"strings"
)
// Scorer calculates quality scores for documents
type Scorer struct {
weights Weights
}
// Weights defines the contribution of each factor to the quality score
type Weights struct {
ContentLength float64 // 0.20 - longer content often more valuable
HeadingStructure float64 // 0.15 - well-structured documents
LinkQuality float64 // 0.15 - low ad/external link density
TextToHTMLRatio float64 // 0.15 - content-rich pages
MetadataPresence float64 // 0.10 - proper title, description
LanguageClarity float64 // 0.10 - German content, no mixed languages
ContentFreshness float64 // 0.10 - indication of update/recency
PDFSpecific float64 // 0.05 - PDF-specific quality signals
}
// DefaultWeights returns the default quality score weights
func DefaultWeights() Weights {
return Weights{
ContentLength: 0.20,
HeadingStructure: 0.15,
LinkQuality: 0.15,
TextToHTMLRatio: 0.15,
MetadataPresence: 0.10,
LanguageClarity: 0.10,
ContentFreshness: 0.10,
PDFSpecific: 0.05,
}
}
// ContentFeatures holds extracted features for quality scoring
type ContentFeatures struct {
ContentLength int
HeadingCount int
HeadingDepth int // max heading level depth (h1-h6)
LinkDensity float64
AdDensity float64
TextToHTMLRatio float64
HasTitle bool
HasDescription bool
HasCanonical bool
Language string
IsPDF bool
PageCount int // for PDFs
HasTOC bool // table of contents
DateIndicators []string // found date patterns
}
// Score represents the quality score breakdown
type Score struct {
Total float64 `json:"total"`
ContentLength float64 `json:"content_length"`
HeadingStructure float64 `json:"heading_structure"`
LinkQuality float64 `json:"link_quality"`
TextToHTMLRatio float64 `json:"text_html_ratio"`
MetadataPresence float64 `json:"metadata_presence"`
LanguageClarity float64 `json:"language_clarity"`
ContentFreshness float64 `json:"content_freshness"`
PDFSpecific float64 `json:"pdf_specific"`
}
// NewScorer creates a quality scorer with default weights
func NewScorer() *Scorer {
return &Scorer{weights: DefaultWeights()}
}
// NewScorerWithWeights creates a scorer with custom weights
func NewScorerWithWeights(w Weights) *Scorer {
return &Scorer{weights: w}
}
// Calculate computes the quality score for given features
func (s *Scorer) Calculate(features ContentFeatures) Score {
score := Score{}
// 1. Content Length Score (0-1)
score.ContentLength = s.calculateContentLengthScore(features.ContentLength)
// 2. Heading Structure Score (0-1)
score.HeadingStructure = s.calculateHeadingScore(features.HeadingCount, features.HeadingDepth, features.HasTOC)
// 3. Link Quality Score (0-1)
score.LinkQuality = s.calculateLinkQualityScore(features.LinkDensity, features.AdDensity)
// 4. Text to HTML Ratio Score (0-1)
score.TextToHTMLRatio = s.calculateTextRatioScore(features.TextToHTMLRatio)
// 5. Metadata Presence Score (0-1)
score.MetadataPresence = s.calculateMetadataScore(features.HasTitle, features.HasDescription, features.HasCanonical)
// 6. Language Clarity Score (0-1)
score.LanguageClarity = s.calculateLanguageScore(features.Language)
// 7. Content Freshness Score (0-1)
score.ContentFreshness = s.calculateFreshnessScore(features.DateIndicators)
// 8. PDF-Specific Score (0-1)
if features.IsPDF {
score.PDFSpecific = s.calculatePDFScore(features.PageCount, features.ContentLength)
} else {
score.PDFSpecific = 1.0 // full score for non-PDFs (no penalty)
}
// Calculate weighted total
score.Total = score.ContentLength*s.weights.ContentLength +
score.HeadingStructure*s.weights.HeadingStructure +
score.LinkQuality*s.weights.LinkQuality +
score.TextToHTMLRatio*s.weights.TextToHTMLRatio +
score.MetadataPresence*s.weights.MetadataPresence +
score.LanguageClarity*s.weights.LanguageClarity +
score.ContentFreshness*s.weights.ContentFreshness +
score.PDFSpecific*s.weights.PDFSpecific
// Clamp to 0-1
if score.Total > 1.0 {
score.Total = 1.0
}
if score.Total < 0 {
score.Total = 0
}
return score
}
// calculateContentLengthScore scores based on content length
func (s *Scorer) calculateContentLengthScore(length int) float64 {
// Optimal range: 1000-10000 characters
// Too short (<500): low quality
// Too long (>20000): might be noise/boilerplate
switch {
case length < 200:
return 0.1
case length < 500:
return 0.3
case length < 1000:
return 0.6
case length < 3000:
return 0.8
case length < 10000:
return 1.0
case length < 20000:
return 0.9
default:
return 0.7 // very long documents might have quality issues
}
}
// calculateHeadingScore scores heading structure
func (s *Scorer) calculateHeadingScore(count, depth int, hasTOC bool) float64 {
score := 0.0
// Headings present
if count > 0 {
score += 0.4
}
if count >= 3 {
score += 0.2
}
// Depth variety (proper hierarchy)
if depth >= 2 {
score += 0.2
}
// Table of contents indicates well-structured document
if hasTOC {
score += 0.2
}
if score > 1.0 {
score = 1.0
}
return score
}
// calculateLinkQualityScore scores based on link/ad density
func (s *Scorer) calculateLinkQualityScore(linkDensity, adDensity float64) float64 {
score := 1.0
// High link density is bad
if linkDensity > 0.3 {
score -= 0.3
} else if linkDensity > 0.2 {
score -= 0.1
}
// Any ad density is bad
if adDensity > 0.1 {
score -= 0.4
} else if adDensity > 0.05 {
score -= 0.2
} else if adDensity > 0 {
score -= 0.1
}
if score < 0 {
score = 0
}
return score
}
// calculateTextRatioScore scores text to HTML ratio
func (s *Scorer) calculateTextRatioScore(ratio float64) float64 {
// Good ratio: 0.2-0.6
// Too low: too much markup
// Too high: might be plain text dump
switch {
case ratio < 0.1:
return 0.3
case ratio < 0.2:
return 0.6
case ratio < 0.6:
return 1.0
case ratio < 0.8:
return 0.8
default:
return 0.6
}
}
// calculateMetadataScore scores presence of metadata
func (s *Scorer) calculateMetadataScore(hasTitle, hasDescription, hasCanonical bool) float64 {
score := 0.0
if hasTitle {
score += 0.5
}
if hasDescription {
score += 0.3
}
if hasCanonical {
score += 0.2
}
return score
}
// calculateLanguageScore scores language clarity
func (s *Scorer) calculateLanguageScore(language string) float64 {
switch strings.ToLower(language) {
case "de", "german", "deutsch":
return 1.0
case "en", "english", "englisch":
return 0.8 // English is acceptable
case "":
return 0.5 // unknown
default:
return 0.3 // other languages
}
}
// calculateFreshnessScore scores content freshness indicators
func (s *Scorer) calculateFreshnessScore(dateIndicators []string) float64 {
if len(dateIndicators) == 0 {
return 0.5 // neutral
}
// Check for recent years (2020+)
recentYearPattern := regexp.MustCompile(`202[0-5]`)
for _, indicator := range dateIndicators {
if recentYearPattern.MatchString(indicator) {
return 1.0
}
}
// Check for 2015-2019
modernPattern := regexp.MustCompile(`201[5-9]`)
for _, indicator := range dateIndicators {
if modernPattern.MatchString(indicator) {
return 0.7
}
}
// Older content
return 0.4
}
// calculatePDFScore scores PDF-specific quality
func (s *Scorer) calculatePDFScore(pageCount, contentLength int) float64 {
score := 0.5 // base
// Page count bonus
if pageCount > 1 {
score += 0.2
}
if pageCount > 5 {
score += 0.1
}
// Text extraction success
if contentLength > 100 {
score += 0.2
}
if score > 1.0 {
score = 1.0
}
return score
}
// ExtractDateIndicators finds date patterns in text
func ExtractDateIndicators(text string) []string {
var indicators []string
// Pattern: DD.MM.YYYY or YYYY-MM-DD
datePatterns := []*regexp.Regexp{
regexp.MustCompile(`\d{2}\.\d{2}\.\d{4}`),
regexp.MustCompile(`\d{4}-\d{2}-\d{2}`),
regexp.MustCompile(`\b20[012][0-9]\b`), // years 2000-2029
}
for _, pattern := range datePatterns {
matches := pattern.FindAllString(text, 5) // limit matches
indicators = append(indicators, matches...)
}
return indicators
}