fix: Restore all files lost during destructive rebase
A previous `git pull --rebase origin main` dropped 177 local commits,
losing 3400+ files across admin-v2, backend, studio-v2, website,
klausur-service, and many other services. The partial restore attempt
(660295e2) only recovered some files.
This commit restores all missing files from pre-rebase ref 98933f5e
while preserving post-rebase additions (night-scheduler, night-mode UI,
NightModeWidget dashboard integration).
Restored features include:
- AI Module Sidebar (FAB), OCR Labeling, OCR Compare
- GPU Dashboard, RAG Pipeline, Magic Help
- Klausur-Korrektur (8 files), Abitur-Archiv (5+ files)
- Companion, Zeugnisse-Crawler, Screen Flow
- Full backend, studio-v2, website, klausur-service
- All compliance SDKs, agent-core, voice-service
- CI/CD configs, documentation, scripts
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
326
edu-search-service/internal/quality/quality.go
Normal file
326
edu-search-service/internal/quality/quality.go
Normal file
@@ -0,0 +1,326 @@
|
||||
package quality
|
||||
|
||||
import (
|
||||
"regexp"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// Scorer calculates quality scores for documents
|
||||
type Scorer struct {
|
||||
weights Weights
|
||||
}
|
||||
|
||||
// Weights defines the contribution of each factor to the quality score
|
||||
type Weights struct {
|
||||
ContentLength float64 // 0.20 - longer content often more valuable
|
||||
HeadingStructure float64 // 0.15 - well-structured documents
|
||||
LinkQuality float64 // 0.15 - low ad/external link density
|
||||
TextToHTMLRatio float64 // 0.15 - content-rich pages
|
||||
MetadataPresence float64 // 0.10 - proper title, description
|
||||
LanguageClarity float64 // 0.10 - German content, no mixed languages
|
||||
ContentFreshness float64 // 0.10 - indication of update/recency
|
||||
PDFSpecific float64 // 0.05 - PDF-specific quality signals
|
||||
}
|
||||
|
||||
// DefaultWeights returns the default quality score weights
|
||||
func DefaultWeights() Weights {
|
||||
return Weights{
|
||||
ContentLength: 0.20,
|
||||
HeadingStructure: 0.15,
|
||||
LinkQuality: 0.15,
|
||||
TextToHTMLRatio: 0.15,
|
||||
MetadataPresence: 0.10,
|
||||
LanguageClarity: 0.10,
|
||||
ContentFreshness: 0.10,
|
||||
PDFSpecific: 0.05,
|
||||
}
|
||||
}
|
||||
|
||||
// ContentFeatures holds extracted features for quality scoring
|
||||
type ContentFeatures struct {
|
||||
ContentLength int
|
||||
HeadingCount int
|
||||
HeadingDepth int // max heading level depth (h1-h6)
|
||||
LinkDensity float64
|
||||
AdDensity float64
|
||||
TextToHTMLRatio float64
|
||||
HasTitle bool
|
||||
HasDescription bool
|
||||
HasCanonical bool
|
||||
Language string
|
||||
IsPDF bool
|
||||
PageCount int // for PDFs
|
||||
HasTOC bool // table of contents
|
||||
DateIndicators []string // found date patterns
|
||||
}
|
||||
|
||||
// Score represents the quality score breakdown
|
||||
type Score struct {
|
||||
Total float64 `json:"total"`
|
||||
ContentLength float64 `json:"content_length"`
|
||||
HeadingStructure float64 `json:"heading_structure"`
|
||||
LinkQuality float64 `json:"link_quality"`
|
||||
TextToHTMLRatio float64 `json:"text_html_ratio"`
|
||||
MetadataPresence float64 `json:"metadata_presence"`
|
||||
LanguageClarity float64 `json:"language_clarity"`
|
||||
ContentFreshness float64 `json:"content_freshness"`
|
||||
PDFSpecific float64 `json:"pdf_specific"`
|
||||
}
|
||||
|
||||
// NewScorer creates a quality scorer with default weights
|
||||
func NewScorer() *Scorer {
|
||||
return &Scorer{weights: DefaultWeights()}
|
||||
}
|
||||
|
||||
// NewScorerWithWeights creates a scorer with custom weights
|
||||
func NewScorerWithWeights(w Weights) *Scorer {
|
||||
return &Scorer{weights: w}
|
||||
}
|
||||
|
||||
// Calculate computes the quality score for given features
|
||||
func (s *Scorer) Calculate(features ContentFeatures) Score {
|
||||
score := Score{}
|
||||
|
||||
// 1. Content Length Score (0-1)
|
||||
score.ContentLength = s.calculateContentLengthScore(features.ContentLength)
|
||||
|
||||
// 2. Heading Structure Score (0-1)
|
||||
score.HeadingStructure = s.calculateHeadingScore(features.HeadingCount, features.HeadingDepth, features.HasTOC)
|
||||
|
||||
// 3. Link Quality Score (0-1)
|
||||
score.LinkQuality = s.calculateLinkQualityScore(features.LinkDensity, features.AdDensity)
|
||||
|
||||
// 4. Text to HTML Ratio Score (0-1)
|
||||
score.TextToHTMLRatio = s.calculateTextRatioScore(features.TextToHTMLRatio)
|
||||
|
||||
// 5. Metadata Presence Score (0-1)
|
||||
score.MetadataPresence = s.calculateMetadataScore(features.HasTitle, features.HasDescription, features.HasCanonical)
|
||||
|
||||
// 6. Language Clarity Score (0-1)
|
||||
score.LanguageClarity = s.calculateLanguageScore(features.Language)
|
||||
|
||||
// 7. Content Freshness Score (0-1)
|
||||
score.ContentFreshness = s.calculateFreshnessScore(features.DateIndicators)
|
||||
|
||||
// 8. PDF-Specific Score (0-1)
|
||||
if features.IsPDF {
|
||||
score.PDFSpecific = s.calculatePDFScore(features.PageCount, features.ContentLength)
|
||||
} else {
|
||||
score.PDFSpecific = 1.0 // full score for non-PDFs (no penalty)
|
||||
}
|
||||
|
||||
// Calculate weighted total
|
||||
score.Total = score.ContentLength*s.weights.ContentLength +
|
||||
score.HeadingStructure*s.weights.HeadingStructure +
|
||||
score.LinkQuality*s.weights.LinkQuality +
|
||||
score.TextToHTMLRatio*s.weights.TextToHTMLRatio +
|
||||
score.MetadataPresence*s.weights.MetadataPresence +
|
||||
score.LanguageClarity*s.weights.LanguageClarity +
|
||||
score.ContentFreshness*s.weights.ContentFreshness +
|
||||
score.PDFSpecific*s.weights.PDFSpecific
|
||||
|
||||
// Clamp to 0-1
|
||||
if score.Total > 1.0 {
|
||||
score.Total = 1.0
|
||||
}
|
||||
if score.Total < 0 {
|
||||
score.Total = 0
|
||||
}
|
||||
|
||||
return score
|
||||
}
|
||||
|
||||
// calculateContentLengthScore scores based on content length
|
||||
func (s *Scorer) calculateContentLengthScore(length int) float64 {
|
||||
// Optimal range: 1000-10000 characters
|
||||
// Too short (<500): low quality
|
||||
// Too long (>20000): might be noise/boilerplate
|
||||
switch {
|
||||
case length < 200:
|
||||
return 0.1
|
||||
case length < 500:
|
||||
return 0.3
|
||||
case length < 1000:
|
||||
return 0.6
|
||||
case length < 3000:
|
||||
return 0.8
|
||||
case length < 10000:
|
||||
return 1.0
|
||||
case length < 20000:
|
||||
return 0.9
|
||||
default:
|
||||
return 0.7 // very long documents might have quality issues
|
||||
}
|
||||
}
|
||||
|
||||
// calculateHeadingScore scores heading structure
|
||||
func (s *Scorer) calculateHeadingScore(count, depth int, hasTOC bool) float64 {
|
||||
score := 0.0
|
||||
|
||||
// Headings present
|
||||
if count > 0 {
|
||||
score += 0.4
|
||||
}
|
||||
if count >= 3 {
|
||||
score += 0.2
|
||||
}
|
||||
|
||||
// Depth variety (proper hierarchy)
|
||||
if depth >= 2 {
|
||||
score += 0.2
|
||||
}
|
||||
|
||||
// Table of contents indicates well-structured document
|
||||
if hasTOC {
|
||||
score += 0.2
|
||||
}
|
||||
|
||||
if score > 1.0 {
|
||||
score = 1.0
|
||||
}
|
||||
return score
|
||||
}
|
||||
|
||||
// calculateLinkQualityScore scores based on link/ad density
|
||||
func (s *Scorer) calculateLinkQualityScore(linkDensity, adDensity float64) float64 {
|
||||
score := 1.0
|
||||
|
||||
// High link density is bad
|
||||
if linkDensity > 0.3 {
|
||||
score -= 0.3
|
||||
} else if linkDensity > 0.2 {
|
||||
score -= 0.1
|
||||
}
|
||||
|
||||
// Any ad density is bad
|
||||
if adDensity > 0.1 {
|
||||
score -= 0.4
|
||||
} else if adDensity > 0.05 {
|
||||
score -= 0.2
|
||||
} else if adDensity > 0 {
|
||||
score -= 0.1
|
||||
}
|
||||
|
||||
if score < 0 {
|
||||
score = 0
|
||||
}
|
||||
return score
|
||||
}
|
||||
|
||||
// calculateTextRatioScore scores text to HTML ratio
|
||||
func (s *Scorer) calculateTextRatioScore(ratio float64) float64 {
|
||||
// Good ratio: 0.2-0.6
|
||||
// Too low: too much markup
|
||||
// Too high: might be plain text dump
|
||||
switch {
|
||||
case ratio < 0.1:
|
||||
return 0.3
|
||||
case ratio < 0.2:
|
||||
return 0.6
|
||||
case ratio < 0.6:
|
||||
return 1.0
|
||||
case ratio < 0.8:
|
||||
return 0.8
|
||||
default:
|
||||
return 0.6
|
||||
}
|
||||
}
|
||||
|
||||
// calculateMetadataScore scores presence of metadata
|
||||
func (s *Scorer) calculateMetadataScore(hasTitle, hasDescription, hasCanonical bool) float64 {
|
||||
score := 0.0
|
||||
|
||||
if hasTitle {
|
||||
score += 0.5
|
||||
}
|
||||
if hasDescription {
|
||||
score += 0.3
|
||||
}
|
||||
if hasCanonical {
|
||||
score += 0.2
|
||||
}
|
||||
|
||||
return score
|
||||
}
|
||||
|
||||
// calculateLanguageScore scores language clarity
|
||||
func (s *Scorer) calculateLanguageScore(language string) float64 {
|
||||
switch strings.ToLower(language) {
|
||||
case "de", "german", "deutsch":
|
||||
return 1.0
|
||||
case "en", "english", "englisch":
|
||||
return 0.8 // English is acceptable
|
||||
case "":
|
||||
return 0.5 // unknown
|
||||
default:
|
||||
return 0.3 // other languages
|
||||
}
|
||||
}
|
||||
|
||||
// calculateFreshnessScore scores content freshness indicators
|
||||
func (s *Scorer) calculateFreshnessScore(dateIndicators []string) float64 {
|
||||
if len(dateIndicators) == 0 {
|
||||
return 0.5 // neutral
|
||||
}
|
||||
|
||||
// Check for recent years (2020+)
|
||||
recentYearPattern := regexp.MustCompile(`202[0-5]`)
|
||||
for _, indicator := range dateIndicators {
|
||||
if recentYearPattern.MatchString(indicator) {
|
||||
return 1.0
|
||||
}
|
||||
}
|
||||
|
||||
// Check for 2015-2019
|
||||
modernPattern := regexp.MustCompile(`201[5-9]`)
|
||||
for _, indicator := range dateIndicators {
|
||||
if modernPattern.MatchString(indicator) {
|
||||
return 0.7
|
||||
}
|
||||
}
|
||||
|
||||
// Older content
|
||||
return 0.4
|
||||
}
|
||||
|
||||
// calculatePDFScore scores PDF-specific quality
|
||||
func (s *Scorer) calculatePDFScore(pageCount, contentLength int) float64 {
|
||||
score := 0.5 // base
|
||||
|
||||
// Page count bonus
|
||||
if pageCount > 1 {
|
||||
score += 0.2
|
||||
}
|
||||
if pageCount > 5 {
|
||||
score += 0.1
|
||||
}
|
||||
|
||||
// Text extraction success
|
||||
if contentLength > 100 {
|
||||
score += 0.2
|
||||
}
|
||||
|
||||
if score > 1.0 {
|
||||
score = 1.0
|
||||
}
|
||||
return score
|
||||
}
|
||||
|
||||
// ExtractDateIndicators finds date patterns in text
|
||||
func ExtractDateIndicators(text string) []string {
|
||||
var indicators []string
|
||||
|
||||
// Pattern: DD.MM.YYYY or YYYY-MM-DD
|
||||
datePatterns := []*regexp.Regexp{
|
||||
regexp.MustCompile(`\d{2}\.\d{2}\.\d{4}`),
|
||||
regexp.MustCompile(`\d{4}-\d{2}-\d{2}`),
|
||||
regexp.MustCompile(`\b20[012][0-9]\b`), // years 2000-2029
|
||||
}
|
||||
|
||||
for _, pattern := range datePatterns {
|
||||
matches := pattern.FindAllString(text, 5) // limit matches
|
||||
indicators = append(indicators, matches...)
|
||||
}
|
||||
|
||||
return indicators
|
||||
}
|
||||
333
edu-search-service/internal/quality/quality_test.go
Normal file
333
edu-search-service/internal/quality/quality_test.go
Normal file
@@ -0,0 +1,333 @@
|
||||
package quality
|
||||
|
||||
import (
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestNewScorer(t *testing.T) {
|
||||
scorer := NewScorer()
|
||||
if scorer == nil {
|
||||
t.Fatal("Expected non-nil scorer")
|
||||
}
|
||||
}
|
||||
|
||||
func TestNewScorerWithWeights(t *testing.T) {
|
||||
weights := Weights{
|
||||
ContentLength: 0.5,
|
||||
HeadingStructure: 0.5,
|
||||
}
|
||||
scorer := NewScorerWithWeights(weights)
|
||||
|
||||
if scorer.weights.ContentLength != 0.5 {
|
||||
t.Errorf("Expected weight 0.5, got %f", scorer.weights.ContentLength)
|
||||
}
|
||||
}
|
||||
|
||||
func TestCalculate_HighQualityDocument(t *testing.T) {
|
||||
scorer := NewScorer()
|
||||
|
||||
features := ContentFeatures{
|
||||
ContentLength: 5000,
|
||||
HeadingCount: 5,
|
||||
HeadingDepth: 3,
|
||||
LinkDensity: 0.1,
|
||||
AdDensity: 0,
|
||||
TextToHTMLRatio: 0.4,
|
||||
HasTitle: true,
|
||||
HasDescription: true,
|
||||
HasCanonical: true,
|
||||
Language: "de",
|
||||
DateIndicators: []string{"2024-01-15"},
|
||||
}
|
||||
|
||||
score := scorer.Calculate(features)
|
||||
|
||||
if score.Total < 0.8 {
|
||||
t.Errorf("Expected high quality score (>0.8), got %f", score.Total)
|
||||
}
|
||||
}
|
||||
|
||||
func TestCalculate_LowQualityDocument(t *testing.T) {
|
||||
scorer := NewScorer()
|
||||
|
||||
features := ContentFeatures{
|
||||
ContentLength: 100,
|
||||
HeadingCount: 0,
|
||||
LinkDensity: 0.5,
|
||||
AdDensity: 0.2,
|
||||
TextToHTMLRatio: 0.05,
|
||||
HasTitle: false,
|
||||
HasDescription: false,
|
||||
Language: "",
|
||||
}
|
||||
|
||||
score := scorer.Calculate(features)
|
||||
|
||||
if score.Total > 0.5 {
|
||||
t.Errorf("Expected low quality score (<0.5), got %f", score.Total)
|
||||
}
|
||||
}
|
||||
|
||||
func TestCalculateContentLengthScore(t *testing.T) {
|
||||
scorer := NewScorer()
|
||||
|
||||
tests := []struct {
|
||||
length int
|
||||
minScore float64
|
||||
maxScore float64
|
||||
}{
|
||||
{100, 0.0, 0.2}, // very short
|
||||
{500, 0.5, 0.7}, // short-medium
|
||||
{2000, 0.7, 0.9}, // good
|
||||
{5000, 0.9, 1.0}, // optimal
|
||||
{30000, 0.6, 0.8}, // very long
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run("", func(t *testing.T) {
|
||||
score := scorer.calculateContentLengthScore(tt.length)
|
||||
if score < tt.minScore || score > tt.maxScore {
|
||||
t.Errorf("Length %d: expected score in [%f, %f], got %f",
|
||||
tt.length, tt.minScore, tt.maxScore, score)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestCalculateHeadingScore(t *testing.T) {
|
||||
scorer := NewScorer()
|
||||
|
||||
// No headings
|
||||
score := scorer.calculateHeadingScore(0, 0, false)
|
||||
if score > 0.1 {
|
||||
t.Errorf("Expected low score for no headings, got %f", score)
|
||||
}
|
||||
|
||||
// Good heading structure
|
||||
score = scorer.calculateHeadingScore(5, 3, true)
|
||||
if score < 0.9 {
|
||||
t.Errorf("Expected high score for good headings, got %f", score)
|
||||
}
|
||||
}
|
||||
|
||||
func TestCalculateLinkQualityScore(t *testing.T) {
|
||||
scorer := NewScorer()
|
||||
|
||||
// Good: low link and ad density
|
||||
score := scorer.calculateLinkQualityScore(0.1, 0)
|
||||
if score < 0.9 {
|
||||
t.Errorf("Expected high score for good link quality, got %f", score)
|
||||
}
|
||||
|
||||
// Bad: high ad density
|
||||
score = scorer.calculateLinkQualityScore(0.1, 0.2)
|
||||
if score > 0.6 {
|
||||
t.Errorf("Expected low score for high ad density, got %f", score)
|
||||
}
|
||||
}
|
||||
|
||||
func TestCalculateTextRatioScore(t *testing.T) {
|
||||
scorer := NewScorer()
|
||||
|
||||
tests := []struct {
|
||||
ratio float64
|
||||
minScore float64
|
||||
}{
|
||||
{0.05, 0.0}, // too low
|
||||
{0.3, 0.9}, // optimal
|
||||
{0.9, 0.5}, // too high (plain text dump)
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
score := scorer.calculateTextRatioScore(tt.ratio)
|
||||
if score < tt.minScore {
|
||||
t.Errorf("Ratio %f: expected score >= %f, got %f", tt.ratio, tt.minScore, score)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestCalculateMetadataScore(t *testing.T) {
|
||||
scorer := NewScorer()
|
||||
|
||||
// All metadata present
|
||||
score := scorer.calculateMetadataScore(true, true, true)
|
||||
if score != 1.0 {
|
||||
t.Errorf("Expected 1.0 for all metadata, got %f", score)
|
||||
}
|
||||
|
||||
// No metadata
|
||||
score = scorer.calculateMetadataScore(false, false, false)
|
||||
if score != 0.0 {
|
||||
t.Errorf("Expected 0.0 for no metadata, got %f", score)
|
||||
}
|
||||
|
||||
// Only title
|
||||
score = scorer.calculateMetadataScore(true, false, false)
|
||||
if score != 0.5 {
|
||||
t.Errorf("Expected 0.5 for only title, got %f", score)
|
||||
}
|
||||
}
|
||||
|
||||
func TestCalculateLanguageScore(t *testing.T) {
|
||||
scorer := NewScorer()
|
||||
|
||||
tests := []struct {
|
||||
language string
|
||||
expected float64
|
||||
}{
|
||||
{"de", 1.0},
|
||||
{"german", 1.0},
|
||||
{"en", 0.8},
|
||||
{"", 0.5},
|
||||
{"fr", 0.3},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
score := scorer.calculateLanguageScore(tt.language)
|
||||
if score != tt.expected {
|
||||
t.Errorf("Language '%s': expected %f, got %f", tt.language, tt.expected, score)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestCalculateFreshnessScore(t *testing.T) {
|
||||
scorer := NewScorer()
|
||||
|
||||
// Recent date
|
||||
score := scorer.calculateFreshnessScore([]string{"2024-06-15"})
|
||||
if score < 0.9 {
|
||||
t.Errorf("Expected high score for recent date, got %f", score)
|
||||
}
|
||||
|
||||
// Older date
|
||||
score = scorer.calculateFreshnessScore([]string{"2016-01-01"})
|
||||
if score > 0.8 {
|
||||
t.Errorf("Expected moderate score for 2016, got %f", score)
|
||||
}
|
||||
|
||||
// No date indicators
|
||||
score = scorer.calculateFreshnessScore(nil)
|
||||
if score != 0.5 {
|
||||
t.Errorf("Expected neutral score for no dates, got %f", score)
|
||||
}
|
||||
}
|
||||
|
||||
func TestCalculatePDFScore(t *testing.T) {
|
||||
scorer := NewScorer()
|
||||
|
||||
// Multi-page PDF with good content
|
||||
score := scorer.calculatePDFScore(10, 5000)
|
||||
if score < 0.8 {
|
||||
t.Errorf("Expected high score for good PDF, got %f", score)
|
||||
}
|
||||
|
||||
// Single page, little content
|
||||
score = scorer.calculatePDFScore(1, 50)
|
||||
if score > 0.6 {
|
||||
t.Errorf("Expected lower score for poor PDF, got %f", score)
|
||||
}
|
||||
}
|
||||
|
||||
func TestExtractDateIndicators(t *testing.T) {
|
||||
text := "Lehrplan gültig ab 01.08.2023 - Stand: 2024-01-15. Aktualisiert 2024."
|
||||
|
||||
indicators := ExtractDateIndicators(text)
|
||||
|
||||
if len(indicators) == 0 {
|
||||
t.Error("Expected to find date indicators")
|
||||
}
|
||||
|
||||
// Should find at least the year patterns
|
||||
found2024 := false
|
||||
for _, ind := range indicators {
|
||||
if ind == "2024" || ind == "2023" || ind == "2024-01-15" || ind == "01.08.2023" {
|
||||
found2024 = true
|
||||
}
|
||||
}
|
||||
|
||||
if !found2024 {
|
||||
t.Errorf("Expected to find 2024 or 2023, got: %v", indicators)
|
||||
}
|
||||
}
|
||||
|
||||
func TestExtractDateIndicators_Empty(t *testing.T) {
|
||||
text := "This text has no dates whatsoever."
|
||||
|
||||
indicators := ExtractDateIndicators(text)
|
||||
|
||||
if len(indicators) != 0 {
|
||||
t.Errorf("Expected no indicators, got: %v", indicators)
|
||||
}
|
||||
}
|
||||
|
||||
func TestCalculate_PDFDocument(t *testing.T) {
|
||||
scorer := NewScorer()
|
||||
|
||||
features := ContentFeatures{
|
||||
ContentLength: 3000,
|
||||
HeadingCount: 3,
|
||||
HeadingDepth: 2,
|
||||
Language: "de",
|
||||
IsPDF: true,
|
||||
PageCount: 8,
|
||||
DateIndicators: []string{"2023"},
|
||||
}
|
||||
|
||||
score := scorer.Calculate(features)
|
||||
|
||||
// PDF with 8 pages and good content should score well
|
||||
if score.PDFSpecific < 0.8 {
|
||||
t.Errorf("Expected good PDF-specific score, got %f", score.PDFSpecific)
|
||||
}
|
||||
|
||||
if score.Total < 0.5 {
|
||||
t.Errorf("Expected reasonable score for PDF, got %f", score.Total)
|
||||
}
|
||||
}
|
||||
|
||||
func TestCalculate_ScoreClamping(t *testing.T) {
|
||||
scorer := NewScorer()
|
||||
|
||||
// Even with all perfect scores, total should not exceed 1.0
|
||||
features := ContentFeatures{
|
||||
ContentLength: 5000,
|
||||
HeadingCount: 10,
|
||||
HeadingDepth: 4,
|
||||
HasTOC: true,
|
||||
LinkDensity: 0,
|
||||
AdDensity: 0,
|
||||
TextToHTMLRatio: 0.4,
|
||||
HasTitle: true,
|
||||
HasDescription: true,
|
||||
HasCanonical: true,
|
||||
Language: "de",
|
||||
DateIndicators: []string{"2024"},
|
||||
}
|
||||
|
||||
score := scorer.Calculate(features)
|
||||
|
||||
if score.Total > 1.0 {
|
||||
t.Errorf("Score should be clamped to 1.0, got %f", score.Total)
|
||||
}
|
||||
if score.Total < 0 {
|
||||
t.Errorf("Score should not be negative, got %f", score.Total)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDefaultWeights(t *testing.T) {
|
||||
weights := DefaultWeights()
|
||||
|
||||
// Sum should be approximately 1.0
|
||||
sum := weights.ContentLength +
|
||||
weights.HeadingStructure +
|
||||
weights.LinkQuality +
|
||||
weights.TextToHTMLRatio +
|
||||
weights.MetadataPresence +
|
||||
weights.LanguageClarity +
|
||||
weights.ContentFreshness +
|
||||
weights.PDFSpecific
|
||||
|
||||
if sum < 0.99 || sum > 1.01 {
|
||||
t.Errorf("Default weights should sum to 1.0, got %f", sum)
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user