package quality import ( "testing" ) func TestNewScorer(t *testing.T) { scorer := NewScorer() if scorer == nil { t.Fatal("Expected non-nil scorer") } } func TestNewScorerWithWeights(t *testing.T) { weights := Weights{ ContentLength: 0.5, HeadingStructure: 0.5, } scorer := NewScorerWithWeights(weights) if scorer.weights.ContentLength != 0.5 { t.Errorf("Expected weight 0.5, got %f", scorer.weights.ContentLength) } } func TestCalculate_HighQualityDocument(t *testing.T) { scorer := NewScorer() features := ContentFeatures{ ContentLength: 5000, HeadingCount: 5, HeadingDepth: 3, LinkDensity: 0.1, AdDensity: 0, TextToHTMLRatio: 0.4, HasTitle: true, HasDescription: true, HasCanonical: true, Language: "de", DateIndicators: []string{"2024-01-15"}, } score := scorer.Calculate(features) if score.Total < 0.8 { t.Errorf("Expected high quality score (>0.8), got %f", score.Total) } } func TestCalculate_LowQualityDocument(t *testing.T) { scorer := NewScorer() features := ContentFeatures{ ContentLength: 100, HeadingCount: 0, LinkDensity: 0.5, AdDensity: 0.2, TextToHTMLRatio: 0.05, HasTitle: false, HasDescription: false, Language: "", } score := scorer.Calculate(features) if score.Total > 0.5 { t.Errorf("Expected low quality score (<0.5), got %f", score.Total) } } func TestCalculateContentLengthScore(t *testing.T) { scorer := NewScorer() tests := []struct { length int minScore float64 maxScore float64 }{ {100, 0.0, 0.2}, // very short {500, 0.5, 0.7}, // short-medium {2000, 0.7, 0.9}, // good {5000, 0.9, 1.0}, // optimal {30000, 0.6, 0.8}, // very long } for _, tt := range tests { t.Run("", func(t *testing.T) { score := scorer.calculateContentLengthScore(tt.length) if score < tt.minScore || score > tt.maxScore { t.Errorf("Length %d: expected score in [%f, %f], got %f", tt.length, tt.minScore, tt.maxScore, score) } }) } } func TestCalculateHeadingScore(t *testing.T) { scorer := NewScorer() // No headings score := scorer.calculateHeadingScore(0, 0, false) if score > 0.1 { t.Errorf("Expected low score for no headings, got %f", score) } // Good heading structure score = scorer.calculateHeadingScore(5, 3, true) if score < 0.9 { t.Errorf("Expected high score for good headings, got %f", score) } } func TestCalculateLinkQualityScore(t *testing.T) { scorer := NewScorer() // Good: low link and ad density score := scorer.calculateLinkQualityScore(0.1, 0) if score < 0.9 { t.Errorf("Expected high score for good link quality, got %f", score) } // Bad: high ad density score = scorer.calculateLinkQualityScore(0.1, 0.2) if score > 0.6 { t.Errorf("Expected low score for high ad density, got %f", score) } } func TestCalculateTextRatioScore(t *testing.T) { scorer := NewScorer() tests := []struct { ratio float64 minScore float64 }{ {0.05, 0.0}, // too low {0.3, 0.9}, // optimal {0.9, 0.5}, // too high (plain text dump) } for _, tt := range tests { score := scorer.calculateTextRatioScore(tt.ratio) if score < tt.minScore { t.Errorf("Ratio %f: expected score >= %f, got %f", tt.ratio, tt.minScore, score) } } } func TestCalculateMetadataScore(t *testing.T) { scorer := NewScorer() // All metadata present score := scorer.calculateMetadataScore(true, true, true) if score != 1.0 { t.Errorf("Expected 1.0 for all metadata, got %f", score) } // No metadata score = scorer.calculateMetadataScore(false, false, false) if score != 0.0 { t.Errorf("Expected 0.0 for no metadata, got %f", score) } // Only title score = scorer.calculateMetadataScore(true, false, false) if score != 0.5 { t.Errorf("Expected 0.5 for only title, got %f", score) } } func TestCalculateLanguageScore(t *testing.T) { scorer := NewScorer() tests := []struct { language string expected float64 }{ {"de", 1.0}, {"german", 1.0}, {"en", 0.8}, {"", 0.5}, {"fr", 0.3}, } for _, tt := range tests { score := scorer.calculateLanguageScore(tt.language) if score != tt.expected { t.Errorf("Language '%s': expected %f, got %f", tt.language, tt.expected, score) } } } func TestCalculateFreshnessScore(t *testing.T) { scorer := NewScorer() // Recent date score := scorer.calculateFreshnessScore([]string{"2024-06-15"}) if score < 0.9 { t.Errorf("Expected high score for recent date, got %f", score) } // Older date score = scorer.calculateFreshnessScore([]string{"2016-01-01"}) if score > 0.8 { t.Errorf("Expected moderate score for 2016, got %f", score) } // No date indicators score = scorer.calculateFreshnessScore(nil) if score != 0.5 { t.Errorf("Expected neutral score for no dates, got %f", score) } } func TestCalculatePDFScore(t *testing.T) { scorer := NewScorer() // Multi-page PDF with good content score := scorer.calculatePDFScore(10, 5000) if score < 0.8 { t.Errorf("Expected high score for good PDF, got %f", score) } // Single page, little content score = scorer.calculatePDFScore(1, 50) if score > 0.6 { t.Errorf("Expected lower score for poor PDF, got %f", score) } } func TestExtractDateIndicators(t *testing.T) { text := "Lehrplan gültig ab 01.08.2023 - Stand: 2024-01-15. Aktualisiert 2024." indicators := ExtractDateIndicators(text) if len(indicators) == 0 { t.Error("Expected to find date indicators") } // Should find at least the year patterns found2024 := false for _, ind := range indicators { if ind == "2024" || ind == "2023" || ind == "2024-01-15" || ind == "01.08.2023" { found2024 = true } } if !found2024 { t.Errorf("Expected to find 2024 or 2023, got: %v", indicators) } } func TestExtractDateIndicators_Empty(t *testing.T) { text := "This text has no dates whatsoever." indicators := ExtractDateIndicators(text) if len(indicators) != 0 { t.Errorf("Expected no indicators, got: %v", indicators) } } func TestCalculate_PDFDocument(t *testing.T) { scorer := NewScorer() features := ContentFeatures{ ContentLength: 3000, HeadingCount: 3, HeadingDepth: 2, Language: "de", IsPDF: true, PageCount: 8, DateIndicators: []string{"2023"}, } score := scorer.Calculate(features) // PDF with 8 pages and good content should score well if score.PDFSpecific < 0.8 { t.Errorf("Expected good PDF-specific score, got %f", score.PDFSpecific) } if score.Total < 0.5 { t.Errorf("Expected reasonable score for PDF, got %f", score.Total) } } func TestCalculate_ScoreClamping(t *testing.T) { scorer := NewScorer() // Even with all perfect scores, total should not exceed 1.0 features := ContentFeatures{ ContentLength: 5000, HeadingCount: 10, HeadingDepth: 4, HasTOC: true, LinkDensity: 0, AdDensity: 0, TextToHTMLRatio: 0.4, HasTitle: true, HasDescription: true, HasCanonical: true, Language: "de", DateIndicators: []string{"2024"}, } score := scorer.Calculate(features) if score.Total > 1.0 { t.Errorf("Score should be clamped to 1.0, got %f", score.Total) } if score.Total < 0 { t.Errorf("Score should not be negative, got %f", score.Total) } } func TestDefaultWeights(t *testing.T) { weights := DefaultWeights() // Sum should be approximately 1.0 sum := weights.ContentLength + weights.HeadingStructure + weights.LinkQuality + weights.TextToHTMLRatio + weights.MetadataPresence + weights.LanguageClarity + weights.ContentFreshness + weights.PDFSpecific if sum < 0.99 || sum > 1.01 { t.Errorf("Default weights should sum to 1.0, got %f", sum) } }