breakpilot-lehrer/edu-search-service/internal/quality/quality_test.go

package quality

import (
	"testing"
)

func TestNewScorer(t *testing.T) {
	scorer := NewScorer()
	if scorer == nil {
		t.Fatal("Expected non-nil scorer")
	}
}

func TestNewScorerWithWeights(t *testing.T) {
	weights := Weights{
		ContentLength:    0.5,
		HeadingStructure: 0.5,
	}
	scorer := NewScorerWithWeights(weights)

	if scorer.weights.ContentLength != 0.5 {
		t.Errorf("Expected weight 0.5, got %f", scorer.weights.ContentLength)
	}
}

func TestCalculate_HighQualityDocument(t *testing.T) {
	scorer := NewScorer()

	features := ContentFeatures{
		ContentLength:   5000,
		HeadingCount:    5,
		HeadingDepth:    3,
		LinkDensity:     0.1,
		AdDensity:       0,
		TextToHTMLRatio: 0.4,
		HasTitle:        true,
		HasDescription:  true,
		HasCanonical:    true,
		Language:        "de",
		DateIndicators:  []string{"2024-01-15"},
	}

	score := scorer.Calculate(features)

	if score.Total < 0.8 {
		t.Errorf("Expected high quality score (>0.8), got %f", score.Total)
	}
}

func TestCalculate_LowQualityDocument(t *testing.T) {
	scorer := NewScorer()

	features := ContentFeatures{
		ContentLength:   100,
		HeadingCount:    0,
		LinkDensity:     0.5,
		AdDensity:       0.2,
		TextToHTMLRatio: 0.05,
		HasTitle:        false,
		HasDescription:  false,
		Language:        "",
	}

	score := scorer.Calculate(features)

	if score.Total > 0.5 {
		t.Errorf("Expected low quality score (<0.5), got %f", score.Total)
	}
}

func TestCalculateContentLengthScore(t *testing.T) {
	scorer := NewScorer()

	tests := []struct {
		length   int
		minScore float64
		maxScore float64
	}{
		{100, 0.0, 0.2},       // very short
		{500, 0.5, 0.7},       // short-medium
		{2000, 0.7, 0.9},      // good
		{5000, 0.9, 1.0},      // optimal
		{30000, 0.6, 0.8},     // very long
	}

	for _, tt := range tests {
		t.Run("", func(t *testing.T) {
			score := scorer.calculateContentLengthScore(tt.length)
			if score < tt.minScore || score > tt.maxScore {
				t.Errorf("Length %d: expected score in [%f, %f], got %f",
					tt.length, tt.minScore, tt.maxScore, score)
			}
		})
	}
}

func TestCalculateHeadingScore(t *testing.T) {
	scorer := NewScorer()

	// No headings
	score := scorer.calculateHeadingScore(0, 0, false)
	if score > 0.1 {
		t.Errorf("Expected low score for no headings, got %f", score)
	}

	// Good heading structure
	score = scorer.calculateHeadingScore(5, 3, true)
	if score < 0.9 {
		t.Errorf("Expected high score for good headings, got %f", score)
	}
}

func TestCalculateLinkQualityScore(t *testing.T) {
	scorer := NewScorer()

	// Good: low link and ad density
	score := scorer.calculateLinkQualityScore(0.1, 0)
	if score < 0.9 {
		t.Errorf("Expected high score for good link quality, got %f", score)
	}

	// Bad: high ad density
	score = scorer.calculateLinkQualityScore(0.1, 0.2)
	if score > 0.6 {
		t.Errorf("Expected low score for high ad density, got %f", score)
	}
}

func TestCalculateTextRatioScore(t *testing.T) {
	scorer := NewScorer()

	tests := []struct {
		ratio    float64
		minScore float64
	}{
		{0.05, 0.0},  // too low
		{0.3, 0.9},   // optimal
		{0.9, 0.5},   // too high (plain text dump)
	}

	for _, tt := range tests {
		score := scorer.calculateTextRatioScore(tt.ratio)
		if score < tt.minScore {
			t.Errorf("Ratio %f: expected score >= %f, got %f", tt.ratio, tt.minScore, score)
		}
	}
}

func TestCalculateMetadataScore(t *testing.T) {
	scorer := NewScorer()

	// All metadata present
	score := scorer.calculateMetadataScore(true, true, true)
	if score != 1.0 {
		t.Errorf("Expected 1.0 for all metadata, got %f", score)
	}

	// No metadata
	score = scorer.calculateMetadataScore(false, false, false)
	if score != 0.0 {
		t.Errorf("Expected 0.0 for no metadata, got %f", score)
	}

	// Only title
	score = scorer.calculateMetadataScore(true, false, false)
	if score != 0.5 {
		t.Errorf("Expected 0.5 for only title, got %f", score)
	}
}

func TestCalculateLanguageScore(t *testing.T) {
	scorer := NewScorer()

	tests := []struct {
		language string
		expected float64
	}{
		{"de", 1.0},
		{"german", 1.0},
		{"en", 0.8},
		{"", 0.5},
		{"fr", 0.3},
	}

	for _, tt := range tests {
		score := scorer.calculateLanguageScore(tt.language)
		if score != tt.expected {
			t.Errorf("Language '%s': expected %f, got %f", tt.language, tt.expected, score)
		}
	}
}

func TestCalculateFreshnessScore(t *testing.T) {
	scorer := NewScorer()

	// Recent date
	score := scorer.calculateFreshnessScore([]string{"2024-06-15"})
	if score < 0.9 {
		t.Errorf("Expected high score for recent date, got %f", score)
	}

	// Older date
	score = scorer.calculateFreshnessScore([]string{"2016-01-01"})
	if score > 0.8 {
		t.Errorf("Expected moderate score for 2016, got %f", score)
	}

	// No date indicators
	score = scorer.calculateFreshnessScore(nil)
	if score != 0.5 {
		t.Errorf("Expected neutral score for no dates, got %f", score)
	}
}

func TestCalculatePDFScore(t *testing.T) {
	scorer := NewScorer()

	// Multi-page PDF with good content
	score := scorer.calculatePDFScore(10, 5000)
	if score < 0.8 {
		t.Errorf("Expected high score for good PDF, got %f", score)
	}

	// Single page, little content
	score = scorer.calculatePDFScore(1, 50)
	if score > 0.6 {
		t.Errorf("Expected lower score for poor PDF, got %f", score)
	}
}

func TestExtractDateIndicators(t *testing.T) {
	text := "Lehrplan gültig ab 01.08.2023 - Stand: 2024-01-15. Aktualisiert 2024."

	indicators := ExtractDateIndicators(text)

	if len(indicators) == 0 {
		t.Error("Expected to find date indicators")
	}

	// Should find at least the year patterns
	found2024 := false
	for _, ind := range indicators {
		if ind == "2024" || ind == "2023" || ind == "2024-01-15" || ind == "01.08.2023" {
			found2024 = true
		}
	}

	if !found2024 {
		t.Errorf("Expected to find 2024 or 2023, got: %v", indicators)
	}
}

func TestExtractDateIndicators_Empty(t *testing.T) {
	text := "This text has no dates whatsoever."

	indicators := ExtractDateIndicators(text)

	if len(indicators) != 0 {
		t.Errorf("Expected no indicators, got: %v", indicators)
	}
}

func TestCalculate_PDFDocument(t *testing.T) {
	scorer := NewScorer()

	features := ContentFeatures{
		ContentLength:   3000,
		HeadingCount:    3,
		HeadingDepth:    2,
		Language:        "de",
		IsPDF:           true,
		PageCount:       8,
		DateIndicators:  []string{"2023"},
	}

	score := scorer.Calculate(features)

	// PDF with 8 pages and good content should score well
	if score.PDFSpecific < 0.8 {
		t.Errorf("Expected good PDF-specific score, got %f", score.PDFSpecific)
	}

	if score.Total < 0.5 {
		t.Errorf("Expected reasonable score for PDF, got %f", score.Total)
	}
}

func TestCalculate_ScoreClamping(t *testing.T) {
	scorer := NewScorer()

	// Even with all perfect scores, total should not exceed 1.0
	features := ContentFeatures{
		ContentLength:   5000,
		HeadingCount:    10,
		HeadingDepth:    4,
		HasTOC:          true,
		LinkDensity:     0,
		AdDensity:       0,
		TextToHTMLRatio: 0.4,
		HasTitle:        true,
		HasDescription:  true,
		HasCanonical:    true,
		Language:        "de",
		DateIndicators:  []string{"2024"},
	}

	score := scorer.Calculate(features)

	if score.Total > 1.0 {
		t.Errorf("Score should be clamped to 1.0, got %f", score.Total)
	}
	if score.Total < 0 {
		t.Errorf("Score should not be negative, got %f", score.Total)
	}
}

func TestDefaultWeights(t *testing.T) {
	weights := DefaultWeights()

	// Sum should be approximately 1.0
	sum := weights.ContentLength +
		weights.HeadingStructure +
		weights.LinkQuality +
		weights.TextToHTMLRatio +
		weights.MetadataPresence +
		weights.LanguageClarity +
		weights.ContentFreshness +
		weights.PDFSpecific

	if sum < 0.99 || sum > 1.01 {
		t.Errorf("Default weights should sum to 1.0, got %f", sum)
	}
}