fix: Restore all files lost during destructive rebase

A previous `git pull --rebase origin main` dropped 177 local commits, losing 3400+ files across admin-v2, backend, studio-v2, website, klausur-service, and many other services. The partial restore attempt (660295e2) only recovered some files. This commit restores all missing files from pre-rebase ref 98933f5e while preserving post-rebase additions (night-scheduler, night-mode UI, NightModeWidget dashboard integration). Restored features include: - AI Module Sidebar (FAB), OCR Labeling, OCR Compare - GPU Dashboard, RAG Pipeline, Magic Help - Klausur-Korrektur (8 files), Abitur-Archiv (5+ files) - Companion, Zeugnisse-Crawler, Screen Flow - Full backend, studio-v2, website, klausur-service - All compliance SDKs, agent-core, voice-service - CI/CD configs, documentation, scripts Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-09 09:51:32 +01:00
parent f7487ee240
commit bfdaf63ba9
2009 changed files with 749983 additions and 1731 deletions
@@ -0,0 +1,326 @@
+package quality
+
+import (
+	"regexp"
+	"strings"
+)
+
+// Scorer calculates quality scores for documents
+type Scorer struct {
+	weights Weights
+}
+
+// Weights defines the contribution of each factor to the quality score
+type Weights struct {
+	ContentLength     float64 // 0.20 - longer content often more valuable
+	HeadingStructure  float64 // 0.15 - well-structured documents
+	LinkQuality       float64 // 0.15 - low ad/external link density
+	TextToHTMLRatio   float64 // 0.15 - content-rich pages
+	MetadataPresence  float64 // 0.10 - proper title, description
+	LanguageClarity   float64 // 0.10 - German content, no mixed languages
+	ContentFreshness  float64 // 0.10 - indication of update/recency
+	PDFSpecific       float64 // 0.05 - PDF-specific quality signals
+}
+
+// DefaultWeights returns the default quality score weights
+func DefaultWeights() Weights {
+	return Weights{
+		ContentLength:    0.20,
+		HeadingStructure: 0.15,
+		LinkQuality:      0.15,
+		TextToHTMLRatio:  0.15,
+		MetadataPresence: 0.10,
+		LanguageClarity:  0.10,
+		ContentFreshness: 0.10,
+		PDFSpecific:      0.05,
+	}
+}
+
+// ContentFeatures holds extracted features for quality scoring
+type ContentFeatures struct {
+	ContentLength    int
+	HeadingCount     int
+	HeadingDepth     int      // max heading level depth (h1-h6)
+	LinkDensity      float64
+	AdDensity        float64
+	TextToHTMLRatio  float64
+	HasTitle         bool
+	HasDescription   bool
+	HasCanonical     bool
+	Language         string
+	IsPDF            bool
+	PageCount        int      // for PDFs
+	HasTOC           bool     // table of contents
+	DateIndicators   []string // found date patterns
+}
+
+// Score represents the quality score breakdown
+type Score struct {
+	Total            float64 `json:"total"`
+	ContentLength    float64 `json:"content_length"`
+	HeadingStructure float64 `json:"heading_structure"`
+	LinkQuality      float64 `json:"link_quality"`
+	TextToHTMLRatio  float64 `json:"text_html_ratio"`
+	MetadataPresence float64 `json:"metadata_presence"`
+	LanguageClarity  float64 `json:"language_clarity"`
+	ContentFreshness float64 `json:"content_freshness"`
+	PDFSpecific      float64 `json:"pdf_specific"`
+}
+
+// NewScorer creates a quality scorer with default weights
+func NewScorer() *Scorer {
+	return &Scorer{weights: DefaultWeights()}
+}
+
+// NewScorerWithWeights creates a scorer with custom weights
+func NewScorerWithWeights(w Weights) *Scorer {
+	return &Scorer{weights: w}
+}
+
+// Calculate computes the quality score for given features
+func (s *Scorer) Calculate(features ContentFeatures) Score {
+	score := Score{}
+
+	// 1. Content Length Score (0-1)
+	score.ContentLength = s.calculateContentLengthScore(features.ContentLength)
+
+	// 2. Heading Structure Score (0-1)
+	score.HeadingStructure = s.calculateHeadingScore(features.HeadingCount, features.HeadingDepth, features.HasTOC)
+
+	// 3. Link Quality Score (0-1)
+	score.LinkQuality = s.calculateLinkQualityScore(features.LinkDensity, features.AdDensity)
+
+	// 4. Text to HTML Ratio Score (0-1)
+	score.TextToHTMLRatio = s.calculateTextRatioScore(features.TextToHTMLRatio)
+
+	// 5. Metadata Presence Score (0-1)
+	score.MetadataPresence = s.calculateMetadataScore(features.HasTitle, features.HasDescription, features.HasCanonical)
+
+	// 6. Language Clarity Score (0-1)
+	score.LanguageClarity = s.calculateLanguageScore(features.Language)
+
+	// 7. Content Freshness Score (0-1)
+	score.ContentFreshness = s.calculateFreshnessScore(features.DateIndicators)
+
+	// 8. PDF-Specific Score (0-1)
+	if features.IsPDF {
+		score.PDFSpecific = s.calculatePDFScore(features.PageCount, features.ContentLength)
+	} else {
+		score.PDFSpecific = 1.0 // full score for non-PDFs (no penalty)
+	}
+
+	// Calculate weighted total
+	score.Total = score.ContentLength*s.weights.ContentLength +
+		score.HeadingStructure*s.weights.HeadingStructure +
+		score.LinkQuality*s.weights.LinkQuality +
+		score.TextToHTMLRatio*s.weights.TextToHTMLRatio +
+		score.MetadataPresence*s.weights.MetadataPresence +
+		score.LanguageClarity*s.weights.LanguageClarity +
+		score.ContentFreshness*s.weights.ContentFreshness +
+		score.PDFSpecific*s.weights.PDFSpecific
+
+	// Clamp to 0-1
+	if score.Total > 1.0 {
+		score.Total = 1.0
+	}
+	if score.Total < 0 {
+		score.Total = 0
+	}
+
+	return score
+}
+
+// calculateContentLengthScore scores based on content length
+func (s *Scorer) calculateContentLengthScore(length int) float64 {
+	// Optimal range: 1000-10000 characters
+	// Too short (<500): low quality
+	// Too long (>20000): might be noise/boilerplate
+	switch {
+	case length < 200:
+		return 0.1
+	case length < 500:
+		return 0.3
+	case length < 1000:
+		return 0.6
+	case length < 3000:
+		return 0.8
+	case length < 10000:
+		return 1.0
+	case length < 20000:
+		return 0.9
+	default:
+		return 0.7 // very long documents might have quality issues
+	}
+}
+
+// calculateHeadingScore scores heading structure
+func (s *Scorer) calculateHeadingScore(count, depth int, hasTOC bool) float64 {
+	score := 0.0
+
+	// Headings present
+	if count > 0 {
+		score += 0.4
+	}
+	if count >= 3 {
+		score += 0.2
+	}
+
+	// Depth variety (proper hierarchy)
+	if depth >= 2 {
+		score += 0.2
+	}
+
+	// Table of contents indicates well-structured document
+	if hasTOC {
+		score += 0.2
+	}
+
+	if score > 1.0 {
+		score = 1.0
+	}
+	return score
+}
+
+// calculateLinkQualityScore scores based on link/ad density
+func (s *Scorer) calculateLinkQualityScore(linkDensity, adDensity float64) float64 {
+	score := 1.0
+
+	// High link density is bad
+	if linkDensity > 0.3 {
+		score -= 0.3
+	} else if linkDensity > 0.2 {
+		score -= 0.1
+	}
+
+	// Any ad density is bad
+	if adDensity > 0.1 {
+		score -= 0.4
+	} else if adDensity > 0.05 {
+		score -= 0.2
+	} else if adDensity > 0 {
+		score -= 0.1
+	}
+
+	if score < 0 {
+		score = 0
+	}
+	return score
+}
+
+// calculateTextRatioScore scores text to HTML ratio
+func (s *Scorer) calculateTextRatioScore(ratio float64) float64 {
+	// Good ratio: 0.2-0.6
+	// Too low: too much markup
+	// Too high: might be plain text dump
+	switch {
+	case ratio < 0.1:
+		return 0.3
+	case ratio < 0.2:
+		return 0.6
+	case ratio < 0.6:
+		return 1.0
+	case ratio < 0.8:
+		return 0.8
+	default:
+		return 0.6
+	}
+}
+
+// calculateMetadataScore scores presence of metadata
+func (s *Scorer) calculateMetadataScore(hasTitle, hasDescription, hasCanonical bool) float64 {
+	score := 0.0
+
+	if hasTitle {
+		score += 0.5
+	}
+	if hasDescription {
+		score += 0.3
+	}
+	if hasCanonical {
+		score += 0.2
+	}
+
+	return score
+}
+
+// calculateLanguageScore scores language clarity
+func (s *Scorer) calculateLanguageScore(language string) float64 {
+	switch strings.ToLower(language) {
+	case "de", "german", "deutsch":
+		return 1.0
+	case "en", "english", "englisch":
+		return 0.8 // English is acceptable
+	case "":
+		return 0.5 // unknown
+	default:
+		return 0.3 // other languages
+	}
+}
+
+// calculateFreshnessScore scores content freshness indicators
+func (s *Scorer) calculateFreshnessScore(dateIndicators []string) float64 {
+	if len(dateIndicators) == 0 {
+		return 0.5 // neutral
+	}
+
+	// Check for recent years (2020+)
+	recentYearPattern := regexp.MustCompile(`202[0-5]`)
+	for _, indicator := range dateIndicators {
+		if recentYearPattern.MatchString(indicator) {
+			return 1.0
+		}
+	}
+
+	// Check for 2015-2019
+	modernPattern := regexp.MustCompile(`201[5-9]`)
+	for _, indicator := range dateIndicators {
+		if modernPattern.MatchString(indicator) {
+			return 0.7
+		}
+	}
+
+	// Older content
+	return 0.4
+}
+
+// calculatePDFScore scores PDF-specific quality
+func (s *Scorer) calculatePDFScore(pageCount, contentLength int) float64 {
+	score := 0.5 // base
+
+	// Page count bonus
+	if pageCount > 1 {
+		score += 0.2
+	}
+	if pageCount > 5 {
+		score += 0.1
+	}
+
+	// Text extraction success
+	if contentLength > 100 {
+		score += 0.2
+	}
+
+	if score > 1.0 {
+		score = 1.0
+	}
+	return score
+}
+
+// ExtractDateIndicators finds date patterns in text
+func ExtractDateIndicators(text string) []string {
+	var indicators []string
+
+	// Pattern: DD.MM.YYYY or YYYY-MM-DD
+	datePatterns := []*regexp.Regexp{
+		regexp.MustCompile(`\d{2}\.\d{2}\.\d{4}`),
+		regexp.MustCompile(`\d{4}-\d{2}-\d{2}`),
+		regexp.MustCompile(`\b20[012][0-9]\b`), // years 2000-2029
+	}
+
+	for _, pattern := range datePatterns {
+		matches := pattern.FindAllString(text, 5) // limit matches
+		indicators = append(indicators, matches...)
+	}
+
+	return indicators
+}
@@ -0,0 +1,333 @@
+package quality
+
+import (
+	"testing"
+)
+
+func TestNewScorer(t *testing.T) {
+	scorer := NewScorer()
+	if scorer == nil {
+		t.Fatal("Expected non-nil scorer")
+	}
+}
+
+func TestNewScorerWithWeights(t *testing.T) {
+	weights := Weights{
+		ContentLength:    0.5,
+		HeadingStructure: 0.5,
+	}
+	scorer := NewScorerWithWeights(weights)
+
+	if scorer.weights.ContentLength != 0.5 {
+		t.Errorf("Expected weight 0.5, got %f", scorer.weights.ContentLength)
+	}
+}
+
+func TestCalculate_HighQualityDocument(t *testing.T) {
+	scorer := NewScorer()
+
+	features := ContentFeatures{
+		ContentLength:   5000,
+		HeadingCount:    5,
+		HeadingDepth:    3,
+		LinkDensity:     0.1,
+		AdDensity:       0,
+		TextToHTMLRatio: 0.4,
+		HasTitle:        true,
+		HasDescription:  true,
+		HasCanonical:    true,
+		Language:        "de",
+		DateIndicators:  []string{"2024-01-15"},
+	}
+
+	score := scorer.Calculate(features)
+
+	if score.Total < 0.8 {
+		t.Errorf("Expected high quality score (>0.8), got %f", score.Total)
+	}
+}
+
+func TestCalculate_LowQualityDocument(t *testing.T) {
+	scorer := NewScorer()
+
+	features := ContentFeatures{
+		ContentLength:   100,
+		HeadingCount:    0,
+		LinkDensity:     0.5,
+		AdDensity:       0.2,
+		TextToHTMLRatio: 0.05,
+		HasTitle:        false,
+		HasDescription:  false,
+		Language:        "",
+	}
+
+	score := scorer.Calculate(features)
+
+	if score.Total > 0.5 {
+		t.Errorf("Expected low quality score (<0.5), got %f", score.Total)
+	}
+}
+
+func TestCalculateContentLengthScore(t *testing.T) {
+	scorer := NewScorer()
+
+	tests := []struct {
+		length   int
+		minScore float64
+		maxScore float64
+	}{
+		{100, 0.0, 0.2},       // very short
+		{500, 0.5, 0.7},       // short-medium
+		{2000, 0.7, 0.9},      // good
+		{5000, 0.9, 1.0},      // optimal
+		{30000, 0.6, 0.8},     // very long
+	}
+
+	for _, tt := range tests {
+		t.Run("", func(t *testing.T) {
+			score := scorer.calculateContentLengthScore(tt.length)
+			if score < tt.minScore || score > tt.maxScore {
+				t.Errorf("Length %d: expected score in [%f, %f], got %f",
+					tt.length, tt.minScore, tt.maxScore, score)
+			}
+		})
+	}
+}
+
+func TestCalculateHeadingScore(t *testing.T) {
+	scorer := NewScorer()
+
+	// No headings
+	score := scorer.calculateHeadingScore(0, 0, false)
+	if score > 0.1 {
+		t.Errorf("Expected low score for no headings, got %f", score)
+	}
+
+	// Good heading structure
+	score = scorer.calculateHeadingScore(5, 3, true)
+	if score < 0.9 {
+		t.Errorf("Expected high score for good headings, got %f", score)
+	}
+}
+
+func TestCalculateLinkQualityScore(t *testing.T) {
+	scorer := NewScorer()
+
+	// Good: low link and ad density
+	score := scorer.calculateLinkQualityScore(0.1, 0)
+	if score < 0.9 {
+		t.Errorf("Expected high score for good link quality, got %f", score)
+	}
+
+	// Bad: high ad density
+	score = scorer.calculateLinkQualityScore(0.1, 0.2)
+	if score > 0.6 {
+		t.Errorf("Expected low score for high ad density, got %f", score)
+	}
+}
+
+func TestCalculateTextRatioScore(t *testing.T) {
+	scorer := NewScorer()
+
+	tests := []struct {
+		ratio    float64
+		minScore float64
+	}{
+		{0.05, 0.0},  // too low
+		{0.3, 0.9},   // optimal
+		{0.9, 0.5},   // too high (plain text dump)
+	}
+
+	for _, tt := range tests {
+		score := scorer.calculateTextRatioScore(tt.ratio)
+		if score < tt.minScore {
+			t.Errorf("Ratio %f: expected score >= %f, got %f", tt.ratio, tt.minScore, score)
+		}
+	}
+}
+
+func TestCalculateMetadataScore(t *testing.T) {
+	scorer := NewScorer()
+
+	// All metadata present
+	score := scorer.calculateMetadataScore(true, true, true)
+	if score != 1.0 {
+		t.Errorf("Expected 1.0 for all metadata, got %f", score)
+	}
+
+	// No metadata
+	score = scorer.calculateMetadataScore(false, false, false)
+	if score != 0.0 {
+		t.Errorf("Expected 0.0 for no metadata, got %f", score)
+	}
+
+	// Only title
+	score = scorer.calculateMetadataScore(true, false, false)
+	if score != 0.5 {
+		t.Errorf("Expected 0.5 for only title, got %f", score)
+	}
+}
+
+func TestCalculateLanguageScore(t *testing.T) {
+	scorer := NewScorer()
+
+	tests := []struct {
+		language string
+		expected float64
+	}{
+		{"de", 1.0},
+		{"german", 1.0},
+		{"en", 0.8},
+		{"", 0.5},
+		{"fr", 0.3},
+	}
+
+	for _, tt := range tests {
+		score := scorer.calculateLanguageScore(tt.language)
+		if score != tt.expected {
+			t.Errorf("Language '%s': expected %f, got %f", tt.language, tt.expected, score)
+		}
+	}
+}
+
+func TestCalculateFreshnessScore(t *testing.T) {
+	scorer := NewScorer()
+
+	// Recent date
+	score := scorer.calculateFreshnessScore([]string{"2024-06-15"})
+	if score < 0.9 {
+		t.Errorf("Expected high score for recent date, got %f", score)
+	}
+
+	// Older date
+	score = scorer.calculateFreshnessScore([]string{"2016-01-01"})
+	if score > 0.8 {
+		t.Errorf("Expected moderate score for 2016, got %f", score)
+	}
+
+	// No date indicators
+	score = scorer.calculateFreshnessScore(nil)
+	if score != 0.5 {
+		t.Errorf("Expected neutral score for no dates, got %f", score)
+	}
+}
+
+func TestCalculatePDFScore(t *testing.T) {
+	scorer := NewScorer()
+
+	// Multi-page PDF with good content
+	score := scorer.calculatePDFScore(10, 5000)
+	if score < 0.8 {
+		t.Errorf("Expected high score for good PDF, got %f", score)
+	}
+
+	// Single page, little content
+	score = scorer.calculatePDFScore(1, 50)
+	if score > 0.6 {
+		t.Errorf("Expected lower score for poor PDF, got %f", score)
+	}
+}
+
+func TestExtractDateIndicators(t *testing.T) {
+	text := "Lehrplan gültig ab 01.08.2023 - Stand: 2024-01-15. Aktualisiert 2024."
+
+	indicators := ExtractDateIndicators(text)
+
+	if len(indicators) == 0 {
+		t.Error("Expected to find date indicators")
+	}
+
+	// Should find at least the year patterns
+	found2024 := false
+	for _, ind := range indicators {
+		if ind == "2024" || ind == "2023" || ind == "2024-01-15" || ind == "01.08.2023" {
+			found2024 = true
+		}
+	}
+
+	if !found2024 {
+		t.Errorf("Expected to find 2024 or 2023, got: %v", indicators)
+	}
+}
+
+func TestExtractDateIndicators_Empty(t *testing.T) {
+	text := "This text has no dates whatsoever."
+
+	indicators := ExtractDateIndicators(text)
+
+	if len(indicators) != 0 {
+		t.Errorf("Expected no indicators, got: %v", indicators)
+	}
+}
+
+func TestCalculate_PDFDocument(t *testing.T) {
+	scorer := NewScorer()
+
+	features := ContentFeatures{
+		ContentLength:   3000,
+		HeadingCount:    3,
+		HeadingDepth:    2,
+		Language:        "de",
+		IsPDF:           true,
+		PageCount:       8,
+		DateIndicators:  []string{"2023"},
+	}
+
+	score := scorer.Calculate(features)
+
+	// PDF with 8 pages and good content should score well
+	if score.PDFSpecific < 0.8 {
+		t.Errorf("Expected good PDF-specific score, got %f", score.PDFSpecific)
+	}
+
+	if score.Total < 0.5 {
+		t.Errorf("Expected reasonable score for PDF, got %f", score.Total)
+	}
+}
+
+func TestCalculate_ScoreClamping(t *testing.T) {
+	scorer := NewScorer()
+
+	// Even with all perfect scores, total should not exceed 1.0
+	features := ContentFeatures{
+		ContentLength:   5000,
+		HeadingCount:    10,
+		HeadingDepth:    4,
+		HasTOC:          true,
+		LinkDensity:     0,
+		AdDensity:       0,
+		TextToHTMLRatio: 0.4,
+		HasTitle:        true,
+		HasDescription:  true,
+		HasCanonical:    true,
+		Language:        "de",
+		DateIndicators:  []string{"2024"},
+	}
+
+	score := scorer.Calculate(features)
+
+	if score.Total > 1.0 {
+		t.Errorf("Score should be clamped to 1.0, got %f", score.Total)
+	}
+	if score.Total < 0 {
+		t.Errorf("Score should not be negative, got %f", score.Total)
+	}
+}
+
+func TestDefaultWeights(t *testing.T) {
+	weights := DefaultWeights()
+
+	// Sum should be approximately 1.0
+	sum := weights.ContentLength +
+		weights.HeadingStructure +
+		weights.LinkQuality +
+		weights.TextToHTMLRatio +
+		weights.MetadataPresence +
+		weights.LanguageClarity +
+		weights.ContentFreshness +
+		weights.PDFSpecific
+
+	if sum < 0.99 || sum > 1.01 {
+		t.Errorf("Default weights should sum to 1.0, got %f", sum)
+	}
+}