fix: Restore all files lost during destructive rebase

A previous `git pull --rebase origin main` dropped 177 local commits,
losing 3400+ files across admin-v2, backend, studio-v2, website,
klausur-service, and many other services. The partial restore attempt
(660295e2) only recovered some files.

This commit restores all missing files from pre-rebase ref 98933f5e
while preserving post-rebase additions (night-scheduler, night-mode UI,
NightModeWidget dashboard integration).

Restored features include:
- AI Module Sidebar (FAB), OCR Labeling, OCR Compare
- GPU Dashboard, RAG Pipeline, Magic Help
- Klausur-Korrektur (8 files), Abitur-Archiv (5+ files)
- Companion, Zeugnisse-Crawler, Screen Flow
- Full backend, studio-v2, website, klausur-service
- All compliance SDKs, agent-core, voice-service
- CI/CD configs, documentation, scripts

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-02-09 09:51:32 +01:00
parent f7487ee240
commit bfdaf63ba9
2009 changed files with 749983 additions and 1731 deletions

View File

@@ -0,0 +1,326 @@
package quality
import (
"regexp"
"strings"
)
// Scorer calculates quality scores for documents
type Scorer struct {
weights Weights
}
// Weights defines the contribution of each factor to the quality score
type Weights struct {
ContentLength float64 // 0.20 - longer content often more valuable
HeadingStructure float64 // 0.15 - well-structured documents
LinkQuality float64 // 0.15 - low ad/external link density
TextToHTMLRatio float64 // 0.15 - content-rich pages
MetadataPresence float64 // 0.10 - proper title, description
LanguageClarity float64 // 0.10 - German content, no mixed languages
ContentFreshness float64 // 0.10 - indication of update/recency
PDFSpecific float64 // 0.05 - PDF-specific quality signals
}
// DefaultWeights returns the default quality score weights
func DefaultWeights() Weights {
return Weights{
ContentLength: 0.20,
HeadingStructure: 0.15,
LinkQuality: 0.15,
TextToHTMLRatio: 0.15,
MetadataPresence: 0.10,
LanguageClarity: 0.10,
ContentFreshness: 0.10,
PDFSpecific: 0.05,
}
}
// ContentFeatures holds extracted features for quality scoring
type ContentFeatures struct {
ContentLength int
HeadingCount int
HeadingDepth int // max heading level depth (h1-h6)
LinkDensity float64
AdDensity float64
TextToHTMLRatio float64
HasTitle bool
HasDescription bool
HasCanonical bool
Language string
IsPDF bool
PageCount int // for PDFs
HasTOC bool // table of contents
DateIndicators []string // found date patterns
}
// Score represents the quality score breakdown
type Score struct {
Total float64 `json:"total"`
ContentLength float64 `json:"content_length"`
HeadingStructure float64 `json:"heading_structure"`
LinkQuality float64 `json:"link_quality"`
TextToHTMLRatio float64 `json:"text_html_ratio"`
MetadataPresence float64 `json:"metadata_presence"`
LanguageClarity float64 `json:"language_clarity"`
ContentFreshness float64 `json:"content_freshness"`
PDFSpecific float64 `json:"pdf_specific"`
}
// NewScorer creates a quality scorer with default weights
func NewScorer() *Scorer {
return &Scorer{weights: DefaultWeights()}
}
// NewScorerWithWeights creates a scorer with custom weights
func NewScorerWithWeights(w Weights) *Scorer {
return &Scorer{weights: w}
}
// Calculate computes the quality score for given features
func (s *Scorer) Calculate(features ContentFeatures) Score {
score := Score{}
// 1. Content Length Score (0-1)
score.ContentLength = s.calculateContentLengthScore(features.ContentLength)
// 2. Heading Structure Score (0-1)
score.HeadingStructure = s.calculateHeadingScore(features.HeadingCount, features.HeadingDepth, features.HasTOC)
// 3. Link Quality Score (0-1)
score.LinkQuality = s.calculateLinkQualityScore(features.LinkDensity, features.AdDensity)
// 4. Text to HTML Ratio Score (0-1)
score.TextToHTMLRatio = s.calculateTextRatioScore(features.TextToHTMLRatio)
// 5. Metadata Presence Score (0-1)
score.MetadataPresence = s.calculateMetadataScore(features.HasTitle, features.HasDescription, features.HasCanonical)
// 6. Language Clarity Score (0-1)
score.LanguageClarity = s.calculateLanguageScore(features.Language)
// 7. Content Freshness Score (0-1)
score.ContentFreshness = s.calculateFreshnessScore(features.DateIndicators)
// 8. PDF-Specific Score (0-1)
if features.IsPDF {
score.PDFSpecific = s.calculatePDFScore(features.PageCount, features.ContentLength)
} else {
score.PDFSpecific = 1.0 // full score for non-PDFs (no penalty)
}
// Calculate weighted total
score.Total = score.ContentLength*s.weights.ContentLength +
score.HeadingStructure*s.weights.HeadingStructure +
score.LinkQuality*s.weights.LinkQuality +
score.TextToHTMLRatio*s.weights.TextToHTMLRatio +
score.MetadataPresence*s.weights.MetadataPresence +
score.LanguageClarity*s.weights.LanguageClarity +
score.ContentFreshness*s.weights.ContentFreshness +
score.PDFSpecific*s.weights.PDFSpecific
// Clamp to 0-1
if score.Total > 1.0 {
score.Total = 1.0
}
if score.Total < 0 {
score.Total = 0
}
return score
}
// calculateContentLengthScore scores based on content length
func (s *Scorer) calculateContentLengthScore(length int) float64 {
// Optimal range: 1000-10000 characters
// Too short (<500): low quality
// Too long (>20000): might be noise/boilerplate
switch {
case length < 200:
return 0.1
case length < 500:
return 0.3
case length < 1000:
return 0.6
case length < 3000:
return 0.8
case length < 10000:
return 1.0
case length < 20000:
return 0.9
default:
return 0.7 // very long documents might have quality issues
}
}
// calculateHeadingScore scores heading structure
func (s *Scorer) calculateHeadingScore(count, depth int, hasTOC bool) float64 {
score := 0.0
// Headings present
if count > 0 {
score += 0.4
}
if count >= 3 {
score += 0.2
}
// Depth variety (proper hierarchy)
if depth >= 2 {
score += 0.2
}
// Table of contents indicates well-structured document
if hasTOC {
score += 0.2
}
if score > 1.0 {
score = 1.0
}
return score
}
// calculateLinkQualityScore scores based on link/ad density
func (s *Scorer) calculateLinkQualityScore(linkDensity, adDensity float64) float64 {
score := 1.0
// High link density is bad
if linkDensity > 0.3 {
score -= 0.3
} else if linkDensity > 0.2 {
score -= 0.1
}
// Any ad density is bad
if adDensity > 0.1 {
score -= 0.4
} else if adDensity > 0.05 {
score -= 0.2
} else if adDensity > 0 {
score -= 0.1
}
if score < 0 {
score = 0
}
return score
}
// calculateTextRatioScore scores text to HTML ratio
func (s *Scorer) calculateTextRatioScore(ratio float64) float64 {
// Good ratio: 0.2-0.6
// Too low: too much markup
// Too high: might be plain text dump
switch {
case ratio < 0.1:
return 0.3
case ratio < 0.2:
return 0.6
case ratio < 0.6:
return 1.0
case ratio < 0.8:
return 0.8
default:
return 0.6
}
}
// calculateMetadataScore scores presence of metadata
func (s *Scorer) calculateMetadataScore(hasTitle, hasDescription, hasCanonical bool) float64 {
score := 0.0
if hasTitle {
score += 0.5
}
if hasDescription {
score += 0.3
}
if hasCanonical {
score += 0.2
}
return score
}
// calculateLanguageScore scores language clarity
func (s *Scorer) calculateLanguageScore(language string) float64 {
switch strings.ToLower(language) {
case "de", "german", "deutsch":
return 1.0
case "en", "english", "englisch":
return 0.8 // English is acceptable
case "":
return 0.5 // unknown
default:
return 0.3 // other languages
}
}
// calculateFreshnessScore scores content freshness indicators
func (s *Scorer) calculateFreshnessScore(dateIndicators []string) float64 {
if len(dateIndicators) == 0 {
return 0.5 // neutral
}
// Check for recent years (2020+)
recentYearPattern := regexp.MustCompile(`202[0-5]`)
for _, indicator := range dateIndicators {
if recentYearPattern.MatchString(indicator) {
return 1.0
}
}
// Check for 2015-2019
modernPattern := regexp.MustCompile(`201[5-9]`)
for _, indicator := range dateIndicators {
if modernPattern.MatchString(indicator) {
return 0.7
}
}
// Older content
return 0.4
}
// calculatePDFScore scores PDF-specific quality
func (s *Scorer) calculatePDFScore(pageCount, contentLength int) float64 {
score := 0.5 // base
// Page count bonus
if pageCount > 1 {
score += 0.2
}
if pageCount > 5 {
score += 0.1
}
// Text extraction success
if contentLength > 100 {
score += 0.2
}
if score > 1.0 {
score = 1.0
}
return score
}
// ExtractDateIndicators finds date patterns in text
func ExtractDateIndicators(text string) []string {
var indicators []string
// Pattern: DD.MM.YYYY or YYYY-MM-DD
datePatterns := []*regexp.Regexp{
regexp.MustCompile(`\d{2}\.\d{2}\.\d{4}`),
regexp.MustCompile(`\d{4}-\d{2}-\d{2}`),
regexp.MustCompile(`\b20[012][0-9]\b`), // years 2000-2029
}
for _, pattern := range datePatterns {
matches := pattern.FindAllString(text, 5) // limit matches
indicators = append(indicators, matches...)
}
return indicators
}

View File

@@ -0,0 +1,333 @@
package quality
import (
"testing"
)
func TestNewScorer(t *testing.T) {
scorer := NewScorer()
if scorer == nil {
t.Fatal("Expected non-nil scorer")
}
}
func TestNewScorerWithWeights(t *testing.T) {
weights := Weights{
ContentLength: 0.5,
HeadingStructure: 0.5,
}
scorer := NewScorerWithWeights(weights)
if scorer.weights.ContentLength != 0.5 {
t.Errorf("Expected weight 0.5, got %f", scorer.weights.ContentLength)
}
}
func TestCalculate_HighQualityDocument(t *testing.T) {
scorer := NewScorer()
features := ContentFeatures{
ContentLength: 5000,
HeadingCount: 5,
HeadingDepth: 3,
LinkDensity: 0.1,
AdDensity: 0,
TextToHTMLRatio: 0.4,
HasTitle: true,
HasDescription: true,
HasCanonical: true,
Language: "de",
DateIndicators: []string{"2024-01-15"},
}
score := scorer.Calculate(features)
if score.Total < 0.8 {
t.Errorf("Expected high quality score (>0.8), got %f", score.Total)
}
}
func TestCalculate_LowQualityDocument(t *testing.T) {
scorer := NewScorer()
features := ContentFeatures{
ContentLength: 100,
HeadingCount: 0,
LinkDensity: 0.5,
AdDensity: 0.2,
TextToHTMLRatio: 0.05,
HasTitle: false,
HasDescription: false,
Language: "",
}
score := scorer.Calculate(features)
if score.Total > 0.5 {
t.Errorf("Expected low quality score (<0.5), got %f", score.Total)
}
}
func TestCalculateContentLengthScore(t *testing.T) {
scorer := NewScorer()
tests := []struct {
length int
minScore float64
maxScore float64
}{
{100, 0.0, 0.2}, // very short
{500, 0.5, 0.7}, // short-medium
{2000, 0.7, 0.9}, // good
{5000, 0.9, 1.0}, // optimal
{30000, 0.6, 0.8}, // very long
}
for _, tt := range tests {
t.Run("", func(t *testing.T) {
score := scorer.calculateContentLengthScore(tt.length)
if score < tt.minScore || score > tt.maxScore {
t.Errorf("Length %d: expected score in [%f, %f], got %f",
tt.length, tt.minScore, tt.maxScore, score)
}
})
}
}
func TestCalculateHeadingScore(t *testing.T) {
scorer := NewScorer()
// No headings
score := scorer.calculateHeadingScore(0, 0, false)
if score > 0.1 {
t.Errorf("Expected low score for no headings, got %f", score)
}
// Good heading structure
score = scorer.calculateHeadingScore(5, 3, true)
if score < 0.9 {
t.Errorf("Expected high score for good headings, got %f", score)
}
}
func TestCalculateLinkQualityScore(t *testing.T) {
scorer := NewScorer()
// Good: low link and ad density
score := scorer.calculateLinkQualityScore(0.1, 0)
if score < 0.9 {
t.Errorf("Expected high score for good link quality, got %f", score)
}
// Bad: high ad density
score = scorer.calculateLinkQualityScore(0.1, 0.2)
if score > 0.6 {
t.Errorf("Expected low score for high ad density, got %f", score)
}
}
func TestCalculateTextRatioScore(t *testing.T) {
scorer := NewScorer()
tests := []struct {
ratio float64
minScore float64
}{
{0.05, 0.0}, // too low
{0.3, 0.9}, // optimal
{0.9, 0.5}, // too high (plain text dump)
}
for _, tt := range tests {
score := scorer.calculateTextRatioScore(tt.ratio)
if score < tt.minScore {
t.Errorf("Ratio %f: expected score >= %f, got %f", tt.ratio, tt.minScore, score)
}
}
}
func TestCalculateMetadataScore(t *testing.T) {
scorer := NewScorer()
// All metadata present
score := scorer.calculateMetadataScore(true, true, true)
if score != 1.0 {
t.Errorf("Expected 1.0 for all metadata, got %f", score)
}
// No metadata
score = scorer.calculateMetadataScore(false, false, false)
if score != 0.0 {
t.Errorf("Expected 0.0 for no metadata, got %f", score)
}
// Only title
score = scorer.calculateMetadataScore(true, false, false)
if score != 0.5 {
t.Errorf("Expected 0.5 for only title, got %f", score)
}
}
func TestCalculateLanguageScore(t *testing.T) {
scorer := NewScorer()
tests := []struct {
language string
expected float64
}{
{"de", 1.0},
{"german", 1.0},
{"en", 0.8},
{"", 0.5},
{"fr", 0.3},
}
for _, tt := range tests {
score := scorer.calculateLanguageScore(tt.language)
if score != tt.expected {
t.Errorf("Language '%s': expected %f, got %f", tt.language, tt.expected, score)
}
}
}
func TestCalculateFreshnessScore(t *testing.T) {
scorer := NewScorer()
// Recent date
score := scorer.calculateFreshnessScore([]string{"2024-06-15"})
if score < 0.9 {
t.Errorf("Expected high score for recent date, got %f", score)
}
// Older date
score = scorer.calculateFreshnessScore([]string{"2016-01-01"})
if score > 0.8 {
t.Errorf("Expected moderate score for 2016, got %f", score)
}
// No date indicators
score = scorer.calculateFreshnessScore(nil)
if score != 0.5 {
t.Errorf("Expected neutral score for no dates, got %f", score)
}
}
func TestCalculatePDFScore(t *testing.T) {
scorer := NewScorer()
// Multi-page PDF with good content
score := scorer.calculatePDFScore(10, 5000)
if score < 0.8 {
t.Errorf("Expected high score for good PDF, got %f", score)
}
// Single page, little content
score = scorer.calculatePDFScore(1, 50)
if score > 0.6 {
t.Errorf("Expected lower score for poor PDF, got %f", score)
}
}
func TestExtractDateIndicators(t *testing.T) {
text := "Lehrplan gültig ab 01.08.2023 - Stand: 2024-01-15. Aktualisiert 2024."
indicators := ExtractDateIndicators(text)
if len(indicators) == 0 {
t.Error("Expected to find date indicators")
}
// Should find at least the year patterns
found2024 := false
for _, ind := range indicators {
if ind == "2024" || ind == "2023" || ind == "2024-01-15" || ind == "01.08.2023" {
found2024 = true
}
}
if !found2024 {
t.Errorf("Expected to find 2024 or 2023, got: %v", indicators)
}
}
func TestExtractDateIndicators_Empty(t *testing.T) {
text := "This text has no dates whatsoever."
indicators := ExtractDateIndicators(text)
if len(indicators) != 0 {
t.Errorf("Expected no indicators, got: %v", indicators)
}
}
func TestCalculate_PDFDocument(t *testing.T) {
scorer := NewScorer()
features := ContentFeatures{
ContentLength: 3000,
HeadingCount: 3,
HeadingDepth: 2,
Language: "de",
IsPDF: true,
PageCount: 8,
DateIndicators: []string{"2023"},
}
score := scorer.Calculate(features)
// PDF with 8 pages and good content should score well
if score.PDFSpecific < 0.8 {
t.Errorf("Expected good PDF-specific score, got %f", score.PDFSpecific)
}
if score.Total < 0.5 {
t.Errorf("Expected reasonable score for PDF, got %f", score.Total)
}
}
func TestCalculate_ScoreClamping(t *testing.T) {
scorer := NewScorer()
// Even with all perfect scores, total should not exceed 1.0
features := ContentFeatures{
ContentLength: 5000,
HeadingCount: 10,
HeadingDepth: 4,
HasTOC: true,
LinkDensity: 0,
AdDensity: 0,
TextToHTMLRatio: 0.4,
HasTitle: true,
HasDescription: true,
HasCanonical: true,
Language: "de",
DateIndicators: []string{"2024"},
}
score := scorer.Calculate(features)
if score.Total > 1.0 {
t.Errorf("Score should be clamped to 1.0, got %f", score.Total)
}
if score.Total < 0 {
t.Errorf("Score should not be negative, got %f", score.Total)
}
}
func TestDefaultWeights(t *testing.T) {
weights := DefaultWeights()
// Sum should be approximately 1.0
sum := weights.ContentLength +
weights.HeadingStructure +
weights.LinkQuality +
weights.TextToHTMLRatio +
weights.MetadataPresence +
weights.LanguageClarity +
weights.ContentFreshness +
weights.PDFSpecific
if sum < 0.99 || sum > 1.01 {
t.Errorf("Default weights should sum to 1.0, got %f", sum)
}
}