package quality import ( "regexp" "strings" ) // Scorer calculates quality scores for documents type Scorer struct { weights Weights } // Weights defines the contribution of each factor to the quality score type Weights struct { ContentLength float64 // 0.20 - longer content often more valuable HeadingStructure float64 // 0.15 - well-structured documents LinkQuality float64 // 0.15 - low ad/external link density TextToHTMLRatio float64 // 0.15 - content-rich pages MetadataPresence float64 // 0.10 - proper title, description LanguageClarity float64 // 0.10 - German content, no mixed languages ContentFreshness float64 // 0.10 - indication of update/recency PDFSpecific float64 // 0.05 - PDF-specific quality signals } // DefaultWeights returns the default quality score weights func DefaultWeights() Weights { return Weights{ ContentLength: 0.20, HeadingStructure: 0.15, LinkQuality: 0.15, TextToHTMLRatio: 0.15, MetadataPresence: 0.10, LanguageClarity: 0.10, ContentFreshness: 0.10, PDFSpecific: 0.05, } } // ContentFeatures holds extracted features for quality scoring type ContentFeatures struct { ContentLength int HeadingCount int HeadingDepth int // max heading level depth (h1-h6) LinkDensity float64 AdDensity float64 TextToHTMLRatio float64 HasTitle bool HasDescription bool HasCanonical bool Language string IsPDF bool PageCount int // for PDFs HasTOC bool // table of contents DateIndicators []string // found date patterns } // Score represents the quality score breakdown type Score struct { Total float64 `json:"total"` ContentLength float64 `json:"content_length"` HeadingStructure float64 `json:"heading_structure"` LinkQuality float64 `json:"link_quality"` TextToHTMLRatio float64 `json:"text_html_ratio"` MetadataPresence float64 `json:"metadata_presence"` LanguageClarity float64 `json:"language_clarity"` ContentFreshness float64 `json:"content_freshness"` PDFSpecific float64 `json:"pdf_specific"` } // NewScorer creates a quality scorer with default weights func NewScorer() *Scorer { return &Scorer{weights: DefaultWeights()} } // NewScorerWithWeights creates a scorer with custom weights func NewScorerWithWeights(w Weights) *Scorer { return &Scorer{weights: w} } // Calculate computes the quality score for given features func (s *Scorer) Calculate(features ContentFeatures) Score { score := Score{} // 1. Content Length Score (0-1) score.ContentLength = s.calculateContentLengthScore(features.ContentLength) // 2. Heading Structure Score (0-1) score.HeadingStructure = s.calculateHeadingScore(features.HeadingCount, features.HeadingDepth, features.HasTOC) // 3. Link Quality Score (0-1) score.LinkQuality = s.calculateLinkQualityScore(features.LinkDensity, features.AdDensity) // 4. Text to HTML Ratio Score (0-1) score.TextToHTMLRatio = s.calculateTextRatioScore(features.TextToHTMLRatio) // 5. Metadata Presence Score (0-1) score.MetadataPresence = s.calculateMetadataScore(features.HasTitle, features.HasDescription, features.HasCanonical) // 6. Language Clarity Score (0-1) score.LanguageClarity = s.calculateLanguageScore(features.Language) // 7. Content Freshness Score (0-1) score.ContentFreshness = s.calculateFreshnessScore(features.DateIndicators) // 8. PDF-Specific Score (0-1) if features.IsPDF { score.PDFSpecific = s.calculatePDFScore(features.PageCount, features.ContentLength) } else { score.PDFSpecific = 1.0 // full score for non-PDFs (no penalty) } // Calculate weighted total score.Total = score.ContentLength*s.weights.ContentLength + score.HeadingStructure*s.weights.HeadingStructure + score.LinkQuality*s.weights.LinkQuality + score.TextToHTMLRatio*s.weights.TextToHTMLRatio + score.MetadataPresence*s.weights.MetadataPresence + score.LanguageClarity*s.weights.LanguageClarity + score.ContentFreshness*s.weights.ContentFreshness + score.PDFSpecific*s.weights.PDFSpecific // Clamp to 0-1 if score.Total > 1.0 { score.Total = 1.0 } if score.Total < 0 { score.Total = 0 } return score } // calculateContentLengthScore scores based on content length func (s *Scorer) calculateContentLengthScore(length int) float64 { // Optimal range: 1000-10000 characters // Too short (<500): low quality // Too long (>20000): might be noise/boilerplate switch { case length < 200: return 0.1 case length < 500: return 0.3 case length < 1000: return 0.6 case length < 3000: return 0.8 case length < 10000: return 1.0 case length < 20000: return 0.9 default: return 0.7 // very long documents might have quality issues } } // calculateHeadingScore scores heading structure func (s *Scorer) calculateHeadingScore(count, depth int, hasTOC bool) float64 { score := 0.0 // Headings present if count > 0 { score += 0.4 } if count >= 3 { score += 0.2 } // Depth variety (proper hierarchy) if depth >= 2 { score += 0.2 } // Table of contents indicates well-structured document if hasTOC { score += 0.2 } if score > 1.0 { score = 1.0 } return score } // calculateLinkQualityScore scores based on link/ad density func (s *Scorer) calculateLinkQualityScore(linkDensity, adDensity float64) float64 { score := 1.0 // High link density is bad if linkDensity > 0.3 { score -= 0.3 } else if linkDensity > 0.2 { score -= 0.1 } // Any ad density is bad if adDensity > 0.1 { score -= 0.4 } else if adDensity > 0.05 { score -= 0.2 } else if adDensity > 0 { score -= 0.1 } if score < 0 { score = 0 } return score } // calculateTextRatioScore scores text to HTML ratio func (s *Scorer) calculateTextRatioScore(ratio float64) float64 { // Good ratio: 0.2-0.6 // Too low: too much markup // Too high: might be plain text dump switch { case ratio < 0.1: return 0.3 case ratio < 0.2: return 0.6 case ratio < 0.6: return 1.0 case ratio < 0.8: return 0.8 default: return 0.6 } } // calculateMetadataScore scores presence of metadata func (s *Scorer) calculateMetadataScore(hasTitle, hasDescription, hasCanonical bool) float64 { score := 0.0 if hasTitle { score += 0.5 } if hasDescription { score += 0.3 } if hasCanonical { score += 0.2 } return score } // calculateLanguageScore scores language clarity func (s *Scorer) calculateLanguageScore(language string) float64 { switch strings.ToLower(language) { case "de", "german", "deutsch": return 1.0 case "en", "english", "englisch": return 0.8 // English is acceptable case "": return 0.5 // unknown default: return 0.3 // other languages } } // calculateFreshnessScore scores content freshness indicators func (s *Scorer) calculateFreshnessScore(dateIndicators []string) float64 { if len(dateIndicators) == 0 { return 0.5 // neutral } // Check for recent years (2020+) recentYearPattern := regexp.MustCompile(`202[0-5]`) for _, indicator := range dateIndicators { if recentYearPattern.MatchString(indicator) { return 1.0 } } // Check for 2015-2019 modernPattern := regexp.MustCompile(`201[5-9]`) for _, indicator := range dateIndicators { if modernPattern.MatchString(indicator) { return 0.7 } } // Older content return 0.4 } // calculatePDFScore scores PDF-specific quality func (s *Scorer) calculatePDFScore(pageCount, contentLength int) float64 { score := 0.5 // base // Page count bonus if pageCount > 1 { score += 0.2 } if pageCount > 5 { score += 0.1 } // Text extraction success if contentLength > 100 { score += 0.2 } if score > 1.0 { score = 1.0 } return score } // ExtractDateIndicators finds date patterns in text func ExtractDateIndicators(text string) []string { var indicators []string // Pattern: DD.MM.YYYY or YYYY-MM-DD datePatterns := []*regexp.Regexp{ regexp.MustCompile(`\d{2}\.\d{2}\.\d{4}`), regexp.MustCompile(`\d{4}-\d{2}-\d{2}`), regexp.MustCompile(`\b20[012][0-9]\b`), // years 2000-2029 } for _, pattern := range datePatterns { matches := pattern.FindAllString(text, 5) // limit matches indicators = append(indicators, matches...) } return indicators }