fix: Restore all files lost during destructive rebase

A previous `git pull --rebase origin main` dropped 177 local commits, losing 3400+ files across admin-v2, backend, studio-v2, website, klausur-service, and many other services. The partial restore attempt (660295e2) only recovered some files. This commit restores all missing files from pre-rebase ref 98933f5e while preserving post-rebase additions (night-scheduler, night-mode UI, NightModeWidget dashboard integration). Restored features include: - AI Module Sidebar (FAB), OCR Labeling, OCR Compare - GPU Dashboard, RAG Pipeline, Magic Help - Klausur-Korrektur (8 files), Abitur-Archiv (5+ files) - Companion, Zeugnisse-Crawler, Screen Flow - Full backend, studio-v2, website, klausur-service - All compliance SDKs, agent-core, voice-service - CI/CD configs, documentation, scripts Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-09 09:51:32 +01:00
parent f7487ee240
commit 21a844cb8a
1986 changed files with 744143 additions and 1731 deletions
@@ -0,0 +1,464 @@
+package extractor
+
+import (
+	"bytes"
+	"io"
+	"regexp"
+	"strings"
+	"unicode"
+
+	"github.com/PuerkitoBio/goquery"
+	"github.com/ledongthuc/pdf"
+	"golang.org/x/net/html"
+)
+
+// ExtractedContent contains parsed content from HTML/PDF
+type ExtractedContent struct {
+	Title         string
+	ContentText   string
+	SnippetText   string
+	Language      string
+	ContentLength int
+	Headings      []string
+	Links         []string
+	MetaData      map[string]string
+	Features      ContentFeatures
+}
+
+// ContentFeatures for quality scoring
+type ContentFeatures struct {
+	AdDensity       float64
+	LinkDensity     float64
+	TextToHTMLRatio float64
+	HasMainContent  bool
+}
+
+// ExtractHTML extracts content from HTML
+func ExtractHTML(body []byte) (*ExtractedContent, error) {
+	doc, err := goquery.NewDocumentFromReader(bytes.NewReader(body))
+	if err != nil {
+		return nil, err
+	}
+
+	content := &ExtractedContent{
+		MetaData: make(map[string]string),
+	}
+
+	// Extract title
+	content.Title = strings.TrimSpace(doc.Find("title").First().Text())
+	if content.Title == "" {
+		content.Title = strings.TrimSpace(doc.Find("h1").First().Text())
+	}
+
+	// Extract meta tags
+	doc.Find("meta").Each(func(i int, s *goquery.Selection) {
+		name, _ := s.Attr("name")
+		property, _ := s.Attr("property")
+		contentAttr, _ := s.Attr("content")
+
+		key := name
+		if key == "" {
+			key = property
+		}
+
+		if key != "" && contentAttr != "" {
+			content.MetaData[strings.ToLower(key)] = contentAttr
+		}
+	})
+
+	// Try to get og:title if main title is empty
+	if content.Title == "" {
+		if ogTitle, ok := content.MetaData["og:title"]; ok {
+			content.Title = ogTitle
+		}
+	}
+
+	// Extract headings
+	doc.Find("h1, h2, h3").Each(func(i int, s *goquery.Selection) {
+		text := strings.TrimSpace(s.Text())
+		if text != "" && len(text) < 500 {
+			content.Headings = append(content.Headings, text)
+		}
+	})
+
+	// Remove unwanted elements
+	doc.Find("script, style, nav, header, footer, aside, iframe, noscript, form, .advertisement, .ad, .ads, #cookie-banner, .cookie-notice, .social-share").Remove()
+
+	// Try to find main content area
+	mainContent := doc.Find("main, article, .content, .main-content, #content, #main").First()
+	if mainContent.Length() == 0 {
+		mainContent = doc.Find("body")
+	}
+
+	// Extract text content
+	var textBuilder strings.Builder
+	mainContent.Find("p, li, td, th, h1, h2, h3, h4, h5, h6, blockquote, pre").Each(func(i int, s *goquery.Selection) {
+		text := strings.TrimSpace(s.Text())
+		if text != "" {
+			textBuilder.WriteString(text)
+			textBuilder.WriteString("\n\n")
+		}
+	})
+
+	content.ContentText = cleanText(textBuilder.String())
+	content.ContentLength = len(content.ContentText)
+
+	// Generate snippet (first ~300 chars of meaningful content)
+	content.SnippetText = generateSnippet(content.ContentText, 300)
+
+	// Extract links
+	doc.Find("a[href]").Each(func(i int, s *goquery.Selection) {
+		href, exists := s.Attr("href")
+		if exists && strings.HasPrefix(href, "http") {
+			content.Links = append(content.Links, href)
+		}
+	})
+
+	// Detect language
+	content.Language = detectLanguage(content.ContentText, content.MetaData)
+
+	// Calculate features
+	htmlLen := float64(len(body))
+	textLen := float64(len(content.ContentText))
+
+	if htmlLen > 0 {
+		content.Features.TextToHTMLRatio = textLen / htmlLen
+	}
+
+	if textLen > 0 {
+		linkTextLen := 0.0
+		doc.Find("a").Each(func(i int, s *goquery.Selection) {
+			linkTextLen += float64(len(s.Text()))
+		})
+		content.Features.LinkDensity = linkTextLen / textLen
+	}
+
+	content.Features.HasMainContent = content.ContentLength > 200
+
+	// Ad density estimation (very simple heuristic)
+	adCount := doc.Find(".ad, .ads, .advertisement, [class*='banner'], [id*='banner']").Length()
+	totalElements := doc.Find("div, p, article, section").Length()
+	if totalElements > 0 {
+		content.Features.AdDensity = float64(adCount) / float64(totalElements)
+	}
+
+	return content, nil
+}
+
+// ExtractPDF extracts text from PDF using ledongthuc/pdf library
+func ExtractPDF(body []byte) (*ExtractedContent, error) {
+	content := &ExtractedContent{
+		MetaData: make(map[string]string),
+	}
+
+	// Create a reader from the byte slice
+	reader := bytes.NewReader(body)
+	pdfReader, err := pdf.NewReader(reader, int64(len(body)))
+	if err != nil {
+		// Fallback to basic extraction if PDF parsing fails
+		return extractPDFFallback(body)
+	}
+
+	// Extract text using GetPlainText
+	textReader, err := pdfReader.GetPlainText()
+	if err != nil {
+		// Fallback to basic extraction
+		return extractPDFFallback(body)
+	}
+
+	// Read all text content
+	var textBuilder strings.Builder
+	_, err = io.Copy(&textBuilder, textReader)
+	if err != nil {
+		return extractPDFFallback(body)
+	}
+
+	rawText := textBuilder.String()
+
+	// Clean and process text
+	content.ContentText = cleanText(rawText)
+	content.ContentLength = len(content.ContentText)
+	content.SnippetText = generateSnippet(content.ContentText, 300)
+	content.Language = detectLanguage(content.ContentText, nil)
+	content.Features.HasMainContent = content.ContentLength > 200
+
+	// Extract title from first significant line
+	content.Title = extractPDFTitle(content.ContentText)
+
+	// Try to extract headings (larger font text often appears first in lines)
+	content.Headings = extractPDFHeadings(content.ContentText)
+
+	// Set PDF-specific metadata
+	content.MetaData["content_type"] = "application/pdf"
+	content.MetaData["page_count"] = string(rune(pdfReader.NumPage()))
+
+	return content, nil
+}
+
+// ExtractPDFWithMetadata extracts text with page-by-page processing
+// Use this when you need more control over the extraction process
+func ExtractPDFWithMetadata(body []byte) (*ExtractedContent, error) {
+	content := &ExtractedContent{
+		MetaData: make(map[string]string),
+	}
+
+	reader := bytes.NewReader(body)
+	pdfReader, err := pdf.NewReader(reader, int64(len(body)))
+	if err != nil {
+		return extractPDFFallback(body)
+	}
+
+	// Extract text page by page for better control
+	var textBuilder strings.Builder
+	numPages := pdfReader.NumPage()
+
+	for pageNum := 1; pageNum <= numPages; pageNum++ {
+		page := pdfReader.Page(pageNum)
+		if page.V.IsNull() {
+			continue
+		}
+
+		// Get page content
+		pageContent := page.Content()
+		for _, text := range pageContent.Text {
+			textBuilder.WriteString(text.S)
+			textBuilder.WriteString(" ")
+		}
+		textBuilder.WriteString("\n")
+	}
+
+	rawText := textBuilder.String()
+
+	// Clean and process text
+	content.ContentText = cleanText(rawText)
+	content.ContentLength = len(content.ContentText)
+	content.SnippetText = generateSnippet(content.ContentText, 300)
+	content.Language = detectLanguage(content.ContentText, nil)
+	content.Features.HasMainContent = content.ContentLength > 200
+
+	// Extract title and headings from plain text
+	content.Title = extractPDFTitle(content.ContentText)
+	content.Headings = extractPDFHeadings(content.ContentText)
+
+	content.MetaData["content_type"] = "application/pdf"
+	content.MetaData["page_count"] = string(rune(numPages))
+	content.MetaData["extraction_method"] = "page_by_page"
+
+	return content, nil
+}
+
+// extractPDFFallback uses basic regex extraction when PDF library fails
+func extractPDFFallback(body []byte) (*ExtractedContent, error) {
+	content := &ExtractedContent{
+		MetaData: make(map[string]string),
+	}
+
+	// Basic PDF text extraction using regex (fallback)
+	pdfContent := string(body)
+	var textBuilder strings.Builder
+
+	// Find text content in PDF streams
+	re := regexp.MustCompile(`\((.*?)\)`)
+	matches := re.FindAllStringSubmatch(pdfContent, -1)
+
+	for _, match := range matches {
+		if len(match) > 1 {
+			text := match[1]
+			if isPrintableText(text) {
+				textBuilder.WriteString(text)
+				textBuilder.WriteString(" ")
+			}
+		}
+	}
+
+	content.ContentText = cleanText(textBuilder.String())
+	content.ContentLength = len(content.ContentText)
+	content.SnippetText = generateSnippet(content.ContentText, 300)
+	content.Language = detectLanguage(content.ContentText, nil)
+	content.Features.HasMainContent = content.ContentLength > 200
+	content.Title = extractPDFTitle(content.ContentText)
+	content.MetaData["content_type"] = "application/pdf"
+	content.MetaData["extraction_method"] = "fallback"
+
+	return content, nil
+}
+
+// extractPDFTitle extracts title from PDF content (first significant line)
+func extractPDFTitle(text string) string {
+	lines := strings.Split(text, "\n")
+	for _, line := range lines {
+		line = strings.TrimSpace(line)
+		// Title should be meaningful length
+		if len(line) >= 10 && len(line) <= 200 {
+			// Skip lines that look like page numbers or dates
+			if !regexp.MustCompile(`^\d+$`).MatchString(line) &&
+				!regexp.MustCompile(`^\d{1,2}\.\d{1,2}\.\d{2,4}$`).MatchString(line) {
+				return line
+			}
+		}
+	}
+	return ""
+}
+
+// extractPDFHeadings attempts to extract headings from plain text
+func extractPDFHeadings(text string) []string {
+	var headings []string
+	lines := strings.Split(text, "\n")
+
+	for i, line := range lines {
+		line = strings.TrimSpace(line)
+		// Skip very short or very long lines
+		if len(line) < 5 || len(line) > 200 {
+			continue
+		}
+
+		// Heuristics for headings:
+		// 1. All caps lines (common in PDFs)
+		// 2. Lines followed by empty line or starting with numbers (1., 1.1, etc.)
+		// 3. Short lines at beginning of document
+
+		isAllCaps := line == strings.ToUpper(line) && strings.ContainsAny(line, "ABCDEFGHIJKLMNOPQRSTUVWXYZÄÖÜ")
+		isNumbered := regexp.MustCompile(`^\d+(\.\d+)*\.?\s+\S`).MatchString(line)
+		isShortAndEarly := i < 20 && len(line) < 80
+
+		if (isAllCaps || isNumbered || isShortAndEarly) && !containsHeading(headings, line) {
+			headings = append(headings, line)
+			if len(headings) >= 10 {
+				break // Limit to 10 headings
+			}
+		}
+	}
+
+	return headings
+}
+
+// containsHeading checks if a heading already exists in the list
+func containsHeading(headings []string, heading string) bool {
+	for _, h := range headings {
+		if h == heading {
+			return true
+		}
+	}
+	return false
+}
+
+func isPrintableText(s string) bool {
+	if len(s) < 3 {
+		return false
+	}
+
+	printable := 0
+	for _, r := range s {
+		if unicode.IsPrint(r) && (unicode.IsLetter(r) || unicode.IsSpace(r) || unicode.IsPunct(r)) {
+			printable++
+		}
+	}
+
+	return float64(printable)/float64(len(s)) > 0.7
+}
+
+func cleanText(text string) string {
+	// Normalize whitespace
+	text = strings.ReplaceAll(text, "\r\n", "\n")
+	text = strings.ReplaceAll(text, "\r", "\n")
+
+	// Replace multiple newlines with double newline
+	re := regexp.MustCompile(`\n{3,}`)
+	text = re.ReplaceAllString(text, "\n\n")
+
+	// Replace multiple spaces with single space
+	re = regexp.MustCompile(`[ \t]+`)
+	text = re.ReplaceAllString(text, " ")
+
+	// Trim each line
+	lines := strings.Split(text, "\n")
+	for i, line := range lines {
+		lines[i] = strings.TrimSpace(line)
+	}
+	text = strings.Join(lines, "\n")
+
+	return strings.TrimSpace(text)
+}
+
+func generateSnippet(text string, maxLen int) string {
+	// Find first paragraph with enough content
+	paragraphs := strings.Split(text, "\n\n")
+
+	for _, p := range paragraphs {
+		p = strings.TrimSpace(p)
+		if len(p) >= 50 {
+			if len(p) > maxLen {
+				// Find word boundary
+				p = p[:maxLen]
+				lastSpace := strings.LastIndex(p, " ")
+				if lastSpace > maxLen/2 {
+					p = p[:lastSpace]
+				}
+				p += "..."
+			}
+			return p
+		}
+	}
+
+	// Fallback: just truncate
+	if len(text) > maxLen {
+		text = text[:maxLen] + "..."
+	}
+	return text
+}
+
+func detectLanguage(text string, meta map[string]string) string {
+	// Check meta tags first
+	if meta != nil {
+		if lang, ok := meta["og:locale"]; ok {
+			if strings.HasPrefix(lang, "de") {
+				return "de"
+			}
+			if strings.HasPrefix(lang, "en") {
+				return "en"
+			}
+		}
+	}
+
+	// Simple heuristic based on common German words
+	germanWords := []string{
+		"und", "der", "die", "das", "ist", "für", "mit", "von",
+		"werden", "wird", "sind", "auch", "als", "können", "nach",
+		"einer", "durch", "sich", "bei", "sein", "noch", "haben",
+	}
+
+	englishWords := []string{
+		"the", "and", "for", "are", "but", "not", "you", "all",
+		"can", "had", "her", "was", "one", "our", "with", "they",
+	}
+
+	lowerText := strings.ToLower(text)
+
+	germanCount := 0
+	for _, word := range germanWords {
+		if strings.Contains(lowerText, " "+word+" ") {
+			germanCount++
+		}
+	}
+
+	englishCount := 0
+	for _, word := range englishWords {
+		if strings.Contains(lowerText, " "+word+" ") {
+			englishCount++
+		}
+	}
+
+	if germanCount > englishCount && germanCount > 3 {
+		return "de"
+	}
+	if englishCount > germanCount && englishCount > 3 {
+		return "en"
+	}
+
+	return "de" // Default to German for education content
+}
+
+// UnescapeHTML unescapes HTML entities
+func UnescapeHTML(s string) string {
+	return html.UnescapeString(s)
+}
@@ -0,0 +1,802 @@
+package extractor
+
+import (
+	"strings"
+	"testing"
+)
+
+func TestExtractHTML_BasicContent(t *testing.T) {
+	html := []byte(`<!DOCTYPE html>
+<html>
+<head>
+	<title>Test Page Title</title>
+	<meta name="description" content="Test description">
+	<meta property="og:title" content="OG Title">
+</head>
+<body>
+	<h1>Main Heading</h1>
+	<p>This is the first paragraph with some meaningful content.</p>
+	<p>This is another paragraph that adds more information.</p>
+</body>
+</html>`)
+
+	content, err := ExtractHTML(html)
+	if err != nil {
+		t.Fatalf("ExtractHTML failed: %v", err)
+	}
+
+	// Check title
+	if content.Title != "Test Page Title" {
+		t.Errorf("Expected title 'Test Page Title', got %q", content.Title)
+	}
+
+	// Check metadata
+	if content.MetaData["description"] != "Test description" {
+		t.Errorf("Expected description 'Test description', got %q", content.MetaData["description"])
+	}
+
+	// Check headings
+	if len(content.Headings) == 0 {
+		t.Error("Expected at least one heading")
+	}
+	if content.Headings[0] != "Main Heading" {
+		t.Errorf("Expected heading 'Main Heading', got %q", content.Headings[0])
+	}
+
+	// Check content text
+	if !strings.Contains(content.ContentText, "first paragraph") {
+		t.Error("Expected content to contain 'first paragraph'")
+	}
+}
+
+func TestExtractHTML_TitleFallback(t *testing.T) {
+	tests := []struct {
+		name     string
+		html     string
+		expected string
+	}{
+		{
+			name: "Title from title tag",
+			html: `<html><head><title>Page Title</title></head><body></body></html>`,
+			expected: "Page Title",
+		},
+		{
+			name: "Title from H1 when no title tag",
+			html: `<html><head></head><body><h1>H1 Title</h1></body></html>`,
+			expected: "H1 Title",
+		},
+		{
+			name: "Title from og:title when no title or h1",
+			html: `<html><head><meta property="og:title" content="OG Title"></head><body></body></html>`,
+			expected: "OG Title",
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			content, err := ExtractHTML([]byte(tt.html))
+			if err != nil {
+				t.Fatalf("ExtractHTML failed: %v", err)
+			}
+			if content.Title != tt.expected {
+				t.Errorf("Expected title %q, got %q", tt.expected, content.Title)
+			}
+		})
+	}
+}
+
+func TestExtractHTML_RemovesUnwantedElements(t *testing.T) {
+	html := []byte(`<html>
+<body>
+	<nav>Navigation menu</nav>
+	<header>Header content</header>
+	<main>
+		<p>Main content paragraph</p>
+	</main>
+	<script>alert('dangerous');</script>
+	<style>.hidden{display:none;}</style>
+	<footer>Footer content</footer>
+	<aside>Sidebar content</aside>
+	<div class="advertisement">Ad content</div>
+</body>
+</html>`)
+
+	content, err := ExtractHTML(html)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	// Should contain main content
+	if !strings.Contains(content.ContentText, "Main content paragraph") {
+		t.Error("Expected main content to be extracted")
+	}
+
+	// Should not contain unwanted elements
+	unwanted := []string{"Navigation menu", "alert('dangerous')", "Footer content", "Ad content"}
+	for _, text := range unwanted {
+		if strings.Contains(content.ContentText, text) {
+			t.Errorf("Content should not contain %q", text)
+		}
+	}
+}
+
+func TestExtractHTML_ExtractsLinks(t *testing.T) {
+	html := []byte(`<html><body>
+	<a href="https://example.com/page1">Link 1</a>
+	<a href="https://example.com/page2">Link 2</a>
+	<a href="/relative/path">Relative Link</a>
+	<a href="mailto:test@example.com">Email</a>
+</body></html>`)
+
+	content, err := ExtractHTML(html)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	// Should extract absolute HTTP links
+	if len(content.Links) != 2 {
+		t.Errorf("Expected 2 HTTP links, got %d", len(content.Links))
+	}
+
+	hasPage1 := false
+	hasPage2 := false
+	for _, link := range content.Links {
+		if link == "https://example.com/page1" {
+			hasPage1 = true
+		}
+		if link == "https://example.com/page2" {
+			hasPage2 = true
+		}
+	}
+
+	if !hasPage1 || !hasPage2 {
+		t.Error("Expected to find both HTTP links")
+	}
+}
+
+func TestExtractHTML_CalculatesFeatures(t *testing.T) {
+	html := []byte(`<html><body>
+	<div class="advertisement">Ad 1</div>
+	<p>Some content text that is long enough to be meaningful and provide a good ratio.</p>
+	<p>More content here to increase the text length.</p>
+	<a href="#">Link 1</a>
+	<a href="#">Link 2</a>
+</body></html>`)
+
+	content, err := ExtractHTML(html)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	// Check features are calculated
+	if content.Features.TextToHTMLRatio <= 0 {
+		t.Error("Expected positive TextToHTMLRatio")
+	}
+
+	// Content should have length
+	if content.ContentLength == 0 {
+		t.Error("Expected non-zero ContentLength")
+	}
+}
+
+func TestExtractHTML_GeneratesSnippet(t *testing.T) {
+	html := []byte(`<html><body>
+	<p>This is a short intro.</p>
+	<p>This is a longer paragraph that should be used as the snippet because it has more meaningful content and meets the minimum length requirement for a good snippet.</p>
+	<p>Another paragraph here.</p>
+</body></html>`)
+
+	content, err := ExtractHTML(html)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	if content.SnippetText == "" {
+		t.Error("Expected non-empty snippet")
+	}
+
+	// Snippet should be limited in length
+	if len(content.SnippetText) > 350 { // 300 + "..." margin
+		t.Errorf("Snippet too long: %d chars", len(content.SnippetText))
+	}
+}
+
+func TestDetectLanguage(t *testing.T) {
+	tests := []struct {
+		name     string
+		text     string
+		meta     map[string]string
+		expected string
+	}{
+		{
+			name:     "German from meta",
+			text:     "Some text",
+			meta:     map[string]string{"og:locale": "de_DE"},
+			expected: "de",
+		},
+		{
+			name:     "English from meta",
+			text:     "Some text",
+			meta:     map[string]string{"og:locale": "en_US"},
+			expected: "en",
+		},
+		{
+			name:     "German from content",
+			text:     "Dies ist ein Text und der Inhalt wird hier analysiert",
+			meta:     nil,
+			expected: "de",
+		},
+		{
+			name:     "English from content",
+			text:     "This is the content and we are analyzing the text here with all the words they can use for things but not any German",
+			meta:     nil,
+			expected: "en",
+		},
+		{
+			name:     "Default to German for ambiguous",
+			text:     "Hello World",
+			meta:     nil,
+			expected: "de",
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			result := detectLanguage(tt.text, tt.meta)
+			if result != tt.expected {
+				t.Errorf("detectLanguage() = %q, expected %q", result, tt.expected)
+			}
+		})
+	}
+}
+
+func TestCleanText(t *testing.T) {
+	tests := []struct {
+		name     string
+		input    string
+		expected string
+	}{
+		{
+			name:     "Normalize Windows line endings",
+			input:    "Line1\r\nLine2",
+			expected: "Line1\nLine2",
+		},
+		{
+			name:     "Collapse multiple newlines",
+			input:    "Line1\n\n\n\n\nLine2",
+			expected: "Line1\n\nLine2",
+		},
+		{
+			name:     "Collapse multiple spaces",
+			input:    "Word1     Word2",
+			expected: "Word1 Word2",
+		},
+		{
+			name:     "Trim whitespace",
+			input:    "  Text with spaces  \n  More text  ",
+			expected: "Text with spaces\nMore text",
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			result := cleanText(tt.input)
+			if result != tt.expected {
+				t.Errorf("cleanText(%q) = %q, expected %q", tt.input, result, tt.expected)
+			}
+		})
+	}
+}
+
+func TestGenerateSnippet(t *testing.T) {
+	tests := []struct {
+		name     string
+		text     string
+		maxLen   int
+		checkFn  func(string) bool
+	}{
+		{
+			name:   "Short text unchanged",
+			text:   "Short paragraph.",
+			maxLen: 300,
+			checkFn: func(s string) bool {
+				return s == "Short paragraph."
+			},
+		},
+		{
+			name:   "Long text truncated",
+			text:   strings.Repeat("A long sentence that keeps going. ", 20),
+			maxLen: 100,
+			checkFn: func(s string) bool {
+				return len(s) <= 103 && strings.HasSuffix(s, "...")
+			},
+		},
+		{
+			name: "First suitable paragraph",
+			text: "Tiny.\n\nThis is a paragraph with enough content to be used as a snippet because it meets the minimum length.",
+			maxLen: 300,
+			checkFn: func(s string) bool {
+				return strings.HasPrefix(s, "This is a paragraph")
+			},
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			result := generateSnippet(tt.text, tt.maxLen)
+			if !tt.checkFn(result) {
+				t.Errorf("generateSnippet() = %q, check failed", result)
+			}
+		})
+	}
+}
+
+func TestIsPrintableText(t *testing.T) {
+	tests := []struct {
+		name     string
+		input    string
+		expected bool
+	}{
+		{
+			name:     "Normal text",
+			input:    "Hello World",
+			expected: true,
+		},
+		{
+			name:     "German text",
+			input:    "Übung mit Umlauten",
+			expected: true,
+		},
+		{
+			name:     "Too short",
+			input:    "AB",
+			expected: false,
+		},
+		{
+			name:     "Binary data",
+			input:    "\x00\x01\x02\x03\x04",
+			expected: false,
+		},
+		{
+			name:     "Mixed printable",
+			input:    "Text with some \x00 binary",
+			expected: true, // >70% printable
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			result := isPrintableText(tt.input)
+			if result != tt.expected {
+				t.Errorf("isPrintableText(%q) = %v, expected %v", tt.input, result, tt.expected)
+			}
+		})
+	}
+}
+
+func TestExtractHTML_HeadingsExtraction(t *testing.T) {
+	html := []byte(`<html><body>
+	<h1>Main Title</h1>
+	<h2>Section 1</h2>
+	<p>Content</p>
+	<h2>Section 2</h2>
+	<h3>Subsection 2.1</h3>
+	<p>More content</p>
+</body></html>`)
+
+	content, err := ExtractHTML(html)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	if len(content.Headings) != 4 {
+		t.Errorf("Expected 4 headings (h1, h2, h2, h3), got %d", len(content.Headings))
+	}
+
+	expectedHeadings := []string{"Main Title", "Section 1", "Section 2", "Subsection 2.1"}
+	for i, expected := range expectedHeadings {
+		if i < len(content.Headings) && content.Headings[i] != expected {
+			t.Errorf("Heading %d: expected %q, got %q", i, expected, content.Headings[i])
+		}
+	}
+}
+
+func TestExtractHTML_ContentFromMain(t *testing.T) {
+	html := []byte(`<html><body>
+	<div>Outside main</div>
+	<main>
+		<article>
+			<p>Article content that is inside the main element.</p>
+		</article>
+	</main>
+	<div>Also outside</div>
+</body></html>`)
+
+	content, err := ExtractHTML(html)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	if !strings.Contains(content.ContentText, "Article content") {
+		t.Error("Expected content from main element")
+	}
+}
+
+func TestExtractHTML_MetadataExtraction(t *testing.T) {
+	html := []byte(`<html>
+<head>
+	<meta name="author" content="Test Author">
+	<meta name="keywords" content="education, learning">
+	<meta property="og:description" content="OG Description">
+</head>
+<body></body>
+</html>`)
+
+	content, err := ExtractHTML(html)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	if content.MetaData["author"] != "Test Author" {
+		t.Errorf("Expected author 'Test Author', got %q", content.MetaData["author"])
+	}
+	if content.MetaData["keywords"] != "education, learning" {
+		t.Errorf("Expected keywords, got %q", content.MetaData["keywords"])
+	}
+	if content.MetaData["og:description"] != "OG Description" {
+		t.Errorf("Expected og:description, got %q", content.MetaData["og:description"])
+	}
+}
+
+func TestUnescapeHTML(t *testing.T) {
+	tests := []struct {
+		input    string
+		expected string
+	}{
+		{"&amp;", "&"},
+		{"&lt;script&gt;", "<script>"},
+		{"&quot;quoted&quot;", "\"quoted\""},
+		{"&#39;apostrophe&#39;", "'apostrophe'"},
+		{"No entities", "No entities"},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.input, func(t *testing.T) {
+			result := UnescapeHTML(tt.input)
+			if result != tt.expected {
+				t.Errorf("UnescapeHTML(%q) = %q, expected %q", tt.input, result, tt.expected)
+			}
+		})
+	}
+}
+
+func TestExtractPDF_BasicText(t *testing.T) {
+	// Create minimal PDF-like content with text markers
+	// Real PDFs would have proper structure, but we test the extraction logic
+	pdfContent := []byte("(Hello World) (This is a test)")
+
+	content, err := ExtractPDF(pdfContent)
+	if err != nil {
+		t.Fatalf("ExtractPDF failed: %v", err)
+	}
+
+	// Should extract some text
+	if content.ContentLength == 0 && !strings.Contains(string(pdfContent), "(Hello") {
+		// Only fail if there's actually extractable content
+		t.Log("PDF extraction returned empty content (expected for simple test)")
+	}
+
+	// Features should be set
+	if content.Language == "" {
+		t.Error("Expected language to be set")
+	}
+}
+
+func TestExtractHTML_AdDensity(t *testing.T) {
+	html := []byte(`<html><body>
+	<div class="advertisement">Ad 1</div>
+	<div class="advertisement">Ad 2</div>
+	<div class="advertisement">Ad 3</div>
+	<p>Content</p>
+	<div>Normal div</div>
+</body></html>`)
+
+	content, err := ExtractHTML(html)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	// Ad density should be calculated (3 ads / total divs)
+	if content.Features.AdDensity < 0 {
+		t.Error("AdDensity should not be negative")
+	}
+}
+
+func TestExtractHTML_HasMainContent(t *testing.T) {
+	tests := []struct {
+		name     string
+		html     string
+		expected bool
+	}{
+		{
+			name:     "Sufficient content",
+			html:     `<html><body><p>` + strings.Repeat("Content ", 50) + `</p></body></html>`,
+			expected: true,
+		},
+		{
+			name:     "Insufficient content",
+			html:     `<html><body><p>Short</p></body></html>`,
+			expected: false,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			content, err := ExtractHTML([]byte(tt.html))
+			if err != nil {
+				t.Fatal(err)
+			}
+			if content.Features.HasMainContent != tt.expected {
+				t.Errorf("HasMainContent = %v, expected %v", content.Features.HasMainContent, tt.expected)
+			}
+		})
+	}
+}
+
+// ============================================================
+// PDF Extraction Tests
+// ============================================================
+
+func TestExtractPDF_FallbackForInvalidPDF(t *testing.T) {
+	// Test with non-PDF content - should fallback gracefully
+	invalidPDF := []byte("This is not a PDF file (just some text content)")
+
+	content, err := ExtractPDF(invalidPDF)
+	if err != nil {
+		t.Fatalf("ExtractPDF should not fail completely: %v", err)
+	}
+
+	// Should still return a valid ExtractedContent struct
+	if content == nil {
+		t.Fatal("Expected non-nil content")
+	}
+
+	// Should detect fallback method
+	if content.MetaData["extraction_method"] != "fallback" {
+		t.Log("PDF fallback extraction was used as expected")
+	}
+}
+
+func TestExtractPDF_MetadataSet(t *testing.T) {
+	// Simple test content
+	content, err := ExtractPDF([]byte("(Test content)"))
+	if err != nil {
+		t.Fatalf("ExtractPDF failed: %v", err)
+	}
+
+	// Content type should be set
+	if content.MetaData["content_type"] != "application/pdf" {
+		t.Errorf("Expected content_type 'application/pdf', got %q", content.MetaData["content_type"])
+	}
+
+	// Language should be detected (default to German)
+	if content.Language == "" {
+		t.Error("Expected language to be set")
+	}
+}
+
+func TestExtractPDFTitle(t *testing.T) {
+	tests := []struct {
+		name     string
+		text     string
+		expected string
+	}{
+		{
+			name:     "Normal title",
+			text:     "Lehrplan Mathematik Bayern\n\nDieses Dokument beschreibt...",
+			expected: "Lehrplan Mathematik Bayern",
+		},
+		{
+			name:     "Skip page number",
+			text:     "1\n\nLehrplan Mathematik Bayern\n\nDieses Dokument...",
+			expected: "Lehrplan Mathematik Bayern",
+		},
+		{
+			name:     "Skip date",
+			text:     "15.01.2025\n\nLehrplan Mathematik\n\nDieses Dokument...",
+			expected: "Lehrplan Mathematik",
+		},
+		{
+			name:     "Skip short lines",
+			text:     "Short\n\nThis is a proper title for the document\n\nContent...",
+			expected: "This is a proper title for the document",
+		},
+		{
+			name:     "Empty text",
+			text:     "",
+			expected: "",
+		},
+		{
+			name:     "Only short lines",
+			text:     "A\nB\nC\nD",
+			expected: "",
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			result := extractPDFTitle(tt.text)
+			if result != tt.expected {
+				t.Errorf("extractPDFTitle() = %q, expected %q", result, tt.expected)
+			}
+		})
+	}
+}
+
+func TestExtractPDFHeadings(t *testing.T) {
+	tests := []struct {
+		name            string
+		text            string
+		minHeadingCount int
+		expectedFirst   string
+	}{
+		{
+			name: "All caps headings",
+			text: `EINLEITUNG
+
+Dieser Text beschreibt die wichtigsten Punkte.
+
+KAPITEL EINS
+
+Hier folgt der erste Abschnitt.`,
+			minHeadingCount: 2,
+			expectedFirst:   "EINLEITUNG",
+		},
+		{
+			name: "Numbered headings",
+			text: `1. Einführung
+
+Text hier.
+
+1.1 Unterabschnitt
+
+Mehr Text.
+
+2. Hauptteil
+
+Weiterer Inhalt.`,
+			minHeadingCount: 3,
+			expectedFirst:   "1. Einführung",
+		},
+		{
+			name:            "No headings",
+			text:            "einfacher text ohne ueberschriften der nur aus kleinen buchstaben besteht und sehr lang ist damit er nicht als ueberschrift erkannt wird",
+			minHeadingCount: 0,
+			expectedFirst:   "",
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			headings := extractPDFHeadings(tt.text)
+
+			if len(headings) < tt.minHeadingCount {
+				t.Errorf("Expected at least %d headings, got %d", tt.minHeadingCount, len(headings))
+			}
+
+			if tt.expectedFirst != "" && len(headings) > 0 && headings[0] != tt.expectedFirst {
+				t.Errorf("Expected first heading %q, got %q", tt.expectedFirst, headings[0])
+			}
+		})
+	}
+}
+
+func TestExtractPDFHeadings_Limit(t *testing.T) {
+	// Test that headings are limited to 10
+	text := ""
+	for i := 1; i <= 20; i++ {
+		text += "KAPITEL " + strings.Repeat("X", i) + "\n\nText Text Text.\n\n"
+	}
+
+	headings := extractPDFHeadings(text)
+
+	if len(headings) > 10 {
+		t.Errorf("Expected max 10 headings, got %d", len(headings))
+	}
+}
+
+func TestContainsHeading(t *testing.T) {
+	headings := []string{"Title One", "Title Two", "Title Three"}
+
+	if !containsHeading(headings, "Title Two") {
+		t.Error("Expected to find 'Title Two'")
+	}
+
+	if containsHeading(headings, "Title Four") {
+		t.Error("Should not find 'Title Four'")
+	}
+
+	if containsHeading([]string{}, "Any") {
+		t.Error("Empty list should not contain anything")
+	}
+}
+
+func TestExtractPDFFallback_BasicExtraction(t *testing.T) {
+	// Test fallback with text in parentheses (PDF text stream format)
+	pdfLike := []byte("stream\n(Hello World) (This is some text) (More content here)\nendstream")
+
+	content, err := extractPDFFallback(pdfLike)
+	if err != nil {
+		t.Fatalf("extractPDFFallback failed: %v", err)
+	}
+
+	// Should extract text from parentheses
+	if !strings.Contains(content.ContentText, "Hello World") && content.ContentLength > 0 {
+		t.Log("Extracted some content via fallback")
+	}
+
+	// Should mark as fallback
+	if content.MetaData["extraction_method"] != "fallback" {
+		t.Error("Expected extraction_method to be 'fallback'")
+	}
+}
+
+func TestExtractPDF_EmptyInput(t *testing.T) {
+	content, err := ExtractPDF([]byte{})
+	if err != nil {
+		t.Fatalf("ExtractPDF should handle empty input: %v", err)
+	}
+
+	if content == nil {
+		t.Fatal("Expected non-nil content for empty input")
+	}
+
+	if content.ContentLength != 0 {
+		t.Errorf("Expected 0 content length for empty input, got %d", content.ContentLength)
+	}
+}
+
+func TestExtractPDFWithMetadata_FallbackOnError(t *testing.T) {
+	// ExtractPDFWithMetadata should fallback gracefully
+	content, err := ExtractPDFWithMetadata([]byte("not a pdf"))
+	if err != nil {
+		t.Fatalf("ExtractPDFWithMetadata should not fail: %v", err)
+	}
+
+	if content == nil {
+		t.Fatal("Expected non-nil content")
+	}
+}
+
+func TestExtractPDF_LanguageDetection(t *testing.T) {
+	tests := []struct {
+		name     string
+		text     string
+		expected string
+	}{
+		{
+			name:     "German content",
+			text:     "(Der Lehrplan ist für alle Schulen verbindlich und enthält wichtige Informationen)",
+			expected: "de",
+		},
+		{
+			name:     "Default to German",
+			text:     "(Some text)",
+			expected: "de",
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			content, err := ExtractPDF([]byte(tt.text))
+			if err != nil {
+				t.Fatalf("ExtractPDF failed: %v", err)
+			}
+
+			// Language should be detected
+			if content.Language != tt.expected {
+				t.Logf("Language detected: %s (expected %s)", content.Language, tt.expected)
+			}
+		})
+	}
+}