package extractor import ( "strings" "testing" ) func TestExtractHTML_BasicContent(t *testing.T) { html := []byte(` Test Page Title

Main Heading

This is the first paragraph with some meaningful content.

This is another paragraph that adds more information.

`) content, err := ExtractHTML(html) if err != nil { t.Fatalf("ExtractHTML failed: %v", err) } // Check title if content.Title != "Test Page Title" { t.Errorf("Expected title 'Test Page Title', got %q", content.Title) } // Check metadata if content.MetaData["description"] != "Test description" { t.Errorf("Expected description 'Test description', got %q", content.MetaData["description"]) } // Check headings if len(content.Headings) == 0 { t.Error("Expected at least one heading") } if content.Headings[0] != "Main Heading" { t.Errorf("Expected heading 'Main Heading', got %q", content.Headings[0]) } // Check content text if !strings.Contains(content.ContentText, "first paragraph") { t.Error("Expected content to contain 'first paragraph'") } } func TestExtractHTML_TitleFallback(t *testing.T) { tests := []struct { name string html string expected string }{ { name: "Title from title tag", html: `Page Title`, expected: "Page Title", }, { name: "Title from H1 when no title tag", html: `

H1 Title

`, expected: "H1 Title", }, { name: "Title from og:title when no title or h1", html: ``, expected: "OG Title", }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { content, err := ExtractHTML([]byte(tt.html)) if err != nil { t.Fatalf("ExtractHTML failed: %v", err) } if content.Title != tt.expected { t.Errorf("Expected title %q, got %q", tt.expected, content.Title) } }) } } func TestExtractHTML_RemovesUnwantedElements(t *testing.T) { html := []byte(`
Header content

Main content paragraph

`) content, err := ExtractHTML(html) if err != nil { t.Fatal(err) } // Should contain main content if !strings.Contains(content.ContentText, "Main content paragraph") { t.Error("Expected main content to be extracted") } // Should not contain unwanted elements unwanted := []string{"Navigation menu", "alert('dangerous')", "Footer content", "Ad content"} for _, text := range unwanted { if strings.Contains(content.ContentText, text) { t.Errorf("Content should not contain %q", text) } } } func TestExtractHTML_ExtractsLinks(t *testing.T) { html := []byte(` Link 1 Link 2 Relative Link Email `) content, err := ExtractHTML(html) if err != nil { t.Fatal(err) } // Should extract absolute HTTP links if len(content.Links) != 2 { t.Errorf("Expected 2 HTTP links, got %d", len(content.Links)) } hasPage1 := false hasPage2 := false for _, link := range content.Links { if link == "https://example.com/page1" { hasPage1 = true } if link == "https://example.com/page2" { hasPage2 = true } } if !hasPage1 || !hasPage2 { t.Error("Expected to find both HTTP links") } } func TestExtractHTML_CalculatesFeatures(t *testing.T) { html := []byte(`

Some content text that is long enough to be meaningful and provide a good ratio.

More content here to increase the text length.

Link 1 Link 2 `) content, err := ExtractHTML(html) if err != nil { t.Fatal(err) } // Check features are calculated if content.Features.TextToHTMLRatio <= 0 { t.Error("Expected positive TextToHTMLRatio") } // Content should have length if content.ContentLength == 0 { t.Error("Expected non-zero ContentLength") } } func TestExtractHTML_GeneratesSnippet(t *testing.T) { html := []byte(`

This is a short intro.

This is a longer paragraph that should be used as the snippet because it has more meaningful content and meets the minimum length requirement for a good snippet.

Another paragraph here.

`) content, err := ExtractHTML(html) if err != nil { t.Fatal(err) } if content.SnippetText == "" { t.Error("Expected non-empty snippet") } // Snippet should be limited in length if len(content.SnippetText) > 350 { // 300 + "..." margin t.Errorf("Snippet too long: %d chars", len(content.SnippetText)) } } func TestDetectLanguage(t *testing.T) { tests := []struct { name string text string meta map[string]string expected string }{ { name: "German from meta", text: "Some text", meta: map[string]string{"og:locale": "de_DE"}, expected: "de", }, { name: "English from meta", text: "Some text", meta: map[string]string{"og:locale": "en_US"}, expected: "en", }, { name: "German from content", text: "Dies ist ein Text und der Inhalt wird hier analysiert", meta: nil, expected: "de", }, { name: "English from content", text: "This is the content and we are analyzing the text here with all the words they can use for things but not any German", meta: nil, expected: "en", }, { name: "Default to German for ambiguous", text: "Hello World", meta: nil, expected: "de", }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { result := detectLanguage(tt.text, tt.meta) if result != tt.expected { t.Errorf("detectLanguage() = %q, expected %q", result, tt.expected) } }) } } func TestCleanText(t *testing.T) { tests := []struct { name string input string expected string }{ { name: "Normalize Windows line endings", input: "Line1\r\nLine2", expected: "Line1\nLine2", }, { name: "Collapse multiple newlines", input: "Line1\n\n\n\n\nLine2", expected: "Line1\n\nLine2", }, { name: "Collapse multiple spaces", input: "Word1 Word2", expected: "Word1 Word2", }, { name: "Trim whitespace", input: " Text with spaces \n More text ", expected: "Text with spaces\nMore text", }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { result := cleanText(tt.input) if result != tt.expected { t.Errorf("cleanText(%q) = %q, expected %q", tt.input, result, tt.expected) } }) } } func TestGenerateSnippet(t *testing.T) { tests := []struct { name string text string maxLen int checkFn func(string) bool }{ { name: "Short text unchanged", text: "Short paragraph.", maxLen: 300, checkFn: func(s string) bool { return s == "Short paragraph." }, }, { name: "Long text truncated", text: strings.Repeat("A long sentence that keeps going. ", 20), maxLen: 100, checkFn: func(s string) bool { return len(s) <= 103 && strings.HasSuffix(s, "...") }, }, { name: "First suitable paragraph", text: "Tiny.\n\nThis is a paragraph with enough content to be used as a snippet because it meets the minimum length.", maxLen: 300, checkFn: func(s string) bool { return strings.HasPrefix(s, "This is a paragraph") }, }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { result := generateSnippet(tt.text, tt.maxLen) if !tt.checkFn(result) { t.Errorf("generateSnippet() = %q, check failed", result) } }) } } func TestIsPrintableText(t *testing.T) { tests := []struct { name string input string expected bool }{ { name: "Normal text", input: "Hello World", expected: true, }, { name: "German text", input: "Übung mit Umlauten", expected: true, }, { name: "Too short", input: "AB", expected: false, }, { name: "Binary data", input: "\x00\x01\x02\x03\x04", expected: false, }, { name: "Mixed printable", input: "Text with some \x00 binary", expected: true, // >70% printable }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { result := isPrintableText(tt.input) if result != tt.expected { t.Errorf("isPrintableText(%q) = %v, expected %v", tt.input, result, tt.expected) } }) } } func TestExtractHTML_HeadingsExtraction(t *testing.T) { html := []byte(`

Main Title

Section 1

Content

Section 2

Subsection 2.1

More content

`) content, err := ExtractHTML(html) if err != nil { t.Fatal(err) } if len(content.Headings) != 4 { t.Errorf("Expected 4 headings (h1, h2, h2, h3), got %d", len(content.Headings)) } expectedHeadings := []string{"Main Title", "Section 1", "Section 2", "Subsection 2.1"} for i, expected := range expectedHeadings { if i < len(content.Headings) && content.Headings[i] != expected { t.Errorf("Heading %d: expected %q, got %q", i, expected, content.Headings[i]) } } } func TestExtractHTML_ContentFromMain(t *testing.T) { html := []byte(`
Outside main

Article content that is inside the main element.

Also outside
`) content, err := ExtractHTML(html) if err != nil { t.Fatal(err) } if !strings.Contains(content.ContentText, "Article content") { t.Error("Expected content from main element") } } func TestExtractHTML_MetadataExtraction(t *testing.T) { html := []byte(` `) content, err := ExtractHTML(html) if err != nil { t.Fatal(err) } if content.MetaData["author"] != "Test Author" { t.Errorf("Expected author 'Test Author', got %q", content.MetaData["author"]) } if content.MetaData["keywords"] != "education, learning" { t.Errorf("Expected keywords, got %q", content.MetaData["keywords"]) } if content.MetaData["og:description"] != "OG Description" { t.Errorf("Expected og:description, got %q", content.MetaData["og:description"]) } } func TestUnescapeHTML(t *testing.T) { tests := []struct { input string expected string }{ {"&", "&"}, {"<script>", "