All checks were successful
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 28s
CI / test-go-edu-search (push) Successful in 27s
CI / test-python-klausur (push) Successful in 1m45s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 21s
- edu-search-service von breakpilot-pwa nach breakpilot-lehrer kopiert (ohne vendor) - opensearch + edu-search-service in docker-compose.yml hinzugefuegt - voice-service aus docker-compose.yml entfernt (jetzt in breakpilot-core) - geo-service aus docker-compose.yml entfernt (nicht mehr benoetigt) - CI/CD: edu-search-service zu Gitea Actions und Woodpecker hinzugefuegt (Go lint, test mit go mod download, build, SBOM) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
803 lines
19 KiB
Go
803 lines
19 KiB
Go
package extractor
|
|
|
|
import (
|
|
"strings"
|
|
"testing"
|
|
)
|
|
|
|
func TestExtractHTML_BasicContent(t *testing.T) {
|
|
html := []byte(`<!DOCTYPE html>
|
|
<html>
|
|
<head>
|
|
<title>Test Page Title</title>
|
|
<meta name="description" content="Test description">
|
|
<meta property="og:title" content="OG Title">
|
|
</head>
|
|
<body>
|
|
<h1>Main Heading</h1>
|
|
<p>This is the first paragraph with some meaningful content.</p>
|
|
<p>This is another paragraph that adds more information.</p>
|
|
</body>
|
|
</html>`)
|
|
|
|
content, err := ExtractHTML(html)
|
|
if err != nil {
|
|
t.Fatalf("ExtractHTML failed: %v", err)
|
|
}
|
|
|
|
// Check title
|
|
if content.Title != "Test Page Title" {
|
|
t.Errorf("Expected title 'Test Page Title', got %q", content.Title)
|
|
}
|
|
|
|
// Check metadata
|
|
if content.MetaData["description"] != "Test description" {
|
|
t.Errorf("Expected description 'Test description', got %q", content.MetaData["description"])
|
|
}
|
|
|
|
// Check headings
|
|
if len(content.Headings) == 0 {
|
|
t.Error("Expected at least one heading")
|
|
}
|
|
if content.Headings[0] != "Main Heading" {
|
|
t.Errorf("Expected heading 'Main Heading', got %q", content.Headings[0])
|
|
}
|
|
|
|
// Check content text
|
|
if !strings.Contains(content.ContentText, "first paragraph") {
|
|
t.Error("Expected content to contain 'first paragraph'")
|
|
}
|
|
}
|
|
|
|
func TestExtractHTML_TitleFallback(t *testing.T) {
|
|
tests := []struct {
|
|
name string
|
|
html string
|
|
expected string
|
|
}{
|
|
{
|
|
name: "Title from title tag",
|
|
html: `<html><head><title>Page Title</title></head><body></body></html>`,
|
|
expected: "Page Title",
|
|
},
|
|
{
|
|
name: "Title from H1 when no title tag",
|
|
html: `<html><head></head><body><h1>H1 Title</h1></body></html>`,
|
|
expected: "H1 Title",
|
|
},
|
|
{
|
|
name: "Title from og:title when no title or h1",
|
|
html: `<html><head><meta property="og:title" content="OG Title"></head><body></body></html>`,
|
|
expected: "OG Title",
|
|
},
|
|
}
|
|
|
|
for _, tt := range tests {
|
|
t.Run(tt.name, func(t *testing.T) {
|
|
content, err := ExtractHTML([]byte(tt.html))
|
|
if err != nil {
|
|
t.Fatalf("ExtractHTML failed: %v", err)
|
|
}
|
|
if content.Title != tt.expected {
|
|
t.Errorf("Expected title %q, got %q", tt.expected, content.Title)
|
|
}
|
|
})
|
|
}
|
|
}
|
|
|
|
func TestExtractHTML_RemovesUnwantedElements(t *testing.T) {
|
|
html := []byte(`<html>
|
|
<body>
|
|
<nav>Navigation menu</nav>
|
|
<header>Header content</header>
|
|
<main>
|
|
<p>Main content paragraph</p>
|
|
</main>
|
|
<script>alert('dangerous');</script>
|
|
<style>.hidden{display:none;}</style>
|
|
<footer>Footer content</footer>
|
|
<aside>Sidebar content</aside>
|
|
<div class="advertisement">Ad content</div>
|
|
</body>
|
|
</html>`)
|
|
|
|
content, err := ExtractHTML(html)
|
|
if err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
|
|
// Should contain main content
|
|
if !strings.Contains(content.ContentText, "Main content paragraph") {
|
|
t.Error("Expected main content to be extracted")
|
|
}
|
|
|
|
// Should not contain unwanted elements
|
|
unwanted := []string{"Navigation menu", "alert('dangerous')", "Footer content", "Ad content"}
|
|
for _, text := range unwanted {
|
|
if strings.Contains(content.ContentText, text) {
|
|
t.Errorf("Content should not contain %q", text)
|
|
}
|
|
}
|
|
}
|
|
|
|
func TestExtractHTML_ExtractsLinks(t *testing.T) {
|
|
html := []byte(`<html><body>
|
|
<a href="https://example.com/page1">Link 1</a>
|
|
<a href="https://example.com/page2">Link 2</a>
|
|
<a href="/relative/path">Relative Link</a>
|
|
<a href="mailto:test@example.com">Email</a>
|
|
</body></html>`)
|
|
|
|
content, err := ExtractHTML(html)
|
|
if err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
|
|
// Should extract absolute HTTP links
|
|
if len(content.Links) != 2 {
|
|
t.Errorf("Expected 2 HTTP links, got %d", len(content.Links))
|
|
}
|
|
|
|
hasPage1 := false
|
|
hasPage2 := false
|
|
for _, link := range content.Links {
|
|
if link == "https://example.com/page1" {
|
|
hasPage1 = true
|
|
}
|
|
if link == "https://example.com/page2" {
|
|
hasPage2 = true
|
|
}
|
|
}
|
|
|
|
if !hasPage1 || !hasPage2 {
|
|
t.Error("Expected to find both HTTP links")
|
|
}
|
|
}
|
|
|
|
func TestExtractHTML_CalculatesFeatures(t *testing.T) {
|
|
html := []byte(`<html><body>
|
|
<div class="advertisement">Ad 1</div>
|
|
<p>Some content text that is long enough to be meaningful and provide a good ratio.</p>
|
|
<p>More content here to increase the text length.</p>
|
|
<a href="#">Link 1</a>
|
|
<a href="#">Link 2</a>
|
|
</body></html>`)
|
|
|
|
content, err := ExtractHTML(html)
|
|
if err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
|
|
// Check features are calculated
|
|
if content.Features.TextToHTMLRatio <= 0 {
|
|
t.Error("Expected positive TextToHTMLRatio")
|
|
}
|
|
|
|
// Content should have length
|
|
if content.ContentLength == 0 {
|
|
t.Error("Expected non-zero ContentLength")
|
|
}
|
|
}
|
|
|
|
func TestExtractHTML_GeneratesSnippet(t *testing.T) {
|
|
html := []byte(`<html><body>
|
|
<p>This is a short intro.</p>
|
|
<p>This is a longer paragraph that should be used as the snippet because it has more meaningful content and meets the minimum length requirement for a good snippet.</p>
|
|
<p>Another paragraph here.</p>
|
|
</body></html>`)
|
|
|
|
content, err := ExtractHTML(html)
|
|
if err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
|
|
if content.SnippetText == "" {
|
|
t.Error("Expected non-empty snippet")
|
|
}
|
|
|
|
// Snippet should be limited in length
|
|
if len(content.SnippetText) > 350 { // 300 + "..." margin
|
|
t.Errorf("Snippet too long: %d chars", len(content.SnippetText))
|
|
}
|
|
}
|
|
|
|
func TestDetectLanguage(t *testing.T) {
|
|
tests := []struct {
|
|
name string
|
|
text string
|
|
meta map[string]string
|
|
expected string
|
|
}{
|
|
{
|
|
name: "German from meta",
|
|
text: "Some text",
|
|
meta: map[string]string{"og:locale": "de_DE"},
|
|
expected: "de",
|
|
},
|
|
{
|
|
name: "English from meta",
|
|
text: "Some text",
|
|
meta: map[string]string{"og:locale": "en_US"},
|
|
expected: "en",
|
|
},
|
|
{
|
|
name: "German from content",
|
|
text: "Dies ist ein Text und der Inhalt wird hier analysiert",
|
|
meta: nil,
|
|
expected: "de",
|
|
},
|
|
{
|
|
name: "English from content",
|
|
text: "This is the content and we are analyzing the text here with all the words they can use for things but not any German",
|
|
meta: nil,
|
|
expected: "en",
|
|
},
|
|
{
|
|
name: "Default to German for ambiguous",
|
|
text: "Hello World",
|
|
meta: nil,
|
|
expected: "de",
|
|
},
|
|
}
|
|
|
|
for _, tt := range tests {
|
|
t.Run(tt.name, func(t *testing.T) {
|
|
result := detectLanguage(tt.text, tt.meta)
|
|
if result != tt.expected {
|
|
t.Errorf("detectLanguage() = %q, expected %q", result, tt.expected)
|
|
}
|
|
})
|
|
}
|
|
}
|
|
|
|
func TestCleanText(t *testing.T) {
|
|
tests := []struct {
|
|
name string
|
|
input string
|
|
expected string
|
|
}{
|
|
{
|
|
name: "Normalize Windows line endings",
|
|
input: "Line1\r\nLine2",
|
|
expected: "Line1\nLine2",
|
|
},
|
|
{
|
|
name: "Collapse multiple newlines",
|
|
input: "Line1\n\n\n\n\nLine2",
|
|
expected: "Line1\n\nLine2",
|
|
},
|
|
{
|
|
name: "Collapse multiple spaces",
|
|
input: "Word1 Word2",
|
|
expected: "Word1 Word2",
|
|
},
|
|
{
|
|
name: "Trim whitespace",
|
|
input: " Text with spaces \n More text ",
|
|
expected: "Text with spaces\nMore text",
|
|
},
|
|
}
|
|
|
|
for _, tt := range tests {
|
|
t.Run(tt.name, func(t *testing.T) {
|
|
result := cleanText(tt.input)
|
|
if result != tt.expected {
|
|
t.Errorf("cleanText(%q) = %q, expected %q", tt.input, result, tt.expected)
|
|
}
|
|
})
|
|
}
|
|
}
|
|
|
|
func TestGenerateSnippet(t *testing.T) {
|
|
tests := []struct {
|
|
name string
|
|
text string
|
|
maxLen int
|
|
checkFn func(string) bool
|
|
}{
|
|
{
|
|
name: "Short text unchanged",
|
|
text: "Short paragraph.",
|
|
maxLen: 300,
|
|
checkFn: func(s string) bool {
|
|
return s == "Short paragraph."
|
|
},
|
|
},
|
|
{
|
|
name: "Long text truncated",
|
|
text: strings.Repeat("A long sentence that keeps going. ", 20),
|
|
maxLen: 100,
|
|
checkFn: func(s string) bool {
|
|
return len(s) <= 103 && strings.HasSuffix(s, "...")
|
|
},
|
|
},
|
|
{
|
|
name: "First suitable paragraph",
|
|
text: "Tiny.\n\nThis is a paragraph with enough content to be used as a snippet because it meets the minimum length.",
|
|
maxLen: 300,
|
|
checkFn: func(s string) bool {
|
|
return strings.HasPrefix(s, "This is a paragraph")
|
|
},
|
|
},
|
|
}
|
|
|
|
for _, tt := range tests {
|
|
t.Run(tt.name, func(t *testing.T) {
|
|
result := generateSnippet(tt.text, tt.maxLen)
|
|
if !tt.checkFn(result) {
|
|
t.Errorf("generateSnippet() = %q, check failed", result)
|
|
}
|
|
})
|
|
}
|
|
}
|
|
|
|
func TestIsPrintableText(t *testing.T) {
|
|
tests := []struct {
|
|
name string
|
|
input string
|
|
expected bool
|
|
}{
|
|
{
|
|
name: "Normal text",
|
|
input: "Hello World",
|
|
expected: true,
|
|
},
|
|
{
|
|
name: "German text",
|
|
input: "Übung mit Umlauten",
|
|
expected: true,
|
|
},
|
|
{
|
|
name: "Too short",
|
|
input: "AB",
|
|
expected: false,
|
|
},
|
|
{
|
|
name: "Binary data",
|
|
input: "\x00\x01\x02\x03\x04",
|
|
expected: false,
|
|
},
|
|
{
|
|
name: "Mixed printable",
|
|
input: "Text with some \x00 binary",
|
|
expected: true, // >70% printable
|
|
},
|
|
}
|
|
|
|
for _, tt := range tests {
|
|
t.Run(tt.name, func(t *testing.T) {
|
|
result := isPrintableText(tt.input)
|
|
if result != tt.expected {
|
|
t.Errorf("isPrintableText(%q) = %v, expected %v", tt.input, result, tt.expected)
|
|
}
|
|
})
|
|
}
|
|
}
|
|
|
|
func TestExtractHTML_HeadingsExtraction(t *testing.T) {
|
|
html := []byte(`<html><body>
|
|
<h1>Main Title</h1>
|
|
<h2>Section 1</h2>
|
|
<p>Content</p>
|
|
<h2>Section 2</h2>
|
|
<h3>Subsection 2.1</h3>
|
|
<p>More content</p>
|
|
</body></html>`)
|
|
|
|
content, err := ExtractHTML(html)
|
|
if err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
|
|
if len(content.Headings) != 4 {
|
|
t.Errorf("Expected 4 headings (h1, h2, h2, h3), got %d", len(content.Headings))
|
|
}
|
|
|
|
expectedHeadings := []string{"Main Title", "Section 1", "Section 2", "Subsection 2.1"}
|
|
for i, expected := range expectedHeadings {
|
|
if i < len(content.Headings) && content.Headings[i] != expected {
|
|
t.Errorf("Heading %d: expected %q, got %q", i, expected, content.Headings[i])
|
|
}
|
|
}
|
|
}
|
|
|
|
func TestExtractHTML_ContentFromMain(t *testing.T) {
|
|
html := []byte(`<html><body>
|
|
<div>Outside main</div>
|
|
<main>
|
|
<article>
|
|
<p>Article content that is inside the main element.</p>
|
|
</article>
|
|
</main>
|
|
<div>Also outside</div>
|
|
</body></html>`)
|
|
|
|
content, err := ExtractHTML(html)
|
|
if err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
|
|
if !strings.Contains(content.ContentText, "Article content") {
|
|
t.Error("Expected content from main element")
|
|
}
|
|
}
|
|
|
|
func TestExtractHTML_MetadataExtraction(t *testing.T) {
|
|
html := []byte(`<html>
|
|
<head>
|
|
<meta name="author" content="Test Author">
|
|
<meta name="keywords" content="education, learning">
|
|
<meta property="og:description" content="OG Description">
|
|
</head>
|
|
<body></body>
|
|
</html>`)
|
|
|
|
content, err := ExtractHTML(html)
|
|
if err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
|
|
if content.MetaData["author"] != "Test Author" {
|
|
t.Errorf("Expected author 'Test Author', got %q", content.MetaData["author"])
|
|
}
|
|
if content.MetaData["keywords"] != "education, learning" {
|
|
t.Errorf("Expected keywords, got %q", content.MetaData["keywords"])
|
|
}
|
|
if content.MetaData["og:description"] != "OG Description" {
|
|
t.Errorf("Expected og:description, got %q", content.MetaData["og:description"])
|
|
}
|
|
}
|
|
|
|
func TestUnescapeHTML(t *testing.T) {
|
|
tests := []struct {
|
|
input string
|
|
expected string
|
|
}{
|
|
{"&", "&"},
|
|
{"<script>", "<script>"},
|
|
{""quoted"", "\"quoted\""},
|
|
{"'apostrophe'", "'apostrophe'"},
|
|
{"No entities", "No entities"},
|
|
}
|
|
|
|
for _, tt := range tests {
|
|
t.Run(tt.input, func(t *testing.T) {
|
|
result := UnescapeHTML(tt.input)
|
|
if result != tt.expected {
|
|
t.Errorf("UnescapeHTML(%q) = %q, expected %q", tt.input, result, tt.expected)
|
|
}
|
|
})
|
|
}
|
|
}
|
|
|
|
func TestExtractPDF_BasicText(t *testing.T) {
|
|
// Create minimal PDF-like content with text markers
|
|
// Real PDFs would have proper structure, but we test the extraction logic
|
|
pdfContent := []byte("(Hello World) (This is a test)")
|
|
|
|
content, err := ExtractPDF(pdfContent)
|
|
if err != nil {
|
|
t.Fatalf("ExtractPDF failed: %v", err)
|
|
}
|
|
|
|
// Should extract some text
|
|
if content.ContentLength == 0 && !strings.Contains(string(pdfContent), "(Hello") {
|
|
// Only fail if there's actually extractable content
|
|
t.Log("PDF extraction returned empty content (expected for simple test)")
|
|
}
|
|
|
|
// Features should be set
|
|
if content.Language == "" {
|
|
t.Error("Expected language to be set")
|
|
}
|
|
}
|
|
|
|
func TestExtractHTML_AdDensity(t *testing.T) {
|
|
html := []byte(`<html><body>
|
|
<div class="advertisement">Ad 1</div>
|
|
<div class="advertisement">Ad 2</div>
|
|
<div class="advertisement">Ad 3</div>
|
|
<p>Content</p>
|
|
<div>Normal div</div>
|
|
</body></html>`)
|
|
|
|
content, err := ExtractHTML(html)
|
|
if err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
|
|
// Ad density should be calculated (3 ads / total divs)
|
|
if content.Features.AdDensity < 0 {
|
|
t.Error("AdDensity should not be negative")
|
|
}
|
|
}
|
|
|
|
func TestExtractHTML_HasMainContent(t *testing.T) {
|
|
tests := []struct {
|
|
name string
|
|
html string
|
|
expected bool
|
|
}{
|
|
{
|
|
name: "Sufficient content",
|
|
html: `<html><body><p>` + strings.Repeat("Content ", 50) + `</p></body></html>`,
|
|
expected: true,
|
|
},
|
|
{
|
|
name: "Insufficient content",
|
|
html: `<html><body><p>Short</p></body></html>`,
|
|
expected: false,
|
|
},
|
|
}
|
|
|
|
for _, tt := range tests {
|
|
t.Run(tt.name, func(t *testing.T) {
|
|
content, err := ExtractHTML([]byte(tt.html))
|
|
if err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
if content.Features.HasMainContent != tt.expected {
|
|
t.Errorf("HasMainContent = %v, expected %v", content.Features.HasMainContent, tt.expected)
|
|
}
|
|
})
|
|
}
|
|
}
|
|
|
|
// ============================================================
|
|
// PDF Extraction Tests
|
|
// ============================================================
|
|
|
|
func TestExtractPDF_FallbackForInvalidPDF(t *testing.T) {
|
|
// Test with non-PDF content - should fallback gracefully
|
|
invalidPDF := []byte("This is not a PDF file (just some text content)")
|
|
|
|
content, err := ExtractPDF(invalidPDF)
|
|
if err != nil {
|
|
t.Fatalf("ExtractPDF should not fail completely: %v", err)
|
|
}
|
|
|
|
// Should still return a valid ExtractedContent struct
|
|
if content == nil {
|
|
t.Fatal("Expected non-nil content")
|
|
}
|
|
|
|
// Should detect fallback method
|
|
if content.MetaData["extraction_method"] != "fallback" {
|
|
t.Log("PDF fallback extraction was used as expected")
|
|
}
|
|
}
|
|
|
|
func TestExtractPDF_MetadataSet(t *testing.T) {
|
|
// Simple test content
|
|
content, err := ExtractPDF([]byte("(Test content)"))
|
|
if err != nil {
|
|
t.Fatalf("ExtractPDF failed: %v", err)
|
|
}
|
|
|
|
// Content type should be set
|
|
if content.MetaData["content_type"] != "application/pdf" {
|
|
t.Errorf("Expected content_type 'application/pdf', got %q", content.MetaData["content_type"])
|
|
}
|
|
|
|
// Language should be detected (default to German)
|
|
if content.Language == "" {
|
|
t.Error("Expected language to be set")
|
|
}
|
|
}
|
|
|
|
func TestExtractPDFTitle(t *testing.T) {
|
|
tests := []struct {
|
|
name string
|
|
text string
|
|
expected string
|
|
}{
|
|
{
|
|
name: "Normal title",
|
|
text: "Lehrplan Mathematik Bayern\n\nDieses Dokument beschreibt...",
|
|
expected: "Lehrplan Mathematik Bayern",
|
|
},
|
|
{
|
|
name: "Skip page number",
|
|
text: "1\n\nLehrplan Mathematik Bayern\n\nDieses Dokument...",
|
|
expected: "Lehrplan Mathematik Bayern",
|
|
},
|
|
{
|
|
name: "Skip date",
|
|
text: "15.01.2025\n\nLehrplan Mathematik\n\nDieses Dokument...",
|
|
expected: "Lehrplan Mathematik",
|
|
},
|
|
{
|
|
name: "Skip short lines",
|
|
text: "Short\n\nThis is a proper title for the document\n\nContent...",
|
|
expected: "This is a proper title for the document",
|
|
},
|
|
{
|
|
name: "Empty text",
|
|
text: "",
|
|
expected: "",
|
|
},
|
|
{
|
|
name: "Only short lines",
|
|
text: "A\nB\nC\nD",
|
|
expected: "",
|
|
},
|
|
}
|
|
|
|
for _, tt := range tests {
|
|
t.Run(tt.name, func(t *testing.T) {
|
|
result := extractPDFTitle(tt.text)
|
|
if result != tt.expected {
|
|
t.Errorf("extractPDFTitle() = %q, expected %q", result, tt.expected)
|
|
}
|
|
})
|
|
}
|
|
}
|
|
|
|
func TestExtractPDFHeadings(t *testing.T) {
|
|
tests := []struct {
|
|
name string
|
|
text string
|
|
minHeadingCount int
|
|
expectedFirst string
|
|
}{
|
|
{
|
|
name: "All caps headings",
|
|
text: `EINLEITUNG
|
|
|
|
Dieser Text beschreibt die wichtigsten Punkte.
|
|
|
|
KAPITEL EINS
|
|
|
|
Hier folgt der erste Abschnitt.`,
|
|
minHeadingCount: 2,
|
|
expectedFirst: "EINLEITUNG",
|
|
},
|
|
{
|
|
name: "Numbered headings",
|
|
text: `1. Einführung
|
|
|
|
Text hier.
|
|
|
|
1.1 Unterabschnitt
|
|
|
|
Mehr Text.
|
|
|
|
2. Hauptteil
|
|
|
|
Weiterer Inhalt.`,
|
|
minHeadingCount: 3,
|
|
expectedFirst: "1. Einführung",
|
|
},
|
|
{
|
|
name: "No headings",
|
|
text: "einfacher text ohne ueberschriften der nur aus kleinen buchstaben besteht und sehr lang ist damit er nicht als ueberschrift erkannt wird",
|
|
minHeadingCount: 0,
|
|
expectedFirst: "",
|
|
},
|
|
}
|
|
|
|
for _, tt := range tests {
|
|
t.Run(tt.name, func(t *testing.T) {
|
|
headings := extractPDFHeadings(tt.text)
|
|
|
|
if len(headings) < tt.minHeadingCount {
|
|
t.Errorf("Expected at least %d headings, got %d", tt.minHeadingCount, len(headings))
|
|
}
|
|
|
|
if tt.expectedFirst != "" && len(headings) > 0 && headings[0] != tt.expectedFirst {
|
|
t.Errorf("Expected first heading %q, got %q", tt.expectedFirst, headings[0])
|
|
}
|
|
})
|
|
}
|
|
}
|
|
|
|
func TestExtractPDFHeadings_Limit(t *testing.T) {
|
|
// Test that headings are limited to 10
|
|
text := ""
|
|
for i := 1; i <= 20; i++ {
|
|
text += "KAPITEL " + strings.Repeat("X", i) + "\n\nText Text Text.\n\n"
|
|
}
|
|
|
|
headings := extractPDFHeadings(text)
|
|
|
|
if len(headings) > 10 {
|
|
t.Errorf("Expected max 10 headings, got %d", len(headings))
|
|
}
|
|
}
|
|
|
|
func TestContainsHeading(t *testing.T) {
|
|
headings := []string{"Title One", "Title Two", "Title Three"}
|
|
|
|
if !containsHeading(headings, "Title Two") {
|
|
t.Error("Expected to find 'Title Two'")
|
|
}
|
|
|
|
if containsHeading(headings, "Title Four") {
|
|
t.Error("Should not find 'Title Four'")
|
|
}
|
|
|
|
if containsHeading([]string{}, "Any") {
|
|
t.Error("Empty list should not contain anything")
|
|
}
|
|
}
|
|
|
|
func TestExtractPDFFallback_BasicExtraction(t *testing.T) {
|
|
// Test fallback with text in parentheses (PDF text stream format)
|
|
pdfLike := []byte("stream\n(Hello World) (This is some text) (More content here)\nendstream")
|
|
|
|
content, err := extractPDFFallback(pdfLike)
|
|
if err != nil {
|
|
t.Fatalf("extractPDFFallback failed: %v", err)
|
|
}
|
|
|
|
// Should extract text from parentheses
|
|
if !strings.Contains(content.ContentText, "Hello World") && content.ContentLength > 0 {
|
|
t.Log("Extracted some content via fallback")
|
|
}
|
|
|
|
// Should mark as fallback
|
|
if content.MetaData["extraction_method"] != "fallback" {
|
|
t.Error("Expected extraction_method to be 'fallback'")
|
|
}
|
|
}
|
|
|
|
func TestExtractPDF_EmptyInput(t *testing.T) {
|
|
content, err := ExtractPDF([]byte{})
|
|
if err != nil {
|
|
t.Fatalf("ExtractPDF should handle empty input: %v", err)
|
|
}
|
|
|
|
if content == nil {
|
|
t.Fatal("Expected non-nil content for empty input")
|
|
}
|
|
|
|
if content.ContentLength != 0 {
|
|
t.Errorf("Expected 0 content length for empty input, got %d", content.ContentLength)
|
|
}
|
|
}
|
|
|
|
func TestExtractPDFWithMetadata_FallbackOnError(t *testing.T) {
|
|
// ExtractPDFWithMetadata should fallback gracefully
|
|
content, err := ExtractPDFWithMetadata([]byte("not a pdf"))
|
|
if err != nil {
|
|
t.Fatalf("ExtractPDFWithMetadata should not fail: %v", err)
|
|
}
|
|
|
|
if content == nil {
|
|
t.Fatal("Expected non-nil content")
|
|
}
|
|
}
|
|
|
|
func TestExtractPDF_LanguageDetection(t *testing.T) {
|
|
tests := []struct {
|
|
name string
|
|
text string
|
|
expected string
|
|
}{
|
|
{
|
|
name: "German content",
|
|
text: "(Der Lehrplan ist für alle Schulen verbindlich und enthält wichtige Informationen)",
|
|
expected: "de",
|
|
},
|
|
{
|
|
name: "Default to German",
|
|
text: "(Some text)",
|
|
expected: "de",
|
|
},
|
|
}
|
|
|
|
for _, tt := range tests {
|
|
t.Run(tt.name, func(t *testing.T) {
|
|
content, err := ExtractPDF([]byte(tt.text))
|
|
if err != nil {
|
|
t.Fatalf("ExtractPDF failed: %v", err)
|
|
}
|
|
|
|
// Language should be detected
|
|
if content.Language != tt.expected {
|
|
t.Logf("Language detected: %s (expected %s)", content.Language, tt.expected)
|
|
}
|
|
})
|
|
}
|
|
}
|