All checks were successful
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 28s
CI / test-go-edu-search (push) Successful in 27s
CI / test-python-klausur (push) Successful in 1m45s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 21s
- edu-search-service von breakpilot-pwa nach breakpilot-lehrer kopiert (ohne vendor) - opensearch + edu-search-service in docker-compose.yml hinzugefuegt - voice-service aus docker-compose.yml entfernt (jetzt in breakpilot-core) - geo-service aus docker-compose.yml entfernt (nicht mehr benoetigt) - CI/CD: edu-search-service zu Gitea Actions und Woodpecker hinzugefuegt (Go lint, test mit go mod download, build, SBOM) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
465 lines
12 KiB
Go
465 lines
12 KiB
Go
package extractor
|
|
|
|
import (
|
|
"bytes"
|
|
"io"
|
|
"regexp"
|
|
"strings"
|
|
"unicode"
|
|
|
|
"github.com/PuerkitoBio/goquery"
|
|
"github.com/ledongthuc/pdf"
|
|
"golang.org/x/net/html"
|
|
)
|
|
|
|
// ExtractedContent contains parsed content from HTML/PDF
|
|
type ExtractedContent struct {
|
|
Title string
|
|
ContentText string
|
|
SnippetText string
|
|
Language string
|
|
ContentLength int
|
|
Headings []string
|
|
Links []string
|
|
MetaData map[string]string
|
|
Features ContentFeatures
|
|
}
|
|
|
|
// ContentFeatures for quality scoring
|
|
type ContentFeatures struct {
|
|
AdDensity float64
|
|
LinkDensity float64
|
|
TextToHTMLRatio float64
|
|
HasMainContent bool
|
|
}
|
|
|
|
// ExtractHTML extracts content from HTML
|
|
func ExtractHTML(body []byte) (*ExtractedContent, error) {
|
|
doc, err := goquery.NewDocumentFromReader(bytes.NewReader(body))
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
content := &ExtractedContent{
|
|
MetaData: make(map[string]string),
|
|
}
|
|
|
|
// Extract title
|
|
content.Title = strings.TrimSpace(doc.Find("title").First().Text())
|
|
if content.Title == "" {
|
|
content.Title = strings.TrimSpace(doc.Find("h1").First().Text())
|
|
}
|
|
|
|
// Extract meta tags
|
|
doc.Find("meta").Each(func(i int, s *goquery.Selection) {
|
|
name, _ := s.Attr("name")
|
|
property, _ := s.Attr("property")
|
|
contentAttr, _ := s.Attr("content")
|
|
|
|
key := name
|
|
if key == "" {
|
|
key = property
|
|
}
|
|
|
|
if key != "" && contentAttr != "" {
|
|
content.MetaData[strings.ToLower(key)] = contentAttr
|
|
}
|
|
})
|
|
|
|
// Try to get og:title if main title is empty
|
|
if content.Title == "" {
|
|
if ogTitle, ok := content.MetaData["og:title"]; ok {
|
|
content.Title = ogTitle
|
|
}
|
|
}
|
|
|
|
// Extract headings
|
|
doc.Find("h1, h2, h3").Each(func(i int, s *goquery.Selection) {
|
|
text := strings.TrimSpace(s.Text())
|
|
if text != "" && len(text) < 500 {
|
|
content.Headings = append(content.Headings, text)
|
|
}
|
|
})
|
|
|
|
// Remove unwanted elements
|
|
doc.Find("script, style, nav, header, footer, aside, iframe, noscript, form, .advertisement, .ad, .ads, #cookie-banner, .cookie-notice, .social-share").Remove()
|
|
|
|
// Try to find main content area
|
|
mainContent := doc.Find("main, article, .content, .main-content, #content, #main").First()
|
|
if mainContent.Length() == 0 {
|
|
mainContent = doc.Find("body")
|
|
}
|
|
|
|
// Extract text content
|
|
var textBuilder strings.Builder
|
|
mainContent.Find("p, li, td, th, h1, h2, h3, h4, h5, h6, blockquote, pre").Each(func(i int, s *goquery.Selection) {
|
|
text := strings.TrimSpace(s.Text())
|
|
if text != "" {
|
|
textBuilder.WriteString(text)
|
|
textBuilder.WriteString("\n\n")
|
|
}
|
|
})
|
|
|
|
content.ContentText = cleanText(textBuilder.String())
|
|
content.ContentLength = len(content.ContentText)
|
|
|
|
// Generate snippet (first ~300 chars of meaningful content)
|
|
content.SnippetText = generateSnippet(content.ContentText, 300)
|
|
|
|
// Extract links
|
|
doc.Find("a[href]").Each(func(i int, s *goquery.Selection) {
|
|
href, exists := s.Attr("href")
|
|
if exists && strings.HasPrefix(href, "http") {
|
|
content.Links = append(content.Links, href)
|
|
}
|
|
})
|
|
|
|
// Detect language
|
|
content.Language = detectLanguage(content.ContentText, content.MetaData)
|
|
|
|
// Calculate features
|
|
htmlLen := float64(len(body))
|
|
textLen := float64(len(content.ContentText))
|
|
|
|
if htmlLen > 0 {
|
|
content.Features.TextToHTMLRatio = textLen / htmlLen
|
|
}
|
|
|
|
if textLen > 0 {
|
|
linkTextLen := 0.0
|
|
doc.Find("a").Each(func(i int, s *goquery.Selection) {
|
|
linkTextLen += float64(len(s.Text()))
|
|
})
|
|
content.Features.LinkDensity = linkTextLen / textLen
|
|
}
|
|
|
|
content.Features.HasMainContent = content.ContentLength > 200
|
|
|
|
// Ad density estimation (very simple heuristic)
|
|
adCount := doc.Find(".ad, .ads, .advertisement, [class*='banner'], [id*='banner']").Length()
|
|
totalElements := doc.Find("div, p, article, section").Length()
|
|
if totalElements > 0 {
|
|
content.Features.AdDensity = float64(adCount) / float64(totalElements)
|
|
}
|
|
|
|
return content, nil
|
|
}
|
|
|
|
// ExtractPDF extracts text from PDF using ledongthuc/pdf library
|
|
func ExtractPDF(body []byte) (*ExtractedContent, error) {
|
|
content := &ExtractedContent{
|
|
MetaData: make(map[string]string),
|
|
}
|
|
|
|
// Create a reader from the byte slice
|
|
reader := bytes.NewReader(body)
|
|
pdfReader, err := pdf.NewReader(reader, int64(len(body)))
|
|
if err != nil {
|
|
// Fallback to basic extraction if PDF parsing fails
|
|
return extractPDFFallback(body)
|
|
}
|
|
|
|
// Extract text using GetPlainText
|
|
textReader, err := pdfReader.GetPlainText()
|
|
if err != nil {
|
|
// Fallback to basic extraction
|
|
return extractPDFFallback(body)
|
|
}
|
|
|
|
// Read all text content
|
|
var textBuilder strings.Builder
|
|
_, err = io.Copy(&textBuilder, textReader)
|
|
if err != nil {
|
|
return extractPDFFallback(body)
|
|
}
|
|
|
|
rawText := textBuilder.String()
|
|
|
|
// Clean and process text
|
|
content.ContentText = cleanText(rawText)
|
|
content.ContentLength = len(content.ContentText)
|
|
content.SnippetText = generateSnippet(content.ContentText, 300)
|
|
content.Language = detectLanguage(content.ContentText, nil)
|
|
content.Features.HasMainContent = content.ContentLength > 200
|
|
|
|
// Extract title from first significant line
|
|
content.Title = extractPDFTitle(content.ContentText)
|
|
|
|
// Try to extract headings (larger font text often appears first in lines)
|
|
content.Headings = extractPDFHeadings(content.ContentText)
|
|
|
|
// Set PDF-specific metadata
|
|
content.MetaData["content_type"] = "application/pdf"
|
|
content.MetaData["page_count"] = string(rune(pdfReader.NumPage()))
|
|
|
|
return content, nil
|
|
}
|
|
|
|
// ExtractPDFWithMetadata extracts text with page-by-page processing
|
|
// Use this when you need more control over the extraction process
|
|
func ExtractPDFWithMetadata(body []byte) (*ExtractedContent, error) {
|
|
content := &ExtractedContent{
|
|
MetaData: make(map[string]string),
|
|
}
|
|
|
|
reader := bytes.NewReader(body)
|
|
pdfReader, err := pdf.NewReader(reader, int64(len(body)))
|
|
if err != nil {
|
|
return extractPDFFallback(body)
|
|
}
|
|
|
|
// Extract text page by page for better control
|
|
var textBuilder strings.Builder
|
|
numPages := pdfReader.NumPage()
|
|
|
|
for pageNum := 1; pageNum <= numPages; pageNum++ {
|
|
page := pdfReader.Page(pageNum)
|
|
if page.V.IsNull() {
|
|
continue
|
|
}
|
|
|
|
// Get page content
|
|
pageContent := page.Content()
|
|
for _, text := range pageContent.Text {
|
|
textBuilder.WriteString(text.S)
|
|
textBuilder.WriteString(" ")
|
|
}
|
|
textBuilder.WriteString("\n")
|
|
}
|
|
|
|
rawText := textBuilder.String()
|
|
|
|
// Clean and process text
|
|
content.ContentText = cleanText(rawText)
|
|
content.ContentLength = len(content.ContentText)
|
|
content.SnippetText = generateSnippet(content.ContentText, 300)
|
|
content.Language = detectLanguage(content.ContentText, nil)
|
|
content.Features.HasMainContent = content.ContentLength > 200
|
|
|
|
// Extract title and headings from plain text
|
|
content.Title = extractPDFTitle(content.ContentText)
|
|
content.Headings = extractPDFHeadings(content.ContentText)
|
|
|
|
content.MetaData["content_type"] = "application/pdf"
|
|
content.MetaData["page_count"] = string(rune(numPages))
|
|
content.MetaData["extraction_method"] = "page_by_page"
|
|
|
|
return content, nil
|
|
}
|
|
|
|
// extractPDFFallback uses basic regex extraction when PDF library fails
|
|
func extractPDFFallback(body []byte) (*ExtractedContent, error) {
|
|
content := &ExtractedContent{
|
|
MetaData: make(map[string]string),
|
|
}
|
|
|
|
// Basic PDF text extraction using regex (fallback)
|
|
pdfContent := string(body)
|
|
var textBuilder strings.Builder
|
|
|
|
// Find text content in PDF streams
|
|
re := regexp.MustCompile(`\((.*?)\)`)
|
|
matches := re.FindAllStringSubmatch(pdfContent, -1)
|
|
|
|
for _, match := range matches {
|
|
if len(match) > 1 {
|
|
text := match[1]
|
|
if isPrintableText(text) {
|
|
textBuilder.WriteString(text)
|
|
textBuilder.WriteString(" ")
|
|
}
|
|
}
|
|
}
|
|
|
|
content.ContentText = cleanText(textBuilder.String())
|
|
content.ContentLength = len(content.ContentText)
|
|
content.SnippetText = generateSnippet(content.ContentText, 300)
|
|
content.Language = detectLanguage(content.ContentText, nil)
|
|
content.Features.HasMainContent = content.ContentLength > 200
|
|
content.Title = extractPDFTitle(content.ContentText)
|
|
content.MetaData["content_type"] = "application/pdf"
|
|
content.MetaData["extraction_method"] = "fallback"
|
|
|
|
return content, nil
|
|
}
|
|
|
|
// extractPDFTitle extracts title from PDF content (first significant line)
|
|
func extractPDFTitle(text string) string {
|
|
lines := strings.Split(text, "\n")
|
|
for _, line := range lines {
|
|
line = strings.TrimSpace(line)
|
|
// Title should be meaningful length
|
|
if len(line) >= 10 && len(line) <= 200 {
|
|
// Skip lines that look like page numbers or dates
|
|
if !regexp.MustCompile(`^\d+$`).MatchString(line) &&
|
|
!regexp.MustCompile(`^\d{1,2}\.\d{1,2}\.\d{2,4}$`).MatchString(line) {
|
|
return line
|
|
}
|
|
}
|
|
}
|
|
return ""
|
|
}
|
|
|
|
// extractPDFHeadings attempts to extract headings from plain text
|
|
func extractPDFHeadings(text string) []string {
|
|
var headings []string
|
|
lines := strings.Split(text, "\n")
|
|
|
|
for i, line := range lines {
|
|
line = strings.TrimSpace(line)
|
|
// Skip very short or very long lines
|
|
if len(line) < 5 || len(line) > 200 {
|
|
continue
|
|
}
|
|
|
|
// Heuristics for headings:
|
|
// 1. All caps lines (common in PDFs)
|
|
// 2. Lines followed by empty line or starting with numbers (1., 1.1, etc.)
|
|
// 3. Short lines at beginning of document
|
|
|
|
isAllCaps := line == strings.ToUpper(line) && strings.ContainsAny(line, "ABCDEFGHIJKLMNOPQRSTUVWXYZÄÖÜ")
|
|
isNumbered := regexp.MustCompile(`^\d+(\.\d+)*\.?\s+\S`).MatchString(line)
|
|
isShortAndEarly := i < 20 && len(line) < 80
|
|
|
|
if (isAllCaps || isNumbered || isShortAndEarly) && !containsHeading(headings, line) {
|
|
headings = append(headings, line)
|
|
if len(headings) >= 10 {
|
|
break // Limit to 10 headings
|
|
}
|
|
}
|
|
}
|
|
|
|
return headings
|
|
}
|
|
|
|
// containsHeading checks if a heading already exists in the list
|
|
func containsHeading(headings []string, heading string) bool {
|
|
for _, h := range headings {
|
|
if h == heading {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
func isPrintableText(s string) bool {
|
|
if len(s) < 3 {
|
|
return false
|
|
}
|
|
|
|
printable := 0
|
|
for _, r := range s {
|
|
if unicode.IsPrint(r) && (unicode.IsLetter(r) || unicode.IsSpace(r) || unicode.IsPunct(r)) {
|
|
printable++
|
|
}
|
|
}
|
|
|
|
return float64(printable)/float64(len(s)) > 0.7
|
|
}
|
|
|
|
func cleanText(text string) string {
|
|
// Normalize whitespace
|
|
text = strings.ReplaceAll(text, "\r\n", "\n")
|
|
text = strings.ReplaceAll(text, "\r", "\n")
|
|
|
|
// Replace multiple newlines with double newline
|
|
re := regexp.MustCompile(`\n{3,}`)
|
|
text = re.ReplaceAllString(text, "\n\n")
|
|
|
|
// Replace multiple spaces with single space
|
|
re = regexp.MustCompile(`[ \t]+`)
|
|
text = re.ReplaceAllString(text, " ")
|
|
|
|
// Trim each line
|
|
lines := strings.Split(text, "\n")
|
|
for i, line := range lines {
|
|
lines[i] = strings.TrimSpace(line)
|
|
}
|
|
text = strings.Join(lines, "\n")
|
|
|
|
return strings.TrimSpace(text)
|
|
}
|
|
|
|
func generateSnippet(text string, maxLen int) string {
|
|
// Find first paragraph with enough content
|
|
paragraphs := strings.Split(text, "\n\n")
|
|
|
|
for _, p := range paragraphs {
|
|
p = strings.TrimSpace(p)
|
|
if len(p) >= 50 {
|
|
if len(p) > maxLen {
|
|
// Find word boundary
|
|
p = p[:maxLen]
|
|
lastSpace := strings.LastIndex(p, " ")
|
|
if lastSpace > maxLen/2 {
|
|
p = p[:lastSpace]
|
|
}
|
|
p += "..."
|
|
}
|
|
return p
|
|
}
|
|
}
|
|
|
|
// Fallback: just truncate
|
|
if len(text) > maxLen {
|
|
text = text[:maxLen] + "..."
|
|
}
|
|
return text
|
|
}
|
|
|
|
func detectLanguage(text string, meta map[string]string) string {
|
|
// Check meta tags first
|
|
if meta != nil {
|
|
if lang, ok := meta["og:locale"]; ok {
|
|
if strings.HasPrefix(lang, "de") {
|
|
return "de"
|
|
}
|
|
if strings.HasPrefix(lang, "en") {
|
|
return "en"
|
|
}
|
|
}
|
|
}
|
|
|
|
// Simple heuristic based on common German words
|
|
germanWords := []string{
|
|
"und", "der", "die", "das", "ist", "für", "mit", "von",
|
|
"werden", "wird", "sind", "auch", "als", "können", "nach",
|
|
"einer", "durch", "sich", "bei", "sein", "noch", "haben",
|
|
}
|
|
|
|
englishWords := []string{
|
|
"the", "and", "for", "are", "but", "not", "you", "all",
|
|
"can", "had", "her", "was", "one", "our", "with", "they",
|
|
}
|
|
|
|
lowerText := strings.ToLower(text)
|
|
|
|
germanCount := 0
|
|
for _, word := range germanWords {
|
|
if strings.Contains(lowerText, " "+word+" ") {
|
|
germanCount++
|
|
}
|
|
}
|
|
|
|
englishCount := 0
|
|
for _, word := range englishWords {
|
|
if strings.Contains(lowerText, " "+word+" ") {
|
|
englishCount++
|
|
}
|
|
}
|
|
|
|
if germanCount > englishCount && germanCount > 3 {
|
|
return "de"
|
|
}
|
|
if englishCount > germanCount && englishCount > 3 {
|
|
return "en"
|
|
}
|
|
|
|
return "de" // Default to German for education content
|
|
}
|
|
|
|
// UnescapeHTML unescapes HTML entities
|
|
func UnescapeHTML(s string) string {
|
|
return html.UnescapeString(s)
|
|
}
|