package extractor import ( "bytes" "io" "regexp" "strings" "unicode" "github.com/PuerkitoBio/goquery" "github.com/ledongthuc/pdf" "golang.org/x/net/html" ) // ExtractedContent contains parsed content from HTML/PDF type ExtractedContent struct { Title string ContentText string SnippetText string Language string ContentLength int Headings []string Links []string MetaData map[string]string Features ContentFeatures } // ContentFeatures for quality scoring type ContentFeatures struct { AdDensity float64 LinkDensity float64 TextToHTMLRatio float64 HasMainContent bool } // ExtractHTML extracts content from HTML func ExtractHTML(body []byte) (*ExtractedContent, error) { doc, err := goquery.NewDocumentFromReader(bytes.NewReader(body)) if err != nil { return nil, err } content := &ExtractedContent{ MetaData: make(map[string]string), } // Extract title content.Title = strings.TrimSpace(doc.Find("title").First().Text()) if content.Title == "" { content.Title = strings.TrimSpace(doc.Find("h1").First().Text()) } // Extract meta tags doc.Find("meta").Each(func(i int, s *goquery.Selection) { name, _ := s.Attr("name") property, _ := s.Attr("property") contentAttr, _ := s.Attr("content") key := name if key == "" { key = property } if key != "" && contentAttr != "" { content.MetaData[strings.ToLower(key)] = contentAttr } }) // Try to get og:title if main title is empty if content.Title == "" { if ogTitle, ok := content.MetaData["og:title"]; ok { content.Title = ogTitle } } // Extract headings doc.Find("h1, h2, h3").Each(func(i int, s *goquery.Selection) { text := strings.TrimSpace(s.Text()) if text != "" && len(text) < 500 { content.Headings = append(content.Headings, text) } }) // Remove unwanted elements doc.Find("script, style, nav, header, footer, aside, iframe, noscript, form, .advertisement, .ad, .ads, #cookie-banner, .cookie-notice, .social-share").Remove() // Try to find main content area mainContent := doc.Find("main, article, .content, .main-content, #content, #main").First() if mainContent.Length() == 0 { mainContent = doc.Find("body") } // Extract text content var textBuilder strings.Builder mainContent.Find("p, li, td, th, h1, h2, h3, h4, h5, h6, blockquote, pre").Each(func(i int, s *goquery.Selection) { text := strings.TrimSpace(s.Text()) if text != "" { textBuilder.WriteString(text) textBuilder.WriteString("\n\n") } }) content.ContentText = cleanText(textBuilder.String()) content.ContentLength = len(content.ContentText) // Generate snippet (first ~300 chars of meaningful content) content.SnippetText = generateSnippet(content.ContentText, 300) // Extract links doc.Find("a[href]").Each(func(i int, s *goquery.Selection) { href, exists := s.Attr("href") if exists && strings.HasPrefix(href, "http") { content.Links = append(content.Links, href) } }) // Detect language content.Language = detectLanguage(content.ContentText, content.MetaData) // Calculate features htmlLen := float64(len(body)) textLen := float64(len(content.ContentText)) if htmlLen > 0 { content.Features.TextToHTMLRatio = textLen / htmlLen } if textLen > 0 { linkTextLen := 0.0 doc.Find("a").Each(func(i int, s *goquery.Selection) { linkTextLen += float64(len(s.Text())) }) content.Features.LinkDensity = linkTextLen / textLen } content.Features.HasMainContent = content.ContentLength > 200 // Ad density estimation (very simple heuristic) adCount := doc.Find(".ad, .ads, .advertisement, [class*='banner'], [id*='banner']").Length() totalElements := doc.Find("div, p, article, section").Length() if totalElements > 0 { content.Features.AdDensity = float64(adCount) / float64(totalElements) } return content, nil } // ExtractPDF extracts text from PDF using ledongthuc/pdf library func ExtractPDF(body []byte) (*ExtractedContent, error) { content := &ExtractedContent{ MetaData: make(map[string]string), } // Create a reader from the byte slice reader := bytes.NewReader(body) pdfReader, err := pdf.NewReader(reader, int64(len(body))) if err != nil { // Fallback to basic extraction if PDF parsing fails return extractPDFFallback(body) } // Extract text using GetPlainText textReader, err := pdfReader.GetPlainText() if err != nil { // Fallback to basic extraction return extractPDFFallback(body) } // Read all text content var textBuilder strings.Builder _, err = io.Copy(&textBuilder, textReader) if err != nil { return extractPDFFallback(body) } rawText := textBuilder.String() // Clean and process text content.ContentText = cleanText(rawText) content.ContentLength = len(content.ContentText) content.SnippetText = generateSnippet(content.ContentText, 300) content.Language = detectLanguage(content.ContentText, nil) content.Features.HasMainContent = content.ContentLength > 200 // Extract title from first significant line content.Title = extractPDFTitle(content.ContentText) // Try to extract headings (larger font text often appears first in lines) content.Headings = extractPDFHeadings(content.ContentText) // Set PDF-specific metadata content.MetaData["content_type"] = "application/pdf" content.MetaData["page_count"] = string(rune(pdfReader.NumPage())) return content, nil } // ExtractPDFWithMetadata extracts text with page-by-page processing // Use this when you need more control over the extraction process func ExtractPDFWithMetadata(body []byte) (*ExtractedContent, error) { content := &ExtractedContent{ MetaData: make(map[string]string), } reader := bytes.NewReader(body) pdfReader, err := pdf.NewReader(reader, int64(len(body))) if err != nil { return extractPDFFallback(body) } // Extract text page by page for better control var textBuilder strings.Builder numPages := pdfReader.NumPage() for pageNum := 1; pageNum <= numPages; pageNum++ { page := pdfReader.Page(pageNum) if page.V.IsNull() { continue } // Get page content pageContent := page.Content() for _, text := range pageContent.Text { textBuilder.WriteString(text.S) textBuilder.WriteString(" ") } textBuilder.WriteString("\n") } rawText := textBuilder.String() // Clean and process text content.ContentText = cleanText(rawText) content.ContentLength = len(content.ContentText) content.SnippetText = generateSnippet(content.ContentText, 300) content.Language = detectLanguage(content.ContentText, nil) content.Features.HasMainContent = content.ContentLength > 200 // Extract title and headings from plain text content.Title = extractPDFTitle(content.ContentText) content.Headings = extractPDFHeadings(content.ContentText) content.MetaData["content_type"] = "application/pdf" content.MetaData["page_count"] = string(rune(numPages)) content.MetaData["extraction_method"] = "page_by_page" return content, nil } // extractPDFFallback uses basic regex extraction when PDF library fails func extractPDFFallback(body []byte) (*ExtractedContent, error) { content := &ExtractedContent{ MetaData: make(map[string]string), } // Basic PDF text extraction using regex (fallback) pdfContent := string(body) var textBuilder strings.Builder // Find text content in PDF streams re := regexp.MustCompile(`\((.*?)\)`) matches := re.FindAllStringSubmatch(pdfContent, -1) for _, match := range matches { if len(match) > 1 { text := match[1] if isPrintableText(text) { textBuilder.WriteString(text) textBuilder.WriteString(" ") } } } content.ContentText = cleanText(textBuilder.String()) content.ContentLength = len(content.ContentText) content.SnippetText = generateSnippet(content.ContentText, 300) content.Language = detectLanguage(content.ContentText, nil) content.Features.HasMainContent = content.ContentLength > 200 content.Title = extractPDFTitle(content.ContentText) content.MetaData["content_type"] = "application/pdf" content.MetaData["extraction_method"] = "fallback" return content, nil } // extractPDFTitle extracts title from PDF content (first significant line) func extractPDFTitle(text string) string { lines := strings.Split(text, "\n") for _, line := range lines { line = strings.TrimSpace(line) // Title should be meaningful length if len(line) >= 10 && len(line) <= 200 { // Skip lines that look like page numbers or dates if !regexp.MustCompile(`^\d+$`).MatchString(line) && !regexp.MustCompile(`^\d{1,2}\.\d{1,2}\.\d{2,4}$`).MatchString(line) { return line } } } return "" } // extractPDFHeadings attempts to extract headings from plain text func extractPDFHeadings(text string) []string { var headings []string lines := strings.Split(text, "\n") for i, line := range lines { line = strings.TrimSpace(line) // Skip very short or very long lines if len(line) < 5 || len(line) > 200 { continue } // Heuristics for headings: // 1. All caps lines (common in PDFs) // 2. Lines followed by empty line or starting with numbers (1., 1.1, etc.) // 3. Short lines at beginning of document isAllCaps := line == strings.ToUpper(line) && strings.ContainsAny(line, "ABCDEFGHIJKLMNOPQRSTUVWXYZÄÖÜ") isNumbered := regexp.MustCompile(`^\d+(\.\d+)*\.?\s+\S`).MatchString(line) isShortAndEarly := i < 20 && len(line) < 80 if (isAllCaps || isNumbered || isShortAndEarly) && !containsHeading(headings, line) { headings = append(headings, line) if len(headings) >= 10 { break // Limit to 10 headings } } } return headings } // containsHeading checks if a heading already exists in the list func containsHeading(headings []string, heading string) bool { for _, h := range headings { if h == heading { return true } } return false } func isPrintableText(s string) bool { if len(s) < 3 { return false } printable := 0 for _, r := range s { if unicode.IsPrint(r) && (unicode.IsLetter(r) || unicode.IsSpace(r) || unicode.IsPunct(r)) { printable++ } } return float64(printable)/float64(len(s)) > 0.7 } func cleanText(text string) string { // Normalize whitespace text = strings.ReplaceAll(text, "\r\n", "\n") text = strings.ReplaceAll(text, "\r", "\n") // Replace multiple newlines with double newline re := regexp.MustCompile(`\n{3,}`) text = re.ReplaceAllString(text, "\n\n") // Replace multiple spaces with single space re = regexp.MustCompile(`[ \t]+`) text = re.ReplaceAllString(text, " ") // Trim each line lines := strings.Split(text, "\n") for i, line := range lines { lines[i] = strings.TrimSpace(line) } text = strings.Join(lines, "\n") return strings.TrimSpace(text) } func generateSnippet(text string, maxLen int) string { // Find first paragraph with enough content paragraphs := strings.Split(text, "\n\n") for _, p := range paragraphs { p = strings.TrimSpace(p) if len(p) >= 50 { if len(p) > maxLen { // Find word boundary p = p[:maxLen] lastSpace := strings.LastIndex(p, " ") if lastSpace > maxLen/2 { p = p[:lastSpace] } p += "..." } return p } } // Fallback: just truncate if len(text) > maxLen { text = text[:maxLen] + "..." } return text } func detectLanguage(text string, meta map[string]string) string { // Check meta tags first if meta != nil { if lang, ok := meta["og:locale"]; ok { if strings.HasPrefix(lang, "de") { return "de" } if strings.HasPrefix(lang, "en") { return "en" } } } // Simple heuristic based on common German words germanWords := []string{ "und", "der", "die", "das", "ist", "für", "mit", "von", "werden", "wird", "sind", "auch", "als", "können", "nach", "einer", "durch", "sich", "bei", "sein", "noch", "haben", } englishWords := []string{ "the", "and", "for", "are", "but", "not", "you", "all", "can", "had", "her", "was", "one", "our", "with", "they", } lowerText := strings.ToLower(text) germanCount := 0 for _, word := range germanWords { if strings.Contains(lowerText, " "+word+" ") { germanCount++ } } englishCount := 0 for _, word := range englishWords { if strings.Contains(lowerText, " "+word+" ") { englishCount++ } } if germanCount > englishCount && germanCount > 3 { return "de" } if englishCount > germanCount && englishCount > 3 { return "en" } return "de" // Default to German for education content } // UnescapeHTML unescapes HTML entities func UnescapeHTML(s string) string { return html.UnescapeString(s) }