"""
Vocabulary Worksheet Generation — HTML/PDF generation and PDF utilities.
Extracted from vocab_worksheet_api.py to keep modules under 500 LOC.
Functions:
- generate_worksheet_html(): Build HTML for various worksheet types
- generate_worksheet_pdf(): Convert HTML to PDF via WeasyPrint
- get_pdf_page_count(): Count pages in a PDF (PyMuPDF)
- convert_pdf_page_to_image(): Render single PDF page to PNG
- convert_pdf_to_images(): Render multiple PDF pages to PNG
"""
import io
import logging
import os
from typing import List, Optional
from fastapi import HTTPException
from vocab_worksheet_models import VocabularyEntry, WorksheetType
logger = logging.getLogger(__name__)
# Optional dependency: WeasyPrint
try:
from weasyprint import HTML as _WeasyHTML
WEASYPRINT_AVAILABLE = True
except (ImportError, OSError):
WEASYPRINT_AVAILABLE = False
logger.warning("WeasyPrint not available")
# Optional dependency: PyMuPDF
try:
import fitz # PyMuPDF
FITZ_AVAILABLE = True
except ImportError:
FITZ_AVAILABLE = False
logger.warning("PyMuPDF (fitz) not available")
# =============================================================================
# Worksheet HTML Generation
# =============================================================================
def generate_worksheet_html(
vocabulary: List[VocabularyEntry],
worksheet_type: WorksheetType,
title: str,
show_solutions: bool = False,
repetitions: int = 3,
line_height: str = "normal"
) -> str:
"""Generate HTML for a worksheet."""
# Line height CSS
line_heights = {
"normal": "2.5em",
"large": "3.5em",
"extra-large": "4.5em"
}
lh = line_heights.get(line_height, "2.5em")
html = f"""
{title}
Name: _________________________ Datum: _____________
"""
if worksheet_type == WorksheetType.EN_TO_DE:
html += 'Uebersetze ins Deutsche:
'
html += '
'
for entry in vocabulary:
if show_solutions:
html += f'| {entry.english} | {entry.german} |
'
else:
html += f'| {entry.english} | |
'
html += '
'
elif worksheet_type == WorksheetType.DE_TO_EN:
html += 'Uebersetze ins Englische:
'
html += '
'
for entry in vocabulary:
if show_solutions:
html += f'| {entry.german} | {entry.english} |
'
else:
html += f'| {entry.german} | |
'
html += '
'
elif worksheet_type == WorksheetType.COPY_PRACTICE:
html += 'Schreibe jedes Wort mehrmals:
'
html += '
'
for entry in vocabulary:
html += f'| {entry.english} | '
html += ''
if show_solutions:
html += f' {entry.english} ' * repetitions
html += ' |
'
html += '
'
elif worksheet_type == WorksheetType.GAP_FILL:
entries_with_examples = [e for e in vocabulary if e.example_sentence]
if entries_with_examples:
html += 'Fuege das passende Wort ein:
'
for i, entry in enumerate(entries_with_examples, 1):
# Create gap sentence by removing the English word
gap_sentence = entry.example_sentence
for word in entry.english.split():
if word.lower() in gap_sentence.lower():
gap_sentence = gap_sentence.replace(word, '
')
gap_sentence = gap_sentence.replace(word.capitalize(), '
')
gap_sentence = gap_sentence.replace(word.lower(), '
')
break
html += f'
{i}. {gap_sentence}
'
if show_solutions:
html += f'
Loesung: {entry.english}
'
else:
html += f'
({entry.german})
'
html += '
'
html += ''
return html
# =============================================================================
# Worksheet PDF Generation
# =============================================================================
async def generate_worksheet_pdf(html: str) -> bytes:
"""Generate PDF from HTML using WeasyPrint."""
try:
from weasyprint import HTML
pdf_bytes = HTML(string=html).write_pdf()
return pdf_bytes
except ImportError:
logger.warning("WeasyPrint not available, returning HTML")
return html.encode('utf-8')
except Exception as e:
logger.error(f"PDF generation failed: {e}")
raise
# =============================================================================
# PDF Utilities (PyMuPDF)
# =============================================================================
def get_pdf_page_count(pdf_data: bytes) -> int:
"""Get the number of pages in a PDF."""
try:
import fitz
pdf_document = fitz.open(stream=pdf_data, filetype="pdf")
count = pdf_document.page_count
pdf_document.close()
return count
except Exception as e:
logger.error(f"Failed to get PDF page count: {e}")
return 0
async def convert_pdf_page_to_image(pdf_data: bytes, page_number: int = 0, thumbnail: bool = False) -> bytes:
"""Convert a specific page of PDF to PNG image using PyMuPDF.
Args:
pdf_data: PDF file as bytes
page_number: 0-indexed page number
thumbnail: If True, return a smaller thumbnail image
"""
try:
import fitz # PyMuPDF
pdf_document = fitz.open(stream=pdf_data, filetype="pdf")
if pdf_document.page_count == 0:
raise ValueError("PDF has no pages")
if page_number >= pdf_document.page_count:
raise ValueError(f"Page {page_number} does not exist (PDF has {pdf_document.page_count} pages)")
page = pdf_document[page_number]
# Render page to image
# For thumbnails: lower resolution, for OCR: higher resolution
zoom = 0.5 if thumbnail else 2.0
mat = fitz.Matrix(zoom, zoom)
pix = page.get_pixmap(matrix=mat)
png_data = pix.tobytes("png")
pdf_document.close()
logger.info(f"Converted PDF page {page_number} to PNG: {len(png_data)} bytes (thumbnail={thumbnail})")
return png_data
except ImportError:
logger.error("PyMuPDF (fitz) not installed")
raise HTTPException(status_code=500, detail="PDF conversion not available - PyMuPDF not installed")
except Exception as e:
logger.error(f"PDF conversion failed: {e}")
raise HTTPException(status_code=400, detail=f"PDF conversion failed: {str(e)}")
async def convert_pdf_to_images(pdf_data: bytes, pages: List[int] = None) -> List[bytes]:
"""Convert multiple pages of PDF to PNG images.
Args:
pdf_data: PDF file as bytes
pages: List of 0-indexed page numbers to convert. If None, convert all pages.
"""
try:
import fitz
pdf_document = fitz.open(stream=pdf_data, filetype="pdf")
if pdf_document.page_count == 0:
raise ValueError("PDF has no pages")
# If no pages specified, convert all
if pages is None:
pages = list(range(pdf_document.page_count))
images = []
zoom = 2.0
mat = fitz.Matrix(zoom, zoom)
for page_num in pages:
if page_num < pdf_document.page_count:
page = pdf_document[page_num]
pix = page.get_pixmap(matrix=mat)
images.append(pix.tobytes("png"))
pdf_document.close()
logger.info(f"Converted {len(images)} PDF pages to images")
return images
except ImportError:
logger.error("PyMuPDF (fitz) not installed")
raise HTTPException(status_code=500, detail="PDF conversion not available")
except Exception as e:
logger.error(f"PDF conversion failed: {e}")
raise HTTPException(status_code=400, detail=f"PDF conversion failed: {str(e)}")