""" Vocabulary Worksheet Generation — HTML/PDF generation and PDF utilities. Extracted from vocab_worksheet_api.py to keep modules under 500 LOC. Functions: - generate_worksheet_html(): Build HTML for various worksheet types - generate_worksheet_pdf(): Convert HTML to PDF via WeasyPrint - get_pdf_page_count(): Count pages in a PDF (PyMuPDF) - convert_pdf_page_to_image(): Render single PDF page to PNG - convert_pdf_to_images(): Render multiple PDF pages to PNG """ import io import logging import os from typing import List, Optional from fastapi import HTTPException from vocab_worksheet_models import VocabularyEntry, WorksheetType logger = logging.getLogger(__name__) # Optional dependency: WeasyPrint try: from weasyprint import HTML as _WeasyHTML WEASYPRINT_AVAILABLE = True except (ImportError, OSError): WEASYPRINT_AVAILABLE = False logger.warning("WeasyPrint not available") # Optional dependency: PyMuPDF try: import fitz # PyMuPDF FITZ_AVAILABLE = True except ImportError: FITZ_AVAILABLE = False logger.warning("PyMuPDF (fitz) not available") # ============================================================================= # Worksheet HTML Generation # ============================================================================= def generate_worksheet_html( vocabulary: List[VocabularyEntry], worksheet_type: WorksheetType, title: str, show_solutions: bool = False, repetitions: int = 3, line_height: str = "normal" ) -> str: """Generate HTML for a worksheet.""" # Line height CSS line_heights = { "normal": "2.5em", "large": "3.5em", "extra-large": "4.5em" } lh = line_heights.get(line_height, "2.5em") html = f"""

{title}

Name: _________________________ Datum: _____________
""" if worksheet_type == WorksheetType.EN_TO_DE: html += '
Uebersetze ins Deutsche:
' html += '' for entry in vocabulary: if show_solutions: html += f'' else: html += f'' html += '
{entry.english}{entry.german}
{entry.english}
' elif worksheet_type == WorksheetType.DE_TO_EN: html += '
Uebersetze ins Englische:
' html += '' for entry in vocabulary: if show_solutions: html += f'' else: html += f'' html += '
{entry.german}{entry.english}
{entry.german}
' elif worksheet_type == WorksheetType.COPY_PRACTICE: html += '
Schreibe jedes Wort mehrmals:
' html += '' for entry in vocabulary: html += f'' html += '' html += '
{entry.english}' if show_solutions: html += f' {entry.english} ' * repetitions html += '
' elif worksheet_type == WorksheetType.GAP_FILL: entries_with_examples = [e for e in vocabulary if e.example_sentence] if entries_with_examples: html += '
Fuege das passende Wort ein:
' for i, entry in enumerate(entries_with_examples, 1): # Create gap sentence by removing the English word gap_sentence = entry.example_sentence for word in entry.english.split(): if word.lower() in gap_sentence.lower(): gap_sentence = gap_sentence.replace(word, '') gap_sentence = gap_sentence.replace(word.capitalize(), '') gap_sentence = gap_sentence.replace(word.lower(), '') break html += f'

{i}. {gap_sentence}

' if show_solutions: html += f'

Loesung: {entry.english}

' else: html += f'

({entry.german})

' html += '
' html += '' return html # ============================================================================= # Worksheet PDF Generation # ============================================================================= async def generate_worksheet_pdf(html: str) -> bytes: """Generate PDF from HTML using WeasyPrint.""" try: from weasyprint import HTML pdf_bytes = HTML(string=html).write_pdf() return pdf_bytes except ImportError: logger.warning("WeasyPrint not available, returning HTML") return html.encode('utf-8') except Exception as e: logger.error(f"PDF generation failed: {e}") raise # ============================================================================= # PDF Utilities (PyMuPDF) # ============================================================================= def get_pdf_page_count(pdf_data: bytes) -> int: """Get the number of pages in a PDF.""" try: import fitz pdf_document = fitz.open(stream=pdf_data, filetype="pdf") count = pdf_document.page_count pdf_document.close() return count except Exception as e: logger.error(f"Failed to get PDF page count: {e}") return 0 async def convert_pdf_page_to_image(pdf_data: bytes, page_number: int = 0, thumbnail: bool = False) -> bytes: """Convert a specific page of PDF to PNG image using PyMuPDF. Args: pdf_data: PDF file as bytes page_number: 0-indexed page number thumbnail: If True, return a smaller thumbnail image """ try: import fitz # PyMuPDF pdf_document = fitz.open(stream=pdf_data, filetype="pdf") if pdf_document.page_count == 0: raise ValueError("PDF has no pages") if page_number >= pdf_document.page_count: raise ValueError(f"Page {page_number} does not exist (PDF has {pdf_document.page_count} pages)") page = pdf_document[page_number] # Render page to image # For thumbnails: lower resolution, for OCR: higher resolution zoom = 0.5 if thumbnail else 2.0 mat = fitz.Matrix(zoom, zoom) pix = page.get_pixmap(matrix=mat) png_data = pix.tobytes("png") pdf_document.close() logger.info(f"Converted PDF page {page_number} to PNG: {len(png_data)} bytes (thumbnail={thumbnail})") return png_data except ImportError: logger.error("PyMuPDF (fitz) not installed") raise HTTPException(status_code=500, detail="PDF conversion not available - PyMuPDF not installed") except Exception as e: logger.error(f"PDF conversion failed: {e}") raise HTTPException(status_code=400, detail=f"PDF conversion failed: {str(e)}") async def convert_pdf_to_images(pdf_data: bytes, pages: List[int] = None) -> List[bytes]: """Convert multiple pages of PDF to PNG images. Args: pdf_data: PDF file as bytes pages: List of 0-indexed page numbers to convert. If None, convert all pages. """ try: import fitz pdf_document = fitz.open(stream=pdf_data, filetype="pdf") if pdf_document.page_count == 0: raise ValueError("PDF has no pages") # If no pages specified, convert all if pages is None: pages = list(range(pdf_document.page_count)) images = [] zoom = 2.0 mat = fitz.Matrix(zoom, zoom) for page_num in pages: if page_num < pdf_document.page_count: page = pdf_document[page_num] pix = page.get_pixmap(matrix=mat) images.append(pix.tobytes("png")) pdf_document.close() logger.info(f"Converted {len(images)} PDF pages to images") return images except ImportError: logger.error("PyMuPDF (fitz) not installed") raise HTTPException(status_code=500, detail="PDF conversion not available") except Exception as e: logger.error(f"PDF conversion failed: {e}") raise HTTPException(status_code=400, detail=f"PDF conversion failed: {str(e)}")