breakpilot-lehrer/klausur-service/backend/vocab_worksheet_generation.py

"""
Vocabulary Worksheet Generation — HTML/PDF generation and PDF utilities.

Extracted from vocab_worksheet_api.py to keep modules under 500 LOC.

Functions:
  - generate_worksheet_html(): Build HTML for various worksheet types
  - generate_worksheet_pdf():  Convert HTML to PDF via WeasyPrint
  - get_pdf_page_count():      Count pages in a PDF (PyMuPDF)
  - convert_pdf_page_to_image(): Render single PDF page to PNG
  - convert_pdf_to_images():     Render multiple PDF pages to PNG
"""

import io
import logging
import os
from typing import List, Optional

from fastapi import HTTPException

from vocab_worksheet_models import VocabularyEntry, WorksheetType

logger = logging.getLogger(__name__)

# Optional dependency: WeasyPrint
try:
    from weasyprint import HTML as _WeasyHTML
    WEASYPRINT_AVAILABLE = True
except (ImportError, OSError):
    WEASYPRINT_AVAILABLE = False
    logger.warning("WeasyPrint not available")

# Optional dependency: PyMuPDF
try:
    import fitz  # PyMuPDF
    FITZ_AVAILABLE = True
except ImportError:
    FITZ_AVAILABLE = False
    logger.warning("PyMuPDF (fitz) not available")


# =============================================================================
# Worksheet HTML Generation
# =============================================================================

def generate_worksheet_html(
    vocabulary: List[VocabularyEntry],
    worksheet_type: WorksheetType,
    title: str,
    show_solutions: bool = False,
    repetitions: int = 3,
    line_height: str = "normal"
) -> str:
    """Generate HTML for a worksheet."""

    # Line height CSS
    line_heights = {
        "normal": "2.5em",
        "large": "3.5em",
        "extra-large": "4.5em"
    }
    lh = line_heights.get(line_height, "2.5em")

    html = f"""<!DOCTYPE html>
<html>
<head>
    <meta charset="UTF-8">
    <style>
        @page {{ size: A4; margin: 2cm; }}
        body {{ font-family: 'Segoe UI', Arial, sans-serif; font-size: 14px; }}
        h1 {{ font-size: 24px; margin-bottom: 10px; }}
        .meta {{ color: #666; margin-bottom: 20px; }}
        .name-line {{ margin-bottom: 30px; }}
        .vocab-table {{ width: 100%; border-collapse: collapse; }}
        .vocab-table td {{ padding: 8px; border-bottom: 1px solid #ddd; line-height: {lh}; }}
        .vocab-word {{ width: 40%; font-weight: 500; }}
        .vocab-blank {{ width: 60%; border-bottom: 2px dotted #999; }}
        .vocab-answer {{ width: 60%; color: #2563eb; }}
        .gap {{ border-bottom: 2px solid #333; min-width: 100px; display: inline-block; }}
        .hint {{ color: #666; font-style: italic; font-size: 12px; }}
        .section {{ margin-top: 30px; }}
        .section-title {{ font-size: 16px; font-weight: 600; margin-bottom: 15px; color: #374151; }}
    </style>
</head>
<body>
    <h1>{title}</h1>
    <div class="name-line">Name: _________________________ Datum: _____________</div>
"""

    if worksheet_type == WorksheetType.EN_TO_DE:
        html += '<div class="section"><div class="section-title">Uebersetze ins Deutsche:</div>'
        html += '<table class="vocab-table">'
        for entry in vocabulary:
            if show_solutions:
                html += f'<tr><td class="vocab-word">{entry.english}</td><td class="vocab-answer">{entry.german}</td></tr>'
            else:
                html += f'<tr><td class="vocab-word">{entry.english}</td><td class="vocab-blank"></td></tr>'
        html += '</table></div>'

    elif worksheet_type == WorksheetType.DE_TO_EN:
        html += '<div class="section"><div class="section-title">Uebersetze ins Englische:</div>'
        html += '<table class="vocab-table">'
        for entry in vocabulary:
            if show_solutions:
                html += f'<tr><td class="vocab-word">{entry.german}</td><td class="vocab-answer">{entry.english}</td></tr>'
            else:
                html += f'<tr><td class="vocab-word">{entry.german}</td><td class="vocab-blank"></td></tr>'
        html += '</table></div>'

    elif worksheet_type == WorksheetType.COPY_PRACTICE:
        html += '<div class="section"><div class="section-title">Schreibe jedes Wort mehrmals:</div>'
        html += '<table class="vocab-table">'
        for entry in vocabulary:
            html += f'<tr><td class="vocab-word">{entry.english}</td>'
            html += '<td class="vocab-blank">'
            if show_solutions:
                html += f' {entry.english} ' * repetitions
            html += '</td></tr>'
        html += '</table></div>'

    elif worksheet_type == WorksheetType.GAP_FILL:
        entries_with_examples = [e for e in vocabulary if e.example_sentence]
        if entries_with_examples:
            html += '<div class="section"><div class="section-title">Fuege das passende Wort ein:</div>'
            for i, entry in enumerate(entries_with_examples, 1):
                # Create gap sentence by removing the English word
                gap_sentence = entry.example_sentence
                for word in entry.english.split():
                    if word.lower() in gap_sentence.lower():
                        gap_sentence = gap_sentence.replace(word, '<span class="gap"></span>')
                        gap_sentence = gap_sentence.replace(word.capitalize(), '<span class="gap"></span>')
                        gap_sentence = gap_sentence.replace(word.lower(), '<span class="gap"></span>')
                        break

                html += f'<p>{i}. {gap_sentence}</p>'
                if show_solutions:
                    html += f'<p class="hint">Loesung: {entry.english}</p>'
                else:
                    html += f'<p class="hint">({entry.german})</p>'
            html += '</div>'

    html += '</body></html>'
    return html


# =============================================================================
# Worksheet PDF Generation
# =============================================================================

async def generate_worksheet_pdf(html: str) -> bytes:
    """Generate PDF from HTML using WeasyPrint."""
    try:
        from weasyprint import HTML
        pdf_bytes = HTML(string=html).write_pdf()
        return pdf_bytes
    except ImportError:
        logger.warning("WeasyPrint not available, returning HTML")
        return html.encode('utf-8')
    except Exception as e:
        logger.error(f"PDF generation failed: {e}")
        raise


# =============================================================================
# PDF Utilities (PyMuPDF)
# =============================================================================

def get_pdf_page_count(pdf_data: bytes) -> int:
    """Get the number of pages in a PDF."""
    try:
        import fitz
        pdf_document = fitz.open(stream=pdf_data, filetype="pdf")
        count = pdf_document.page_count
        pdf_document.close()
        return count
    except Exception as e:
        logger.error(f"Failed to get PDF page count: {e}")
        return 0


async def convert_pdf_page_to_image(pdf_data: bytes, page_number: int = 0, thumbnail: bool = False) -> bytes:
    """Convert a specific page of PDF to PNG image using PyMuPDF.

    Args:
        pdf_data: PDF file as bytes
        page_number: 0-indexed page number
        thumbnail: If True, return a smaller thumbnail image
    """
    try:
        import fitz  # PyMuPDF

        pdf_document = fitz.open(stream=pdf_data, filetype="pdf")

        if pdf_document.page_count == 0:
            raise ValueError("PDF has no pages")

        if page_number >= pdf_document.page_count:
            raise ValueError(f"Page {page_number} does not exist (PDF has {pdf_document.page_count} pages)")

        page = pdf_document[page_number]

        # Render page to image
        # For thumbnails: lower resolution, for OCR: higher resolution
        zoom = 0.5 if thumbnail else 2.0
        mat = fitz.Matrix(zoom, zoom)
        pix = page.get_pixmap(matrix=mat)

        png_data = pix.tobytes("png")
        pdf_document.close()

        logger.info(f"Converted PDF page {page_number} to PNG: {len(png_data)} bytes (thumbnail={thumbnail})")
        return png_data

    except ImportError:
        logger.error("PyMuPDF (fitz) not installed")
        raise HTTPException(status_code=500, detail="PDF conversion not available - PyMuPDF not installed")
    except Exception as e:
        logger.error(f"PDF conversion failed: {e}")
        raise HTTPException(status_code=400, detail=f"PDF conversion failed: {str(e)}")


async def convert_pdf_to_images(pdf_data: bytes, pages: List[int] = None) -> List[bytes]:
    """Convert multiple pages of PDF to PNG images.

    Args:
        pdf_data: PDF file as bytes
        pages: List of 0-indexed page numbers to convert. If None, convert all pages.
    """
    try:
        import fitz

        pdf_document = fitz.open(stream=pdf_data, filetype="pdf")

        if pdf_document.page_count == 0:
            raise ValueError("PDF has no pages")

        # If no pages specified, convert all
        if pages is None:
            pages = list(range(pdf_document.page_count))

        images = []
        zoom = 2.0
        mat = fitz.Matrix(zoom, zoom)

        for page_num in pages:
            if page_num < pdf_document.page_count:
                page = pdf_document[page_num]
                pix = page.get_pixmap(matrix=mat)
                images.append(pix.tobytes("png"))

        pdf_document.close()
        logger.info(f"Converted {len(images)} PDF pages to images")
        return images

    except ImportError:
        logger.error("PyMuPDF (fitz) not installed")
        raise HTTPException(status_code=500, detail="PDF conversion not available")
    except Exception as e:
        logger.error(f"PDF conversion failed: {e}")
        raise HTTPException(status_code=400, detail=f"PDF conversion failed: {str(e)}")