breakpilot-lehrer/klausur-service/backend/nru_worksheet_generator.py

"""
NRU Worksheet Generator - Generate vocabulary worksheets in NRU format.

Format:
- Page 1 (Vokabeln): 3-column table
  - Column 1: English vocabulary
  - Column 2: Empty (child writes German translation)
  - Column 3: Empty (child writes corrected English after parent review)

- Page 2 (Lernsätze): Full-width table
  - Row 1: German sentence (pre-filled)
  - Row 2-3: Empty lines (child writes English translation)

Per scanned page, we generate 2 worksheet pages.
"""

import io
import logging
from typing import List, Dict, Tuple
from dataclasses import dataclass

logger = logging.getLogger(__name__)


@dataclass
class VocabEntry:
    english: str
    german: str
    source_page: int = 1


@dataclass
class SentenceEntry:
    german: str
    english: str  # For solution sheet
    source_page: int = 1


def separate_vocab_and_sentences(entries: List[Dict]) -> Tuple[List[VocabEntry], List[SentenceEntry]]:
    """
    Separate vocabulary entries into single words/phrases and full sentences.

    Sentences are identified by:
    - Ending with punctuation (. ! ?)
    - Being longer than 40 characters
    - Containing multiple words with capital letters mid-sentence
    """
    vocab_list = []
    sentence_list = []

    for entry in entries:
        english = entry.get("english", "").strip()
        german = entry.get("german", "").strip()
        source_page = entry.get("source_page", 1)

        if not english or not german:
            continue

        # Detect if this is a sentence
        is_sentence = (
            english.endswith('.') or
            english.endswith('!') or
            english.endswith('?') or
            len(english) > 50 or
            (len(english.split()) > 5 and any(w[0].isupper() for w in english.split()[1:] if w))
        )

        if is_sentence:
            sentence_list.append(SentenceEntry(
                german=german,
                english=english,
                source_page=source_page
            ))
        else:
            vocab_list.append(VocabEntry(
                english=english,
                german=german,
                source_page=source_page
            ))

    return vocab_list, sentence_list


def generate_nru_html(
    vocab_list: List[VocabEntry],
    sentence_list: List[SentenceEntry],
    page_number: int,
    title: str = "Vokabeltest",
    show_solutions: bool = False,
    line_height_px: int = 28
) -> str:
    """
    Generate HTML for NRU-format worksheet.

    Returns HTML for 2 pages:
    - Page 1: Vocabulary table (3 columns)
    - Page 2: Sentence practice (full width)
    """

    # Filter by page
    page_vocab = [v for v in vocab_list if v.source_page == page_number]
    page_sentences = [s for s in sentence_list if s.source_page == page_number]

    html = f"""<!DOCTYPE html>
<html>
<head>
    <meta charset="UTF-8">
    <style>
        @page {{
            size: A4;
            margin: 1.5cm 2cm;
        }}
        * {{
            box-sizing: border-box;
        }}
        body {{
            font-family: Arial, Helvetica, sans-serif;
            font-size: 12pt;
            line-height: 1.4;
            margin: 0;
            padding: 0;
        }}
        .page {{
            page-break-after: always;
            min-height: 100%;
        }}
        .page:last-child {{
            page-break-after: avoid;
        }}
        h1 {{
            font-size: 16pt;
            margin: 0 0 8px 0;
            text-align: center;
        }}
        .header {{
            margin-bottom: 15px;
        }}
        .name-line {{
            font-size: 11pt;
            margin-bottom: 10px;
        }}

        /* Vocabulary Table - 3 columns */
        .vocab-table {{
            width: 100%;
            border-collapse: collapse;
            table-layout: fixed;
        }}
        .vocab-table th {{
            background: #f0f0f0;
            border: 1px solid #333;
            padding: 6px 8px;
            font-weight: bold;
            font-size: 11pt;
            text-align: left;
        }}
        .vocab-table td {{
            border: 1px solid #333;
            padding: 4px 8px;
            height: {line_height_px}px;
            vertical-align: middle;
        }}
        .vocab-table .col-english {{ width: 35%; }}
        .vocab-table .col-german {{ width: 35%; }}
        .vocab-table .col-correction {{ width: 30%; }}
        .vocab-answer {{
            color: #0066cc;
            font-style: italic;
        }}

        /* Sentence Table - full width */
        .sentence-table {{
            width: 100%;
            border-collapse: collapse;
            margin-bottom: 15px;
        }}
        .sentence-table td {{
            border: 1px solid #333;
            padding: 6px 10px;
        }}
        .sentence-header {{
            background: #f5f5f5;
            font-weight: normal;
            min-height: 30px;
        }}
        .sentence-line {{
            height: {line_height_px + 4}px;
        }}
        .sentence-answer {{
            color: #0066cc;
            font-style: italic;
            font-size: 11pt;
        }}

        .page-info {{
            font-size: 9pt;
            color: #666;
            text-align: right;
            margin-top: 10px;
        }}
    </style>
</head>
<body>
"""

    # ========== PAGE 1: VOCABULARY TABLE ==========
    if page_vocab:
        html += f"""
    <div class="page">
        <div class="header">
            <h1>{title} - Vokabeln (Seite {page_number})</h1>
            <div class="name-line">Name: _________________________ Datum: _____________</div>
        </div>

        <table class="vocab-table">
            <thead>
                <tr>
                    <th class="col-english">Englisch</th>
                    <th class="col-german">Deutsch</th>
                    <th class="col-correction">Korrektur</th>
                </tr>
            </thead>
            <tbody>
"""
        for v in page_vocab:
            if show_solutions:
                html += f"""
                <tr>
                    <td>{v.english}</td>
                    <td class="vocab-answer">{v.german}</td>
                    <td></td>
                </tr>
"""
            else:
                html += f"""
                <tr>
                    <td>{v.english}</td>
                    <td></td>
                    <td></td>
                </tr>
"""

        html += """
            </tbody>
        </table>
        <div class="page-info">Vokabeln aus Unit</div>
    </div>
"""

    # ========== PAGE 2: SENTENCE PRACTICE ==========
    if page_sentences:
        html += f"""
    <div class="page">
        <div class="header">
            <h1>{title} - Lernsaetze (Seite {page_number})</h1>
            <div class="name-line">Name: _________________________ Datum: _____________</div>
        </div>
"""
        for s in page_sentences:
            html += f"""
        <table class="sentence-table">
            <tr>
                <td class="sentence-header">{s.german}</td>
            </tr>
"""
            if show_solutions:
                html += f"""
            <tr>
                <td class="sentence-line sentence-answer">{s.english}</td>
            </tr>
            <tr>
                <td class="sentence-line"></td>
            </tr>
"""
            else:
                html += """
            <tr>
                <td class="sentence-line"></td>
            </tr>
            <tr>
                <td class="sentence-line"></td>
            </tr>
"""
            html += """
        </table>
"""

        html += """
        <div class="page-info">Lernsaetze aus Unit</div>
    </div>
"""

    html += """
</body>
</html>
"""
    return html


def generate_nru_worksheet_html(
    entries: List[Dict],
    title: str = "Vokabeltest",
    show_solutions: bool = False,
    specific_pages: List[int] = None
) -> str:
    """
    Generate complete NRU worksheet HTML for all pages.

    Args:
        entries: List of vocabulary entries with source_page
        title: Worksheet title
        show_solutions: Whether to show answers
        specific_pages: List of specific page numbers to include (1-indexed)

    Returns:
        Complete HTML document
    """
    # Separate into vocab and sentences
    vocab_list, sentence_list = separate_vocab_and_sentences(entries)

    # Get unique page numbers
    all_pages = set()
    for v in vocab_list:
        all_pages.add(v.source_page)
    for s in sentence_list:
        all_pages.add(s.source_page)

    # Filter to specific pages if requested
    if specific_pages:
        all_pages = all_pages.intersection(set(specific_pages))

    pages_sorted = sorted(all_pages)

    logger.info(f"Generating NRU worksheet for pages {pages_sorted}")
    logger.info(f"Total vocab: {len(vocab_list)}, Total sentences: {len(sentence_list)}")

    # Generate HTML for each page
    combined_html = """<!DOCTYPE html>
<html>
<head>
    <meta charset="UTF-8">
    <style>
        @page {
            size: A4;
            margin: 1.5cm 2cm;
        }
        * {
            box-sizing: border-box;
        }
        body {
            font-family: Arial, Helvetica, sans-serif;
            font-size: 12pt;
            line-height: 1.4;
            margin: 0;
            padding: 0;
        }
        .page {
            page-break-after: always;
            min-height: 100%;
        }
        .page:last-child {
            page-break-after: avoid;
        }
        h1 {
            font-size: 16pt;
            margin: 0 0 8px 0;
            text-align: center;
        }
        .header {
            margin-bottom: 15px;
        }
        .name-line {
            font-size: 11pt;
            margin-bottom: 10px;
        }

        /* Vocabulary Table - 3 columns */
        .vocab-table {
            width: 100%;
            border-collapse: collapse;
            table-layout: fixed;
        }
        .vocab-table th {
            background: #f0f0f0;
            border: 1px solid #333;
            padding: 6px 8px;
            font-weight: bold;
            font-size: 11pt;
            text-align: left;
        }
        .vocab-table td {
            border: 1px solid #333;
            padding: 4px 8px;
            height: 28px;
            vertical-align: middle;
        }
        .vocab-table .col-english { width: 35%; }
        .vocab-table .col-german { width: 35%; }
        .vocab-table .col-correction { width: 30%; }
        .vocab-answer {
            color: #0066cc;
            font-style: italic;
        }

        /* Sentence Table - full width */
        .sentence-table {
            width: 100%;
            border-collapse: collapse;
            margin-bottom: 15px;
        }
        .sentence-table td {
            border: 1px solid #333;
            padding: 6px 10px;
        }
        .sentence-header {
            background: #f5f5f5;
            font-weight: normal;
            min-height: 30px;
        }
        .sentence-line {
            height: 32px;
        }
        .sentence-answer {
            color: #0066cc;
            font-style: italic;
            font-size: 11pt;
        }

        .page-info {
            font-size: 9pt;
            color: #666;
            text-align: right;
            margin-top: 10px;
        }
    </style>
</head>
<body>
"""

    for page_num in pages_sorted:
        page_vocab = [v for v in vocab_list if v.source_page == page_num]
        page_sentences = [s for s in sentence_list if s.source_page == page_num]

        # PAGE 1: VOCABULARY TABLE
        if page_vocab:
            combined_html += f"""
    <div class="page">
        <div class="header">
            <h1>{title} - Vokabeln (Seite {page_num})</h1>
            <div class="name-line">Name: _________________________ Datum: _____________</div>
        </div>

        <table class="vocab-table">
            <thead>
                <tr>
                    <th class="col-english">Englisch</th>
                    <th class="col-german">Deutsch</th>
                    <th class="col-correction">Korrektur</th>
                </tr>
            </thead>
            <tbody>
"""
            for v in page_vocab:
                if show_solutions:
                    combined_html += f"""
                <tr>
                    <td>{v.english}</td>
                    <td class="vocab-answer">{v.german}</td>
                    <td></td>
                </tr>
"""
                else:
                    combined_html += f"""
                <tr>
                    <td>{v.english}</td>
                    <td></td>
                    <td></td>
                </tr>
"""

            combined_html += f"""
            </tbody>
        </table>
        <div class="page-info">{title} - Seite {page_num}</div>
    </div>
"""

        # PAGE 2: SENTENCE PRACTICE
        if page_sentences:
            combined_html += f"""
    <div class="page">
        <div class="header">
            <h1>{title} - Lernsaetze (Seite {page_num})</h1>
            <div class="name-line">Name: _________________________ Datum: _____________</div>
        </div>
"""
            for s in page_sentences:
                combined_html += f"""
        <table class="sentence-table">
            <tr>
                <td class="sentence-header">{s.german}</td>
            </tr>
"""
                if show_solutions:
                    combined_html += f"""
            <tr>
                <td class="sentence-line sentence-answer">{s.english}</td>
            </tr>
            <tr>
                <td class="sentence-line"></td>
            </tr>
"""
                else:
                    combined_html += """
            <tr>
                <td class="sentence-line"></td>
            </tr>
            <tr>
                <td class="sentence-line"></td>
            </tr>
"""
                combined_html += """
        </table>
"""

            combined_html += f"""
        <div class="page-info">{title} - Seite {page_num}</div>
    </div>
"""

    combined_html += """
</body>
</html>
"""
    return combined_html


async def generate_nru_pdf(entries: List[Dict], title: str = "Vokabeltest", include_solutions: bool = True) -> Tuple[bytes, bytes]:
    """
    Generate NRU worksheet PDFs.

    Returns:
        Tuple of (worksheet_pdf_bytes, solution_pdf_bytes)
    """
    from weasyprint import HTML

    # Generate worksheet HTML
    worksheet_html = generate_nru_worksheet_html(entries, title, show_solutions=False)
    worksheet_pdf = HTML(string=worksheet_html).write_pdf()

    # Generate solution HTML
    solution_pdf = None
    if include_solutions:
        solution_html = generate_nru_worksheet_html(entries, title, show_solutions=True)
        solution_pdf = HTML(string=solution_html).write_pdf()

    return worksheet_pdf, solution_pdf