breakpilot-lehrer/klausur-service/backend/nru_worksheet_models.py

"""
NRU Worksheet Models — data classes and entry separation logic.

Extracted from nru_worksheet_generator.py for modularity.
"""

import logging
from typing import List, Dict, Tuple
from dataclasses import dataclass

logger = logging.getLogger(__name__)


@dataclass
class VocabEntry:
    english: str
    german: str
    source_page: int = 1


@dataclass
class SentenceEntry:
    german: str
    english: str  # For solution sheet
    source_page: int = 1


def separate_vocab_and_sentences(entries: List[Dict]) -> Tuple[List[VocabEntry], List[SentenceEntry]]:
    """
    Separate vocabulary entries into single words/phrases and full sentences.

    Sentences are identified by:
    - Ending with punctuation (. ! ?)
    - Being longer than 40 characters
    - Containing multiple words with capital letters mid-sentence
    """
    vocab_list = []
    sentence_list = []

    for entry in entries:
        english = entry.get("english", "").strip()
        german = entry.get("german", "").strip()
        source_page = entry.get("source_page", 1)

        if not english or not german:
            continue

        # Detect if this is a sentence
        is_sentence = (
            english.endswith('.') or
            english.endswith('!') or
            english.endswith('?') or
            len(english) > 50 or
            (len(english.split()) > 5 and any(w[0].isupper() for w in english.split()[1:] if w))
        )

        if is_sentence:
            sentence_list.append(SentenceEntry(
                german=german,
                english=english,
                source_page=source_page
            ))
        else:
            vocab_list.append(VocabEntry(
                english=english,
                german=german,
                source_page=source_page
            ))

    return vocab_list, sentence_list