breakpilot-lehrer/klausur-service/backend/cv_cell_grid_merge.py

"""
Row-merging logic for vocabulary entries (phonetic, wrapped, continuation rows).

Extracted from cv_cell_grid.py.

Lizenz: Apache 2.0 (kommerziell nutzbar)
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
"""

import logging
import re
from typing import Any, Dict, List

from cv_ocr_engines import _RE_ALPHA

logger = logging.getLogger(__name__)

# Regex: line starts with phonetic bracket content only (no real word before it)
_PHONETIC_ONLY_RE = re.compile(
    r'''^\s*[\[\('"]*[^\]]*[\])\s]*$'''
)


def _is_phonetic_only_text(text: str) -> bool:
    """Check if text consists only of phonetic transcription.

    Phonetic-only patterns:
      ['mani serva]   ->  True
      [dance]         ->  True
      ["a:mand]       ->  True
      almond ['a:mand] -> False (has real word before bracket)
      Mandel           -> False
    """
    t = text.strip()
    if not t:
        return False
    # Must contain at least one bracket
    if '[' not in t and ']' not in t:
        return False
    # Remove all bracket content and surrounding punctuation/whitespace
    without_brackets = re.sub(r"\[.*?\]", '', t)
    without_brackets = re.sub(r"[\[\]'\"()\s]", '', without_brackets)
    # If nothing meaningful remains, it's phonetic-only
    alpha_remaining = ''.join(_RE_ALPHA.findall(without_brackets))
    return len(alpha_remaining) < 2


def _merge_phonetic_continuation_rows(
    entries: List[Dict[str, Any]],
) -> List[Dict[str, Any]]:
    """Merge rows that contain only phonetic transcription into previous entry.

    In dictionary pages, phonetic transcription sometimes wraps to the next
    row.  E.g.:
      Row 28: EN="it's a money-saver"  DE="es spart Kosten"
      Row 29: EN="['mani serva]"       DE=""

    Row 29 is phonetic-only -> merge into row 28's EN field.
    """
    if len(entries) < 2:
        return entries

    merged: List[Dict[str, Any]] = []
    for entry in entries:
        en = (entry.get('english') or '').strip()
        de = (entry.get('german') or '').strip()
        ex = (entry.get('example') or '').strip()

        # Check if this entry is phonetic-only (EN has only phonetics, DE empty)
        if merged and _is_phonetic_only_text(en) and not de:
            prev = merged[-1]
            prev_en = (prev.get('english') or '').strip()
            # Append phonetic to previous entry's EN
            if prev_en:
                prev['english'] = prev_en + ' ' + en
            else:
                prev['english'] = en
            # If there was an example, append to previous too
            if ex:
                prev_ex = (prev.get('example') or '').strip()
                prev['example'] = (prev_ex + ' ' + ex).strip() if prev_ex else ex
            logger.debug(
                f"Merged phonetic row {entry.get('row_index')} "
                f"into previous entry: {prev['english']!r}"
            )
            continue

        merged.append(entry)

    return merged


def _merge_wrapped_rows(
    entries: List[Dict[str, Any]],
) -> List[Dict[str, Any]]:
    """Merge rows where the primary column (EN) is empty -- cell wrap continuation.

    In textbook vocabulary tables, columns are often narrow, so the author
    wraps text within a cell. OCR treats each physical line as a separate row.
    The key indicator: if the EN column is empty but DE/example have text,
    this row is a continuation of the previous row's cells.

    Example (original textbook has ONE row):
      Row 2: EN="take part (in)"  DE="teilnehmen (an), mitmachen"  EX="More than 200 singers took"
      Row 3: EN=""                DE="(bei)"                        EX="part in the concert."
      -> Merged: EN="take part (in)" DE="teilnehmen (an), mitmachen (bei)" EX="..."

    Also handles the reverse case: DE empty but EN has text (wrap in EN column).
    """
    if len(entries) < 2:
        return entries

    merged: List[Dict[str, Any]] = []
    for entry in entries:
        en = (entry.get('english') or '').strip()
        de = (entry.get('german') or '').strip()
        ex = (entry.get('example') or '').strip()

        if not merged:
            merged.append(entry)
            continue

        prev = merged[-1]
        prev_en = (prev.get('english') or '').strip()
        prev_de = (prev.get('german') or '').strip()
        prev_ex = (prev.get('example') or '').strip()

        # Case 1: EN is empty -> continuation of previous row
        if not en and (de or ex) and prev_en:
            if de:
                if prev_de.endswith(','):
                    sep = ' '
                elif prev_de.endswith(('-', '(')):
                    sep = ''
                else:
                    sep = ' '
                prev['german'] = (prev_de + sep + de).strip()
            if ex:
                sep = ' ' if prev_ex else ''
                prev['example'] = (prev_ex + sep + ex).strip()
            logger.debug(
                f"Merged wrapped row {entry.get('row_index')} into previous "
                f"(empty EN): DE={prev['german']!r}, EX={prev.get('example', '')!r}"
            )
            continue

        # Case 2: DE is empty, EN has text that looks like continuation
        if en and not de and prev_de:
            is_paren = en.startswith('(')
            first_alpha = next((c for c in en if c.isalpha()), '')
            starts_lower = first_alpha and first_alpha.islower()

            if (is_paren or starts_lower) and len(en.split()) < 5:
                sep = ' ' if prev_en and not prev_en.endswith((',', '-', '(')) else ''
                prev['english'] = (prev_en + sep + en).strip()
                if ex:
                    sep2 = ' ' if prev_ex else ''
                    prev['example'] = (prev_ex + sep2 + ex).strip()
                logger.debug(
                    f"Merged wrapped row {entry.get('row_index')} into previous "
                    f"(empty DE): EN={prev['english']!r}"
                )
                continue

        merged.append(entry)

    if len(merged) < len(entries):
        logger.info(
            f"_merge_wrapped_rows: merged {len(entries) - len(merged)} "
            f"continuation rows ({len(entries)} -> {len(merged)})"
        )
    return merged


def _merge_continuation_rows(
    entries: List[Dict[str, Any]],
) -> List[Dict[str, Any]]:
    """Merge multi-line vocabulary entries where text wraps to the next row.

    A row is a continuation of the previous entry when:
    - EN has text, but DE is empty
    - EN starts with a lowercase letter (not a new vocab entry)
    - Previous entry's EN does NOT end with a sentence terminator (.!?)
    - The continuation text has fewer than 4 words (not an example sentence)
    - The row was not already merged as phonetic

    Example:
      Row 5: EN="to put up"       DE="aufstellen"
      Row 6: EN="with sth."       DE=""
      -> Merged: EN="to put up with sth."  DE="aufstellen"
    """
    if len(entries) < 2:
        return entries

    merged: List[Dict[str, Any]] = []
    for entry in entries:
        en = (entry.get('english') or '').strip()
        de = (entry.get('german') or '').strip()

        if merged and en and not de:
            # Check: not phonetic (already handled)
            if _is_phonetic_only_text(en):
                merged.append(entry)
                continue

            # Check: starts with lowercase
            first_alpha = next((c for c in en if c.isalpha()), '')
            starts_lower = first_alpha and first_alpha.islower()

            # Check: fewer than 4 words (not an example sentence)
            word_count = len(en.split())
            is_short = word_count < 4

            # Check: previous entry doesn't end with sentence terminator
            prev = merged[-1]
            prev_en = (prev.get('english') or '').strip()
            prev_ends_sentence = prev_en and prev_en[-1] in '.!?'

            if starts_lower and is_short and not prev_ends_sentence:
                # Merge into previous entry
                prev['english'] = (prev_en + ' ' + en).strip()
                # Merge example if present
                ex = (entry.get('example') or '').strip()
                if ex:
                    prev_ex = (prev.get('example') or '').strip()
                    prev['example'] = (prev_ex + ' ' + ex).strip() if prev_ex else ex
                logger.debug(
                    f"Merged continuation row {entry.get('row_index')} "
                    f"into previous entry: {prev['english']!r}"
                )
                continue

        merged.append(entry)

    return merged