breakpilot-lehrer/klausur-service/backend/cv_syllable_detect.py

"""
Syllable divider insertion for dictionary pages.

For confirmed dictionary pages (is_dictionary=True), processes all content
column cells:
  1. Strips existing | dividers for clean normalization
  2. Merges pipe-gap spaces (where OCR split a word at a divider position)
  3. Applies pyphen syllabification to each word >= 3 alpha chars (DE then EN)
  4. Only modifies words that pyphen recognizes — garbled OCR stays as-is

No CV gate needed — the dictionary detection confidence is sufficient.
pyphen uses Hunspell/TeX hyphenation dictionaries and is very reliable.

Lizenz: Apache 2.0 (kommerziell nutzbar)
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
"""

import logging
import re
from typing import Any, Dict, List, Optional, Tuple

import numpy as np

logger = logging.getLogger(__name__)

# IPA/phonetic characters — skip cells containing these
_IPA_RE = re.compile(r'[\[\]ˈˌːʃʒθðŋɑɒæɔəɛɜɪʊʌ]')

# Common German words that should NOT be merged with adjacent tokens.
# These are function words that appear as standalone words between
# headwords/definitions on dictionary pages.
_STOP_WORDS = frozenset([
    # Articles
    'der', 'die', 'das', 'dem', 'den', 'des',
    'ein', 'eine', 'einem', 'einen', 'einer',
    # Pronouns
    'du', 'er', 'es', 'sie', 'wir', 'ihr', 'ich', 'man', 'sich',
    'dich', 'dir', 'mich', 'mir', 'uns', 'euch', 'ihm', 'ihn',
    # Prepositions
    'mit', 'von', 'zu', 'für', 'auf', 'in', 'an', 'um', 'am', 'im',
    'aus', 'bei', 'nach', 'vor', 'bis', 'durch', 'über', 'unter',
    'zwischen', 'ohne', 'gegen',
    # Conjunctions
    'und', 'oder', 'als', 'wie', 'wenn', 'dass', 'weil', 'aber',
    # Adverbs
    'auch', 'noch', 'nur', 'schon', 'sehr', 'nicht',
    # Verbs
    'ist', 'hat', 'wird', 'kann', 'soll', 'muss', 'darf',
    'sein', 'haben',
    # Other
    'kein', 'keine', 'keinem', 'keinen', 'keiner',
])

# Cached hyphenators
_hyph_de = None
_hyph_en = None

# Cached spellchecker (for autocorrect_pipe_artifacts)
_spell_de = None


def _get_hyphenators():
    """Lazy-load pyphen hyphenators (cached across calls)."""
    global _hyph_de, _hyph_en
    if _hyph_de is not None:
        return _hyph_de, _hyph_en
    try:
        import pyphen
    except ImportError:
        return None, None
    _hyph_de = pyphen.Pyphen(lang='de_DE')
    _hyph_en = pyphen.Pyphen(lang='en_US')
    return _hyph_de, _hyph_en


def _get_spellchecker():
    """Lazy-load German spellchecker (cached across calls)."""
    global _spell_de
    if _spell_de is not None:
        return _spell_de
    try:
        from spellchecker import SpellChecker
    except ImportError:
        return None
    _spell_de = SpellChecker(language='de')
    return _spell_de


def _is_known_word(word: str, hyph_de, hyph_en) -> bool:
    """Check whether pyphen recognises a word (DE or EN)."""
    if len(word) < 2:
        return False
    return ('|' in hyph_de.inserted(word, hyphen='|')
            or '|' in hyph_en.inserted(word, hyphen='|'))


def _is_real_word(word: str) -> bool:
    """Check whether spellchecker knows this word (case-insensitive)."""
    spell = _get_spellchecker()
    if spell is None:
        return False
    return word.lower() in spell


def _hyphenate_word(word: str, hyph_de, hyph_en) -> Optional[str]:
    """Try to hyphenate a word using DE then EN dictionary.

    Returns word with | separators, or None if not recognized.
    """
    hyph = hyph_de.inserted(word, hyphen='|')
    if '|' in hyph:
        return hyph
    hyph = hyph_en.inserted(word, hyphen='|')
    if '|' in hyph:
        return hyph
    return None


def _autocorrect_piped_word(word_with_pipes: str) -> Optional[str]:
    """Try to correct a word that has OCR pipe artifacts.

    Printed syllable divider lines on dictionary pages confuse OCR:
    the vertical stroke is often read as an extra character (commonly
    ``l``, ``I``, ``1``, ``i``) adjacent to where the pipe appears.
    Sometimes OCR reads one divider as ``|`` and another as a letter,
    so the garbled character may be far from any detected pipe.

    Uses ``spellchecker`` (frequency-based word list) for validation —
    unlike pyphen which is a pattern-based hyphenator and accepts
    nonsense strings like "Zeplpelin".

    Strategy:
        1. Strip ``|`` — if spellchecker knows the result, done.
        2. Try deleting each pipe-like character (l, I, 1, i, t).
           OCR inserts extra chars that resemble vertical strokes.
        3. Fall back to spellchecker's own ``correction()`` method.
        4. Preserve the original casing of the first letter.
    """
    stripped = word_with_pipes.replace('|', '')
    if not stripped or len(stripped) < 3:
        return stripped  # too short to validate

    # Step 1: if the stripped word is already a real word, done
    if _is_real_word(stripped):
        return stripped

    # Step 2: try deleting pipe-like characters (most likely artifacts)
    _PIPE_LIKE = frozenset('lI1it')
    for idx in range(len(stripped)):
        if stripped[idx] not in _PIPE_LIKE:
            continue
        candidate = stripped[:idx] + stripped[idx + 1:]
        if len(candidate) >= 3 and _is_real_word(candidate):
            return candidate

    # Step 3: use spellchecker's built-in correction
    spell = _get_spellchecker()
    if spell is not None:
        suggestion = spell.correction(stripped.lower())
        if suggestion and suggestion != stripped.lower():
            # Preserve original first-letter case
            if stripped[0].isupper():
                suggestion = suggestion[0].upper() + suggestion[1:]
            return suggestion

    return None  # could not fix


def autocorrect_pipe_artifacts(
    zones_data: List[Dict], session_id: str,
) -> int:
    """Strip OCR pipe artifacts and correct garbled words in-place.

    Printed syllable divider lines on dictionary scans are read by OCR
    as ``|`` characters embedded in words (e.g. ``Zel|le``, ``Ze|plpe|lin``).
    This function:

    1. Strips ``|`` from every word in content cells.
    2. Validates with spellchecker (real dictionary lookup).
    3. If not recognised, tries deleting pipe-like characters or uses
       spellchecker's correction (e.g. ``Zeplpelin`` → ``Zeppelin``).
    4. Updates both word-box texts and cell text.

    Returns the number of cells modified.
    """
    spell = _get_spellchecker()
    if spell is None:
        logger.warning("spellchecker not available — pipe autocorrect limited")
        # Fall back: still strip pipes even without spellchecker
        pass

    modified = 0
    for z in zones_data:
        for cell in z.get("cells", []):
            ct = cell.get("col_type", "")
            if not ct.startswith("column_"):
                continue

            cell_changed = False

            # --- Fix word boxes ---
            for wb in cell.get("word_boxes", []):
                wb_text = wb.get("text", "")
                if "|" not in wb_text:
                    continue

                # Separate trailing punctuation
                m = re.match(
                    r'^([^a-zA-ZäöüÄÖÜßẞ]*)'
                    r'(.*?)'
                    r'([^a-zA-ZäöüÄÖÜßẞ]*)$',
                    wb_text,
                )
                if not m:
                    continue
                lead, core, trail = m.group(1), m.group(2), m.group(3)
                if "|" not in core:
                    continue

                corrected = _autocorrect_piped_word(core)
                if corrected is not None and corrected != core:
                    wb["text"] = lead + corrected + trail
                    cell_changed = True

            # --- Rebuild cell text from word boxes ---
            if cell_changed:
                wbs = cell.get("word_boxes", [])
                if wbs:
                    cell["text"] = " ".join(
                        (wb.get("text") or "") for wb in wbs
                    )
                modified += 1

            # --- Fallback: strip residual | from cell text ---
            # (covers cases where word_boxes don't exist or weren't fixed)
            text = cell.get("text", "")
            if "|" in text:
                clean = text.replace("|", "")
                if clean != text:
                    cell["text"] = clean
                    if not cell_changed:
                        modified += 1

    if modified:
        logger.info(
            "build-grid session %s: autocorrected pipe artifacts in %d cells",
            session_id, modified,
        )
    return modified


def _try_merge_pipe_gaps(text: str, hyph_de) -> str:
    """Merge fragments separated by single spaces where OCR split at a pipe.

    Example: "Kaf fee" -> "Kaffee" (pyphen recognizes the merged word).
    Multi-step: "Ka bel jau" -> "Kabel jau" -> "Kabeljau".

    Guards against false merges:
    - The FIRST token must be pure alpha (word start — no attached punctuation)
    - The second token may have trailing punctuation (comma, period) which
      stays attached to the merged word: "Kä" + "fer," -> "Käfer,"
    - Common German function words (der, die, das, ...) are never merged
    - At least one fragment must be very short (<=3 alpha chars)
    """
    parts = text.split(' ')
    if len(parts) < 2:
        return text

    result = [parts[0]]
    i = 1
    while i < len(parts):
        prev = result[-1]
        curr = parts[i]

        # Extract alpha-only core for lookup
        prev_alpha = re.sub(r'[^a-zA-ZäöüÄÖÜßẞ]', '', prev)
        curr_alpha = re.sub(r'[^a-zA-ZäöüÄÖÜßẞ]', '', curr)

        # Guard 1: first token must be pure alpha (word-start fragment)
        #          second token may have trailing punctuation
        # Guard 2: neither alpha core can be a common German function word
        # Guard 3: the shorter fragment must be <= 3 chars (pipe-gap signal)
        # Guard 4: combined length must be >= 4
        should_try = (
            prev == prev_alpha  # first token: pure alpha (word start)
            and prev_alpha and curr_alpha
            and prev_alpha.lower() not in _STOP_WORDS
            and curr_alpha.lower() not in _STOP_WORDS
            and min(len(prev_alpha), len(curr_alpha)) <= 3
            and len(prev_alpha) + len(curr_alpha) >= 4
        )

        if should_try:
            merged_alpha = prev_alpha + curr_alpha
            hyph = hyph_de.inserted(merged_alpha, hyphen='-')
            if '-' in hyph:
                # pyphen recognizes merged word — collapse the space
                result[-1] = prev + curr
                i += 1
                continue

        result.append(curr)
        i += 1

    return ' '.join(result)


def merge_word_gaps_in_zones(zones_data: List[Dict], session_id: str) -> int:
    """Merge OCR word-gap fragments in cell texts using pyphen validation.

    OCR often splits words at syllable boundaries into separate word_boxes,
    producing text like "zerknit tert" instead of "zerknittert".  This
    function tries to merge adjacent fragments in every content cell.

    More permissive than ``_try_merge_pipe_gaps`` (threshold 5 instead of 3)
    but still guarded by pyphen dictionary lookup and stop-word exclusion.

    Returns the number of cells modified.
    """
    hyph_de, _ = _get_hyphenators()
    if hyph_de is None:
        return 0

    modified = 0
    for z in zones_data:
        for cell in z.get("cells", []):
            ct = cell.get("col_type", "")
            if not ct.startswith("column_"):
                continue
            text = cell.get("text", "")
            if not text or " " not in text:
                continue

            # Skip IPA cells
            text_no_brackets = re.sub(r'\[[^\]]*\]', '', text)
            if _IPA_RE.search(text_no_brackets):
                continue

            new_text = _try_merge_word_gaps(text, hyph_de)
            if new_text != text:
                cell["text"] = new_text
                modified += 1

    if modified:
        logger.info(
            "build-grid session %s: merged word gaps in %d cells",
            session_id, modified,
        )
    return modified


def _try_merge_word_gaps(text: str, hyph_de) -> str:
    """Merge OCR word fragments with relaxed threshold (max_short=5).

    Similar to ``_try_merge_pipe_gaps`` but allows slightly longer fragments
    (max_short=5 instead of 3).  Still requires pyphen to recognize the
    merged word.
    """
    parts = text.split(' ')
    if len(parts) < 2:
        return text

    result = [parts[0]]
    i = 1
    while i < len(parts):
        prev = result[-1]
        curr = parts[i]

        prev_alpha = re.sub(r'[^a-zA-ZäöüÄÖÜßẞ]', '', prev)
        curr_alpha = re.sub(r'[^a-zA-ZäöüÄÖÜßẞ]', '', curr)

        should_try = (
            prev == prev_alpha
            and prev_alpha and curr_alpha
            and prev_alpha.lower() not in _STOP_WORDS
            and curr_alpha.lower() not in _STOP_WORDS
            and min(len(prev_alpha), len(curr_alpha)) <= 5
            and len(prev_alpha) + len(curr_alpha) >= 4
        )

        if should_try:
            merged_alpha = prev_alpha + curr_alpha
            hyph = hyph_de.inserted(merged_alpha, hyphen='-')
            if '-' in hyph:
                result[-1] = prev + curr
                i += 1
                continue

        result.append(curr)
        i += 1

    return ' '.join(result)


def _syllabify_text(text: str, hyph_de, hyph_en) -> str:
    """Syllabify all significant words in a text string.

    1. Strip existing | dividers
    2. Merge pipe-gap spaces where possible
    3. Apply pyphen to each word >= 3 alphabetic chars
    4. Words pyphen doesn't recognize stay as-is (no bad guesses)
    """
    if not text:
        return text

    # Skip cells that contain IPA transcription characters outside brackets.
    # Bracket content like [bɪltʃøn] is programmatically inserted and should
    # not block syllabification of the surrounding text.
    text_no_brackets = re.sub(r'\[[^\]]*\]', '', text)
    if _IPA_RE.search(text_no_brackets):
        return text

    # Phase 1: strip existing pipe dividers for clean normalization
    clean = text.replace('|', '')

    # Phase 2: merge pipe-gap spaces (OCR fragments from pipe splitting)
    clean = _try_merge_pipe_gaps(clean, hyph_de)

    # Phase 3: tokenize and syllabify each word
    # Split on whitespace and comma/semicolon sequences, keeping separators
    tokens = re.split(r'(\s+|[,;:]+\s*)', clean)

    result = []
    for tok in tokens:
        if not tok or re.match(r'^[\s,;:]+$', tok):
            result.append(tok)
            continue

        # Strip trailing/leading punctuation for pyphen lookup
        m = re.match(r'^([^a-zA-ZäöüÄÖÜßẞ]*)(.*?)([^a-zA-ZäöüÄÖÜßẞ]*)$', tok)
        if not m:
            result.append(tok)
            continue
        lead, word, trail = m.group(1), m.group(2), m.group(3)

        if len(word) < 3 or not re.search(r'[a-zA-ZäöüÄÖÜß]', word):
            result.append(tok)
            continue

        hyph = _hyphenate_word(word, hyph_de, hyph_en)
        if hyph:
            result.append(lead + hyph + trail)
        else:
            result.append(tok)

    return ''.join(result)


def insert_syllable_dividers(
    zones_data: List[Dict],
    img_bgr: np.ndarray,
    session_id: str,
    *,
    force: bool = False,
    col_filter: Optional[set] = None,
) -> int:
    """Insert pipe syllable dividers into dictionary cells.

    For dictionary pages: process all content column cells, strip existing
    pipes, merge pipe-gap spaces, and re-syllabify using pyphen.

    Pre-check: at least 1% of content cells must already contain ``|`` from
    OCR.  This guards against pages with zero pipe characters (the primary
    guard — article_col_index — is checked at the call site).

    Args:
        force: If True, skip the pipe-ratio pre-check and syllabify all
            content words regardless of whether the original has pipe dividers.
        col_filter: If set, only process cells whose col_type is in this set.
            None means process all content columns.

    Returns the number of cells modified.
    """
    hyph_de, hyph_en = _get_hyphenators()
    if hyph_de is None:
        logger.warning("pyphen not installed — skipping syllable insertion")
        return 0

    # Pre-check: count cells that already have | from OCR.
    # Real dictionary pages with printed syllable dividers will have OCR-
    # detected pipes in many cells.  Pages without syllable dividers will
    # have zero — skip those to avoid false syllabification.
    if not force:
        total_col_cells = 0
        cells_with_pipes = 0
        for z in zones_data:
            for cell in z.get("cells", []):
                if cell.get("col_type", "").startswith("column_"):
                    total_col_cells += 1
                    if "|" in cell.get("text", ""):
                        cells_with_pipes += 1

        if total_col_cells > 0:
            pipe_ratio = cells_with_pipes / total_col_cells
            if pipe_ratio < 0.01:
                logger.info(
                    "build-grid session %s: skipping syllable insertion — "
                    "only %.1f%% of cells have existing pipes (need >=1%%)",
                    session_id, pipe_ratio * 100,
                )
                return 0

    insertions = 0
    for z in zones_data:
        for cell in z.get("cells", []):
            ct = cell.get("col_type", "")
            if not ct.startswith("column_"):
                continue
            if col_filter is not None and ct not in col_filter:
                continue
            text = cell.get("text", "")
            if not text:
                continue

            # In auto mode (force=False), only normalize cells that already
            # have | from OCR (i.e. printed syllable dividers on the original
            # scan).  Don't add new syllable marks to other words.
            if not force and "|" not in text:
                continue

            new_text = _syllabify_text(text, hyph_de, hyph_en)
            if new_text != text:
                cell["text"] = new_text
                insertions += 1

    if insertions:
        logger.info(
            "build-grid session %s: syllable dividers inserted/normalized "
            "in %d cells (pyphen)",
            session_id, insertions,
        )
    return insertions