breakpilot-lehrer/klausur-service/backend/cv_syllable_detect.py

"""
CV-based syllable divider detection and insertion for dictionary pages.

Two-step approach:
  1. CV: morphological vertical line detection checks if a word_box image
     contains thin, isolated pipe-like vertical lines (syllable dividers).
  2. pyphen: inserts syllable breaks at linguistically correct positions
     for words where CV confirmed the presence of dividers.

Lizenz: Apache 2.0 (kommerziell nutzbar)
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
"""

import logging
import re
from typing import Any, Dict, List

import cv2
import numpy as np

logger = logging.getLogger(__name__)


def _word_has_pipe_lines(img_gray: np.ndarray, wb: Dict) -> bool:
    """CV check: does this word_box image show thin vertical pipe dividers?

    Uses morphological opening with a tall thin kernel to isolate vertical
    structures, then filters for thin (≤4px), isolated contours that are
    NOT at the word edges (those would be l, I, 1 etc.).
    """
    x = wb.get("left", 0)
    y = wb.get("top", 0)
    w = wb.get("width", 0)
    h = wb.get("height", 0)
    if w < 30 or h < 12:
        return False
    ih, iw = img_gray.shape[:2]
    y1, y2 = max(0, y), min(ih, y + h)
    x1, x2 = max(0, x), min(iw, x + w)
    roi = img_gray[y1:y2, x1:x2]
    if roi.size == 0:
        return False
    rh, rw = roi.shape

    # Binarize (ink = white on black background)
    _, binary = cv2.threshold(
        roi, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU
    )

    # Morphological opening: keep only tall vertical structures (≥55% height)
    kern_h = max(int(rh * 0.55), 8)
    kernel = np.ones((kern_h, 1), np.uint8)
    vertical = cv2.morphologyEx(binary, cv2.MORPH_OPEN, kernel)

    # Find surviving contours
    contours, _ = cv2.findContours(
        vertical, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
    )

    margin = max(int(rw * 0.08), 3)
    for cnt in contours:
        cx, cy, cw, ch = cv2.boundingRect(cnt)
        if cw > 4:
            continue  # too wide for a pipe
        if cx < margin or cx + cw > rw - margin:
            continue  # at word edge — likely l, I, 1
        # Check isolation: adjacent columns should be mostly empty (ink-free)
        left_zone = binary[cy:cy + ch, max(0, cx - 3):cx]
        right_zone = binary[cy:cy + ch, cx + cw:min(rw, cx + cw + 3)]
        left_ink = np.mean(left_zone) if left_zone.size else 255
        right_ink = np.mean(right_zone) if right_zone.size else 255
        if left_ink < 80 and right_ink < 80:
            return True  # isolated thin vertical line = pipe divider
    return False


# IPA/phonetic bracket pattern — don't hyphenate transcriptions
_IPA_RE = re.compile(r'[\[\]ˈˌːʃʒθðŋɑɒæɔəɛɜɪʊʌ]')


def insert_syllable_dividers(
    zones_data: List[Dict],
    img_bgr: np.ndarray,
    session_id: str,
) -> int:
    """Insert pipe syllable dividers into dictionary cells where CV confirms them.

    For each cell on a dictionary page:
      1. Check if ANY word_box has CV-detected pipe lines
      2. If yes, apply pyphen to EACH word (≥4 chars) in the cell
      3. Try DE hyphenation first, then EN

    Returns the number of cells modified.
    """
    try:
        import pyphen
    except ImportError:
        logger.warning("pyphen not installed — skipping syllable insertion")
        return 0

    _hyph_de = pyphen.Pyphen(lang='de_DE')
    _hyph_en = pyphen.Pyphen(lang='en_US')
    img_gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)

    insertions = 0
    for z in zones_data:
        for cell in z.get("cells", []):
            ct = cell.get("col_type", "")
            if not ct.startswith("column_"):
                continue
            text = cell.get("text", "")
            if not text or "|" in text:
                continue
            if _IPA_RE.search(text):
                continue

            # CV gate: check if ANY word_box in this cell has pipe lines
            wbs = cell.get("word_boxes") or []
            if not any(_word_has_pipe_lines(img_gray, wb) for wb in wbs):
                continue

            # Apply pyphen to each significant word in the cell
            tokens = re.split(r'(\s+|[,;]+\s*)', text)
            new_tokens = []
            changed = False
            for tok in tokens:
                # Skip whitespace/punctuation separators
                if re.match(r'^[\s,;]+$', tok):
                    new_tokens.append(tok)
                    continue
                # Only hyphenate words ≥ 4 alpha chars
                clean = re.sub(r'[().\-]', '', tok)
                if len(clean) < 4 or not re.search(r'[a-zA-ZäöüÄÖÜß]', clean):
                    new_tokens.append(tok)
                    continue
                # Try DE first, then EN
                hyph = _hyph_de.inserted(tok, hyphen='|')
                if '|' not in hyph:
                    hyph = _hyph_en.inserted(tok, hyphen='|')
                if '|' in hyph and hyph != tok:
                    new_tokens.append(hyph)
                    changed = True
                else:
                    new_tokens.append(tok)
            if changed:
                cell["text"] = ''.join(new_tokens)
                insertions += 1

    if insertions:
        logger.info(
            "build-grid session %s: inserted syllable dividers in %d cells "
            "(CV-validated)",
            session_id, insertions,
        )
    return insertions