refactor: extract grid helpers + generic CV-gated syllable insertion

1. Extracted 1367 lines of helper functions from grid_editor_api.py (3051→1620 lines) into grid_editor_helpers.py (filters, detectors, zone grid building). 2. Created cv_syllable_detect.py with generic CV+pyphen logic: - Checks EVERY word_box for vertical pipe lines (not just first word) - No article-column dependency — works with any dictionary layout - CV morphological detection gates pyphen insertion 3. Grid editor scroll: calc(100vh-200px) for reliable scrolling. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-24 14:39:33 +01:00
parent d9b2aa82e9
commit 12b4c61bac
3 changed files with 1572 additions and 1459 deletions
@@ -0,0 +1,155 @@
+"""
+CV-based syllable divider detection and insertion for dictionary pages.
+
+Two-step approach:
+  1. CV: morphological vertical line detection checks if a word_box image
+     contains thin, isolated pipe-like vertical lines (syllable dividers).
+  2. pyphen: inserts syllable breaks at linguistically correct positions
+     for words where CV confirmed the presence of dividers.
+
+Lizenz: Apache 2.0 (kommerziell nutzbar)
+DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
+"""
+
+import logging
+import re
+from typing import Any, Dict, List
+
+import cv2
+import numpy as np
+
+logger = logging.getLogger(__name__)
+
+
+def _word_has_pipe_lines(img_gray: np.ndarray, wb: Dict) -> bool:
+    """CV check: does this word_box image show thin vertical pipe dividers?
+
+    Uses morphological opening with a tall thin kernel to isolate vertical
+    structures, then filters for thin (≤4px), isolated contours that are
+    NOT at the word edges (those would be l, I, 1 etc.).
+    """
+    x = wb.get("left", 0)
+    y = wb.get("top", 0)
+    w = wb.get("width", 0)
+    h = wb.get("height", 0)
+    if w < 30 or h < 12:
+        return False
+    ih, iw = img_gray.shape[:2]
+    y1, y2 = max(0, y), min(ih, y + h)
+    x1, x2 = max(0, x), min(iw, x + w)
+    roi = img_gray[y1:y2, x1:x2]
+    if roi.size == 0:
+        return False
+    rh, rw = roi.shape
+
+    # Binarize (ink = white on black background)
+    _, binary = cv2.threshold(
+        roi, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU
+    )
+
+    # Morphological opening: keep only tall vertical structures (≥55% height)
+    kern_h = max(int(rh * 0.55), 8)
+    kernel = np.ones((kern_h, 1), np.uint8)
+    vertical = cv2.morphologyEx(binary, cv2.MORPH_OPEN, kernel)
+
+    # Find surviving contours
+    contours, _ = cv2.findContours(
+        vertical, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
+    )
+
+    margin = max(int(rw * 0.08), 3)
+    for cnt in contours:
+        cx, cy, cw, ch = cv2.boundingRect(cnt)
+        if cw > 4:
+            continue  # too wide for a pipe
+        if cx < margin or cx + cw > rw - margin:
+            continue  # at word edge — likely l, I, 1
+        # Check isolation: adjacent columns should be mostly empty (ink-free)
+        left_zone = binary[cy:cy + ch, max(0, cx - 3):cx]
+        right_zone = binary[cy:cy + ch, cx + cw:min(rw, cx + cw + 3)]
+        left_ink = np.mean(left_zone) if left_zone.size else 255
+        right_ink = np.mean(right_zone) if right_zone.size else 255
+        if left_ink < 80 and right_ink < 80:
+            return True  # isolated thin vertical line = pipe divider
+    return False
+
+
+# IPA/phonetic bracket pattern — don't hyphenate transcriptions
+_IPA_RE = re.compile(r'[\[\]ˈˌːʃʒθðŋɑɒæɔəɛɜɪʊʌ]')
+
+
+def insert_syllable_dividers(
+    zones_data: List[Dict],
+    img_bgr: np.ndarray,
+    session_id: str,
+) -> int:
+    """Insert pipe syllable dividers into dictionary cells where CV confirms them.
+
+    For each cell on a dictionary page:
+      1. Check if ANY word_box has CV-detected pipe lines
+      2. If yes, apply pyphen to EACH word (≥4 chars) in the cell
+      3. Try DE hyphenation first, then EN
+
+    Returns the number of cells modified.
+    """
+    try:
+        import pyphen
+    except ImportError:
+        logger.warning("pyphen not installed — skipping syllable insertion")
+        return 0
+
+    _hyph_de = pyphen.Pyphen(lang='de_DE')
+    _hyph_en = pyphen.Pyphen(lang='en_US')
+    img_gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
+
+    insertions = 0
+    for z in zones_data:
+        for cell in z.get("cells", []):
+            ct = cell.get("col_type", "")
+            if not ct.startswith("column_"):
+                continue
+            text = cell.get("text", "")
+            if not text or "|" in text:
+                continue
+            if _IPA_RE.search(text):
+                continue
+
+            # CV gate: check if ANY word_box in this cell has pipe lines
+            wbs = cell.get("word_boxes") or []
+            if not any(_word_has_pipe_lines(img_gray, wb) for wb in wbs):
+                continue
+
+            # Apply pyphen to each significant word in the cell
+            tokens = re.split(r'(\s+|[,;]+\s*)', text)
+            new_tokens = []
+            changed = False
+            for tok in tokens:
+                # Skip whitespace/punctuation separators
+                if re.match(r'^[\s,;]+$', tok):
+                    new_tokens.append(tok)
+                    continue
+                # Only hyphenate words ≥ 4 alpha chars
+                clean = re.sub(r'[().\-]', '', tok)
+                if len(clean) < 4 or not re.search(r'[a-zA-ZäöüÄÖÜß]', clean):
+                    new_tokens.append(tok)
+                    continue
+                # Try DE first, then EN
+                hyph = _hyph_de.inserted(tok, hyphen='|')
+                if '|' not in hyph:
+                    hyph = _hyph_en.inserted(tok, hyphen='|')
+                if '|' in hyph and hyph != tok:
+                    new_tokens.append(hyph)
+                    changed = True
+                else:
+                    new_tokens.append(tok)
+            if changed:
+                cell["text"] = ''.join(new_tokens)
+                insertions += 1
+
+    if insertions:
+        logger.info(
+            "build-grid session %s: inserted syllable dividers in %d cells "
+            "(CV-validated)",
+            session_id, insertions,
+        )
+    return insertions