""" CV-based syllable divider detection and insertion for dictionary pages. Two-step approach: 1. CV: morphological vertical line detection checks if a word_box image contains thin, isolated pipe-like vertical lines (syllable dividers). 2. pyphen: inserts syllable breaks at linguistically correct positions for words where CV confirmed the presence of dividers. Lizenz: Apache 2.0 (kommerziell nutzbar) DATENSCHUTZ: Alle Verarbeitung erfolgt lokal. """ import logging import re from typing import Any, Dict, List import cv2 import numpy as np logger = logging.getLogger(__name__) def _word_has_pipe_lines(img_gray: np.ndarray, wb: Dict) -> bool: """CV check: does this word_box image show thin vertical pipe dividers? Uses morphological opening with a tall thin kernel to isolate vertical structures, then filters for thin (≤4px), isolated contours that are NOT at the word edges (those would be l, I, 1 etc.). """ x = wb.get("left", 0) y = wb.get("top", 0) w = wb.get("width", 0) h = wb.get("height", 0) if w < 30 or h < 12: return False ih, iw = img_gray.shape[:2] y1, y2 = max(0, y), min(ih, y + h) x1, x2 = max(0, x), min(iw, x + w) roi = img_gray[y1:y2, x1:x2] if roi.size == 0: return False rh, rw = roi.shape # Binarize (ink = white on black background) _, binary = cv2.threshold( roi, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU ) # Morphological opening: keep only tall vertical structures (≥55% height) kern_h = max(int(rh * 0.55), 8) kernel = np.ones((kern_h, 1), np.uint8) vertical = cv2.morphologyEx(binary, cv2.MORPH_OPEN, kernel) # Find surviving contours contours, _ = cv2.findContours( vertical, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE ) margin = max(int(rw * 0.08), 3) for cnt in contours: cx, cy, cw, ch = cv2.boundingRect(cnt) if cw > 4: continue # too wide for a pipe if cx < margin or cx + cw > rw - margin: continue # at word edge — likely l, I, 1 # Check isolation: adjacent columns should be mostly empty (ink-free) left_zone = binary[cy:cy + ch, max(0, cx - 3):cx] right_zone = binary[cy:cy + ch, cx + cw:min(rw, cx + cw + 3)] left_ink = np.mean(left_zone) if left_zone.size else 255 right_ink = np.mean(right_zone) if right_zone.size else 255 if left_ink < 80 and right_ink < 80: return True # isolated thin vertical line = pipe divider return False # IPA/phonetic bracket pattern — don't hyphenate transcriptions _IPA_RE = re.compile(r'[\[\]ˈˌːʃʒθðŋɑɒæɔəɛɜɪʊʌ]') def insert_syllable_dividers( zones_data: List[Dict], img_bgr: np.ndarray, session_id: str, ) -> int: """Insert pipe syllable dividers into dictionary cells where CV confirms them. For each cell on a dictionary page: 1. Check if ANY word_box has CV-detected pipe lines 2. If yes, apply pyphen to EACH word (≥4 chars) in the cell 3. Try DE hyphenation first, then EN Returns the number of cells modified. """ try: import pyphen except ImportError: logger.warning("pyphen not installed — skipping syllable insertion") return 0 _hyph_de = pyphen.Pyphen(lang='de_DE') _hyph_en = pyphen.Pyphen(lang='en_US') img_gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY) insertions = 0 for z in zones_data: for cell in z.get("cells", []): ct = cell.get("col_type", "") if not ct.startswith("column_"): continue text = cell.get("text", "") if not text or "|" in text: continue if _IPA_RE.search(text): continue # CV gate: check if ANY word_box in this cell has pipe lines wbs = cell.get("word_boxes") or [] if not any(_word_has_pipe_lines(img_gray, wb) for wb in wbs): continue # Apply pyphen to each significant word in the cell tokens = re.split(r'(\s+|[,;]+\s*)', text) new_tokens = [] changed = False for tok in tokens: # Skip whitespace/punctuation separators if re.match(r'^[\s,;]+$', tok): new_tokens.append(tok) continue # Only hyphenate words ≥ 4 alpha chars clean = re.sub(r'[().\-]', '', tok) if len(clean) < 4 or not re.search(r'[a-zA-ZäöüÄÖÜß]', clean): new_tokens.append(tok) continue # Try DE first, then EN hyph = _hyph_de.inserted(tok, hyphen='|') if '|' not in hyph: hyph = _hyph_en.inserted(tok, hyphen='|') if '|' in hyph and hyph != tok: new_tokens.append(hyph) changed = True else: new_tokens.append(tok) if changed: cell["text"] = ''.join(new_tokens) insertions += 1 if insertions: logger.info( "build-grid session %s: inserted syllable dividers in %d cells " "(CV-validated)", session_id, insertions, ) return insertions