""" Syllable Merge — word gap merging, syllabification, divider insertion. Extracted from cv_syllable_detect.py for modularity. Lizenz: Apache 2.0 (kommerziell nutzbar) DATENSCHUTZ: Alle Verarbeitung erfolgt lokal. """ import logging import re from typing import Any, Dict, List, Optional import numpy as np from cv_syllable_core import ( _get_hyphenators, _hyphenate_word, _IPA_RE, _STOP_WORDS, ) logger = logging.getLogger(__name__) def _try_merge_pipe_gaps(text: str, hyph_de) -> str: """Merge fragments separated by single spaces where OCR split at a pipe. Example: "Kaf fee" -> "Kaffee" (pyphen recognizes the merged word). Multi-step: "Ka bel jau" -> "Kabel jau" -> "Kabeljau". Guards against false merges: - The FIRST token must be pure alpha (word start -- no attached punctuation) - The second token may have trailing punctuation (comma, period) which stays attached to the merged word: "Ka" + "fer," -> "Kafer," - Common German function words (der, die, das, ...) are never merged - At least one fragment must be very short (<=3 alpha chars) """ parts = text.split(' ') if len(parts) < 2: return text result = [parts[0]] i = 1 while i < len(parts): prev = result[-1] curr = parts[i] # Extract alpha-only core for lookup prev_alpha = re.sub(r'[^a-zA-Z\u00e4\u00f6\u00fc\u00c4\u00d6\u00dc\u00df\u1e9e]', '', prev) curr_alpha = re.sub(r'[^a-zA-Z\u00e4\u00f6\u00fc\u00c4\u00d6\u00dc\u00df\u1e9e]', '', curr) # Guard 1: first token must be pure alpha (word-start fragment) # second token may have trailing punctuation # Guard 2: neither alpha core can be a common German function word # Guard 3: the shorter fragment must be <= 3 chars (pipe-gap signal) # Guard 4: combined length must be >= 4 should_try = ( prev == prev_alpha # first token: pure alpha (word start) and prev_alpha and curr_alpha and prev_alpha.lower() not in _STOP_WORDS and curr_alpha.lower() not in _STOP_WORDS and min(len(prev_alpha), len(curr_alpha)) <= 3 and len(prev_alpha) + len(curr_alpha) >= 4 ) if should_try: merged_alpha = prev_alpha + curr_alpha hyph = hyph_de.inserted(merged_alpha, hyphen='-') if '-' in hyph: # pyphen recognizes merged word -- collapse the space result[-1] = prev + curr i += 1 continue result.append(curr) i += 1 return ' '.join(result) def merge_word_gaps_in_zones(zones_data: List[Dict], session_id: str) -> int: """Merge OCR word-gap fragments in cell texts using pyphen validation. OCR often splits words at syllable boundaries into separate word_boxes, producing text like "zerknit tert" instead of "zerknittert". This function tries to merge adjacent fragments in every content cell. More permissive than ``_try_merge_pipe_gaps`` (threshold 5 instead of 3) but still guarded by pyphen dictionary lookup and stop-word exclusion. Returns the number of cells modified. """ hyph_de, _ = _get_hyphenators() if hyph_de is None: return 0 modified = 0 for z in zones_data: for cell in z.get("cells", []): ct = cell.get("col_type", "") if not ct.startswith("column_"): continue text = cell.get("text", "") if not text or " " not in text: continue # Skip IPA cells text_no_brackets = re.sub(r'\[[^\]]*\]', '', text) if _IPA_RE.search(text_no_brackets): continue new_text = _try_merge_word_gaps(text, hyph_de) if new_text != text: cell["text"] = new_text modified += 1 if modified: logger.info( "build-grid session %s: merged word gaps in %d cells", session_id, modified, ) return modified def _try_merge_word_gaps(text: str, hyph_de) -> str: """Merge OCR word fragments with relaxed threshold (max_short=5). Similar to ``_try_merge_pipe_gaps`` but allows slightly longer fragments (max_short=5 instead of 3). Still requires pyphen to recognize the merged word. """ parts = text.split(' ') if len(parts) < 2: return text result = [parts[0]] i = 1 while i < len(parts): prev = result[-1] curr = parts[i] prev_alpha = re.sub(r'[^a-zA-Z\u00e4\u00f6\u00fc\u00c4\u00d6\u00dc\u00df\u1e9e]', '', prev) curr_alpha = re.sub(r'[^a-zA-Z\u00e4\u00f6\u00fc\u00c4\u00d6\u00dc\u00df\u1e9e]', '', curr) should_try = ( prev == prev_alpha and prev_alpha and curr_alpha and prev_alpha.lower() not in _STOP_WORDS and curr_alpha.lower() not in _STOP_WORDS and min(len(prev_alpha), len(curr_alpha)) <= 5 and len(prev_alpha) + len(curr_alpha) >= 4 ) if should_try: merged_alpha = prev_alpha + curr_alpha hyph = hyph_de.inserted(merged_alpha, hyphen='-') if '-' in hyph: result[-1] = prev + curr i += 1 continue result.append(curr) i += 1 return ' '.join(result) def _syllabify_text(text: str, hyph_de, hyph_en) -> str: """Syllabify all significant words in a text string. 1. Strip existing | dividers 2. Merge pipe-gap spaces where possible 3. Apply pyphen to each word >= 3 alphabetic chars 4. Words pyphen doesn't recognize stay as-is (no bad guesses) """ if not text: return text # Skip cells that contain IPA transcription characters outside brackets. text_no_brackets = re.sub(r'\[[^\]]*\]', '', text) if _IPA_RE.search(text_no_brackets): return text # Phase 1: strip existing pipe dividers for clean normalization clean = text.replace('|', '') # Phase 2: merge pipe-gap spaces (OCR fragments from pipe splitting) clean = _try_merge_pipe_gaps(clean, hyph_de) # Phase 3: tokenize and syllabify each word # Split on whitespace and comma/semicolon sequences, keeping separators tokens = re.split(r'(\s+|[,;:]+\s*)', clean) result = [] for tok in tokens: if not tok or re.match(r'^[\s,;:]+$', tok): result.append(tok) continue # Strip trailing/leading punctuation for pyphen lookup m = re.match(r'^([^a-zA-Z\u00e4\u00f6\u00fc\u00c4\u00d6\u00dc\u00df\u1e9e]*)(.*?)([^a-zA-Z\u00e4\u00f6\u00fc\u00c4\u00d6\u00dc\u00df\u1e9e]*)$', tok) if not m: result.append(tok) continue lead, word, trail = m.group(1), m.group(2), m.group(3) if len(word) < 3 or not re.search(r'[a-zA-Z\u00e4\u00f6\u00fc\u00c4\u00d6\u00dc\u00df]', word): result.append(tok) continue hyph = _hyphenate_word(word, hyph_de, hyph_en) if hyph: result.append(lead + hyph + trail) else: result.append(tok) return ''.join(result) def insert_syllable_dividers( zones_data: List[Dict], img_bgr: np.ndarray, session_id: str, *, force: bool = False, col_filter: Optional[set] = None, ) -> int: """Insert pipe syllable dividers into dictionary cells. For dictionary pages: process all content column cells, strip existing pipes, merge pipe-gap spaces, and re-syllabify using pyphen. Pre-check: at least 1% of content cells must already contain ``|`` from OCR. This guards against pages with zero pipe characters. Args: force: If True, skip the pipe-ratio pre-check and syllabify all content words regardless of whether the original has pipe dividers. col_filter: If set, only process cells whose col_type is in this set. None means process all content columns. Returns the number of cells modified. """ hyph_de, hyph_en = _get_hyphenators() if hyph_de is None: logger.warning("pyphen not installed -- skipping syllable insertion") return 0 # Pre-check: count cells that already have | from OCR. if not force: total_col_cells = 0 cells_with_pipes = 0 for z in zones_data: for cell in z.get("cells", []): if cell.get("col_type", "").startswith("column_"): total_col_cells += 1 if "|" in cell.get("text", ""): cells_with_pipes += 1 if total_col_cells > 0: pipe_ratio = cells_with_pipes / total_col_cells if pipe_ratio < 0.01: logger.info( "build-grid session %s: skipping syllable insertion -- " "only %.1f%% of cells have existing pipes (need >=1%%)", session_id, pipe_ratio * 100, ) return 0 insertions = 0 for z in zones_data: for cell in z.get("cells", []): ct = cell.get("col_type", "") if not ct.startswith("column_"): continue if col_filter is not None and ct not in col_filter: continue text = cell.get("text", "") if not text: continue # In auto mode (force=False), only normalize cells that already # have | from OCR (i.e. printed syllable dividers on the original # scan). Don't add new syllable marks to other words. if not force and "|" not in text: continue new_text = _syllabify_text(text, hyph_de, hyph_en) if new_text != text: cell["text"] = new_text insertions += 1 if insertions: logger.info( "build-grid session %s: syllable dividers inserted/normalized " "in %d cells (pyphen)", session_id, insertions, ) return insertions