""" Syllable divider insertion for dictionary pages. For confirmed dictionary pages (is_dictionary=True), processes all content column cells: 1. Strips existing | dividers for clean normalization 2. Merges pipe-gap spaces (where OCR split a word at a divider position) 3. Applies pyphen syllabification to each word >= 3 alpha chars (DE then EN) 4. Only modifies words that pyphen recognizes — garbled OCR stays as-is No CV gate needed — the dictionary detection confidence is sufficient. pyphen uses Hunspell/TeX hyphenation dictionaries and is very reliable. Lizenz: Apache 2.0 (kommerziell nutzbar) DATENSCHUTZ: Alle Verarbeitung erfolgt lokal. """ import logging import re from typing import Any, Dict, List, Optional, Tuple import numpy as np logger = logging.getLogger(__name__) # IPA/phonetic characters — skip cells containing these _IPA_RE = re.compile(r'[\[\]ˈˌːʃʒθðŋɑɒæɔəɛɜɪʊʌ]') # Common German words that should NOT be merged with adjacent tokens. # These are function words that appear as standalone words between # headwords/definitions on dictionary pages. _STOP_WORDS = frozenset([ # Articles 'der', 'die', 'das', 'dem', 'den', 'des', 'ein', 'eine', 'einem', 'einen', 'einer', # Pronouns 'er', 'es', 'sie', 'wir', 'ihr', 'ich', 'man', 'sich', # Prepositions 'mit', 'von', 'zu', 'für', 'auf', 'in', 'an', 'um', 'am', 'im', 'aus', 'bei', 'nach', 'vor', 'bis', 'durch', 'über', 'unter', 'zwischen', 'ohne', 'gegen', # Conjunctions 'und', 'oder', 'als', 'wie', 'wenn', 'dass', 'weil', 'aber', # Adverbs 'auch', 'noch', 'nur', 'schon', 'sehr', 'nicht', # Verbs 'ist', 'hat', 'wird', 'kann', 'soll', 'muss', 'darf', 'sein', 'haben', # Other 'kein', 'keine', 'keinem', 'keinen', 'keiner', ]) # Cached hyphenators _hyph_de = None _hyph_en = None def _get_hyphenators(): """Lazy-load pyphen hyphenators (cached across calls).""" global _hyph_de, _hyph_en if _hyph_de is not None: return _hyph_de, _hyph_en try: import pyphen except ImportError: return None, None _hyph_de = pyphen.Pyphen(lang='de_DE') _hyph_en = pyphen.Pyphen(lang='en_US') return _hyph_de, _hyph_en def _hyphenate_word(word: str, hyph_de, hyph_en) -> Optional[str]: """Try to hyphenate a word using DE then EN dictionary. Returns word with | separators, or None if not recognized. """ hyph = hyph_de.inserted(word, hyphen='|') if '|' in hyph: return hyph hyph = hyph_en.inserted(word, hyphen='|') if '|' in hyph: return hyph return None def _try_merge_pipe_gaps(text: str, hyph_de) -> str: """Merge fragments separated by single spaces where OCR split at a pipe. Example: "Kaf fee" -> "Kaffee" (pyphen recognizes the merged word). Multi-step: "Ka bel jau" -> "Kabel jau" -> "Kabeljau". Guards against false merges: - The FIRST token must be pure alpha (word start — no attached punctuation) - The second token may have trailing punctuation (comma, period) which stays attached to the merged word: "Kä" + "fer," -> "Käfer," - Common German function words (der, die, das, ...) are never merged - At least one fragment must be very short (<=3 alpha chars) """ parts = text.split(' ') if len(parts) < 2: return text result = [parts[0]] i = 1 while i < len(parts): prev = result[-1] curr = parts[i] # Extract alpha-only core for lookup prev_alpha = re.sub(r'[^a-zA-ZäöüÄÖÜßẞ]', '', prev) curr_alpha = re.sub(r'[^a-zA-ZäöüÄÖÜßẞ]', '', curr) # Guard 1: first token must be pure alpha (word-start fragment) # second token may have trailing punctuation # Guard 2: neither alpha core can be a common German function word # Guard 3: the shorter fragment must be <= 3 chars (pipe-gap signal) # Guard 4: combined length must be >= 4 should_try = ( prev == prev_alpha # first token: pure alpha (word start) and prev_alpha and curr_alpha and prev_alpha.lower() not in _STOP_WORDS and curr_alpha.lower() not in _STOP_WORDS and min(len(prev_alpha), len(curr_alpha)) <= 3 and len(prev_alpha) + len(curr_alpha) >= 4 ) if should_try: merged_alpha = prev_alpha + curr_alpha hyph = hyph_de.inserted(merged_alpha, hyphen='-') if '-' in hyph: # pyphen recognizes merged word — collapse the space result[-1] = prev + curr i += 1 continue result.append(curr) i += 1 return ' '.join(result) def _syllabify_text(text: str, hyph_de, hyph_en) -> str: """Syllabify all significant words in a text string. 1. Strip existing | dividers 2. Merge pipe-gap spaces where possible 3. Apply pyphen to each word >= 3 alphabetic chars 4. Words pyphen doesn't recognize stay as-is (no bad guesses) """ if not text: return text # Skip cells that contain IPA transcription characters outside brackets. # Bracket content like [bɪltʃøn] is programmatically inserted and should # not block syllabification of the surrounding text. text_no_brackets = re.sub(r'\[[^\]]*\]', '', text) if _IPA_RE.search(text_no_brackets): return text # Phase 1: strip existing pipe dividers for clean normalization clean = text.replace('|', '') # Phase 2: merge pipe-gap spaces (OCR fragments from pipe splitting) clean = _try_merge_pipe_gaps(clean, hyph_de) # Phase 3: tokenize and syllabify each word # Split on whitespace and comma/semicolon sequences, keeping separators tokens = re.split(r'(\s+|[,;:]+\s*)', clean) result = [] for tok in tokens: if not tok or re.match(r'^[\s,;:]+$', tok): result.append(tok) continue # Strip trailing/leading punctuation for pyphen lookup m = re.match(r'^([^a-zA-ZäöüÄÖÜßẞ]*)(.*?)([^a-zA-ZäöüÄÖÜßẞ]*)$', tok) if not m: result.append(tok) continue lead, word, trail = m.group(1), m.group(2), m.group(3) if len(word) < 3 or not re.search(r'[a-zA-ZäöüÄÖÜß]', word): result.append(tok) continue hyph = _hyphenate_word(word, hyph_de, hyph_en) if hyph: result.append(lead + hyph + trail) else: result.append(tok) return ''.join(result) def insert_syllable_dividers( zones_data: List[Dict], img_bgr: np.ndarray, session_id: str, *, force: bool = False, col_filter: Optional[set] = None, ) -> int: """Insert pipe syllable dividers into dictionary cells. For dictionary pages: process all content column cells, strip existing pipes, merge pipe-gap spaces, and re-syllabify using pyphen. Pre-check: at least 1% of content cells must already contain ``|`` from OCR. This guards against pages with zero pipe characters (the primary guard — article_col_index — is checked at the call site). Args: force: If True, skip the pipe-ratio pre-check and syllabify all content words regardless of whether the original has pipe dividers. col_filter: If set, only process cells whose col_type is in this set. None means process all content columns. Returns the number of cells modified. """ hyph_de, hyph_en = _get_hyphenators() if hyph_de is None: logger.warning("pyphen not installed — skipping syllable insertion") return 0 # Pre-check: count cells that already have | from OCR. # Real dictionary pages with printed syllable dividers will have OCR- # detected pipes in many cells. Pages without syllable dividers will # have zero — skip those to avoid false syllabification. if not force: total_col_cells = 0 cells_with_pipes = 0 for z in zones_data: for cell in z.get("cells", []): if cell.get("col_type", "").startswith("column_"): total_col_cells += 1 if "|" in cell.get("text", ""): cells_with_pipes += 1 if total_col_cells > 0: pipe_ratio = cells_with_pipes / total_col_cells if pipe_ratio < 0.01: logger.info( "build-grid session %s: skipping syllable insertion — " "only %.1f%% of cells have existing pipes (need >=1%%)", session_id, pipe_ratio * 100, ) return 0 insertions = 0 for z in zones_data: for cell in z.get("cells", []): ct = cell.get("col_type", "") if not ct.startswith("column_"): continue if col_filter is not None and ct not in col_filter: continue text = cell.get("text", "") if not text: continue new_text = _syllabify_text(text, hyph_de, hyph_en) if new_text != text: cell["text"] = new_text insertions += 1 if insertions: logger.info( "build-grid session %s: syllable dividers inserted/normalized " "in %d cells (pyphen)", session_id, insertions, ) return insertions