""" Syllable divider insertion for dictionary pages. For confirmed dictionary pages (is_dictionary=True), processes all content column cells: 1. Strips existing | dividers for clean normalization 2. Merges pipe-gap spaces (where OCR split a word at a divider position) 3. Applies pyphen syllabification to each word >= 3 alpha chars (DE then EN) 4. Only modifies words that pyphen recognizes — garbled OCR stays as-is No CV gate needed — the dictionary detection confidence is sufficient. pyphen uses Hunspell/TeX hyphenation dictionaries and is very reliable. Lizenz: Apache 2.0 (kommerziell nutzbar) DATENSCHUTZ: Alle Verarbeitung erfolgt lokal. """ import logging import re from typing import Any, Dict, List, Optional, Tuple import numpy as np logger = logging.getLogger(__name__) # IPA/phonetic characters — skip cells containing these _IPA_RE = re.compile(r'[\[\]ˈˌːʃʒθðŋɑɒæɔəɛɜɪʊʌ]') # Common German words that should NOT be merged with adjacent tokens. # These are function words that appear as standalone words between # headwords/definitions on dictionary pages. _STOP_WORDS = frozenset([ # Articles 'der', 'die', 'das', 'dem', 'den', 'des', 'ein', 'eine', 'einem', 'einen', 'einer', # Pronouns 'du', 'er', 'es', 'sie', 'wir', 'ihr', 'ich', 'man', 'sich', 'dich', 'dir', 'mich', 'mir', 'uns', 'euch', 'ihm', 'ihn', # Prepositions 'mit', 'von', 'zu', 'für', 'auf', 'in', 'an', 'um', 'am', 'im', 'aus', 'bei', 'nach', 'vor', 'bis', 'durch', 'über', 'unter', 'zwischen', 'ohne', 'gegen', # Conjunctions 'und', 'oder', 'als', 'wie', 'wenn', 'dass', 'weil', 'aber', # Adverbs 'auch', 'noch', 'nur', 'schon', 'sehr', 'nicht', # Verbs 'ist', 'hat', 'wird', 'kann', 'soll', 'muss', 'darf', 'sein', 'haben', # Other 'kein', 'keine', 'keinem', 'keinen', 'keiner', ]) # Cached hyphenators _hyph_de = None _hyph_en = None # Cached spellchecker (for autocorrect_pipe_artifacts) _spell_de = None def _get_hyphenators(): """Lazy-load pyphen hyphenators (cached across calls).""" global _hyph_de, _hyph_en if _hyph_de is not None: return _hyph_de, _hyph_en try: import pyphen except ImportError: return None, None _hyph_de = pyphen.Pyphen(lang='de_DE') _hyph_en = pyphen.Pyphen(lang='en_US') return _hyph_de, _hyph_en def _get_spellchecker(): """Lazy-load German spellchecker (cached across calls).""" global _spell_de if _spell_de is not None: return _spell_de try: from spellchecker import SpellChecker except ImportError: return None _spell_de = SpellChecker(language='de') return _spell_de def _is_known_word(word: str, hyph_de, hyph_en) -> bool: """Check whether pyphen recognises a word (DE or EN).""" if len(word) < 2: return False return ('|' in hyph_de.inserted(word, hyphen='|') or '|' in hyph_en.inserted(word, hyphen='|')) def _is_real_word(word: str) -> bool: """Check whether spellchecker knows this word (case-insensitive).""" spell = _get_spellchecker() if spell is None: return False return word.lower() in spell def _hyphenate_word(word: str, hyph_de, hyph_en) -> Optional[str]: """Try to hyphenate a word using DE then EN dictionary. Returns word with | separators, or None if not recognized. """ hyph = hyph_de.inserted(word, hyphen='|') if '|' in hyph: return hyph hyph = hyph_en.inserted(word, hyphen='|') if '|' in hyph: return hyph return None def _autocorrect_piped_word(word_with_pipes: str) -> Optional[str]: """Try to correct a word that has OCR pipe artifacts. Printed syllable divider lines on dictionary pages confuse OCR: the vertical stroke is often read as an extra character (commonly ``l``, ``I``, ``1``, ``i``) adjacent to where the pipe appears. Sometimes OCR reads one divider as ``|`` and another as a letter, so the garbled character may be far from any detected pipe. Uses ``spellchecker`` (frequency-based word list) for validation — unlike pyphen which is a pattern-based hyphenator and accepts nonsense strings like "Zeplpelin". Strategy: 1. Strip ``|`` — if spellchecker knows the result, done. 2. Try deleting each pipe-like character (l, I, 1, i, t). OCR inserts extra chars that resemble vertical strokes. 3. Fall back to spellchecker's own ``correction()`` method. 4. Preserve the original casing of the first letter. """ stripped = word_with_pipes.replace('|', '') if not stripped or len(stripped) < 3: return stripped # too short to validate # Step 1: if the stripped word is already a real word, done if _is_real_word(stripped): return stripped # Step 2: try deleting pipe-like characters (most likely artifacts) _PIPE_LIKE = frozenset('lI1it') for idx in range(len(stripped)): if stripped[idx] not in _PIPE_LIKE: continue candidate = stripped[:idx] + stripped[idx + 1:] if len(candidate) >= 3 and _is_real_word(candidate): return candidate # Step 3: use spellchecker's built-in correction spell = _get_spellchecker() if spell is not None: suggestion = spell.correction(stripped.lower()) if suggestion and suggestion != stripped.lower(): # Preserve original first-letter case if stripped[0].isupper(): suggestion = suggestion[0].upper() + suggestion[1:] return suggestion return None # could not fix def autocorrect_pipe_artifacts( zones_data: List[Dict], session_id: str, ) -> int: """Strip OCR pipe artifacts and correct garbled words in-place. Printed syllable divider lines on dictionary scans are read by OCR as ``|`` characters embedded in words (e.g. ``Zel|le``, ``Ze|plpe|lin``). This function: 1. Strips ``|`` from every word in content cells. 2. Validates with spellchecker (real dictionary lookup). 3. If not recognised, tries deleting pipe-like characters or uses spellchecker's correction (e.g. ``Zeplpelin`` → ``Zeppelin``). 4. Updates both word-box texts and cell text. Returns the number of cells modified. """ spell = _get_spellchecker() if spell is None: logger.warning("spellchecker not available — pipe autocorrect limited") # Fall back: still strip pipes even without spellchecker pass modified = 0 for z in zones_data: for cell in z.get("cells", []): ct = cell.get("col_type", "") if not ct.startswith("column_"): continue cell_changed = False # --- Fix word boxes --- for wb in cell.get("word_boxes", []): wb_text = wb.get("text", "") if "|" not in wb_text: continue # Separate trailing punctuation m = re.match( r'^([^a-zA-ZäöüÄÖÜßẞ]*)' r'(.*?)' r'([^a-zA-ZäöüÄÖÜßẞ]*)$', wb_text, ) if not m: continue lead, core, trail = m.group(1), m.group(2), m.group(3) if "|" not in core: continue corrected = _autocorrect_piped_word(core) if corrected is not None and corrected != core: wb["text"] = lead + corrected + trail cell_changed = True # --- Rebuild cell text from word boxes --- if cell_changed: wbs = cell.get("word_boxes", []) if wbs: cell["text"] = " ".join( (wb.get("text") or "") for wb in wbs ) modified += 1 # --- Fallback: strip residual | from cell text --- # (covers cases where word_boxes don't exist or weren't fixed) text = cell.get("text", "") if "|" in text: clean = text.replace("|", "") if clean != text: cell["text"] = clean if not cell_changed: modified += 1 if modified: logger.info( "build-grid session %s: autocorrected pipe artifacts in %d cells", session_id, modified, ) return modified def _try_merge_pipe_gaps(text: str, hyph_de) -> str: """Merge fragments separated by single spaces where OCR split at a pipe. Example: "Kaf fee" -> "Kaffee" (pyphen recognizes the merged word). Multi-step: "Ka bel jau" -> "Kabel jau" -> "Kabeljau". Guards against false merges: - The FIRST token must be pure alpha (word start — no attached punctuation) - The second token may have trailing punctuation (comma, period) which stays attached to the merged word: "Kä" + "fer," -> "Käfer," - Common German function words (der, die, das, ...) are never merged - At least one fragment must be very short (<=3 alpha chars) """ parts = text.split(' ') if len(parts) < 2: return text result = [parts[0]] i = 1 while i < len(parts): prev = result[-1] curr = parts[i] # Extract alpha-only core for lookup prev_alpha = re.sub(r'[^a-zA-ZäöüÄÖÜßẞ]', '', prev) curr_alpha = re.sub(r'[^a-zA-ZäöüÄÖÜßẞ]', '', curr) # Guard 1: first token must be pure alpha (word-start fragment) # second token may have trailing punctuation # Guard 2: neither alpha core can be a common German function word # Guard 3: the shorter fragment must be <= 3 chars (pipe-gap signal) # Guard 4: combined length must be >= 4 should_try = ( prev == prev_alpha # first token: pure alpha (word start) and prev_alpha and curr_alpha and prev_alpha.lower() not in _STOP_WORDS and curr_alpha.lower() not in _STOP_WORDS and min(len(prev_alpha), len(curr_alpha)) <= 3 and len(prev_alpha) + len(curr_alpha) >= 4 ) if should_try: merged_alpha = prev_alpha + curr_alpha hyph = hyph_de.inserted(merged_alpha, hyphen='-') if '-' in hyph: # pyphen recognizes merged word — collapse the space result[-1] = prev + curr i += 1 continue result.append(curr) i += 1 return ' '.join(result) def merge_word_gaps_in_zones(zones_data: List[Dict], session_id: str) -> int: """Merge OCR word-gap fragments in cell texts using pyphen validation. OCR often splits words at syllable boundaries into separate word_boxes, producing text like "zerknit tert" instead of "zerknittert". This function tries to merge adjacent fragments in every content cell. More permissive than ``_try_merge_pipe_gaps`` (threshold 5 instead of 3) but still guarded by pyphen dictionary lookup and stop-word exclusion. Returns the number of cells modified. """ hyph_de, _ = _get_hyphenators() if hyph_de is None: return 0 modified = 0 for z in zones_data: for cell in z.get("cells", []): ct = cell.get("col_type", "") if not ct.startswith("column_"): continue text = cell.get("text", "") if not text or " " not in text: continue # Skip IPA cells text_no_brackets = re.sub(r'\[[^\]]*\]', '', text) if _IPA_RE.search(text_no_brackets): continue new_text = _try_merge_word_gaps(text, hyph_de) if new_text != text: cell["text"] = new_text modified += 1 if modified: logger.info( "build-grid session %s: merged word gaps in %d cells", session_id, modified, ) return modified def _try_merge_word_gaps(text: str, hyph_de) -> str: """Merge OCR word fragments with relaxed threshold (max_short=5). Similar to ``_try_merge_pipe_gaps`` but allows slightly longer fragments (max_short=5 instead of 3). Still requires pyphen to recognize the merged word. """ parts = text.split(' ') if len(parts) < 2: return text result = [parts[0]] i = 1 while i < len(parts): prev = result[-1] curr = parts[i] prev_alpha = re.sub(r'[^a-zA-ZäöüÄÖÜßẞ]', '', prev) curr_alpha = re.sub(r'[^a-zA-ZäöüÄÖÜßẞ]', '', curr) should_try = ( prev == prev_alpha and prev_alpha and curr_alpha and prev_alpha.lower() not in _STOP_WORDS and curr_alpha.lower() not in _STOP_WORDS and min(len(prev_alpha), len(curr_alpha)) <= 5 and len(prev_alpha) + len(curr_alpha) >= 4 ) if should_try: merged_alpha = prev_alpha + curr_alpha hyph = hyph_de.inserted(merged_alpha, hyphen='-') if '-' in hyph: result[-1] = prev + curr i += 1 continue result.append(curr) i += 1 return ' '.join(result) def _syllabify_text(text: str, hyph_de, hyph_en) -> str: """Syllabify all significant words in a text string. 1. Strip existing | dividers 2. Merge pipe-gap spaces where possible 3. Apply pyphen to each word >= 3 alphabetic chars 4. Words pyphen doesn't recognize stay as-is (no bad guesses) """ if not text: return text # Skip cells that contain IPA transcription characters outside brackets. # Bracket content like [bɪltʃøn] is programmatically inserted and should # not block syllabification of the surrounding text. text_no_brackets = re.sub(r'\[[^\]]*\]', '', text) if _IPA_RE.search(text_no_brackets): return text # Phase 1: strip existing pipe dividers for clean normalization clean = text.replace('|', '') # Phase 2: merge pipe-gap spaces (OCR fragments from pipe splitting) clean = _try_merge_pipe_gaps(clean, hyph_de) # Phase 3: tokenize and syllabify each word # Split on whitespace and comma/semicolon sequences, keeping separators tokens = re.split(r'(\s+|[,;:]+\s*)', clean) result = [] for tok in tokens: if not tok or re.match(r'^[\s,;:]+$', tok): result.append(tok) continue # Strip trailing/leading punctuation for pyphen lookup m = re.match(r'^([^a-zA-ZäöüÄÖÜßẞ]*)(.*?)([^a-zA-ZäöüÄÖÜßẞ]*)$', tok) if not m: result.append(tok) continue lead, word, trail = m.group(1), m.group(2), m.group(3) if len(word) < 3 or not re.search(r'[a-zA-ZäöüÄÖÜß]', word): result.append(tok) continue hyph = _hyphenate_word(word, hyph_de, hyph_en) if hyph: result.append(lead + hyph + trail) else: result.append(tok) return ''.join(result) def insert_syllable_dividers( zones_data: List[Dict], img_bgr: np.ndarray, session_id: str, *, force: bool = False, col_filter: Optional[set] = None, ) -> int: """Insert pipe syllable dividers into dictionary cells. For dictionary pages: process all content column cells, strip existing pipes, merge pipe-gap spaces, and re-syllabify using pyphen. Pre-check: at least 1% of content cells must already contain ``|`` from OCR. This guards against pages with zero pipe characters (the primary guard — article_col_index — is checked at the call site). Args: force: If True, skip the pipe-ratio pre-check and syllabify all content words regardless of whether the original has pipe dividers. col_filter: If set, only process cells whose col_type is in this set. None means process all content columns. Returns the number of cells modified. """ hyph_de, hyph_en = _get_hyphenators() if hyph_de is None: logger.warning("pyphen not installed — skipping syllable insertion") return 0 # Pre-check: count cells that already have | from OCR. # Real dictionary pages with printed syllable dividers will have OCR- # detected pipes in many cells. Pages without syllable dividers will # have zero — skip those to avoid false syllabification. if not force: total_col_cells = 0 cells_with_pipes = 0 for z in zones_data: for cell in z.get("cells", []): if cell.get("col_type", "").startswith("column_"): total_col_cells += 1 if "|" in cell.get("text", ""): cells_with_pipes += 1 if total_col_cells > 0: pipe_ratio = cells_with_pipes / total_col_cells if pipe_ratio < 0.01: logger.info( "build-grid session %s: skipping syllable insertion — " "only %.1f%% of cells have existing pipes (need >=1%%)", session_id, pipe_ratio * 100, ) return 0 insertions = 0 for z in zones_data: for cell in z.get("cells", []): ct = cell.get("col_type", "") if not ct.startswith("column_"): continue if col_filter is not None and ct not in col_filter: continue text = cell.get("text", "") if not text: continue # In auto mode (force=False), only normalize cells that already # have | from OCR (i.e. printed syllable dividers on the original # scan). Don't add new syllable marks to other words. if not force and "|" not in text: continue new_text = _syllabify_text(text, hyph_de, hyph_en) if new_text != text: cell["text"] = new_text insertions += 1 if insertions: logger.info( "build-grid session %s: syllable dividers inserted/normalized " "in %d cells (pyphen)", session_id, insertions, ) return insertions