""" Syllable Core — hyphenator init, word validation, pipe autocorrect. Extracted from cv_syllable_detect.py for modularity. Lizenz: Apache 2.0 (kommerziell nutzbar) DATENSCHUTZ: Alle Verarbeitung erfolgt lokal. """ import logging import re from typing import Any, Dict, List, Optional, Tuple logger = logging.getLogger(__name__) # IPA/phonetic characters -- skip cells containing these _IPA_RE = re.compile(r'[\[\]\u02c8\u02cc\u02d0\u0283\u0292\u03b8\u00f0\u014b\u0251\u0252\u00e6\u0254\u0259\u025b\u025c\u026a\u028a\u028c]') # Common German words that should NOT be merged with adjacent tokens. _STOP_WORDS = frozenset([ # Articles 'der', 'die', 'das', 'dem', 'den', 'des', 'ein', 'eine', 'einem', 'einen', 'einer', # Pronouns 'du', 'er', 'es', 'sie', 'wir', 'ihr', 'ich', 'man', 'sich', 'dich', 'dir', 'mich', 'mir', 'uns', 'euch', 'ihm', 'ihn', # Prepositions 'mit', 'von', 'zu', 'f\u00fcr', 'auf', 'in', 'an', 'um', 'am', 'im', 'aus', 'bei', 'nach', 'vor', 'bis', 'durch', '\u00fcber', 'unter', 'zwischen', 'ohne', 'gegen', # Conjunctions 'und', 'oder', 'als', 'wie', 'wenn', 'dass', 'weil', 'aber', # Adverbs 'auch', 'noch', 'nur', 'schon', 'sehr', 'nicht', # Verbs 'ist', 'hat', 'wird', 'kann', 'soll', 'muss', 'darf', 'sein', 'haben', # Other 'kein', 'keine', 'keinem', 'keinen', 'keiner', ]) # Cached hyphenators _hyph_de = None _hyph_en = None # Cached spellchecker (for autocorrect_pipe_artifacts) _spell_de = None def _get_hyphenators(): """Lazy-load pyphen hyphenators (cached across calls).""" global _hyph_de, _hyph_en if _hyph_de is not None: return _hyph_de, _hyph_en try: import pyphen except ImportError: return None, None _hyph_de = pyphen.Pyphen(lang='de_DE') _hyph_en = pyphen.Pyphen(lang='en_US') return _hyph_de, _hyph_en def _get_spellchecker(): """Lazy-load German spellchecker (cached across calls).""" global _spell_de if _spell_de is not None: return _spell_de try: from spellchecker import SpellChecker except ImportError: return None _spell_de = SpellChecker(language='de') return _spell_de def _is_known_word(word: str, hyph_de, hyph_en) -> bool: """Check whether pyphen recognises a word (DE or EN).""" if len(word) < 2: return False return ('|' in hyph_de.inserted(word, hyphen='|') or '|' in hyph_en.inserted(word, hyphen='|')) def _is_real_word(word: str) -> bool: """Check whether spellchecker knows this word (case-insensitive).""" spell = _get_spellchecker() if spell is None: return False return word.lower() in spell def _hyphenate_word(word: str, hyph_de, hyph_en) -> Optional[str]: """Try to hyphenate a word using DE then EN dictionary. Returns word with | separators, or None if not recognized. """ hyph = hyph_de.inserted(word, hyphen='|') if '|' in hyph: return hyph hyph = hyph_en.inserted(word, hyphen='|') if '|' in hyph: return hyph return None def _autocorrect_piped_word(word_with_pipes: str) -> Optional[str]: """Try to correct a word that has OCR pipe artifacts. Printed syllable divider lines on dictionary pages confuse OCR: the vertical stroke is often read as an extra character (commonly ``l``, ``I``, ``1``, ``i``) adjacent to where the pipe appears. Uses ``spellchecker`` (frequency-based word list) for validation. Strategy: 1. Strip ``|`` -- if spellchecker knows the result, done. 2. Try deleting each pipe-like character (l, I, 1, i, t). 3. Fall back to spellchecker's own ``correction()`` method. 4. Preserve the original casing of the first letter. """ stripped = word_with_pipes.replace('|', '') if not stripped or len(stripped) < 3: return stripped # too short to validate # Step 1: if the stripped word is already a real word, done if _is_real_word(stripped): return stripped # Step 2: try deleting pipe-like characters (most likely artifacts) _PIPE_LIKE = frozenset('lI1it') for idx in range(len(stripped)): if stripped[idx] not in _PIPE_LIKE: continue candidate = stripped[:idx] + stripped[idx + 1:] if len(candidate) >= 3 and _is_real_word(candidate): return candidate # Step 3: use spellchecker's built-in correction spell = _get_spellchecker() if spell is not None: suggestion = spell.correction(stripped.lower()) if suggestion and suggestion != stripped.lower(): # Preserve original first-letter case if stripped[0].isupper(): suggestion = suggestion[0].upper() + suggestion[1:] return suggestion return None # could not fix def autocorrect_pipe_artifacts( zones_data: List[Dict], session_id: str, ) -> int: """Strip OCR pipe artifacts and correct garbled words in-place. Printed syllable divider lines on dictionary scans are read by OCR as ``|`` characters embedded in words (e.g. ``Zel|le``, ``Ze|plpe|lin``). This function: 1. Strips ``|`` from every word in content cells. 2. Validates with spellchecker (real dictionary lookup). 3. If not recognised, tries deleting pipe-like characters or uses spellchecker's correction (e.g. ``Zeplpelin`` -> ``Zeppelin``). 4. Updates both word-box texts and cell text. Returns the number of cells modified. """ spell = _get_spellchecker() if spell is None: logger.warning("spellchecker not available -- pipe autocorrect limited") # Fall back: still strip pipes even without spellchecker pass modified = 0 for z in zones_data: for cell in z.get("cells", []): ct = cell.get("col_type", "") if not ct.startswith("column_"): continue cell_changed = False # --- Fix word boxes --- for wb in cell.get("word_boxes", []): wb_text = wb.get("text", "") if "|" not in wb_text: continue # Separate trailing punctuation m = re.match( r'^([^a-zA-Z\u00e4\u00f6\u00fc\u00c4\u00d6\u00dc\u00df\u1e9e]*)' r'(.*?)' r'([^a-zA-Z\u00e4\u00f6\u00fc\u00c4\u00d6\u00dc\u00df\u1e9e]*)$', wb_text, ) if not m: continue lead, core, trail = m.group(1), m.group(2), m.group(3) if "|" not in core: continue corrected = _autocorrect_piped_word(core) if corrected is not None and corrected != core: wb["text"] = lead + corrected + trail cell_changed = True # --- Rebuild cell text from word boxes --- if cell_changed: wbs = cell.get("word_boxes", []) if wbs: cell["text"] = " ".join( (wb.get("text") or "") for wb in wbs ) modified += 1 # --- Fallback: strip residual | from cell text --- text = cell.get("text", "") if "|" in text: clean = text.replace("|", "") if clean != text: cell["text"] = clean if not cell_changed: modified += 1 if modified: logger.info( "build-grid session %s: autocorrected pipe artifacts in %d cells", session_id, modified, ) return modified