diff --git a/klausur-service/backend/cv_syllable_detect.py b/klausur-service/backend/cv_syllable_detect.py index 96b21b6..65e0ae9 100644 --- a/klausur-service/backend/cv_syllable_detect.py +++ b/klausur-service/backend/cv_syllable_detect.py @@ -55,6 +55,9 @@ _STOP_WORDS = frozenset([ _hyph_de = None _hyph_en = None +# Cached spellchecker (for autocorrect_pipe_artifacts) +_spell_de = None + def _get_hyphenators(): """Lazy-load pyphen hyphenators (cached across calls).""" @@ -70,6 +73,19 @@ def _get_hyphenators(): return _hyph_de, _hyph_en +def _get_spellchecker(): + """Lazy-load German spellchecker (cached across calls).""" + global _spell_de + if _spell_de is not None: + return _spell_de + try: + from spellchecker import SpellChecker + except ImportError: + return None + _spell_de = SpellChecker(language='de') + return _spell_de + + def _is_known_word(word: str, hyph_de, hyph_en) -> bool: """Check whether pyphen recognises a word (DE or EN).""" if len(word) < 2: @@ -78,6 +94,14 @@ def _is_known_word(word: str, hyph_de, hyph_en) -> bool: or '|' in hyph_en.inserted(word, hyphen='|')) +def _is_real_word(word: str) -> bool: + """Check whether spellchecker knows this word (case-insensitive).""" + spell = _get_spellchecker() + if spell is None: + return False + return word.lower() in spell + + def _hyphenate_word(word: str, hyph_de, hyph_en) -> Optional[str]: """Try to hyphenate a word using DE then EN dictionary. @@ -92,54 +116,52 @@ def _hyphenate_word(word: str, hyph_de, hyph_en) -> Optional[str]: return None -def _autocorrect_piped_word( - word_with_pipes: str, hyph_de, hyph_en, -) -> Optional[str]: +def _autocorrect_piped_word(word_with_pipes: str) -> Optional[str]: """Try to correct a word that has OCR pipe artifacts. Printed syllable divider lines on dictionary pages confuse OCR: the vertical stroke is often read as an extra character (commonly ``l``, ``I``, ``1``, ``i``) adjacent to where the pipe appears. + Sometimes OCR reads one divider as ``|`` and another as a letter, + so the garbled character may be far from any detected pipe. + + Uses ``spellchecker`` (frequency-based word list) for validation — + unlike pyphen which is a pattern-based hyphenator and accepts + nonsense strings like "Zeplpelin". Strategy: - 1. Strip ``|`` — if pyphen recognises the result, done. - 2. Record where the pipes were in the stripped string. - 3. Try deleting one character near each pipe position (the extra - character the OCR inserted). If pyphen recognises the - candidate, return it. + 1. Strip ``|`` — if spellchecker knows the result, done. + 2. Try deleting each pipe-like character (l, I, 1, i, t). + OCR inserts extra chars that resemble vertical strokes. + 3. Fall back to spellchecker's own ``correction()`` method. + 4. Preserve the original casing of the first letter. """ stripped = word_with_pipes.replace('|', '') if not stripped or len(stripped) < 3: return stripped # too short to validate - # Case-preserved check; pyphen is case-insensitive internally - if _is_known_word(stripped, hyph_de, hyph_en): + # Step 1: if the stripped word is already a real word, done + if _is_real_word(stripped): return stripped - # Map pipe positions into the stripped string. - # e.g. "Ze|plpe|lin" → pipes were at indices 2, 6 in the original, - # which map to positions 2, 5 in "Zeplpelin". - pipe_positions: List[int] = [] - offset = 0 - for i, c in enumerate(word_with_pipes): - if c == '|': - pipe_positions.append(i - offset) - offset += 1 + # Step 2: try deleting pipe-like characters (most likely artifacts) + _PIPE_LIKE = frozenset('lI1it') + for idx in range(len(stripped)): + if stripped[idx] not in _PIPE_LIKE: + continue + candidate = stripped[:idx] + stripped[idx + 1:] + if len(candidate) >= 3 and _is_real_word(candidate): + return candidate - # Try single-character deletion near each pipe position. - # OCR typically inserts ONE extra char per pipe stroke. - seen: set = set() - for pos in pipe_positions: - for delta in (0, 1, -1, 2, -2): - idx = pos + delta - if idx < 0 or idx >= len(stripped): - continue - candidate = stripped[:idx] + stripped[idx + 1:] - if candidate in seen or len(candidate) < 3: - continue - seen.add(candidate) - if _is_known_word(candidate, hyph_de, hyph_en): - return candidate + # Step 3: use spellchecker's built-in correction + spell = _get_spellchecker() + if spell is not None: + suggestion = spell.correction(stripped.lower()) + if suggestion and suggestion != stripped.lower(): + # Preserve original first-letter case + if stripped[0].isupper(): + suggestion = suggestion[0].upper() + suggestion[1:] + return suggestion return None # could not fix @@ -154,16 +176,18 @@ def autocorrect_pipe_artifacts( This function: 1. Strips ``|`` from every word in content cells. - 2. Validates the stripped word with pyphen. - 3. If not recognised, tries deleting characters that the OCR inserted - around the pipe position (e.g. ``Zeplpelin`` → ``Zeppelin``). + 2. Validates with spellchecker (real dictionary lookup). + 3. If not recognised, tries deleting pipe-like characters or uses + spellchecker's correction (e.g. ``Zeplpelin`` → ``Zeppelin``). 4. Updates both word-box texts and cell text. Returns the number of cells modified. """ - hyph_de, hyph_en = _get_hyphenators() - if hyph_de is None: - return 0 + spell = _get_spellchecker() + if spell is None: + logger.warning("spellchecker not available — pipe autocorrect limited") + # Fall back: still strip pipes even without spellchecker + pass modified = 0 for z in zones_data: @@ -193,7 +217,7 @@ def autocorrect_pipe_artifacts( if "|" not in core: continue - corrected = _autocorrect_piped_word(core, hyph_de, hyph_en) + corrected = _autocorrect_piped_word(core) if corrected is not None and corrected != core: wb["text"] = lead + corrected + trail cell_changed = True