diff --git a/klausur-service/backend/cv_ocr_engines.py b/klausur-service/backend/cv_ocr_engines.py index ee43b73..6fca67e 100644 --- a/klausur-service/backend/cv_ocr_engines.py +++ b/klausur-service/backend/cv_ocr_engines.py @@ -829,12 +829,20 @@ def _is_grammar_bracket_content(content: str) -> bool: return all(token in _GRAMMAR_BRACKET_WORDS for token in tokens) -def _replace_phonetics_in_text(text: str, pronunciation: str = 'british') -> str: +def _replace_phonetics_in_text( + text: str, + pronunciation: str = 'british', + strip_orphans: bool = True, +) -> str: """Replace [phonetic] / {phonetic} / (phonetic) after words with dictionary IPA. Tesseract garbles IPA brackets, e.g. China [ˈtʃaɪnə] → China {'tfatno]. We match any bracket type and replace with dictionary IPA if found. Legitimate parenthetical content like (zer)brechen or (veranstaltung) is preserved. + + Args: + strip_orphans: If True, strip orphan brackets that look like garbled IPA. + Set to False for column_text where brackets may be German content. """ if not IPA_AVAILABLE: return text @@ -864,28 +872,30 @@ def _replace_phonetics_in_text(text: str, pronunciation: str = 'british') -> str text = _PHONETIC_BRACKET_RE.sub(replacer, text) - # Second pass: strip remaining orphan brackets that are garbled IPA. - # These have no word before them (the main regex requires \b word \s* bracket). - # Examples: "[mais]", "{'mani setva]", trailing "(kros]" - # Keep: grammar parens "(sich beschweren)", correct IPA "[dˈɑːns]" - def _strip_orphan_bracket(m): - content = m.group(1).strip() - # Keep grammar info: (sich beschweren), (about/of) - if _is_grammar_bracket_content(content): - return m.group(0) - # Keep correct IPA (contains Unicode IPA characters) - if any(ch in _IPA_CHARS for ch in content): - return m.group(0) - # Keep real-word parentheticals like (probieren), (Profit), (Geld). - # Garbled IPA fragments are short nonsense like (kros), (cy), (mais) - # — they never contain a real word ≥4 letters with proper casing. - content_alpha = re.sub(r'[^a-zA-ZäöüÄÖÜßéèêëàâîïôûùç]', '', content) - if len(content_alpha) >= 4: - return m.group(0) - logger.debug(f"phonetic: stripping orphan bracket '{m.group(0)}'") - return '' + if strip_orphans: + # Second pass: strip remaining orphan brackets that are garbled IPA. + # These have no word before them (the main regex requires \b word \s* bracket). + # Examples: "[mais]", "{'mani setva]", trailing "(kros]" + # Keep: grammar parens "(sich beschweren)", correct IPA "[dˈɑːns]" + def _strip_orphan_bracket(m): + content = m.group(1).strip() + # Keep grammar info: (sich beschweren), (about/of) + if _is_grammar_bracket_content(content): + return m.group(0) + # Keep correct IPA (contains Unicode IPA characters) + if any(ch in _IPA_CHARS for ch in content): + return m.group(0) + # Keep real-word parentheticals like (probieren), (Profit), (Geld). + # Garbled IPA fragments are short nonsense like (kros), (cy), (mais) + # — they never contain a real word ≥4 letters with proper casing. + content_alpha = re.sub(r'[^a-zA-ZäöüÄÖÜßéèêëàâîïôûùç]', '', content) + if len(content_alpha) >= 4: + return m.group(0) + logger.debug(f"phonetic: stripping orphan bracket '{m.group(0)}'") + return '' + + text = re.sub(r'[\[\{\(]([^\]\}\)]*)[\]\}\)]', _strip_orphan_bracket, text) - text = re.sub(r'[\[\{\(]([^\]\}\)]*)[\]\}\)]', _strip_orphan_bracket, text) text = text.strip() return text @@ -952,17 +962,17 @@ def fix_cell_phonetics( (entry['english']). But the overlay reads cell['text'] directly, so phonetic fixes must be applied to cells too. - This function: - 1. Replaces garbled IPA brackets with correct dictionary IPA - 2. Inserts missing IPA for English headwords that have no brackets - - Only processes cells in English-like columns (column_en, column_text). - German columns are never processed (they contain meaningful parentheses). + Processing depends on column type: + - column_en: Full processing (replace garbled IPA + strip orphan brackets + + insert missing IPA). Safe because these cells contain only English + headwords. + - column_text: Light processing (replace garbled IPA ONLY). No orphan + bracket stripping (brackets may be German content like "(probieren)") + and no IPA insertion (would add tokens and break overlay positioning). """ if not IPA_AVAILABLE: return cells - # Column types where IPA processing makes sense ipa_col_types = {'column_en', 'column_text'} replaced = 0 @@ -974,11 +984,17 @@ def fix_cell_phonetics( if not text.strip(): continue - # Step 1: replace garbled IPA brackets - new_text = _replace_phonetics_in_text(text, pronunciation) - # Step 2: insert missing IPA if no brackets were present - if new_text == text: - new_text = _insert_missing_ipa(text, pronunciation) + if col_type == 'column_en': + # Full processing: replace garbled IPA, strip orphan brackets, + # insert missing IPA + new_text = _replace_phonetics_in_text(text, pronunciation, strip_orphans=True) + if new_text == text: + new_text = _insert_missing_ipa(text, pronunciation) + else: + # column_text: only replace garbled IPA brackets, nothing else. + # No orphan stripping (would remove German parentheticals). + # No IPA insertion (would add tokens, breaking overlay positioning). + new_text = _replace_phonetics_in_text(text, pronunciation, strip_orphans=False) if new_text != text: logger.debug(f"fix_cell_phonetics: '{text}' → '{new_text}'")