From d98dba90986d109a98886e006a18ad48e1a248d2 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Wed, 11 Mar 2026 23:25:38 +0100 Subject: [PATCH] fix: Headword-IPA auch in langen column_text Zeilen einfuegen _insert_missing_ipa ueberspringe Texte mit >6 Woertern oder Klammern. Neue _insert_headword_ipa fuer column_text: prueft nur das erste Wort der Zeile, unabhaengig von Textlaenge oder vorhandenen Klammern. Ausserdem _sync_word_boxes_after_ipa_insert gefixt: Token-Vergleich nutzt jetzt paralleles Durchlaufen statt zip (verschobene Positionen). Co-Authored-By: Claude Opus 4.6 --- klausur-service/backend/cv_ocr_engines.py | 81 ++++++++++++++++------- 1 file changed, 57 insertions(+), 24 deletions(-) diff --git a/klausur-service/backend/cv_ocr_engines.py b/klausur-service/backend/cv_ocr_engines.py index d2e603c..ee7fad5 100644 --- a/klausur-service/backend/cv_ocr_engines.py +++ b/klausur-service/backend/cv_ocr_engines.py @@ -952,6 +952,49 @@ def _insert_missing_ipa(text: str, pronunciation: str = 'british') -> str: return ' '.join(words) +def _insert_headword_ipa(text: str, pronunciation: str = 'british') -> str: + """Insert IPA for the first English headword in a long mixed-language line. + + Unlike _insert_missing_ipa (for short column_en cells), this handles + column_text lines of any length. It only inserts IPA for the FIRST word + if that word: + - has no bracket following it already + - has an IPA entry in the dictionary + - is not a number/symbol prefix like "».55" + + Returns the text with [ipa] inserted after the first word, or unchanged. + """ + if not IPA_AVAILABLE: + return text + if not text or not text.strip(): + return text + + words = text.strip().split() + if not words: + return text + + # Check if text already starts with a bracket (IPA already present) + if len(words) > 1 and words[1].startswith(('[', '{', '(')): + return text + + # Try the first few words (skip numeric prefixes like "».55", "0.56") + for i in range(min(3, len(words))): + w = words[i] + clean = re.sub(r'[^a-zA-ZäöüÄÖÜß\'-]', '', w) + if not clean or len(clean) < 2: + continue + if clean.lower() in _GRAMMAR_BRACKET_WORDS: + continue + ipa = _lookup_ipa(clean, pronunciation) + if ipa: + words[i] = f"{w} [{ipa}]" + return ' '.join(words) + # Stop at first real word even if no IPA found + break + + return text + + def fix_cell_phonetics( cells: List[Dict[str, Any]], pronunciation: str = 'british', @@ -993,13 +1036,12 @@ def fix_cell_phonetics( else: # column_text: replace garbled IPA, no orphan stripping new_text = _replace_phonetics_in_text(text, pronunciation, strip_orphans=False) - # Insert missing IPA AND sync word_boxes so overlay positioning - # stays consistent (1:1 token-to-box mapping). + # Insert headword IPA for long mixed-language lines AND sync + # word_boxes so overlay positioning stays consistent. if new_text == text: - inserted = _insert_missing_ipa(text, pronunciation) + inserted = _insert_headword_ipa(text, pronunciation) if inserted != text: new_text = inserted - # Sync word_boxes: insert a synthetic box for the IPA token _sync_word_boxes_after_ipa_insert(cell, text, new_text) if new_text != text: @@ -1017,10 +1059,10 @@ def _sync_word_boxes_after_ipa_insert( old_text: str, new_text: str, ) -> None: - """Insert a synthetic word_box for an IPA token added by _insert_missing_ipa. + """Insert a synthetic word_box for an IPA token added by IPA insertion. - _insert_missing_ipa changes e.g. "challenge" → "challenge [tʃælɪndʒ]". - This adds a new word_box right after the headword's box so the 1:1 + E.g. "challenge ..." → "challenge [tʃælɪndʒ] ..." + Adds a new word_box right after the headword's box so the 1:1 token-to-box mapping in the frontend overlay stays consistent. """ word_boxes = cell.get('word_boxes') @@ -1030,23 +1072,17 @@ def _sync_word_boxes_after_ipa_insert( old_tokens = old_text.split() new_tokens = new_text.split() - # Find the inserted IPA token (the one that's new) if len(new_tokens) != len(old_tokens) + 1: return # unexpected change, skip + # Find the inserted token by walking both lists in parallel. + # One token in new_tokens won't match — that's the inserted IPA. insert_idx = -1 - for i, (ot, nt) in enumerate(zip(old_tokens, new_tokens)): - if ot != nt: - # old token was modified (shouldn't happen with _insert_missing_ipa) - return - # The extra token is at the position where old and new diverge - # _insert_missing_ipa inserts "[ipa]" right after the word, so - # new_tokens has one extra element. + j = 0 # index into old_tokens for i in range(len(new_tokens)): - if i >= len(old_tokens): - insert_idx = i - break - if old_tokens[i] != new_tokens[i]: + if j < len(old_tokens) and new_tokens[i] == old_tokens[j]: + j += 1 + else: insert_idx = i break @@ -1055,20 +1091,17 @@ def _sync_word_boxes_after_ipa_insert( ipa_token = new_tokens[insert_idx] - # Find the corresponding word_box to place the IPA after. - # The headword is at insert_idx - 1 in the new tokens, which corresponds - # to insert_idx - 1 in the old tokens (and thus in word_boxes). + # The headword is at insert_idx - 1 in old_tokens (and word_boxes) ref_idx = insert_idx - 1 if ref_idx < 0 or ref_idx >= len(word_boxes): return ref_box = word_boxes[ref_idx] - # Create synthetic box: same height/top, placed right after the headword ipa_box = { 'text': ipa_token, 'left': ref_box['left'] + ref_box['width'] + 2, 'top': ref_box['top'], - 'width': ref_box['width'], # approximate same width + 'width': ref_box['width'], 'height': ref_box['height'], 'conf': ref_box.get('conf', 90), }