From cd13eca29062e24774c55275c158afa58511a3c6 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Wed, 11 Mar 2026 23:15:26 +0100 Subject: [PATCH] fix: IPA-Einfuegung fuer column_text mit word_boxes Synchronisation Fuer column_text werden fehlende IPA-Lautschriften (challenge, profit, film, badge) wieder eingefuegt, aber gleichzeitig eine synthetische word_box erzeugt, damit die 1:1 Token-zu-Box Zuordnung im Overlay erhalten bleibt. Co-Authored-By: Claude Opus 4.6 --- klausur-service/backend/cv_ocr_engines.py | 75 ++++++++++++++++++++++- 1 file changed, 72 insertions(+), 3 deletions(-) diff --git a/klausur-service/backend/cv_ocr_engines.py b/klausur-service/backend/cv_ocr_engines.py index 6fca67e..d2e603c 100644 --- a/klausur-service/backend/cv_ocr_engines.py +++ b/klausur-service/backend/cv_ocr_engines.py @@ -991,10 +991,16 @@ def fix_cell_phonetics( if new_text == text: new_text = _insert_missing_ipa(text, pronunciation) else: - # column_text: only replace garbled IPA brackets, nothing else. - # No orphan stripping (would remove German parentheticals). - # No IPA insertion (would add tokens, breaking overlay positioning). + # column_text: replace garbled IPA, no orphan stripping new_text = _replace_phonetics_in_text(text, pronunciation, strip_orphans=False) + # Insert missing IPA AND sync word_boxes so overlay positioning + # stays consistent (1:1 token-to-box mapping). + if new_text == text: + inserted = _insert_missing_ipa(text, pronunciation) + if inserted != text: + new_text = inserted + # Sync word_boxes: insert a synthetic box for the IPA token + _sync_word_boxes_after_ipa_insert(cell, text, new_text) if new_text != text: logger.debug(f"fix_cell_phonetics: '{text}' → '{new_text}'") @@ -1006,6 +1012,69 @@ def fix_cell_phonetics( return cells +def _sync_word_boxes_after_ipa_insert( + cell: Dict[str, Any], + old_text: str, + new_text: str, +) -> None: + """Insert a synthetic word_box for an IPA token added by _insert_missing_ipa. + + _insert_missing_ipa changes e.g. "challenge" → "challenge [tʃælɪndʒ]". + This adds a new word_box right after the headword's box so the 1:1 + token-to-box mapping in the frontend overlay stays consistent. + """ + word_boxes = cell.get('word_boxes') + if not word_boxes: + return + + old_tokens = old_text.split() + new_tokens = new_text.split() + + # Find the inserted IPA token (the one that's new) + if len(new_tokens) != len(old_tokens) + 1: + return # unexpected change, skip + + insert_idx = -1 + for i, (ot, nt) in enumerate(zip(old_tokens, new_tokens)): + if ot != nt: + # old token was modified (shouldn't happen with _insert_missing_ipa) + return + # The extra token is at the position where old and new diverge + # _insert_missing_ipa inserts "[ipa]" right after the word, so + # new_tokens has one extra element. + for i in range(len(new_tokens)): + if i >= len(old_tokens): + insert_idx = i + break + if old_tokens[i] != new_tokens[i]: + insert_idx = i + break + + if insert_idx < 0 or insert_idx >= len(new_tokens): + return + + ipa_token = new_tokens[insert_idx] + + # Find the corresponding word_box to place the IPA after. + # The headword is at insert_idx - 1 in the new tokens, which corresponds + # to insert_idx - 1 in the old tokens (and thus in word_boxes). + ref_idx = insert_idx - 1 + if ref_idx < 0 or ref_idx >= len(word_boxes): + return + + ref_box = word_boxes[ref_idx] + # Create synthetic box: same height/top, placed right after the headword + ipa_box = { + 'text': ipa_token, + 'left': ref_box['left'] + ref_box['width'] + 2, + 'top': ref_box['top'], + 'width': ref_box['width'], # approximate same width + 'height': ref_box['height'], + 'conf': ref_box.get('conf', 90), + } + word_boxes.insert(insert_idx, ipa_box) + + def _assign_row_words_to_columns( row: RowGeometry, columns: List[PageRegion],