fix: IPA-Einfuegung fuer column_text mit word_boxes Synchronisation

Fuer column_text werden fehlende IPA-Lautschriften (challenge, profit, film, badge) wieder eingefuegt, aber gleichzeitig eine synthetische word_box erzeugt, damit die 1:1 Token-zu-Box Zuordnung im Overlay erhalten bleibt. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-11 23:15:26 +01:00
parent aa7db43f02
commit cd13eca290
1 changed files with 72 additions and 3 deletions
@@ -991,10 +991,16 @@ def fix_cell_phonetics(
            if new_text == text:
                new_text = _insert_missing_ipa(text, pronunciation)
        else:
-            # column_text: only replace garbled IPA brackets, nothing else.
-            # No orphan stripping (would remove German parentheticals).
-            # No IPA insertion (would add tokens, breaking overlay positioning).
+            # column_text: replace garbled IPA, no orphan stripping
            new_text = _replace_phonetics_in_text(text, pronunciation, strip_orphans=False)
+            # Insert missing IPA AND sync word_boxes so overlay positioning
+            # stays consistent (1:1 token-to-box mapping).
+            if new_text == text:
+                inserted = _insert_missing_ipa(text, pronunciation)
+                if inserted != text:
+                    new_text = inserted
+                    # Sync word_boxes: insert a synthetic box for the IPA token
+                    _sync_word_boxes_after_ipa_insert(cell, text, new_text)

        if new_text != text:
            logger.debug(f"fix_cell_phonetics: '{text}' → '{new_text}'")
@@ -1006,6 +1012,69 @@ def fix_cell_phonetics(
    return cells


+def _sync_word_boxes_after_ipa_insert(
+    cell: Dict[str, Any],
+    old_text: str,
+    new_text: str,
+) -> None:
+    """Insert a synthetic word_box for an IPA token added by _insert_missing_ipa.
+
+    _insert_missing_ipa changes e.g. "challenge" → "challenge [tʃælɪndʒ]".
+    This adds a new word_box right after the headword's box so the 1:1
+    token-to-box mapping in the frontend overlay stays consistent.
+    """
+    word_boxes = cell.get('word_boxes')
+    if not word_boxes:
+        return
+
+    old_tokens = old_text.split()
+    new_tokens = new_text.split()
+
+    # Find the inserted IPA token (the one that's new)
+    if len(new_tokens) != len(old_tokens) + 1:
+        return  # unexpected change, skip
+
+    insert_idx = -1
+    for i, (ot, nt) in enumerate(zip(old_tokens, new_tokens)):
+        if ot != nt:
+            # old token was modified (shouldn't happen with _insert_missing_ipa)
+            return
+    # The extra token is at the position where old and new diverge
+    # _insert_missing_ipa inserts "[ipa]" right after the word, so
+    # new_tokens has one extra element.
+    for i in range(len(new_tokens)):
+        if i >= len(old_tokens):
+            insert_idx = i
+            break
+        if old_tokens[i] != new_tokens[i]:
+            insert_idx = i
+            break
+
+    if insert_idx < 0 or insert_idx >= len(new_tokens):
+        return
+
+    ipa_token = new_tokens[insert_idx]
+
+    # Find the corresponding word_box to place the IPA after.
+    # The headword is at insert_idx - 1 in the new tokens, which corresponds
+    # to insert_idx - 1 in the old tokens (and thus in word_boxes).
+    ref_idx = insert_idx - 1
+    if ref_idx < 0 or ref_idx >= len(word_boxes):
+        return
+
+    ref_box = word_boxes[ref_idx]
+    # Create synthetic box: same height/top, placed right after the headword
+    ipa_box = {
+        'text': ipa_token,
+        'left': ref_box['left'] + ref_box['width'] + 2,
+        'top': ref_box['top'],
+        'width': ref_box['width'],  # approximate same width
+        'height': ref_box['height'],
+        'conf': ref_box.get('conf', 90),
+    }
+    word_boxes.insert(insert_idx, ipa_box)
+
+
 def _assign_row_words_to_columns(
    row: RowGeometry,
    columns: List[PageRegion],