fix: IPA-Einfuegung fuer column_text mit word_boxes Synchronisation
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 31s
CI / test-go-edu-search (push) Successful in 32s
CI / test-python-klausur (push) Failing after 2m9s
CI / test-python-agent-core (push) Successful in 19s
CI / test-nodejs-website (push) Successful in 20s

Fuer column_text werden fehlende IPA-Lautschriften (challenge, profit,
film, badge) wieder eingefuegt, aber gleichzeitig eine synthetische
word_box erzeugt, damit die 1:1 Token-zu-Box Zuordnung im Overlay
erhalten bleibt.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-11 23:15:26 +01:00
parent aa7db43f02
commit cd13eca290

View File

@@ -991,10 +991,16 @@ def fix_cell_phonetics(
if new_text == text:
new_text = _insert_missing_ipa(text, pronunciation)
else:
# column_text: only replace garbled IPA brackets, nothing else.
# No orphan stripping (would remove German parentheticals).
# No IPA insertion (would add tokens, breaking overlay positioning).
# column_text: replace garbled IPA, no orphan stripping
new_text = _replace_phonetics_in_text(text, pronunciation, strip_orphans=False)
# Insert missing IPA AND sync word_boxes so overlay positioning
# stays consistent (1:1 token-to-box mapping).
if new_text == text:
inserted = _insert_missing_ipa(text, pronunciation)
if inserted != text:
new_text = inserted
# Sync word_boxes: insert a synthetic box for the IPA token
_sync_word_boxes_after_ipa_insert(cell, text, new_text)
if new_text != text:
logger.debug(f"fix_cell_phonetics: '{text}''{new_text}'")
@@ -1006,6 +1012,69 @@ def fix_cell_phonetics(
return cells
def _sync_word_boxes_after_ipa_insert(
cell: Dict[str, Any],
old_text: str,
new_text: str,
) -> None:
"""Insert a synthetic word_box for an IPA token added by _insert_missing_ipa.
_insert_missing_ipa changes e.g. "challenge""challenge [tʃælɪndʒ]".
This adds a new word_box right after the headword's box so the 1:1
token-to-box mapping in the frontend overlay stays consistent.
"""
word_boxes = cell.get('word_boxes')
if not word_boxes:
return
old_tokens = old_text.split()
new_tokens = new_text.split()
# Find the inserted IPA token (the one that's new)
if len(new_tokens) != len(old_tokens) + 1:
return # unexpected change, skip
insert_idx = -1
for i, (ot, nt) in enumerate(zip(old_tokens, new_tokens)):
if ot != nt:
# old token was modified (shouldn't happen with _insert_missing_ipa)
return
# The extra token is at the position where old and new diverge
# _insert_missing_ipa inserts "[ipa]" right after the word, so
# new_tokens has one extra element.
for i in range(len(new_tokens)):
if i >= len(old_tokens):
insert_idx = i
break
if old_tokens[i] != new_tokens[i]:
insert_idx = i
break
if insert_idx < 0 or insert_idx >= len(new_tokens):
return
ipa_token = new_tokens[insert_idx]
# Find the corresponding word_box to place the IPA after.
# The headword is at insert_idx - 1 in the new tokens, which corresponds
# to insert_idx - 1 in the old tokens (and thus in word_boxes).
ref_idx = insert_idx - 1
if ref_idx < 0 or ref_idx >= len(word_boxes):
return
ref_box = word_boxes[ref_idx]
# Create synthetic box: same height/top, placed right after the headword
ipa_box = {
'text': ipa_token,
'left': ref_box['left'] + ref_box['width'] + 2,
'top': ref_box['top'],
'width': ref_box['width'], # approximate same width
'height': ref_box['height'],
'conf': ref_box.get('conf', 90),
}
word_boxes.insert(insert_idx, ipa_box)
def _assign_row_words_to_columns(
row: RowGeometry,
columns: List[PageRegion],