fix: IPA-Einfuegung fuer column_text mit word_boxes Synchronisation
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 31s
CI / test-go-edu-search (push) Successful in 32s
CI / test-python-klausur (push) Failing after 2m9s
CI / test-python-agent-core (push) Successful in 19s
CI / test-nodejs-website (push) Successful in 20s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 31s
CI / test-go-edu-search (push) Successful in 32s
CI / test-python-klausur (push) Failing after 2m9s
CI / test-python-agent-core (push) Successful in 19s
CI / test-nodejs-website (push) Successful in 20s
Fuer column_text werden fehlende IPA-Lautschriften (challenge, profit, film, badge) wieder eingefuegt, aber gleichzeitig eine synthetische word_box erzeugt, damit die 1:1 Token-zu-Box Zuordnung im Overlay erhalten bleibt. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -991,10 +991,16 @@ def fix_cell_phonetics(
|
||||
if new_text == text:
|
||||
new_text = _insert_missing_ipa(text, pronunciation)
|
||||
else:
|
||||
# column_text: only replace garbled IPA brackets, nothing else.
|
||||
# No orphan stripping (would remove German parentheticals).
|
||||
# No IPA insertion (would add tokens, breaking overlay positioning).
|
||||
# column_text: replace garbled IPA, no orphan stripping
|
||||
new_text = _replace_phonetics_in_text(text, pronunciation, strip_orphans=False)
|
||||
# Insert missing IPA AND sync word_boxes so overlay positioning
|
||||
# stays consistent (1:1 token-to-box mapping).
|
||||
if new_text == text:
|
||||
inserted = _insert_missing_ipa(text, pronunciation)
|
||||
if inserted != text:
|
||||
new_text = inserted
|
||||
# Sync word_boxes: insert a synthetic box for the IPA token
|
||||
_sync_word_boxes_after_ipa_insert(cell, text, new_text)
|
||||
|
||||
if new_text != text:
|
||||
logger.debug(f"fix_cell_phonetics: '{text}' → '{new_text}'")
|
||||
@@ -1006,6 +1012,69 @@ def fix_cell_phonetics(
|
||||
return cells
|
||||
|
||||
|
||||
def _sync_word_boxes_after_ipa_insert(
|
||||
cell: Dict[str, Any],
|
||||
old_text: str,
|
||||
new_text: str,
|
||||
) -> None:
|
||||
"""Insert a synthetic word_box for an IPA token added by _insert_missing_ipa.
|
||||
|
||||
_insert_missing_ipa changes e.g. "challenge" → "challenge [tʃælɪndʒ]".
|
||||
This adds a new word_box right after the headword's box so the 1:1
|
||||
token-to-box mapping in the frontend overlay stays consistent.
|
||||
"""
|
||||
word_boxes = cell.get('word_boxes')
|
||||
if not word_boxes:
|
||||
return
|
||||
|
||||
old_tokens = old_text.split()
|
||||
new_tokens = new_text.split()
|
||||
|
||||
# Find the inserted IPA token (the one that's new)
|
||||
if len(new_tokens) != len(old_tokens) + 1:
|
||||
return # unexpected change, skip
|
||||
|
||||
insert_idx = -1
|
||||
for i, (ot, nt) in enumerate(zip(old_tokens, new_tokens)):
|
||||
if ot != nt:
|
||||
# old token was modified (shouldn't happen with _insert_missing_ipa)
|
||||
return
|
||||
# The extra token is at the position where old and new diverge
|
||||
# _insert_missing_ipa inserts "[ipa]" right after the word, so
|
||||
# new_tokens has one extra element.
|
||||
for i in range(len(new_tokens)):
|
||||
if i >= len(old_tokens):
|
||||
insert_idx = i
|
||||
break
|
||||
if old_tokens[i] != new_tokens[i]:
|
||||
insert_idx = i
|
||||
break
|
||||
|
||||
if insert_idx < 0 or insert_idx >= len(new_tokens):
|
||||
return
|
||||
|
||||
ipa_token = new_tokens[insert_idx]
|
||||
|
||||
# Find the corresponding word_box to place the IPA after.
|
||||
# The headword is at insert_idx - 1 in the new tokens, which corresponds
|
||||
# to insert_idx - 1 in the old tokens (and thus in word_boxes).
|
||||
ref_idx = insert_idx - 1
|
||||
if ref_idx < 0 or ref_idx >= len(word_boxes):
|
||||
return
|
||||
|
||||
ref_box = word_boxes[ref_idx]
|
||||
# Create synthetic box: same height/top, placed right after the headword
|
||||
ipa_box = {
|
||||
'text': ipa_token,
|
||||
'left': ref_box['left'] + ref_box['width'] + 2,
|
||||
'top': ref_box['top'],
|
||||
'width': ref_box['width'], # approximate same width
|
||||
'height': ref_box['height'],
|
||||
'conf': ref_box.get('conf', 90),
|
||||
}
|
||||
word_boxes.insert(insert_idx, ipa_box)
|
||||
|
||||
|
||||
def _assign_row_words_to_columns(
|
||||
row: RowGeometry,
|
||||
columns: List[PageRegion],
|
||||
|
||||
Reference in New Issue
Block a user