fix: Headword-IPA auch in langen column_text Zeilen einfuegen
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 53s
CI / test-go-edu-search (push) Successful in 49s
CI / test-python-klausur (push) Failing after 2m14s
CI / test-python-agent-core (push) Successful in 22s
CI / test-nodejs-website (push) Successful in 23s

_insert_missing_ipa ueberspringe Texte mit >6 Woertern oder Klammern.
Neue _insert_headword_ipa fuer column_text: prueft nur das erste Wort
der Zeile, unabhaengig von Textlaenge oder vorhandenen Klammern.

Ausserdem _sync_word_boxes_after_ipa_insert gefixt: Token-Vergleich
nutzt jetzt paralleles Durchlaufen statt zip (verschobene Positionen).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-11 23:25:38 +01:00
parent cd13eca290
commit d98dba9098

View File

@@ -952,6 +952,49 @@ def _insert_missing_ipa(text: str, pronunciation: str = 'british') -> str:
return ' '.join(words)
def _insert_headword_ipa(text: str, pronunciation: str = 'british') -> str:
"""Insert IPA for the first English headword in a long mixed-language line.
Unlike _insert_missing_ipa (for short column_en cells), this handles
column_text lines of any length. It only inserts IPA for the FIRST word
if that word:
- has no bracket following it already
- has an IPA entry in the dictionary
- is not a number/symbol prefix like "».55"
Returns the text with [ipa] inserted after the first word, or unchanged.
"""
if not IPA_AVAILABLE:
return text
if not text or not text.strip():
return text
words = text.strip().split()
if not words:
return text
# Check if text already starts with a bracket (IPA already present)
if len(words) > 1 and words[1].startswith(('[', '{', '(')):
return text
# Try the first few words (skip numeric prefixes like "».55", "0.56")
for i in range(min(3, len(words))):
w = words[i]
clean = re.sub(r'[^a-zA-ZäöüÄÖÜß\'-]', '', w)
if not clean or len(clean) < 2:
continue
if clean.lower() in _GRAMMAR_BRACKET_WORDS:
continue
ipa = _lookup_ipa(clean, pronunciation)
if ipa:
words[i] = f"{w} [{ipa}]"
return ' '.join(words)
# Stop at first real word even if no IPA found
break
return text
def fix_cell_phonetics(
cells: List[Dict[str, Any]],
pronunciation: str = 'british',
@@ -993,13 +1036,12 @@ def fix_cell_phonetics(
else:
# column_text: replace garbled IPA, no orphan stripping
new_text = _replace_phonetics_in_text(text, pronunciation, strip_orphans=False)
# Insert missing IPA AND sync word_boxes so overlay positioning
# stays consistent (1:1 token-to-box mapping).
# Insert headword IPA for long mixed-language lines AND sync
# word_boxes so overlay positioning stays consistent.
if new_text == text:
inserted = _insert_missing_ipa(text, pronunciation)
inserted = _insert_headword_ipa(text, pronunciation)
if inserted != text:
new_text = inserted
# Sync word_boxes: insert a synthetic box for the IPA token
_sync_word_boxes_after_ipa_insert(cell, text, new_text)
if new_text != text:
@@ -1017,10 +1059,10 @@ def _sync_word_boxes_after_ipa_insert(
old_text: str,
new_text: str,
) -> None:
"""Insert a synthetic word_box for an IPA token added by _insert_missing_ipa.
"""Insert a synthetic word_box for an IPA token added by IPA insertion.
_insert_missing_ipa changes e.g. "challenge""challenge [tʃælɪndʒ]".
This adds a new word_box right after the headword's box so the 1:1
E.g. "challenge ...""challenge [tʃælɪndʒ] ..."
Adds a new word_box right after the headword's box so the 1:1
token-to-box mapping in the frontend overlay stays consistent.
"""
word_boxes = cell.get('word_boxes')
@@ -1030,23 +1072,17 @@ def _sync_word_boxes_after_ipa_insert(
old_tokens = old_text.split()
new_tokens = new_text.split()
# Find the inserted IPA token (the one that's new)
if len(new_tokens) != len(old_tokens) + 1:
return # unexpected change, skip
# Find the inserted token by walking both lists in parallel.
# One token in new_tokens won't match — that's the inserted IPA.
insert_idx = -1
for i, (ot, nt) in enumerate(zip(old_tokens, new_tokens)):
if ot != nt:
# old token was modified (shouldn't happen with _insert_missing_ipa)
return
# The extra token is at the position where old and new diverge
# _insert_missing_ipa inserts "[ipa]" right after the word, so
# new_tokens has one extra element.
j = 0 # index into old_tokens
for i in range(len(new_tokens)):
if i >= len(old_tokens):
insert_idx = i
break
if old_tokens[i] != new_tokens[i]:
if j < len(old_tokens) and new_tokens[i] == old_tokens[j]:
j += 1
else:
insert_idx = i
break
@@ -1055,20 +1091,17 @@ def _sync_word_boxes_after_ipa_insert(
ipa_token = new_tokens[insert_idx]
# Find the corresponding word_box to place the IPA after.
# The headword is at insert_idx - 1 in the new tokens, which corresponds
# to insert_idx - 1 in the old tokens (and thus in word_boxes).
# The headword is at insert_idx - 1 in old_tokens (and word_boxes)
ref_idx = insert_idx - 1
if ref_idx < 0 or ref_idx >= len(word_boxes):
return
ref_box = word_boxes[ref_idx]
# Create synthetic box: same height/top, placed right after the headword
ipa_box = {
'text': ipa_token,
'left': ref_box['left'] + ref_box['width'] + 2,
'top': ref_box['top'],
'width': ref_box['width'], # approximate same width
'width': ref_box['width'],
'height': ref_box['height'],
'conf': ref_box.get('conf', 90),
}