fix: Headword-IPA auch in langen column_text Zeilen einfuegen
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 53s
CI / test-go-edu-search (push) Successful in 49s
CI / test-python-klausur (push) Failing after 2m14s
CI / test-python-agent-core (push) Successful in 22s
CI / test-nodejs-website (push) Successful in 23s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 53s
CI / test-go-edu-search (push) Successful in 49s
CI / test-python-klausur (push) Failing after 2m14s
CI / test-python-agent-core (push) Successful in 22s
CI / test-nodejs-website (push) Successful in 23s
_insert_missing_ipa ueberspringe Texte mit >6 Woertern oder Klammern. Neue _insert_headword_ipa fuer column_text: prueft nur das erste Wort der Zeile, unabhaengig von Textlaenge oder vorhandenen Klammern. Ausserdem _sync_word_boxes_after_ipa_insert gefixt: Token-Vergleich nutzt jetzt paralleles Durchlaufen statt zip (verschobene Positionen). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -952,6 +952,49 @@ def _insert_missing_ipa(text: str, pronunciation: str = 'british') -> str:
|
||||
return ' '.join(words)
|
||||
|
||||
|
||||
def _insert_headword_ipa(text: str, pronunciation: str = 'british') -> str:
|
||||
"""Insert IPA for the first English headword in a long mixed-language line.
|
||||
|
||||
Unlike _insert_missing_ipa (for short column_en cells), this handles
|
||||
column_text lines of any length. It only inserts IPA for the FIRST word
|
||||
if that word:
|
||||
- has no bracket following it already
|
||||
- has an IPA entry in the dictionary
|
||||
- is not a number/symbol prefix like "».55"
|
||||
|
||||
Returns the text with [ipa] inserted after the first word, or unchanged.
|
||||
"""
|
||||
if not IPA_AVAILABLE:
|
||||
return text
|
||||
if not text or not text.strip():
|
||||
return text
|
||||
|
||||
words = text.strip().split()
|
||||
if not words:
|
||||
return text
|
||||
|
||||
# Check if text already starts with a bracket (IPA already present)
|
||||
if len(words) > 1 and words[1].startswith(('[', '{', '(')):
|
||||
return text
|
||||
|
||||
# Try the first few words (skip numeric prefixes like "».55", "0.56")
|
||||
for i in range(min(3, len(words))):
|
||||
w = words[i]
|
||||
clean = re.sub(r'[^a-zA-ZäöüÄÖÜß\'-]', '', w)
|
||||
if not clean or len(clean) < 2:
|
||||
continue
|
||||
if clean.lower() in _GRAMMAR_BRACKET_WORDS:
|
||||
continue
|
||||
ipa = _lookup_ipa(clean, pronunciation)
|
||||
if ipa:
|
||||
words[i] = f"{w} [{ipa}]"
|
||||
return ' '.join(words)
|
||||
# Stop at first real word even if no IPA found
|
||||
break
|
||||
|
||||
return text
|
||||
|
||||
|
||||
def fix_cell_phonetics(
|
||||
cells: List[Dict[str, Any]],
|
||||
pronunciation: str = 'british',
|
||||
@@ -993,13 +1036,12 @@ def fix_cell_phonetics(
|
||||
else:
|
||||
# column_text: replace garbled IPA, no orphan stripping
|
||||
new_text = _replace_phonetics_in_text(text, pronunciation, strip_orphans=False)
|
||||
# Insert missing IPA AND sync word_boxes so overlay positioning
|
||||
# stays consistent (1:1 token-to-box mapping).
|
||||
# Insert headword IPA for long mixed-language lines AND sync
|
||||
# word_boxes so overlay positioning stays consistent.
|
||||
if new_text == text:
|
||||
inserted = _insert_missing_ipa(text, pronunciation)
|
||||
inserted = _insert_headword_ipa(text, pronunciation)
|
||||
if inserted != text:
|
||||
new_text = inserted
|
||||
# Sync word_boxes: insert a synthetic box for the IPA token
|
||||
_sync_word_boxes_after_ipa_insert(cell, text, new_text)
|
||||
|
||||
if new_text != text:
|
||||
@@ -1017,10 +1059,10 @@ def _sync_word_boxes_after_ipa_insert(
|
||||
old_text: str,
|
||||
new_text: str,
|
||||
) -> None:
|
||||
"""Insert a synthetic word_box for an IPA token added by _insert_missing_ipa.
|
||||
"""Insert a synthetic word_box for an IPA token added by IPA insertion.
|
||||
|
||||
_insert_missing_ipa changes e.g. "challenge" → "challenge [tʃælɪndʒ]".
|
||||
This adds a new word_box right after the headword's box so the 1:1
|
||||
E.g. "challenge ..." → "challenge [tʃælɪndʒ] ..."
|
||||
Adds a new word_box right after the headword's box so the 1:1
|
||||
token-to-box mapping in the frontend overlay stays consistent.
|
||||
"""
|
||||
word_boxes = cell.get('word_boxes')
|
||||
@@ -1030,23 +1072,17 @@ def _sync_word_boxes_after_ipa_insert(
|
||||
old_tokens = old_text.split()
|
||||
new_tokens = new_text.split()
|
||||
|
||||
# Find the inserted IPA token (the one that's new)
|
||||
if len(new_tokens) != len(old_tokens) + 1:
|
||||
return # unexpected change, skip
|
||||
|
||||
# Find the inserted token by walking both lists in parallel.
|
||||
# One token in new_tokens won't match — that's the inserted IPA.
|
||||
insert_idx = -1
|
||||
for i, (ot, nt) in enumerate(zip(old_tokens, new_tokens)):
|
||||
if ot != nt:
|
||||
# old token was modified (shouldn't happen with _insert_missing_ipa)
|
||||
return
|
||||
# The extra token is at the position where old and new diverge
|
||||
# _insert_missing_ipa inserts "[ipa]" right after the word, so
|
||||
# new_tokens has one extra element.
|
||||
j = 0 # index into old_tokens
|
||||
for i in range(len(new_tokens)):
|
||||
if i >= len(old_tokens):
|
||||
insert_idx = i
|
||||
break
|
||||
if old_tokens[i] != new_tokens[i]:
|
||||
if j < len(old_tokens) and new_tokens[i] == old_tokens[j]:
|
||||
j += 1
|
||||
else:
|
||||
insert_idx = i
|
||||
break
|
||||
|
||||
@@ -1055,20 +1091,17 @@ def _sync_word_boxes_after_ipa_insert(
|
||||
|
||||
ipa_token = new_tokens[insert_idx]
|
||||
|
||||
# Find the corresponding word_box to place the IPA after.
|
||||
# The headword is at insert_idx - 1 in the new tokens, which corresponds
|
||||
# to insert_idx - 1 in the old tokens (and thus in word_boxes).
|
||||
# The headword is at insert_idx - 1 in old_tokens (and word_boxes)
|
||||
ref_idx = insert_idx - 1
|
||||
if ref_idx < 0 or ref_idx >= len(word_boxes):
|
||||
return
|
||||
|
||||
ref_box = word_boxes[ref_idx]
|
||||
# Create synthetic box: same height/top, placed right after the headword
|
||||
ipa_box = {
|
||||
'text': ipa_token,
|
||||
'left': ref_box['left'] + ref_box['width'] + 2,
|
||||
'top': ref_box['top'],
|
||||
'width': ref_box['width'], # approximate same width
|
||||
'width': ref_box['width'],
|
||||
'height': ref_box['height'],
|
||||
'conf': ref_box.get('conf', 90),
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user