fix: Headword-IPA auch in langen column_text Zeilen einfuegen

_insert_missing_ipa ueberspringe Texte mit >6 Woertern oder Klammern. Neue _insert_headword_ipa fuer column_text: prueft nur das erste Wort der Zeile, unabhaengig von Textlaenge oder vorhandenen Klammern. Ausserdem _sync_word_boxes_after_ipa_insert gefixt: Token-Vergleich nutzt jetzt paralleles Durchlaufen statt zip (verschobene Positionen). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-11 23:25:38 +01:00
parent cd13eca290
commit d98dba9098
1 changed files with 57 additions and 24 deletions
@@ -952,6 +952,49 @@ def _insert_missing_ipa(text: str, pronunciation: str = 'british') -> str:
    return ' '.join(words)


+def _insert_headword_ipa(text: str, pronunciation: str = 'british') -> str:
+    """Insert IPA for the first English headword in a long mixed-language line.
+
+    Unlike _insert_missing_ipa (for short column_en cells), this handles
+    column_text lines of any length.  It only inserts IPA for the FIRST word
+    if that word:
+    - has no bracket following it already
+    - has an IPA entry in the dictionary
+    - is not a number/symbol prefix like "».55"
+
+    Returns the text with [ipa] inserted after the first word, or unchanged.
+    """
+    if not IPA_AVAILABLE:
+        return text
+    if not text or not text.strip():
+        return text
+
+    words = text.strip().split()
+    if not words:
+        return text
+
+    # Check if text already starts with a bracket (IPA already present)
+    if len(words) > 1 and words[1].startswith(('[', '{', '(')):
+        return text
+
+    # Try the first few words (skip numeric prefixes like "».55", "0.56")
+    for i in range(min(3, len(words))):
+        w = words[i]
+        clean = re.sub(r'[^a-zA-ZäöüÄÖÜß\'-]', '', w)
+        if not clean or len(clean) < 2:
+            continue
+        if clean.lower() in _GRAMMAR_BRACKET_WORDS:
+            continue
+        ipa = _lookup_ipa(clean, pronunciation)
+        if ipa:
+            words[i] = f"{w} [{ipa}]"
+            return ' '.join(words)
+        # Stop at first real word even if no IPA found
+        break
+
+    return text
+
+
 def fix_cell_phonetics(
    cells: List[Dict[str, Any]],
    pronunciation: str = 'british',
@@ -993,13 +1036,12 @@ def fix_cell_phonetics(
        else:
            # column_text: replace garbled IPA, no orphan stripping
            new_text = _replace_phonetics_in_text(text, pronunciation, strip_orphans=False)
-            # Insert missing IPA AND sync word_boxes so overlay positioning
-            # stays consistent (1:1 token-to-box mapping).
+            # Insert headword IPA for long mixed-language lines AND sync
+            # word_boxes so overlay positioning stays consistent.
            if new_text == text:
-                inserted = _insert_missing_ipa(text, pronunciation)
+                inserted = _insert_headword_ipa(text, pronunciation)
                if inserted != text:
                    new_text = inserted
-                    # Sync word_boxes: insert a synthetic box for the IPA token
                    _sync_word_boxes_after_ipa_insert(cell, text, new_text)

        if new_text != text:
@@ -1017,10 +1059,10 @@ def _sync_word_boxes_after_ipa_insert(
    old_text: str,
    new_text: str,
 ) -> None:
-    """Insert a synthetic word_box for an IPA token added by _insert_missing_ipa.
+    """Insert a synthetic word_box for an IPA token added by IPA insertion.

-    _insert_missing_ipa changes e.g. "challenge" → "challenge [tʃælɪndʒ]".
-    This adds a new word_box right after the headword's box so the 1:1
+    E.g. "challenge ..." → "challenge [tʃælɪndʒ] ..."
+    Adds a new word_box right after the headword's box so the 1:1
    token-to-box mapping in the frontend overlay stays consistent.
    """
    word_boxes = cell.get('word_boxes')
@@ -1030,23 +1072,17 @@ def _sync_word_boxes_after_ipa_insert(
    old_tokens = old_text.split()
    new_tokens = new_text.split()

-    # Find the inserted IPA token (the one that's new)
    if len(new_tokens) != len(old_tokens) + 1:
        return  # unexpected change, skip

+    # Find the inserted token by walking both lists in parallel.
+    # One token in new_tokens won't match — that's the inserted IPA.
    insert_idx = -1
-    for i, (ot, nt) in enumerate(zip(old_tokens, new_tokens)):
-        if ot != nt:
-            # old token was modified (shouldn't happen with _insert_missing_ipa)
-            return
-    # The extra token is at the position where old and new diverge
-    # _insert_missing_ipa inserts "[ipa]" right after the word, so
-    # new_tokens has one extra element.
+    j = 0  # index into old_tokens
    for i in range(len(new_tokens)):
-        if i >= len(old_tokens):
-            insert_idx = i
-            break
-        if old_tokens[i] != new_tokens[i]:
+        if j < len(old_tokens) and new_tokens[i] == old_tokens[j]:
+            j += 1
+        else:
            insert_idx = i
            break

@@ -1055,20 +1091,17 @@ def _sync_word_boxes_after_ipa_insert(

    ipa_token = new_tokens[insert_idx]

-    # Find the corresponding word_box to place the IPA after.
-    # The headword is at insert_idx - 1 in the new tokens, which corresponds
-    # to insert_idx - 1 in the old tokens (and thus in word_boxes).
+    # The headword is at insert_idx - 1 in old_tokens (and word_boxes)
    ref_idx = insert_idx - 1
    if ref_idx < 0 or ref_idx >= len(word_boxes):
        return

    ref_box = word_boxes[ref_idx]
-    # Create synthetic box: same height/top, placed right after the headword
    ipa_box = {
        'text': ipa_token,
        'left': ref_box['left'] + ref_box['width'] + 2,
        'top': ref_box['top'],
-        'width': ref_box['width'],  # approximate same width
+        'width': ref_box['width'],
        'height': ref_box['height'],
        'conf': ref_box.get('conf', 90),
    }