From d98dba90986d109a98886e006a18ad48e1a248d2 Mon Sep 17 00:00:00 2001
From: Benjamin Admin <benjaminadmin@MacBookPro.fritz.box>
Date: Wed, 11 Mar 2026 23:25:38 +0100
Subject: [PATCH] fix: Headword-IPA auch in langen column_text Zeilen einfuegen

_insert_missing_ipa ueberspringe Texte mit >6 Woertern oder Klammern.
Neue _insert_headword_ipa fuer column_text: prueft nur das erste Wort
der Zeile, unabhaengig von Textlaenge oder vorhandenen Klammern.

Ausserdem _sync_word_boxes_after_ipa_insert gefixt: Token-Vergleich
nutzt jetzt paralleles Durchlaufen statt zip (verschobene Positionen).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 klausur-service/backend/cv_ocr_engines.py | 81 ++++++++++++++++-------
 1 file changed, 57 insertions(+), 24 deletions(-)

diff --git a/klausur-service/backend/cv_ocr_engines.py b/klausur-service/backend/cv_ocr_engines.py
index d2e603c..ee7fad5 100644
--- a/klausur-service/backend/cv_ocr_engines.py
+++ b/klausur-service/backend/cv_ocr_engines.py
@@ -952,6 +952,49 @@ def _insert_missing_ipa(text: str, pronunciation: str = 'british') -> str:
     return ' '.join(words)
 
 
+def _insert_headword_ipa(text: str, pronunciation: str = 'british') -> str:
+    """Insert IPA for the first English headword in a long mixed-language line.
+
+    Unlike _insert_missing_ipa (for short column_en cells), this handles
+    column_text lines of any length.  It only inserts IPA for the FIRST word
+    if that word:
+    - has no bracket following it already
+    - has an IPA entry in the dictionary
+    - is not a number/symbol prefix like "».55"
+
+    Returns the text with [ipa] inserted after the first word, or unchanged.
+    """
+    if not IPA_AVAILABLE:
+        return text
+    if not text or not text.strip():
+        return text
+
+    words = text.strip().split()
+    if not words:
+        return text
+
+    # Check if text already starts with a bracket (IPA already present)
+    if len(words) > 1 and words[1].startswith(('[', '{', '(')):
+        return text
+
+    # Try the first few words (skip numeric prefixes like "».55", "0.56")
+    for i in range(min(3, len(words))):
+        w = words[i]
+        clean = re.sub(r'[^a-zA-ZäöüÄÖÜß\'-]', '', w)
+        if not clean or len(clean) < 2:
+            continue
+        if clean.lower() in _GRAMMAR_BRACKET_WORDS:
+            continue
+        ipa = _lookup_ipa(clean, pronunciation)
+        if ipa:
+            words[i] = f"{w} [{ipa}]"
+            return ' '.join(words)
+        # Stop at first real word even if no IPA found
+        break
+
+    return text
+
+
 def fix_cell_phonetics(
     cells: List[Dict[str, Any]],
     pronunciation: str = 'british',
@@ -993,13 +1036,12 @@ def fix_cell_phonetics(
         else:
             # column_text: replace garbled IPA, no orphan stripping
             new_text = _replace_phonetics_in_text(text, pronunciation, strip_orphans=False)
-            # Insert missing IPA AND sync word_boxes so overlay positioning
-            # stays consistent (1:1 token-to-box mapping).
+            # Insert headword IPA for long mixed-language lines AND sync
+            # word_boxes so overlay positioning stays consistent.
             if new_text == text:
-                inserted = _insert_missing_ipa(text, pronunciation)
+                inserted = _insert_headword_ipa(text, pronunciation)
                 if inserted != text:
                     new_text = inserted
-                    # Sync word_boxes: insert a synthetic box for the IPA token
                     _sync_word_boxes_after_ipa_insert(cell, text, new_text)
 
         if new_text != text:
@@ -1017,10 +1059,10 @@ def _sync_word_boxes_after_ipa_insert(
     old_text: str,
     new_text: str,
 ) -> None:
-    """Insert a synthetic word_box for an IPA token added by _insert_missing_ipa.
+    """Insert a synthetic word_box for an IPA token added by IPA insertion.
 
-    _insert_missing_ipa changes e.g. "challenge" → "challenge [tʃælɪndʒ]".
-    This adds a new word_box right after the headword's box so the 1:1
+    E.g. "challenge ..." → "challenge [tʃælɪndʒ] ..."
+    Adds a new word_box right after the headword's box so the 1:1
     token-to-box mapping in the frontend overlay stays consistent.
     """
     word_boxes = cell.get('word_boxes')
@@ -1030,23 +1072,17 @@ def _sync_word_boxes_after_ipa_insert(
     old_tokens = old_text.split()
     new_tokens = new_text.split()
 
-    # Find the inserted IPA token (the one that's new)
     if len(new_tokens) != len(old_tokens) + 1:
         return  # unexpected change, skip
 
+    # Find the inserted token by walking both lists in parallel.
+    # One token in new_tokens won't match — that's the inserted IPA.
     insert_idx = -1
-    for i, (ot, nt) in enumerate(zip(old_tokens, new_tokens)):
-        if ot != nt:
-            # old token was modified (shouldn't happen with _insert_missing_ipa)
-            return
-    # The extra token is at the position where old and new diverge
-    # _insert_missing_ipa inserts "[ipa]" right after the word, so
-    # new_tokens has one extra element.
+    j = 0  # index into old_tokens
     for i in range(len(new_tokens)):
-        if i >= len(old_tokens):
-            insert_idx = i
-            break
-        if old_tokens[i] != new_tokens[i]:
+        if j < len(old_tokens) and new_tokens[i] == old_tokens[j]:
+            j += 1
+        else:
             insert_idx = i
             break
 
@@ -1055,20 +1091,17 @@ def _sync_word_boxes_after_ipa_insert(
 
     ipa_token = new_tokens[insert_idx]
 
-    # Find the corresponding word_box to place the IPA after.
-    # The headword is at insert_idx - 1 in the new tokens, which corresponds
-    # to insert_idx - 1 in the old tokens (and thus in word_boxes).
+    # The headword is at insert_idx - 1 in old_tokens (and word_boxes)
     ref_idx = insert_idx - 1
     if ref_idx < 0 or ref_idx >= len(word_boxes):
         return
 
     ref_box = word_boxes[ref_idx]
-    # Create synthetic box: same height/top, placed right after the headword
     ipa_box = {
         'text': ipa_token,
         'left': ref_box['left'] + ref_box['width'] + 2,
         'top': ref_box['top'],
-        'width': ref_box['width'],  # approximate same width
+        'width': ref_box['width'],
         'height': ref_box['height'],
         'conf': ref_box.get('conf', 90),
     }