From 2e21a4b6d07ce8edcb37b3781f325e4066d22b25 Mon Sep 17 00:00:00 2001
From: Benjamin Admin <benjaminadmin@MacBookPro.fritz.box>
Date: Wed, 11 Mar 2026 23:40:18 +0100
Subject: [PATCH] =?UTF-8?q?fix:=20IPA=20nur=20einf=C3=BCgen=20wenn=20word?=
 =?UTF-8?q?=5Fboxes=20Gap=20>80px=20zeigen=20(kein=20falsches=20IPA)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

_has_ipa_gap() prüft ob Tesseract eine IPA-Klammer übersehen hat anhand
des physischen Abstands zwischen Headword und nächstem Wort. Ohne Gap
(z.B. "be good at sth.", "Focus on language") wird kein IPA eingefügt.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 klausur-service/backend/cv_ocr_engines.py | 55 ++++++++++++++++++++---
 1 file changed, 49 insertions(+), 6 deletions(-)

diff --git a/klausur-service/backend/cv_ocr_engines.py b/klausur-service/backend/cv_ocr_engines.py
index ee7fad5..9081cd9 100644
--- a/klausur-service/backend/cv_ocr_engines.py
+++ b/klausur-service/backend/cv_ocr_engines.py
@@ -1036,13 +1036,16 @@ def fix_cell_phonetics(
         else:
             # column_text: replace garbled IPA, no orphan stripping
             new_text = _replace_phonetics_in_text(text, pronunciation, strip_orphans=False)
-            # Insert headword IPA for long mixed-language lines AND sync
-            # word_boxes so overlay positioning stays consistent.
+            # Insert headword IPA ONLY if there's a gap in word_boxes
+            # suggesting Tesseract missed an IPA bracket on the page.
+            # Without gap evidence, the original page had no IPA.
             if new_text == text:
-                inserted = _insert_headword_ipa(text, pronunciation)
-                if inserted != text:
-                    new_text = inserted
-                    _sync_word_boxes_after_ipa_insert(cell, text, new_text)
+                wb = cell.get('word_boxes', [])
+                if _has_ipa_gap(text, wb):
+                    inserted = _insert_headword_ipa(text, pronunciation)
+                    if inserted != text:
+                        new_text = inserted
+                        _sync_word_boxes_after_ipa_insert(cell, text, new_text)
 
         if new_text != text:
             logger.debug(f"fix_cell_phonetics: '{text}' → '{new_text}'")
@@ -1054,6 +1057,46 @@ def fix_cell_phonetics(
     return cells
 
 
+def _has_ipa_gap(text: str, word_boxes: List[Dict]) -> bool:
+    """Check if word_boxes show a gap where IPA brackets should be.
+
+    On a typical vocab page, the layout is:
+        headword [ipa]              German translation
+
+    If Tesseract missed the IPA bracket, the gap between the headword
+    and the next word (German translation) is unusually large (>80px)
+    because the IPA occupied physical space on the page.
+
+    If no IPA was on the page (e.g. "be good at sth."), the words are
+    close together (<30px).
+    """
+    if not word_boxes or len(word_boxes) < 2:
+        return False
+
+    tokens = text.split()
+    if not tokens:
+        return False
+
+    # Find the headword index: skip numeric prefixes like "».55", "0.56"
+    hw_box_idx = 0
+    for i, wb in enumerate(word_boxes):
+        wt = wb.get('text', '')
+        clean = re.sub(r'[^a-zA-ZäöüÄÖÜß]', '', wt)
+        if len(clean) >= 2:
+            hw_box_idx = i
+            break
+
+    if hw_box_idx >= len(word_boxes) - 1:
+        return False
+
+    # Check gap between headword and the next word_box
+    hw = word_boxes[hw_box_idx]
+    next_wb = word_boxes[hw_box_idx + 1]
+    gap = next_wb['left'] - (hw['left'] + hw['width'])
+
+    return gap > 80
+
+
 def _sync_word_boxes_after_ipa_insert(
     cell: Dict[str, Any],
     old_text: str,