From 038eaf783c74010496665944f60f58ed391eaa76 Mon Sep 17 00:00:00 2001
From: Benjamin Admin <benjaminadmin@MacBookPro.fritz.box>
Date: Thu, 19 Mar 2026 09:59:21 +0100
Subject: [PATCH] Only insert IPA when garbled phonetics exist in OCR text

_insert_missing_ipa was adding dictionary IPA to cells that had NO
phonetic transcription on the original page (e.g. "scissors" heading,
"scarf - scarves" without IPA). Now guarded by _text_has_garbled_ipa()
which checks for OCR-mangled phonetic markers (stress marks, length
marks, IPA special chars) before allowing insertion.

Rule: if a line has no phonetics, don't add any. Where garbled IPA
exists, replace it with correct IPA notation.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 klausur-service/backend/cv_ocr_engines.py | 50 ++++++++++++++++++++---
 1 file changed, 44 insertions(+), 6 deletions(-)

diff --git a/klausur-service/backend/cv_ocr_engines.py b/klausur-service/backend/cv_ocr_engines.py
index 67648c3..8348b2f 100644
--- a/klausur-service/backend/cv_ocr_engines.py
+++ b/klausur-service/backend/cv_ocr_engines.py
@@ -984,12 +984,48 @@ def _replace_phonetics_in_text(
     return text
 
 
+def _text_has_garbled_ipa(text: str) -> bool:
+    """Check if text contains garbled IPA-like fragments from OCR.
+
+    Returns True if there is evidence of OCR-mangled phonetic
+    transcription, e.g. stress marks, length marks, or IPA special chars.
+    This is used to decide whether ``_insert_missing_ipa`` should run:
+    it must only insert IPA to *replace* garbled phonetics that are already
+    in the text — never to ADD phonetics where none existed on the page.
+    """
+    for w in text.strip().split():
+        # Skip delimiters and very short tokens
+        if len(w) <= 1 or w in ('–', '—', '-', '/', '|', ',', ';'):
+            continue
+        # Starts with stress mark (OCR read IPA stress ' as apostrophe)
+        if w.startswith("'") and len(w) > 1 and not w[1:].istitle():
+            return True
+        if w.startswith("\u02c8") or w.startswith("\u02cc"):  # ˈ ˌ
+            return True
+        # Contains IPA length mark ':' in a short non-word fragment
+        if ':' in w and len(w) < 12:
+            # But not things like "3:00" (time) or common words
+            stripped = re.sub(r'[^a-zA-Z:]', '', w)
+            if ':' in stripped and not stripped.replace(':', '').isalpha():
+                continue
+            return True
+        # Contains IPA special characters
+        if any(c in w for c in 'əɪɛɒʊʌæɑɔʃʒθðŋ'):
+            return True
+    return False
+
+
 def _insert_missing_ipa(text: str, pronunciation: str = 'british') -> str:
     """Insert IPA pronunciation for English words that have no brackets at all.
 
-    OCR sometimes drops the phonetic transcription entirely (e.g. "challenge"
-    instead of "challenge [ˈtʃælɪndʒ]").  This scans the text for lone English
-    words that have a dictionary IPA entry and appends [ipa] after them.
+    OCR sometimes garbles the phonetic transcription into plain-text fragments
+    (e.g. "scare skea" where "skea" is garbled /skɛə/).  This scans the text
+    for the headword, inserts correct [IPA], and strips the garbled fragments.
+
+    IMPORTANT: This function must only be called when ``_text_has_garbled_ipa``
+    confirms that the text actually contains garbled phonetics.  If the text
+    is clean (e.g. just "scissors"), IPA must NOT be inserted — the original
+    page had no phonetics on that line.
 
     Only inserts for words that:
     - are standalone (not already followed by a bracket)
@@ -1136,10 +1172,12 @@ def fix_cell_phonetics(
             continue
 
         if col_type == 'column_en':
-            # Full processing: replace garbled IPA, strip orphan brackets,
-            # insert missing IPA
+            # Full processing: replace garbled IPA, strip orphan brackets.
             new_text = _replace_phonetics_in_text(text, pronunciation, strip_orphans=True)
-            if new_text == text:
+            if new_text == text and _text_has_garbled_ipa(text):
+                # Only insert IPA when there IS garbled phonetics in the
+                # text — never add IPA to clean text that had none on the
+                # original page.
                 new_text = _insert_missing_ipa(text, pronunciation)
         else:
             # column_text: replace garbled IPA, no orphan stripping