From b98ea33a3a1a10ee2e5d679951d01f47c161fb12 Mon Sep 17 00:00:00 2001
From: Benjamin Admin <benjaminadmin@MacBookPro.fritz.box>
Date: Wed, 18 Mar 2026 11:15:14 +0100
Subject: [PATCH] Strip garbled OCR phonetics after IPA insertion
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

_insert_missing_ipa now removes garbled phonetic text (e.g. "skea",
"sku:l", "'sizaz") that follows the inserted IPA bracket. Keeps
delimiters (–, -), uppercase words (German), and known English words.

Fixes: "scare [skˈɛə] skea" → "scare [skˈɛə]"

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 klausur-service/backend/cv_ocr_engines.py | 26 +++++++++++++++++++++--
 1 file changed, 24 insertions(+), 2 deletions(-)

diff --git a/klausur-service/backend/cv_ocr_engines.py b/klausur-service/backend/cv_ocr_engines.py
index e5a5a51..51d4d54 100644
--- a/klausur-service/backend/cv_ocr_engines.py
+++ b/klausur-service/backend/cv_ocr_engines.py
@@ -1028,8 +1028,30 @@ def _insert_missing_ipa(text: str, pronunciation: str = 'british') -> str:
         ipa = _lookup_ipa(clean, pronunciation)
         if ipa:
             words[i] = f"{w} [{ipa}]"
-            # Only insert for the FIRST word that has IPA
-            # (headword in English column)
+            # Strip garbled OCR phonetics after the IPA bracket.
+            # On scanned vocab pages, printed IPA is read as garbled
+            # text (e.g. "scare skea" where "skea" is garbled /skɛə/).
+            # After inserting correct IPA, remove remaining words that
+            # aren't real English words, delimiters, or German text.
+            kept = words[:i + 1]
+            for j in range(i + 1, len(words)):
+                wj = words[j]
+                # Delimiter — keep this and everything after
+                if wj in ('–', '—', '-', '/', '|', ',', ';'):
+                    kept.extend(words[j:])
+                    break
+                # Starts with uppercase — likely German or proper noun
+                clean_j = re.sub(r'[^a-zA-Z]', '', wj)
+                if clean_j and clean_j[0].isupper():
+                    kept.extend(words[j:])
+                    break
+                # Known English word (≥2 chars) — keep it and rest
+                if clean_j and len(clean_j) >= 2:
+                    if _lookup_ipa(clean_j, pronunciation):
+                        kept.extend(words[j:])
+                        break
+                # Otherwise — likely garbled phonetics, skip
+            words = kept
             break
 
     return ' '.join(words)