From 038eaf783c74010496665944f60f58ed391eaa76 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Thu, 19 Mar 2026 09:59:21 +0100 Subject: [PATCH] Only insert IPA when garbled phonetics exist in OCR text _insert_missing_ipa was adding dictionary IPA to cells that had NO phonetic transcription on the original page (e.g. "scissors" heading, "scarf - scarves" without IPA). Now guarded by _text_has_garbled_ipa() which checks for OCR-mangled phonetic markers (stress marks, length marks, IPA special chars) before allowing insertion. Rule: if a line has no phonetics, don't add any. Where garbled IPA exists, replace it with correct IPA notation. Co-Authored-By: Claude Opus 4.6 --- klausur-service/backend/cv_ocr_engines.py | 50 ++++++++++++++++++++--- 1 file changed, 44 insertions(+), 6 deletions(-) diff --git a/klausur-service/backend/cv_ocr_engines.py b/klausur-service/backend/cv_ocr_engines.py index 67648c3..8348b2f 100644 --- a/klausur-service/backend/cv_ocr_engines.py +++ b/klausur-service/backend/cv_ocr_engines.py @@ -984,12 +984,48 @@ def _replace_phonetics_in_text( return text +def _text_has_garbled_ipa(text: str) -> bool: + """Check if text contains garbled IPA-like fragments from OCR. + + Returns True if there is evidence of OCR-mangled phonetic + transcription, e.g. stress marks, length marks, or IPA special chars. + This is used to decide whether ``_insert_missing_ipa`` should run: + it must only insert IPA to *replace* garbled phonetics that are already + in the text — never to ADD phonetics where none existed on the page. + """ + for w in text.strip().split(): + # Skip delimiters and very short tokens + if len(w) <= 1 or w in ('–', '—', '-', '/', '|', ',', ';'): + continue + # Starts with stress mark (OCR read IPA stress ' as apostrophe) + if w.startswith("'") and len(w) > 1 and not w[1:].istitle(): + return True + if w.startswith("\u02c8") or w.startswith("\u02cc"): # ˈ ˌ + return True + # Contains IPA length mark ':' in a short non-word fragment + if ':' in w and len(w) < 12: + # But not things like "3:00" (time) or common words + stripped = re.sub(r'[^a-zA-Z:]', '', w) + if ':' in stripped and not stripped.replace(':', '').isalpha(): + continue + return True + # Contains IPA special characters + if any(c in w for c in 'əɪɛɒʊʌæɑɔʃʒθðŋ'): + return True + return False + + def _insert_missing_ipa(text: str, pronunciation: str = 'british') -> str: """Insert IPA pronunciation for English words that have no brackets at all. - OCR sometimes drops the phonetic transcription entirely (e.g. "challenge" - instead of "challenge [ˈtʃælɪndʒ]"). This scans the text for lone English - words that have a dictionary IPA entry and appends [ipa] after them. + OCR sometimes garbles the phonetic transcription into plain-text fragments + (e.g. "scare skea" where "skea" is garbled /skɛə/). This scans the text + for the headword, inserts correct [IPA], and strips the garbled fragments. + + IMPORTANT: This function must only be called when ``_text_has_garbled_ipa`` + confirms that the text actually contains garbled phonetics. If the text + is clean (e.g. just "scissors"), IPA must NOT be inserted — the original + page had no phonetics on that line. Only inserts for words that: - are standalone (not already followed by a bracket) @@ -1136,10 +1172,12 @@ def fix_cell_phonetics( continue if col_type == 'column_en': - # Full processing: replace garbled IPA, strip orphan brackets, - # insert missing IPA + # Full processing: replace garbled IPA, strip orphan brackets. new_text = _replace_phonetics_in_text(text, pronunciation, strip_orphans=True) - if new_text == text: + if new_text == text and _text_has_garbled_ipa(text): + # Only insert IPA when there IS garbled phonetics in the + # text — never add IPA to clean text that had none on the + # original page. new_text = _insert_missing_ipa(text, pronunciation) else: # column_text: replace garbled IPA, no orphan stripping