Fix garbled IPA detection for bracket-notation like [n, nn] and [1uedtX,1]

- Detect bracketed text without real IPA symbols as garbled OCR phonetics - Allow IPA continuation fix even when other columns have content (for rows where EN cell is clearly garbled bracketed IPA) - Strip parenthetical grammar annotations like (no pl) from headword before IPA lookup in fix_ipa_continuation_cell Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-19 23:28:00 +01:00
parent 7750b2a05f
commit 6bfa9eed86
3 changed files with 77 additions and 12 deletions
--- a/klausur-service/backend/cv_ocr_engines.py
+++ b/klausur-service/backend/cv_ocr_engines.py
@@ -993,6 +993,18 @@ def _text_has_garbled_ipa(text: str) -> bool:
    it must only insert IPA to *replace* garbled phonetics that are already
    in the text — never to ADD phonetics where none existed on the page.
    """
+    # Bracketed text that doesn't contain valid IPA symbols is garbled OCR
+    # of a phonetic transcription, e.g. "[n, nn]" or "[1uedtX,1]".
+    stripped = text.strip()
+    if stripped.startswith('[') and stripped.endswith(']'):
+        inner = stripped[1:-1]
+        # Real IPA brackets contain IPA symbols (ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ)
+        if not any(c in inner for c in 'ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ'):
+            # Not a valid dictionary-style bracket like "(no pl)" — those
+            # use parentheses, not square brackets.  Square brackets with
+            # no IPA chars are garbled phonetics.
+            return True
+
    for w in text.strip().split():
        # Skip delimiters and very short tokens
        if len(w) <= 1 or w in ('–', '—', '-', '/', '|', ',', ';'):
@@ -1238,8 +1250,10 @@ def fix_ipa_continuation_cell(
    if not IPA_AVAILABLE or not garbled_text or not headword_text:
        return garbled_text

-    # Strip existing IPA brackets from headword text
-    clean_hw = re.sub(r'\[[^\]]*\]', '', headword_text).strip()
+    # Strip existing IPA brackets and parenthetical grammar annotations
+    # like "(no pl)", "(sth)", "(sb)" from headword text
+    clean_hw = re.sub(r'\[[^\]]*\]', '', headword_text)
+    clean_hw = re.sub(r'\([^)]*\)', '', clean_hw).strip()
    if not clean_hw:
        return garbled_text