Fix IPA correction for dictionary pages (WIP)

- Fix Step 5h: restrict slash-IPA conversion to English headword column only — prevents converting "der/die/das" to "der [dər]das" in German columns (confirmed working) - Fix _text_has_garbled_ipa: detect embedded apostrophes in merged tokens like "Scotland'skotland" where OCR reads ˈ as ' - Fix _insert_missing_ipa: detect dictionary word prefix in merged trailing tokens like "fictionsalans'fIkfn" → extract "fiction" with IPA - Move en_col_type to wider scope for Step 5h access Note: Fixes 1+2 confirmed working in unit tests but not yet applying in the full build-grid pipeline — needs further debugging. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-24 23:54:14 +01:00
parent 4feec7c7b7
commit 9ea217bdfc
2 changed files with 25 additions and 1 deletions
--- a/klausur-service/backend/cv_ocr_engines.py
+++ b/klausur-service/backend/cv_ocr_engines.py
@@ -1030,6 +1030,15 @@ def _text_has_garbled_ipa(text: str) -> bool:
        # Contains IPA special characters
        if any(c in w for c in 'əɪɛɒʊʌæɑɔʃʒθðŋ'):
            return True
+        # Embedded apostrophe suggesting merged garbled IPA with stress mark.
+        # E.g. "Scotland'skotland" — OCR reads ˈ as '.
+        # Guard: apostrophe must be after ≥3 chars and before ≥3 lowercase
+        # chars to avoid contractions (don't, won't, o'clock).
+        if "'" in w and not w.startswith("'"):
+            apos_idx = w.index("'")
+            after = w[apos_idx + 1:]
+            if apos_idx >= 3 and len(after) >= 3 and after[0].islower():
+                return True
    return False


@@ -1183,6 +1192,17 @@ def _insert_missing_ipa(text: str, pronunciation: str = 'british') -> str:
                    if _lookup_ipa(clean_j, pronunciation):
                        kept.extend(words[j:])
                        break
+                # Merged token: dictionary word + garbled IPA stuck together.
+                # E.g. "fictionsalans'fIkfn" starts with "fiction".
+                # Extract the dictionary prefix and add it with IPA.
+                if clean_j and len(clean_j) >= 5:
+                    for pend in range(min(len(clean_j), 15), 2, -1):
+                        prefix_j = clean_j[:pend]
+                        prefix_ipa = _lookup_ipa(prefix_j, pronunciation)
+                        if prefix_ipa:
+                            kept.append(f"{prefix_j} [{prefix_ipa}]")
+                            break
+                    break  # rest of this token is garbled
                # Otherwise — likely garbled phonetics, skip
            words = kept
            break