Fix garbled IPA in continuation rows using headword lookup

IPA continuation rows (phonetic transcription that wraps below the headword) now get proper IPA by looking up headwords from the row above. E.g. "ska:f – ska:vz" → "[skˈɑːf] – [skˈɑːvz]". Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-19 10:28:14 +01:00
parent 050d410ba0
commit fc0ab84e40
2 changed files with 123 additions and 4 deletions
--- a/klausur-service/backend/cv_ocr_engines.py
+++ b/klausur-service/backend/cv_ocr_engines.py
@@ -1096,6 +1096,74 @@ def _insert_missing_ipa(text: str, pronunciation: str = 'british') -> str:
    return ' '.join(words)


+def fix_ipa_continuation_cell(
+    garbled_text: str,
+    headword_text: str,
+    pronunciation: str = 'british',
+) -> str:
+    """Replace garbled IPA in a continuation row with proper IPA.
+
+    Continuation rows appear below the headword and contain only the
+    printed phonetic transcription, which OCR garbles into fragments
+    like ``ska:f – ska:vz`` (should be ``[skˈɑːf] – [skˈɑːvz]``).
+
+    Args:
+        garbled_text: The OCR-garbled IPA text from the continuation row.
+        headword_text: The headword text from the previous row
+            (e.g. ``scarf – scarves``).
+        pronunciation: ``'british'`` or ``'american'``.
+
+    Returns:
+        Corrected IPA text, or the original if no fix could be applied.
+    """
+    if not IPA_AVAILABLE or not garbled_text or not headword_text:
+        return garbled_text
+
+    # Strip existing IPA brackets from headword text
+    clean_hw = re.sub(r'\[[^\]]*\]', '', headword_text).strip()
+    if not clean_hw:
+        return garbled_text
+
+    # Split headword by delimiters (– — -)
+    # "scarf – scarves" → ["scarf", "scarves"]
+    # "see - saw - seen" → ["see", "saw", "seen"]
+    parts = re.split(r'\s*[–—]\s*|\s+-\s+', clean_hw)
+    parts = [p.strip() for p in parts if p.strip()]
+
+    if not parts:
+        return garbled_text
+
+    # Look up IPA for each headword part
+    ipa_parts: List[str] = []
+    for part in parts:
+        # A part may be multi-word like "secondary school"
+        words = part.split()
+        word_ipas: List[str] = []
+        for w in words:
+            clean_w = re.sub(r'[^a-zA-Z\'-]', '', w)
+            if not clean_w or len(clean_w) < 2:
+                continue
+            # Skip grammar words like "to" at the start
+            if clean_w.lower() in _GRAMMAR_BRACKET_WORDS:
+                continue
+            ipa = _lookup_ipa(clean_w, pronunciation)
+            if ipa:
+                word_ipas.append(ipa)
+        if word_ipas:
+            ipa_parts.append('[' + ' '.join(word_ipas) + ']')
+
+    if not ipa_parts:
+        return garbled_text
+
+    # Join with delimiter
+    result = ' – '.join(ipa_parts)
+    logger.debug(
+        "fix_ipa_continuation: '%s' → '%s' (headwords: '%s')",
+        garbled_text, result, headword_text,
+    )
+    return result
+
+
 def _insert_headword_ipa(text: str, pronunciation: str = 'british') -> str:
    """Insert IPA for the first English headword in a long mixed-language line.