Step 5h: convert slash-delimited IPA to bracket notation with dict lookup

Dictionary-style pages print IPA between slashes (e.g. tiger /'taiga/). Step 5h detects these patterns, looks up the headword in the IPA dictionary for proper Unicode IPA, and falls back to OCR text when not found. Converts /ipa/ to [ipa] bracket notation matching the rest of the pipeline. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-20 12:36:08 +01:00
parent 7ac09b5941
commit 7fafd297e7
2 changed files with 148 additions and 1 deletions
--- a/klausur-service/backend/grid_editor_api.py
+++ b/klausur-service/backend/grid_editor_api.py
@@ -23,7 +23,7 @@ from fastapi import APIRouter, HTTPException, Request
 from cv_box_detect import detect_boxes, split_page_into_zones
 from cv_vocab_types import PageZone
 from cv_color_detect import detect_word_colors, recover_colored_text
-from cv_ocr_engines import fix_cell_phonetics, fix_ipa_continuation_cell, _text_has_garbled_ipa
+from cv_ocr_engines import fix_cell_phonetics, fix_ipa_continuation_cell, _text_has_garbled_ipa, _lookup_ipa
 from cv_words_first import _cluster_rows, _build_cells
 from ocr_pipeline_session_store import (
    get_session_db,
@@ -2002,6 +2002,61 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
        if footer_rows:
            z["footer"] = footer_rows

+    # 5h. Convert slash-delimited IPA to bracket notation.
+    # Dictionary-style pages print IPA between slashes: "tiger /'taiga/"
+    # Detect the pattern <headword> /ocr_ipa/ and replace with [dict_ipa]
+    # using the IPA dictionary when available, falling back to the OCR text.
+    # The regex requires a word character (or ² ³) right before the opening
+    # slash to avoid false positives like "sb/sth".
+    _SLASH_IPA_RE = re.compile(
+        r'(\b[a-zA-Z]+[²³¹]?)\s*'   # headword (capture group 1)
+        r"(/[^/]{2,}/)"              # /ipa/ (capture group 2), min 2 chars
+    )
+    # Standalone slash IPA at start of text (headword on previous line)
+    _STANDALONE_SLASH_IPA_RE = re.compile(r'^/([^/]{2,})/')
+    slash_ipa_fixed = 0
+    for z in zones_data:
+        for cell in z.get("cells", []):
+            text = cell.get("text", "")
+            if "/" not in text:
+                continue
+
+            def _replace_slash_ipa(m: re.Match) -> str:
+                nonlocal slash_ipa_fixed
+                headword = m.group(1)
+                ocr_ipa = m.group(2)  # includes slashes
+                # Strip superscript digits for lookup
+                clean_hw = re.sub(r'[²³¹\d]', '', headword).strip()
+                ipa = _lookup_ipa(clean_hw, "british") if clean_hw else None
+                if ipa:
+                    slash_ipa_fixed += 1
+                    return f"{headword} [{ipa}]"
+                # Fallback: keep OCR IPA but convert slashes to brackets
+                inner = ocr_ipa.strip("/").strip()
+                # Strip leading ' (OCR stress marker)
+                inner = inner.lstrip("'").strip()
+                if inner:
+                    slash_ipa_fixed += 1
+                    return f"{headword} [{inner}]"
+                return m.group(0)
+
+            new_text = _SLASH_IPA_RE.sub(_replace_slash_ipa, text)
+
+            # Handle standalone /ipa/ at start (no headword in this cell)
+            if new_text == text:
+                m = _STANDALONE_SLASH_IPA_RE.match(text)
+                if m:
+                    inner = m.group(1).strip().lstrip("'").strip()
+                    if inner:
+                        new_text = "[" + inner + "]" + text[m.end():]
+                        slash_ipa_fixed += 1
+
+            if new_text != text:
+                cell["text"] = new_text
+
+    if slash_ipa_fixed:
+        logger.info("Step 5h: converted %d slash-IPA to bracket notation", slash_ipa_fixed)
+
    duration = time.time() - t0

    # 6. Build result