feat(ocr-pipeline): British/American IPA pronunciation choice

- Integrate Britfone dictionary (MIT, 15k British English IPA entries) - Add pronunciation parameter: 'british' (default) or 'american' - British uses Britfone (Received Pronunciation), falls back to CMU - American uses eng_to_ipa/CMU, falls back to Britfone - Frontend: dropdown to switch pronunciation, default = British - API: ?pronunciation=british|american query parameter Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-01 11:08:52 +01:00
parent 954d21e469
commit f2521d2b9e
4 changed files with 102 additions and 23 deletions
@@ -34,6 +34,7 @@ export function StepWordRecognition({ sessionId, onNext, goToStep }: StepWordRec
  const [mode, setMode] = useState<'overview' | 'labeling'>('overview')
  const [ocrEngine, setOcrEngine] = useState<'auto' | 'tesseract' | 'rapid'>('auto')
  const [usedEngine, setUsedEngine] = useState<string>('')
+  const [pronunciation, setPronunciation] = useState<'british' | 'american'>('british')

  const enRef = useRef<HTMLInputElement>(null)

@@ -73,7 +74,7 @@ export function StepWordRecognition({ sessionId, onNext, goToStep }: StepWordRec
    setDetecting(true)
    setError(null)
    try {
-      const res = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/words?engine=${eng}`, {
+      const res = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/words?engine=${eng}&pronunciation=${pronunciation}`, {
        method: 'POST',
      })
      if (!res.ok) {
@@ -538,6 +539,16 @@ export function StepWordRecognition({ sessionId, onNext, goToStep }: StepWordRec
              <option value="tesseract">Tesseract</option>
            </select>

+            {/* Pronunciation selector */}
+            <select
+              value={pronunciation}
+              onChange={(e) => setPronunciation(e.target.value as 'british' | 'american')}
+              className="px-2 py-1.5 text-xs border rounded-lg dark:bg-gray-700 dark:border-gray-600"
+            >
+              <option value="british">Britisch (RP)</option>
+              <option value="american">Amerikanisch</option>
+            </select>
+
            <button
              onClick={() => runAutoDetection()}
              disabled={detecting}
@@ -49,18 +49,34 @@ CV_PIPELINE_AVAILABLE = CV2_AVAILABLE and TESSERACT_AVAILABLE

 # --- IPA Dictionary ---

+import json
+import os
+import re
+
 IPA_AVAILABLE = False
-_ipa_convert = None
+_ipa_convert_american = None
+_britfone_dict: Dict[str, str] = {}

 try:
    import eng_to_ipa as _eng_to_ipa
-    _ipa_convert = _eng_to_ipa.convert
+    _ipa_convert_american = _eng_to_ipa.convert
    IPA_AVAILABLE = True
-    logger.info("eng_to_ipa available — IPA dictionary lookup enabled")
+    logger.info("eng_to_ipa available — American IPA lookup enabled")
 except ImportError:
-    logger.info("eng_to_ipa not installed — IPA replacement disabled")
+    logger.info("eng_to_ipa not installed — American IPA disabled")

-import re
+# Load Britfone dictionary (MIT license, ~15k British English IPA entries)
+_britfone_path = os.path.join(os.path.dirname(__file__), 'data', 'britfone_ipa.json')
+if os.path.exists(_britfone_path):
+    try:
+        with open(_britfone_path, 'r', encoding='utf-8') as f:
+            _britfone_dict = json.load(f)
+        IPA_AVAILABLE = True
+        logger.info(f"Britfone loaded — {len(_britfone_dict)} British IPA entries")
+    except Exception as e:
+        logger.warning(f"Failed to load Britfone: {e}")
+else:
+    logger.info("Britfone not found — British IPA disabled")

 # --- Language Detection Constants ---

@@ -2595,17 +2611,68 @@ _PHONETIC_BRACKET_RE = re.compile(
 )


-def _fix_phonetic_brackets(entries: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+def _lookup_ipa(word: str, pronunciation: str = 'british') -> Optional[str]:
+    """Look up IPA for a word using the selected pronunciation dictionary.
+
+    Args:
+        word: English word to look up.
+        pronunciation: 'british' (Britfone, MIT) or 'american' (eng_to_ipa, MIT).
+
+    Returns:
+        IPA string or None if not found.
+    """
+    word_lower = word.lower().strip()
+    if not word_lower:
+        return None
+
+    if pronunciation == 'british' and _britfone_dict:
+        ipa = _britfone_dict.get(word_lower)
+        if ipa:
+            return ipa
+        # Fallback to American if not in Britfone
+        if _ipa_convert_american:
+            result = _ipa_convert_american(word_lower)
+            if result and '*' not in result:
+                return result
+        return None
+
+    if pronunciation == 'american' and _ipa_convert_american:
+        result = _ipa_convert_american(word_lower)
+        if result and '*' not in result:
+            return result
+        # Fallback to Britfone if not in CMU
+        if _britfone_dict:
+            ipa = _britfone_dict.get(word_lower)
+            if ipa:
+                return ipa
+        return None
+
+    # Try any available source
+    if _britfone_dict:
+        ipa = _britfone_dict.get(word_lower)
+        if ipa:
+            return ipa
+    if _ipa_convert_american:
+        result = _ipa_convert_american(word_lower)
+        if result and '*' not in result:
+            return result
+
+    return None
+
+
+def _fix_phonetic_brackets(
+    entries: List[Dict[str, Any]],
+    pronunciation: str = 'british',
+) -> List[Dict[str, Any]]:
    """Replace OCR'd phonetic transcriptions with dictionary IPA.

-    Detects patterns like "dance [du:ns]" and replaces with "dance [dæns]"
-    using eng_to_ipa dictionary lookup.
+    Detects patterns like "dance [du:ns]" and replaces with correct IPA:
+    - British: "dance [dˈɑːns]"  (Britfone, MIT)
+    - American: "dance [dæns]"    (eng_to_ipa/CMU, MIT)

-    Only replaces if:
-    - The word before brackets is found in the IPA dictionary
-    - The bracket content looks like phonetics (not regular text)
+    Only replaces if the word before brackets is found in the dictionary.
    """
-    if not IPA_AVAILABLE or _ipa_convert is None:
+    if not IPA_AVAILABLE:
        return entries

    for entry in entries:
@@ -2613,14 +2680,14 @@ def _fix_phonetic_brackets(entries: List[Dict[str, Any]]) -> List[Dict[str, Any]
            text = entry.get(field, '') or ''
            if '[' not in text:
                continue
-            entry[field] = _replace_phonetics_in_text(text)
+            entry[field] = _replace_phonetics_in_text(text, pronunciation)

    return entries


-def _replace_phonetics_in_text(text: str) -> str:
+def _replace_phonetics_in_text(text: str, pronunciation: str = 'british') -> str:
    """Replace [phonetic] after words with dictionary IPA."""
-    if not IPA_AVAILABLE or _ipa_convert is None:
+    if not IPA_AVAILABLE:
        return text

    def replacer(match):
@@ -2632,13 +2699,10 @@ def _replace_phonetics_in_text(text: str) -> str:
            return match.group(0)  # Keep original

        # Look up in IPA dictionary
-        ipa = _ipa_convert(word.lower())
-
-        # eng_to_ipa returns word with * if not found
-        if '*' in ipa or not ipa:
+        ipa = _lookup_ipa(word, pronunciation)
+        if not ipa:
            return match.group(0)  # Keep original

-        # Clean up: eng_to_ipa returns bare IPA, we add brackets
        return f"{word} [{ipa}]"

    return _PHONETIC_BRACKET_RE.sub(replacer, text)
@@ -2766,6 +2830,7 @@ def build_word_grid(
    lang: str = "eng+deu",
    ocr_engine: str = "auto",
    img_bgr: Optional[np.ndarray] = None,
+    pronunciation: str = "british",
 ) -> List[Dict[str, Any]]:
    """Build a word grid by intersecting columns and rows, then OCR each cell.

@@ -2923,7 +2988,7 @@ def build_word_grid(
    entries = _fix_character_confusion(entries)

    # 3. Replace OCR'd phonetics with dictionary IPA
-    entries = _fix_phonetic_brackets(entries)
+    entries = _fix_phonetic_brackets(entries, pronunciation=pronunciation)

    # 4. Split comma-separated word forms (break, broke, broken → 3 entries)
    entries = _split_comma_entries(entries)
@@ -1007,11 +1007,12 @@ async def get_row_ground_truth(session_id: str):
 # ---------------------------------------------------------------------------

@router.post("/sessions/{session_id}/words")
-async def detect_words(session_id: str, engine: str = "auto"):
+async def detect_words(session_id: str, engine: str = "auto", pronunciation: str = "british"):
    """Build word grid from columns × rows, OCR each cell.

    Query params:
        engine: 'auto' (default), 'tesseract', or 'rapid'
+        pronunciation: 'british' (default) or 'american' — for IPA dictionary lookup
    """
    if session_id not in _cache:
        await _load_session_to_cache(session_id)
@@ -1068,6 +1069,7 @@ async def detect_words(session_id: str, engine: str = "auto"):
    entries = build_word_grid(
        ocr_img, col_regions, row_geoms, img_w, img_h,
        ocr_engine=engine, img_bgr=dewarped_bgr,
+        pronunciation=pronunciation,
    )
    duration = time.time() - t0