feat(ocr-pipeline): British/American IPA pronunciation choice

- Integrate Britfone dictionary (MIT, 15k British English IPA entries)
- Add pronunciation parameter: 'british' (default) or 'american'
- British uses Britfone (Received Pronunciation), falls back to CMU
- American uses eng_to_ipa/CMU, falls back to Britfone
- Frontend: dropdown to switch pronunciation, default = British
- API: ?pronunciation=british|american query parameter

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-01 11:08:52 +01:00
parent 954d21e469
commit f2521d2b9e
4 changed files with 102 additions and 23 deletions

View File

@@ -34,6 +34,7 @@ export function StepWordRecognition({ sessionId, onNext, goToStep }: StepWordRec
const [mode, setMode] = useState<'overview' | 'labeling'>('overview')
const [ocrEngine, setOcrEngine] = useState<'auto' | 'tesseract' | 'rapid'>('auto')
const [usedEngine, setUsedEngine] = useState<string>('')
const [pronunciation, setPronunciation] = useState<'british' | 'american'>('british')
const enRef = useRef<HTMLInputElement>(null)
@@ -73,7 +74,7 @@ export function StepWordRecognition({ sessionId, onNext, goToStep }: StepWordRec
setDetecting(true)
setError(null)
try {
const res = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/words?engine=${eng}`, {
const res = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/words?engine=${eng}&pronunciation=${pronunciation}`, {
method: 'POST',
})
if (!res.ok) {
@@ -538,6 +539,16 @@ export function StepWordRecognition({ sessionId, onNext, goToStep }: StepWordRec
<option value="tesseract">Tesseract</option>
</select>
{/* Pronunciation selector */}
<select
value={pronunciation}
onChange={(e) => setPronunciation(e.target.value as 'british' | 'american')}
className="px-2 py-1.5 text-xs border rounded-lg dark:bg-gray-700 dark:border-gray-600"
>
<option value="british">Britisch (RP)</option>
<option value="american">Amerikanisch</option>
</select>
<button
onClick={() => runAutoDetection()}
disabled={detecting}

View File

@@ -49,18 +49,34 @@ CV_PIPELINE_AVAILABLE = CV2_AVAILABLE and TESSERACT_AVAILABLE
# --- IPA Dictionary ---
import json
import os
import re
IPA_AVAILABLE = False
_ipa_convert = None
_ipa_convert_american = None
_britfone_dict: Dict[str, str] = {}
try:
import eng_to_ipa as _eng_to_ipa
_ipa_convert = _eng_to_ipa.convert
_ipa_convert_american = _eng_to_ipa.convert
IPA_AVAILABLE = True
logger.info("eng_to_ipa available — IPA dictionary lookup enabled")
logger.info("eng_to_ipa available — American IPA lookup enabled")
except ImportError:
logger.info("eng_to_ipa not installed — IPA replacement disabled")
logger.info("eng_to_ipa not installed — American IPA disabled")
import re
# Load Britfone dictionary (MIT license, ~15k British English IPA entries)
_britfone_path = os.path.join(os.path.dirname(__file__), 'data', 'britfone_ipa.json')
if os.path.exists(_britfone_path):
try:
with open(_britfone_path, 'r', encoding='utf-8') as f:
_britfone_dict = json.load(f)
IPA_AVAILABLE = True
logger.info(f"Britfone loaded — {len(_britfone_dict)} British IPA entries")
except Exception as e:
logger.warning(f"Failed to load Britfone: {e}")
else:
logger.info("Britfone not found — British IPA disabled")
# --- Language Detection Constants ---
@@ -2595,17 +2611,68 @@ _PHONETIC_BRACKET_RE = re.compile(
)
def _fix_phonetic_brackets(entries: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
def _lookup_ipa(word: str, pronunciation: str = 'british') -> Optional[str]:
"""Look up IPA for a word using the selected pronunciation dictionary.
Args:
word: English word to look up.
pronunciation: 'british' (Britfone, MIT) or 'american' (eng_to_ipa, MIT).
Returns:
IPA string or None if not found.
"""
word_lower = word.lower().strip()
if not word_lower:
return None
if pronunciation == 'british' and _britfone_dict:
ipa = _britfone_dict.get(word_lower)
if ipa:
return ipa
# Fallback to American if not in Britfone
if _ipa_convert_american:
result = _ipa_convert_american(word_lower)
if result and '*' not in result:
return result
return None
if pronunciation == 'american' and _ipa_convert_american:
result = _ipa_convert_american(word_lower)
if result and '*' not in result:
return result
# Fallback to Britfone if not in CMU
if _britfone_dict:
ipa = _britfone_dict.get(word_lower)
if ipa:
return ipa
return None
# Try any available source
if _britfone_dict:
ipa = _britfone_dict.get(word_lower)
if ipa:
return ipa
if _ipa_convert_american:
result = _ipa_convert_american(word_lower)
if result and '*' not in result:
return result
return None
def _fix_phonetic_brackets(
entries: List[Dict[str, Any]],
pronunciation: str = 'british',
) -> List[Dict[str, Any]]:
"""Replace OCR'd phonetic transcriptions with dictionary IPA.
Detects patterns like "dance [du:ns]" and replaces with "dance [dæns]"
using eng_to_ipa dictionary lookup.
Detects patterns like "dance [du:ns]" and replaces with correct IPA:
- British: "dance [dˈɑːns]" (Britfone, MIT)
- American: "dance [dæns]" (eng_to_ipa/CMU, MIT)
Only replaces if:
- The word before brackets is found in the IPA dictionary
- The bracket content looks like phonetics (not regular text)
Only replaces if the word before brackets is found in the dictionary.
"""
if not IPA_AVAILABLE or _ipa_convert is None:
if not IPA_AVAILABLE:
return entries
for entry in entries:
@@ -2613,14 +2680,14 @@ def _fix_phonetic_brackets(entries: List[Dict[str, Any]]) -> List[Dict[str, Any]
text = entry.get(field, '') or ''
if '[' not in text:
continue
entry[field] = _replace_phonetics_in_text(text)
entry[field] = _replace_phonetics_in_text(text, pronunciation)
return entries
def _replace_phonetics_in_text(text: str) -> str:
def _replace_phonetics_in_text(text: str, pronunciation: str = 'british') -> str:
"""Replace [phonetic] after words with dictionary IPA."""
if not IPA_AVAILABLE or _ipa_convert is None:
if not IPA_AVAILABLE:
return text
def replacer(match):
@@ -2632,13 +2699,10 @@ def _replace_phonetics_in_text(text: str) -> str:
return match.group(0) # Keep original
# Look up in IPA dictionary
ipa = _ipa_convert(word.lower())
# eng_to_ipa returns word with * if not found
if '*' in ipa or not ipa:
ipa = _lookup_ipa(word, pronunciation)
if not ipa:
return match.group(0) # Keep original
# Clean up: eng_to_ipa returns bare IPA, we add brackets
return f"{word} [{ipa}]"
return _PHONETIC_BRACKET_RE.sub(replacer, text)
@@ -2766,6 +2830,7 @@ def build_word_grid(
lang: str = "eng+deu",
ocr_engine: str = "auto",
img_bgr: Optional[np.ndarray] = None,
pronunciation: str = "british",
) -> List[Dict[str, Any]]:
"""Build a word grid by intersecting columns and rows, then OCR each cell.
@@ -2923,7 +2988,7 @@ def build_word_grid(
entries = _fix_character_confusion(entries)
# 3. Replace OCR'd phonetics with dictionary IPA
entries = _fix_phonetic_brackets(entries)
entries = _fix_phonetic_brackets(entries, pronunciation=pronunciation)
# 4. Split comma-separated word forms (break, broke, broken → 3 entries)
entries = _split_comma_entries(entries)

File diff suppressed because one or more lines are too long

View File

@@ -1007,11 +1007,12 @@ async def get_row_ground_truth(session_id: str):
# ---------------------------------------------------------------------------
@router.post("/sessions/{session_id}/words")
async def detect_words(session_id: str, engine: str = "auto"):
async def detect_words(session_id: str, engine: str = "auto", pronunciation: str = "british"):
"""Build word grid from columns × rows, OCR each cell.
Query params:
engine: 'auto' (default), 'tesseract', or 'rapid'
pronunciation: 'british' (default) or 'american' — for IPA dictionary lookup
"""
if session_id not in _cache:
await _load_session_to_cache(session_id)
@@ -1068,6 +1069,7 @@ async def detect_words(session_id: str, engine: str = "auto"):
entries = build_word_grid(
ocr_img, col_regions, row_geoms, img_w, img_h,
ocr_engine=engine, img_bgr=dewarped_bgr,
pronunciation=pronunciation,
)
duration = time.time() - t0