feat(ocr-pipeline): British/American IPA pronunciation choice
- Integrate Britfone dictionary (MIT, 15k British English IPA entries) - Add pronunciation parameter: 'british' (default) or 'american' - British uses Britfone (Received Pronunciation), falls back to CMU - American uses eng_to_ipa/CMU, falls back to Britfone - Frontend: dropdown to switch pronunciation, default = British - API: ?pronunciation=british|american query parameter Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -34,6 +34,7 @@ export function StepWordRecognition({ sessionId, onNext, goToStep }: StepWordRec
|
||||
const [mode, setMode] = useState<'overview' | 'labeling'>('overview')
|
||||
const [ocrEngine, setOcrEngine] = useState<'auto' | 'tesseract' | 'rapid'>('auto')
|
||||
const [usedEngine, setUsedEngine] = useState<string>('')
|
||||
const [pronunciation, setPronunciation] = useState<'british' | 'american'>('british')
|
||||
|
||||
const enRef = useRef<HTMLInputElement>(null)
|
||||
|
||||
@@ -73,7 +74,7 @@ export function StepWordRecognition({ sessionId, onNext, goToStep }: StepWordRec
|
||||
setDetecting(true)
|
||||
setError(null)
|
||||
try {
|
||||
const res = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/words?engine=${eng}`, {
|
||||
const res = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/words?engine=${eng}&pronunciation=${pronunciation}`, {
|
||||
method: 'POST',
|
||||
})
|
||||
if (!res.ok) {
|
||||
@@ -538,6 +539,16 @@ export function StepWordRecognition({ sessionId, onNext, goToStep }: StepWordRec
|
||||
<option value="tesseract">Tesseract</option>
|
||||
</select>
|
||||
|
||||
{/* Pronunciation selector */}
|
||||
<select
|
||||
value={pronunciation}
|
||||
onChange={(e) => setPronunciation(e.target.value as 'british' | 'american')}
|
||||
className="px-2 py-1.5 text-xs border rounded-lg dark:bg-gray-700 dark:border-gray-600"
|
||||
>
|
||||
<option value="british">Britisch (RP)</option>
|
||||
<option value="american">Amerikanisch</option>
|
||||
</select>
|
||||
|
||||
<button
|
||||
onClick={() => runAutoDetection()}
|
||||
disabled={detecting}
|
||||
|
||||
@@ -49,18 +49,34 @@ CV_PIPELINE_AVAILABLE = CV2_AVAILABLE and TESSERACT_AVAILABLE
|
||||
|
||||
# --- IPA Dictionary ---
|
||||
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
|
||||
IPA_AVAILABLE = False
|
||||
_ipa_convert = None
|
||||
_ipa_convert_american = None
|
||||
_britfone_dict: Dict[str, str] = {}
|
||||
|
||||
try:
|
||||
import eng_to_ipa as _eng_to_ipa
|
||||
_ipa_convert = _eng_to_ipa.convert
|
||||
_ipa_convert_american = _eng_to_ipa.convert
|
||||
IPA_AVAILABLE = True
|
||||
logger.info("eng_to_ipa available — IPA dictionary lookup enabled")
|
||||
logger.info("eng_to_ipa available — American IPA lookup enabled")
|
||||
except ImportError:
|
||||
logger.info("eng_to_ipa not installed — IPA replacement disabled")
|
||||
logger.info("eng_to_ipa not installed — American IPA disabled")
|
||||
|
||||
import re
|
||||
# Load Britfone dictionary (MIT license, ~15k British English IPA entries)
|
||||
_britfone_path = os.path.join(os.path.dirname(__file__), 'data', 'britfone_ipa.json')
|
||||
if os.path.exists(_britfone_path):
|
||||
try:
|
||||
with open(_britfone_path, 'r', encoding='utf-8') as f:
|
||||
_britfone_dict = json.load(f)
|
||||
IPA_AVAILABLE = True
|
||||
logger.info(f"Britfone loaded — {len(_britfone_dict)} British IPA entries")
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to load Britfone: {e}")
|
||||
else:
|
||||
logger.info("Britfone not found — British IPA disabled")
|
||||
|
||||
# --- Language Detection Constants ---
|
||||
|
||||
@@ -2595,17 +2611,68 @@ _PHONETIC_BRACKET_RE = re.compile(
|
||||
)
|
||||
|
||||
|
||||
def _fix_phonetic_brackets(entries: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
||||
def _lookup_ipa(word: str, pronunciation: str = 'british') -> Optional[str]:
|
||||
"""Look up IPA for a word using the selected pronunciation dictionary.
|
||||
|
||||
Args:
|
||||
word: English word to look up.
|
||||
pronunciation: 'british' (Britfone, MIT) or 'american' (eng_to_ipa, MIT).
|
||||
|
||||
Returns:
|
||||
IPA string or None if not found.
|
||||
"""
|
||||
word_lower = word.lower().strip()
|
||||
if not word_lower:
|
||||
return None
|
||||
|
||||
if pronunciation == 'british' and _britfone_dict:
|
||||
ipa = _britfone_dict.get(word_lower)
|
||||
if ipa:
|
||||
return ipa
|
||||
# Fallback to American if not in Britfone
|
||||
if _ipa_convert_american:
|
||||
result = _ipa_convert_american(word_lower)
|
||||
if result and '*' not in result:
|
||||
return result
|
||||
return None
|
||||
|
||||
if pronunciation == 'american' and _ipa_convert_american:
|
||||
result = _ipa_convert_american(word_lower)
|
||||
if result and '*' not in result:
|
||||
return result
|
||||
# Fallback to Britfone if not in CMU
|
||||
if _britfone_dict:
|
||||
ipa = _britfone_dict.get(word_lower)
|
||||
if ipa:
|
||||
return ipa
|
||||
return None
|
||||
|
||||
# Try any available source
|
||||
if _britfone_dict:
|
||||
ipa = _britfone_dict.get(word_lower)
|
||||
if ipa:
|
||||
return ipa
|
||||
if _ipa_convert_american:
|
||||
result = _ipa_convert_american(word_lower)
|
||||
if result and '*' not in result:
|
||||
return result
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def _fix_phonetic_brackets(
|
||||
entries: List[Dict[str, Any]],
|
||||
pronunciation: str = 'british',
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Replace OCR'd phonetic transcriptions with dictionary IPA.
|
||||
|
||||
Detects patterns like "dance [du:ns]" and replaces with "dance [dæns]"
|
||||
using eng_to_ipa dictionary lookup.
|
||||
Detects patterns like "dance [du:ns]" and replaces with correct IPA:
|
||||
- British: "dance [dˈɑːns]" (Britfone, MIT)
|
||||
- American: "dance [dæns]" (eng_to_ipa/CMU, MIT)
|
||||
|
||||
Only replaces if:
|
||||
- The word before brackets is found in the IPA dictionary
|
||||
- The bracket content looks like phonetics (not regular text)
|
||||
Only replaces if the word before brackets is found in the dictionary.
|
||||
"""
|
||||
if not IPA_AVAILABLE or _ipa_convert is None:
|
||||
if not IPA_AVAILABLE:
|
||||
return entries
|
||||
|
||||
for entry in entries:
|
||||
@@ -2613,14 +2680,14 @@ def _fix_phonetic_brackets(entries: List[Dict[str, Any]]) -> List[Dict[str, Any]
|
||||
text = entry.get(field, '') or ''
|
||||
if '[' not in text:
|
||||
continue
|
||||
entry[field] = _replace_phonetics_in_text(text)
|
||||
entry[field] = _replace_phonetics_in_text(text, pronunciation)
|
||||
|
||||
return entries
|
||||
|
||||
|
||||
def _replace_phonetics_in_text(text: str) -> str:
|
||||
def _replace_phonetics_in_text(text: str, pronunciation: str = 'british') -> str:
|
||||
"""Replace [phonetic] after words with dictionary IPA."""
|
||||
if not IPA_AVAILABLE or _ipa_convert is None:
|
||||
if not IPA_AVAILABLE:
|
||||
return text
|
||||
|
||||
def replacer(match):
|
||||
@@ -2632,13 +2699,10 @@ def _replace_phonetics_in_text(text: str) -> str:
|
||||
return match.group(0) # Keep original
|
||||
|
||||
# Look up in IPA dictionary
|
||||
ipa = _ipa_convert(word.lower())
|
||||
|
||||
# eng_to_ipa returns word with * if not found
|
||||
if '*' in ipa or not ipa:
|
||||
ipa = _lookup_ipa(word, pronunciation)
|
||||
if not ipa:
|
||||
return match.group(0) # Keep original
|
||||
|
||||
# Clean up: eng_to_ipa returns bare IPA, we add brackets
|
||||
return f"{word} [{ipa}]"
|
||||
|
||||
return _PHONETIC_BRACKET_RE.sub(replacer, text)
|
||||
@@ -2766,6 +2830,7 @@ def build_word_grid(
|
||||
lang: str = "eng+deu",
|
||||
ocr_engine: str = "auto",
|
||||
img_bgr: Optional[np.ndarray] = None,
|
||||
pronunciation: str = "british",
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Build a word grid by intersecting columns and rows, then OCR each cell.
|
||||
|
||||
@@ -2923,7 +2988,7 @@ def build_word_grid(
|
||||
entries = _fix_character_confusion(entries)
|
||||
|
||||
# 3. Replace OCR'd phonetics with dictionary IPA
|
||||
entries = _fix_phonetic_brackets(entries)
|
||||
entries = _fix_phonetic_brackets(entries, pronunciation=pronunciation)
|
||||
|
||||
# 4. Split comma-separated word forms (break, broke, broken → 3 entries)
|
||||
entries = _split_comma_entries(entries)
|
||||
|
||||
1
klausur-service/backend/data/britfone_ipa.json
Normal file
1
klausur-service/backend/data/britfone_ipa.json
Normal file
File diff suppressed because one or more lines are too long
@@ -1007,11 +1007,12 @@ async def get_row_ground_truth(session_id: str):
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
@router.post("/sessions/{session_id}/words")
|
||||
async def detect_words(session_id: str, engine: str = "auto"):
|
||||
async def detect_words(session_id: str, engine: str = "auto", pronunciation: str = "british"):
|
||||
"""Build word grid from columns × rows, OCR each cell.
|
||||
|
||||
Query params:
|
||||
engine: 'auto' (default), 'tesseract', or 'rapid'
|
||||
pronunciation: 'british' (default) or 'american' — for IPA dictionary lookup
|
||||
"""
|
||||
if session_id not in _cache:
|
||||
await _load_session_to_cache(session_id)
|
||||
@@ -1068,6 +1069,7 @@ async def detect_words(session_id: str, engine: str = "auto"):
|
||||
entries = build_word_grid(
|
||||
ocr_img, col_regions, row_geoms, img_w, img_h,
|
||||
ocr_engine=engine, img_bgr=dewarped_bgr,
|
||||
pronunciation=pronunciation,
|
||||
)
|
||||
duration = time.time() - t0
|
||||
|
||||
|
||||
Reference in New Issue
Block a user