feat: IPA-Lautschrift in Cell-Texte einfuegen (fuer Overlay-Modus)
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 34s
CI / test-go-edu-search (push) Successful in 31s
CI / test-python-klausur (push) Failing after 2m5s
CI / test-python-agent-core (push) Successful in 23s
CI / test-nodejs-website (push) Successful in 22s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 34s
CI / test-go-edu-search (push) Successful in 31s
CI / test-python-klausur (push) Failing after 2m5s
CI / test-python-agent-core (push) Successful in 23s
CI / test-nodejs-website (push) Successful in 22s
fix_cell_phonetics() ersetzt fehlerhafte IPA-Klammern UND fuegt fehlende Lautschrift fuer englische Woerter ein (z.B. badge, film, challenge, profit). Wird auf alle Zellen mit col_type column_en/column_text angewandt. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -885,6 +885,105 @@ def _replace_phonetics_in_text(text: str, pronunciation: str = 'british') -> str
|
||||
return text
|
||||
|
||||
|
||||
def _insert_missing_ipa(text: str, pronunciation: str = 'british') -> str:
|
||||
"""Insert IPA pronunciation for English words that have no brackets at all.
|
||||
|
||||
OCR sometimes drops the phonetic transcription entirely (e.g. "challenge"
|
||||
instead of "challenge [ˈtʃælɪndʒ]"). This scans the text for lone English
|
||||
words that have a dictionary IPA entry and appends [ipa] after them.
|
||||
|
||||
Only inserts for words that:
|
||||
- are standalone (not already followed by a bracket)
|
||||
- have an IPA entry in the dictionary
|
||||
- appear to be English headwords (at the start of text or after common
|
||||
separators like ",", ";", "•")
|
||||
|
||||
This is intentionally conservative: it only inserts at the END of each
|
||||
whitespace-separated token group to avoid breaking phrases.
|
||||
"""
|
||||
if not IPA_AVAILABLE:
|
||||
return text
|
||||
if not text or not text.strip():
|
||||
return text
|
||||
|
||||
# Skip if already has brackets (IPA replacement handles those)
|
||||
if any(ch in text for ch in '[{('):
|
||||
return text
|
||||
|
||||
# Only process short text fragments (typical vocab cells).
|
||||
# Long sentences / paragraphs should not get IPA insertions.
|
||||
words = text.strip().split()
|
||||
if len(words) > 6:
|
||||
return text
|
||||
|
||||
# Try to insert IPA for the first alphanumeric word
|
||||
# Typical patterns: "challenge", "profit", "film", "badge"
|
||||
for i, w in enumerate(words):
|
||||
# Clean punctuation for lookup
|
||||
clean = re.sub(r'[^a-zA-ZäöüÄÖÜß\'-]', '', w)
|
||||
if not clean or len(clean) < 2:
|
||||
continue
|
||||
# Skip German/grammar words
|
||||
if clean.lower() in _GRAMMAR_BRACKET_WORDS:
|
||||
continue
|
||||
ipa = _lookup_ipa(clean, pronunciation)
|
||||
if ipa:
|
||||
words[i] = f"{w} [{ipa}]"
|
||||
# Only insert for the FIRST word that has IPA
|
||||
# (headword in English column)
|
||||
break
|
||||
|
||||
return ' '.join(words)
|
||||
|
||||
|
||||
def fix_cell_phonetics(
|
||||
cells: List[Dict[str, Any]],
|
||||
pronunciation: str = 'british',
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Apply IPA phonetic fixes to cell texts for overlay mode.
|
||||
|
||||
In the normal pipeline, _fix_phonetic_brackets operates on vocab entries
|
||||
(entry['english']). But the overlay reads cell['text'] directly, so
|
||||
phonetic fixes must be applied to cells too.
|
||||
|
||||
This function:
|
||||
1. Replaces garbled IPA brackets with correct dictionary IPA
|
||||
2. Inserts missing IPA for English headwords that have no brackets
|
||||
|
||||
Only processes cells in English-like columns (column_en, column_text).
|
||||
German columns are never processed (they contain meaningful parentheses).
|
||||
"""
|
||||
if not IPA_AVAILABLE:
|
||||
return cells
|
||||
|
||||
# Column types where IPA processing makes sense
|
||||
ipa_col_types = {'column_en', 'column_text'}
|
||||
replaced = 0
|
||||
|
||||
for cell in cells:
|
||||
col_type = cell.get('col_type', '')
|
||||
if col_type not in ipa_col_types:
|
||||
continue
|
||||
text = cell.get('text', '') or ''
|
||||
if not text.strip():
|
||||
continue
|
||||
|
||||
# Step 1: replace garbled IPA brackets
|
||||
new_text = _replace_phonetics_in_text(text, pronunciation)
|
||||
# Step 2: insert missing IPA if no brackets were present
|
||||
if new_text == text:
|
||||
new_text = _insert_missing_ipa(text, pronunciation)
|
||||
|
||||
if new_text != text:
|
||||
logger.debug(f"fix_cell_phonetics: '{text}' → '{new_text}'")
|
||||
cell['text'] = new_text
|
||||
replaced += 1
|
||||
|
||||
if replaced:
|
||||
logger.info(f"fix_cell_phonetics: {replaced} IPA fixes in {len(cells)} cells")
|
||||
return cells
|
||||
|
||||
|
||||
def _assign_row_words_to_columns(
|
||||
row: RowGeometry,
|
||||
columns: List[PageRegion],
|
||||
|
||||
@@ -43,6 +43,7 @@ from cv_vocab_pipeline import (
|
||||
_detect_sub_columns,
|
||||
_fix_character_confusion,
|
||||
_fix_phonetic_brackets,
|
||||
fix_cell_phonetics,
|
||||
analyze_layout,
|
||||
analyze_layout_by_words,
|
||||
build_cell_grid,
|
||||
@@ -2030,6 +2031,9 @@ async def detect_words(
|
||||
# Determine which engine was actually used
|
||||
used_engine = cells[0].get("ocr_engine", "tesseract") if cells else engine
|
||||
|
||||
# Apply IPA phonetic fixes directly to cell texts (for overlay mode)
|
||||
fix_cell_phonetics(cells, pronunciation=pronunciation)
|
||||
|
||||
# Grid result (always generic)
|
||||
word_result = {
|
||||
"cells": cells,
|
||||
@@ -2169,11 +2173,14 @@ async def _word_batch_stream_generator(
|
||||
logger.info(f"SSE batch: client disconnected after OCR for {session_id}")
|
||||
return
|
||||
|
||||
# 4. Send columns meta
|
||||
# 4. Apply IPA phonetic fixes directly to cell texts (for overlay mode)
|
||||
fix_cell_phonetics(cells, pronunciation=pronunciation)
|
||||
|
||||
# 5. Send columns meta
|
||||
if columns_meta:
|
||||
yield f"data: {json.dumps({'type': 'columns', 'columns_used': columns_meta})}\n\n"
|
||||
|
||||
# 5. Stream all cells
|
||||
# 6. Stream all cells
|
||||
for idx, cell in enumerate(cells):
|
||||
cell_event = {
|
||||
"type": "cell",
|
||||
@@ -2323,6 +2330,9 @@ async def _word_stream_generator(
|
||||
|
||||
used_engine = all_cells[0].get("ocr_engine", "tesseract") if all_cells else engine
|
||||
|
||||
# Apply IPA phonetic fixes directly to cell texts (for overlay mode)
|
||||
fix_cell_phonetics(all_cells, pronunciation=pronunciation)
|
||||
|
||||
word_result = {
|
||||
"cells": all_cells,
|
||||
"grid_shape": {
|
||||
@@ -3996,6 +4006,9 @@ async def run_auto(session_id: str, req: RunAutoRequest, request: Request):
|
||||
n_content_rows = len([r for r in row_geoms if r.row_type == 'content'])
|
||||
used_engine = cells[0].get("ocr_engine", "tesseract") if cells else req.ocr_engine
|
||||
|
||||
# Apply IPA phonetic fixes directly to cell texts
|
||||
fix_cell_phonetics(cells, pronunciation=req.pronunciation)
|
||||
|
||||
word_result_data = {
|
||||
"cells": cells,
|
||||
"grid_shape": {
|
||||
|
||||
117
klausur-service/backend/tests/test_cell_phonetics.py
Normal file
117
klausur-service/backend/tests/test_cell_phonetics.py
Normal file
@@ -0,0 +1,117 @@
|
||||
"""Tests for fix_cell_phonetics and _insert_missing_ipa."""
|
||||
|
||||
import pytest
|
||||
from unittest.mock import patch, MagicMock
|
||||
|
||||
import sys
|
||||
import os
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
|
||||
|
||||
|
||||
class TestInsertMissingIpa:
|
||||
"""Tests for _insert_missing_ipa function."""
|
||||
|
||||
def test_single_headword_gets_ipa(self):
|
||||
"""Single English headword should get IPA inserted."""
|
||||
from cv_ocr_engines import _insert_missing_ipa
|
||||
result = _insert_missing_ipa("badge", "british")
|
||||
assert "[" in result and "]" in result
|
||||
assert result.startswith("badge [")
|
||||
|
||||
def test_short_phrase_first_word_gets_ipa(self):
|
||||
"""First real word in short phrase gets IPA."""
|
||||
from cv_ocr_engines import _insert_missing_ipa
|
||||
result = _insert_missing_ipa("film", "british")
|
||||
assert "[" in result
|
||||
|
||||
def test_long_sentence_unchanged(self):
|
||||
"""Sentences with >6 words should not get IPA."""
|
||||
from cv_ocr_engines import _insert_missing_ipa
|
||||
text = "Can I borrow your CD player from you please"
|
||||
result = _insert_missing_ipa(text, "british")
|
||||
assert result == text
|
||||
|
||||
def test_existing_brackets_unchanged(self):
|
||||
"""Text with existing brackets should not get double IPA."""
|
||||
from cv_ocr_engines import _insert_missing_ipa
|
||||
text = "dance [dˈɑːns]"
|
||||
result = _insert_missing_ipa(text, "british")
|
||||
assert result == text
|
||||
|
||||
def test_empty_text_unchanged(self):
|
||||
"""Empty text returns empty."""
|
||||
from cv_ocr_engines import _insert_missing_ipa
|
||||
assert _insert_missing_ipa("", "british") == ""
|
||||
assert _insert_missing_ipa(" ", "british") == ""
|
||||
|
||||
def test_grammar_words_skipped(self):
|
||||
"""Grammar particles should not get IPA."""
|
||||
from cv_ocr_engines import _insert_missing_ipa
|
||||
# "sth" is in _GRAMMAR_BRACKET_WORDS
|
||||
result = _insert_missing_ipa("sth", "british")
|
||||
assert "[" not in result
|
||||
|
||||
def test_german_word_no_ipa(self):
|
||||
"""German words (no IPA entry) stay unchanged."""
|
||||
from cv_ocr_engines import _insert_missing_ipa
|
||||
result = _insert_missing_ipa("Anstecknadel", "british")
|
||||
assert result == "Anstecknadel"
|
||||
|
||||
|
||||
class TestFixCellPhonetics:
|
||||
"""Tests for fix_cell_phonetics function."""
|
||||
|
||||
def test_english_column_cells_processed(self):
|
||||
"""Cells with col_type column_en should be processed."""
|
||||
from cv_ocr_engines import fix_cell_phonetics
|
||||
cells = [
|
||||
{"cell_id": "c1", "col_type": "column_en", "text": "badge"},
|
||||
{"cell_id": "c2", "col_type": "column_de", "text": "Anstecknadel"},
|
||||
]
|
||||
fix_cell_phonetics(cells, pronunciation="british")
|
||||
# English cell should have IPA
|
||||
assert "[" in cells[0]["text"]
|
||||
# German cell should be unchanged
|
||||
assert cells[1]["text"] == "Anstecknadel"
|
||||
|
||||
def test_column_text_cells_processed(self):
|
||||
"""Cells with col_type column_text should be processed."""
|
||||
from cv_ocr_engines import fix_cell_phonetics
|
||||
cells = [
|
||||
{"cell_id": "c1", "col_type": "column_text", "text": "challenge"},
|
||||
]
|
||||
fix_cell_phonetics(cells, pronunciation="british")
|
||||
assert "[" in cells[0]["text"]
|
||||
|
||||
def test_garbled_ipa_replaced(self):
|
||||
"""Garbled IPA brackets should be replaced with correct IPA."""
|
||||
from cv_ocr_engines import fix_cell_phonetics
|
||||
cells = [
|
||||
{"cell_id": "c1", "col_type": "column_en", "text": "dance {'tfatno]"},
|
||||
]
|
||||
fix_cell_phonetics(cells, pronunciation="british")
|
||||
# Should have proper IPA now
|
||||
text = cells[0]["text"]
|
||||
assert "dance [" in text
|
||||
assert "{'tfatno]" not in text
|
||||
|
||||
def test_empty_cells_unchanged(self):
|
||||
"""Empty cells should not cause errors."""
|
||||
from cv_ocr_engines import fix_cell_phonetics
|
||||
cells = [
|
||||
{"cell_id": "c1", "col_type": "column_en", "text": ""},
|
||||
{"cell_id": "c2", "col_type": "column_en", "text": None},
|
||||
]
|
||||
fix_cell_phonetics(cells, pronunciation="british")
|
||||
assert cells[0]["text"] == ""
|
||||
|
||||
def test_non_english_col_types_skipped(self):
|
||||
"""Cells with column_de, column_example etc. should not be processed."""
|
||||
from cv_ocr_engines import fix_cell_phonetics
|
||||
cells = [
|
||||
{"cell_id": "c1", "col_type": "column_de", "text": "Eis (gefrorenes Wasser)"},
|
||||
{"cell_id": "c2", "col_type": "column_example", "text": "(sich beschweren)"},
|
||||
]
|
||||
fix_cell_phonetics(cells, pronunciation="british")
|
||||
assert cells[0]["text"] == "Eis (gefrorenes Wasser)"
|
||||
assert cells[1]["text"] == "(sich beschweren)"
|
||||
Reference in New Issue
Block a user