feat: IPA-Lautschrift in Cell-Texte einfuegen (fuer Overlay-Modus)

fix_cell_phonetics() ersetzt fehlerhafte IPA-Klammern UND fuegt fehlende Lautschrift fuer englische Woerter ein (z.B. badge, film, challenge, profit). Wird auf alle Zellen mit col_type column_en/column_text angewandt. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-11 15:47:26 +01:00
parent 8a5f2aa188
commit 2f51ac617f
3 changed files with 231 additions and 2 deletions
@@ -885,6 +885,105 @@ def _replace_phonetics_in_text(text: str, pronunciation: str = 'british') -> str
    return text


+def _insert_missing_ipa(text: str, pronunciation: str = 'british') -> str:
+    """Insert IPA pronunciation for English words that have no brackets at all.
+
+    OCR sometimes drops the phonetic transcription entirely (e.g. "challenge"
+    instead of "challenge [ˈtʃælɪndʒ]").  This scans the text for lone English
+    words that have a dictionary IPA entry and appends [ipa] after them.
+
+    Only inserts for words that:
+    - are standalone (not already followed by a bracket)
+    - have an IPA entry in the dictionary
+    - appear to be English headwords (at the start of text or after common
+      separators like ",", ";", "•")
+
+    This is intentionally conservative: it only inserts at the END of each
+    whitespace-separated token group to avoid breaking phrases.
+    """
+    if not IPA_AVAILABLE:
+        return text
+    if not text or not text.strip():
+        return text
+
+    # Skip if already has brackets (IPA replacement handles those)
+    if any(ch in text for ch in '[{('):
+        return text
+
+    # Only process short text fragments (typical vocab cells).
+    # Long sentences / paragraphs should not get IPA insertions.
+    words = text.strip().split()
+    if len(words) > 6:
+        return text
+
+    # Try to insert IPA for the first alphanumeric word
+    # Typical patterns: "challenge", "profit", "film", "badge"
+    for i, w in enumerate(words):
+        # Clean punctuation for lookup
+        clean = re.sub(r'[^a-zA-ZäöüÄÖÜß\'-]', '', w)
+        if not clean or len(clean) < 2:
+            continue
+        # Skip German/grammar words
+        if clean.lower() in _GRAMMAR_BRACKET_WORDS:
+            continue
+        ipa = _lookup_ipa(clean, pronunciation)
+        if ipa:
+            words[i] = f"{w} [{ipa}]"
+            # Only insert for the FIRST word that has IPA
+            # (headword in English column)
+            break
+
+    return ' '.join(words)
+
+
+def fix_cell_phonetics(
+    cells: List[Dict[str, Any]],
+    pronunciation: str = 'british',
+) -> List[Dict[str, Any]]:
+    """Apply IPA phonetic fixes to cell texts for overlay mode.
+
+    In the normal pipeline, _fix_phonetic_brackets operates on vocab entries
+    (entry['english']).  But the overlay reads cell['text'] directly, so
+    phonetic fixes must be applied to cells too.
+
+    This function:
+    1. Replaces garbled IPA brackets with correct dictionary IPA
+    2. Inserts missing IPA for English headwords that have no brackets
+
+    Only processes cells in English-like columns (column_en, column_text).
+    German columns are never processed (they contain meaningful parentheses).
+    """
+    if not IPA_AVAILABLE:
+        return cells
+
+    # Column types where IPA processing makes sense
+    ipa_col_types = {'column_en', 'column_text'}
+    replaced = 0
+
+    for cell in cells:
+        col_type = cell.get('col_type', '')
+        if col_type not in ipa_col_types:
+            continue
+        text = cell.get('text', '') or ''
+        if not text.strip():
+            continue
+
+        # Step 1: replace garbled IPA brackets
+        new_text = _replace_phonetics_in_text(text, pronunciation)
+        # Step 2: insert missing IPA if no brackets were present
+        if new_text == text:
+            new_text = _insert_missing_ipa(text, pronunciation)
+
+        if new_text != text:
+            logger.debug(f"fix_cell_phonetics: '{text}' → '{new_text}'")
+            cell['text'] = new_text
+            replaced += 1
+
+    if replaced:
+        logger.info(f"fix_cell_phonetics: {replaced} IPA fixes in {len(cells)} cells")
+    return cells
+
+
 def _assign_row_words_to_columns(
    row: RowGeometry,
    columns: List[PageRegion],
@@ -43,6 +43,7 @@ from cv_vocab_pipeline import (
    _detect_sub_columns,
    _fix_character_confusion,
    _fix_phonetic_brackets,
+    fix_cell_phonetics,
    analyze_layout,
    analyze_layout_by_words,
    build_cell_grid,
@@ -2030,6 +2031,9 @@ async def detect_words(
    # Determine which engine was actually used
    used_engine = cells[0].get("ocr_engine", "tesseract") if cells else engine

+    # Apply IPA phonetic fixes directly to cell texts (for overlay mode)
+    fix_cell_phonetics(cells, pronunciation=pronunciation)
+
    # Grid result (always generic)
    word_result = {
        "cells": cells,
@@ -2169,11 +2173,14 @@ async def _word_batch_stream_generator(
        logger.info(f"SSE batch: client disconnected after OCR for {session_id}")
        return

-    # 4. Send columns meta
+    # 4. Apply IPA phonetic fixes directly to cell texts (for overlay mode)
+    fix_cell_phonetics(cells, pronunciation=pronunciation)
+
+    # 5. Send columns meta
    if columns_meta:
        yield f"data: {json.dumps({'type': 'columns', 'columns_used': columns_meta})}\n\n"

-    # 5. Stream all cells
+    # 6. Stream all cells
    for idx, cell in enumerate(cells):
        cell_event = {
            "type": "cell",
@@ -2323,6 +2330,9 @@ async def _word_stream_generator(

    used_engine = all_cells[0].get("ocr_engine", "tesseract") if all_cells else engine

+    # Apply IPA phonetic fixes directly to cell texts (for overlay mode)
+    fix_cell_phonetics(all_cells, pronunciation=pronunciation)
+
    word_result = {
        "cells": all_cells,
        "grid_shape": {
@@ -3996,6 +4006,9 @@ async def run_auto(session_id: str, req: RunAutoRequest, request: Request):
                n_content_rows = len([r for r in row_geoms if r.row_type == 'content'])
                used_engine = cells[0].get("ocr_engine", "tesseract") if cells else req.ocr_engine

+                # Apply IPA phonetic fixes directly to cell texts
+                fix_cell_phonetics(cells, pronunciation=req.pronunciation)
+
                word_result_data = {
                    "cells": cells,
                    "grid_shape": {
@@ -0,0 +1,117 @@
+"""Tests for fix_cell_phonetics and _insert_missing_ipa."""
+
+import pytest
+from unittest.mock import patch, MagicMock
+
+import sys
+import os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
+
+
+class TestInsertMissingIpa:
+    """Tests for _insert_missing_ipa function."""
+
+    def test_single_headword_gets_ipa(self):
+        """Single English headword should get IPA inserted."""
+        from cv_ocr_engines import _insert_missing_ipa
+        result = _insert_missing_ipa("badge", "british")
+        assert "[" in result and "]" in result
+        assert result.startswith("badge [")
+
+    def test_short_phrase_first_word_gets_ipa(self):
+        """First real word in short phrase gets IPA."""
+        from cv_ocr_engines import _insert_missing_ipa
+        result = _insert_missing_ipa("film", "british")
+        assert "[" in result
+
+    def test_long_sentence_unchanged(self):
+        """Sentences with >6 words should not get IPA."""
+        from cv_ocr_engines import _insert_missing_ipa
+        text = "Can I borrow your CD player from you please"
+        result = _insert_missing_ipa(text, "british")
+        assert result == text
+
+    def test_existing_brackets_unchanged(self):
+        """Text with existing brackets should not get double IPA."""
+        from cv_ocr_engines import _insert_missing_ipa
+        text = "dance [dˈɑːns]"
+        result = _insert_missing_ipa(text, "british")
+        assert result == text
+
+    def test_empty_text_unchanged(self):
+        """Empty text returns empty."""
+        from cv_ocr_engines import _insert_missing_ipa
+        assert _insert_missing_ipa("", "british") == ""
+        assert _insert_missing_ipa("  ", "british") == ""
+
+    def test_grammar_words_skipped(self):
+        """Grammar particles should not get IPA."""
+        from cv_ocr_engines import _insert_missing_ipa
+        # "sth" is in _GRAMMAR_BRACKET_WORDS
+        result = _insert_missing_ipa("sth", "british")
+        assert "[" not in result
+
+    def test_german_word_no_ipa(self):
+        """German words (no IPA entry) stay unchanged."""
+        from cv_ocr_engines import _insert_missing_ipa
+        result = _insert_missing_ipa("Anstecknadel", "british")
+        assert result == "Anstecknadel"
+
+
+class TestFixCellPhonetics:
+    """Tests for fix_cell_phonetics function."""
+
+    def test_english_column_cells_processed(self):
+        """Cells with col_type column_en should be processed."""
+        from cv_ocr_engines import fix_cell_phonetics
+        cells = [
+            {"cell_id": "c1", "col_type": "column_en", "text": "badge"},
+            {"cell_id": "c2", "col_type": "column_de", "text": "Anstecknadel"},
+        ]
+        fix_cell_phonetics(cells, pronunciation="british")
+        # English cell should have IPA
+        assert "[" in cells[0]["text"]
+        # German cell should be unchanged
+        assert cells[1]["text"] == "Anstecknadel"
+
+    def test_column_text_cells_processed(self):
+        """Cells with col_type column_text should be processed."""
+        from cv_ocr_engines import fix_cell_phonetics
+        cells = [
+            {"cell_id": "c1", "col_type": "column_text", "text": "challenge"},
+        ]
+        fix_cell_phonetics(cells, pronunciation="british")
+        assert "[" in cells[0]["text"]
+
+    def test_garbled_ipa_replaced(self):
+        """Garbled IPA brackets should be replaced with correct IPA."""
+        from cv_ocr_engines import fix_cell_phonetics
+        cells = [
+            {"cell_id": "c1", "col_type": "column_en", "text": "dance {'tfatno]"},
+        ]
+        fix_cell_phonetics(cells, pronunciation="british")
+        # Should have proper IPA now
+        text = cells[0]["text"]
+        assert "dance [" in text
+        assert "{'tfatno]" not in text
+
+    def test_empty_cells_unchanged(self):
+        """Empty cells should not cause errors."""
+        from cv_ocr_engines import fix_cell_phonetics
+        cells = [
+            {"cell_id": "c1", "col_type": "column_en", "text": ""},
+            {"cell_id": "c2", "col_type": "column_en", "text": None},
+        ]
+        fix_cell_phonetics(cells, pronunciation="british")
+        assert cells[0]["text"] == ""
+
+    def test_non_english_col_types_skipped(self):
+        """Cells with column_de, column_example etc. should not be processed."""
+        from cv_ocr_engines import fix_cell_phonetics
+        cells = [
+            {"cell_id": "c1", "col_type": "column_de", "text": "Eis (gefrorenes Wasser)"},
+            {"cell_id": "c2", "col_type": "column_example", "text": "(sich beschweren)"},
+        ]
+        fix_cell_phonetics(cells, pronunciation="british")
+        assert cells[0]["text"] == "Eis (gefrorenes Wasser)"
+        assert cells[1]["text"] == "(sich beschweren)"