From 2f51ac617fe6a416d0b10de141c0d64f246c3526 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Wed, 11 Mar 2026 15:47:26 +0100 Subject: [PATCH] feat: IPA-Lautschrift in Cell-Texte einfuegen (fuer Overlay-Modus) fix_cell_phonetics() ersetzt fehlerhafte IPA-Klammern UND fuegt fehlende Lautschrift fuer englische Woerter ein (z.B. badge, film, challenge, profit). Wird auf alle Zellen mit col_type column_en/column_text angewandt. Co-Authored-By: Claude Opus 4.6 --- klausur-service/backend/cv_ocr_engines.py | 99 +++++++++++++++ klausur-service/backend/ocr_pipeline_api.py | 17 ++- .../backend/tests/test_cell_phonetics.py | 117 ++++++++++++++++++ 3 files changed, 231 insertions(+), 2 deletions(-) create mode 100644 klausur-service/backend/tests/test_cell_phonetics.py diff --git a/klausur-service/backend/cv_ocr_engines.py b/klausur-service/backend/cv_ocr_engines.py index 831dc45..3d1f89e 100644 --- a/klausur-service/backend/cv_ocr_engines.py +++ b/klausur-service/backend/cv_ocr_engines.py @@ -885,6 +885,105 @@ def _replace_phonetics_in_text(text: str, pronunciation: str = 'british') -> str return text +def _insert_missing_ipa(text: str, pronunciation: str = 'british') -> str: + """Insert IPA pronunciation for English words that have no brackets at all. + + OCR sometimes drops the phonetic transcription entirely (e.g. "challenge" + instead of "challenge [ˈtʃælɪndʒ]"). This scans the text for lone English + words that have a dictionary IPA entry and appends [ipa] after them. + + Only inserts for words that: + - are standalone (not already followed by a bracket) + - have an IPA entry in the dictionary + - appear to be English headwords (at the start of text or after common + separators like ",", ";", "•") + + This is intentionally conservative: it only inserts at the END of each + whitespace-separated token group to avoid breaking phrases. + """ + if not IPA_AVAILABLE: + return text + if not text or not text.strip(): + return text + + # Skip if already has brackets (IPA replacement handles those) + if any(ch in text for ch in '[{('): + return text + + # Only process short text fragments (typical vocab cells). + # Long sentences / paragraphs should not get IPA insertions. + words = text.strip().split() + if len(words) > 6: + return text + + # Try to insert IPA for the first alphanumeric word + # Typical patterns: "challenge", "profit", "film", "badge" + for i, w in enumerate(words): + # Clean punctuation for lookup + clean = re.sub(r'[^a-zA-ZäöüÄÖÜß\'-]', '', w) + if not clean or len(clean) < 2: + continue + # Skip German/grammar words + if clean.lower() in _GRAMMAR_BRACKET_WORDS: + continue + ipa = _lookup_ipa(clean, pronunciation) + if ipa: + words[i] = f"{w} [{ipa}]" + # Only insert for the FIRST word that has IPA + # (headword in English column) + break + + return ' '.join(words) + + +def fix_cell_phonetics( + cells: List[Dict[str, Any]], + pronunciation: str = 'british', +) -> List[Dict[str, Any]]: + """Apply IPA phonetic fixes to cell texts for overlay mode. + + In the normal pipeline, _fix_phonetic_brackets operates on vocab entries + (entry['english']). But the overlay reads cell['text'] directly, so + phonetic fixes must be applied to cells too. + + This function: + 1. Replaces garbled IPA brackets with correct dictionary IPA + 2. Inserts missing IPA for English headwords that have no brackets + + Only processes cells in English-like columns (column_en, column_text). + German columns are never processed (they contain meaningful parentheses). + """ + if not IPA_AVAILABLE: + return cells + + # Column types where IPA processing makes sense + ipa_col_types = {'column_en', 'column_text'} + replaced = 0 + + for cell in cells: + col_type = cell.get('col_type', '') + if col_type not in ipa_col_types: + continue + text = cell.get('text', '') or '' + if not text.strip(): + continue + + # Step 1: replace garbled IPA brackets + new_text = _replace_phonetics_in_text(text, pronunciation) + # Step 2: insert missing IPA if no brackets were present + if new_text == text: + new_text = _insert_missing_ipa(text, pronunciation) + + if new_text != text: + logger.debug(f"fix_cell_phonetics: '{text}' → '{new_text}'") + cell['text'] = new_text + replaced += 1 + + if replaced: + logger.info(f"fix_cell_phonetics: {replaced} IPA fixes in {len(cells)} cells") + return cells + + def _assign_row_words_to_columns( row: RowGeometry, columns: List[PageRegion], diff --git a/klausur-service/backend/ocr_pipeline_api.py b/klausur-service/backend/ocr_pipeline_api.py index 0b528c9..c7b2b18 100644 --- a/klausur-service/backend/ocr_pipeline_api.py +++ b/klausur-service/backend/ocr_pipeline_api.py @@ -43,6 +43,7 @@ from cv_vocab_pipeline import ( _detect_sub_columns, _fix_character_confusion, _fix_phonetic_brackets, + fix_cell_phonetics, analyze_layout, analyze_layout_by_words, build_cell_grid, @@ -2030,6 +2031,9 @@ async def detect_words( # Determine which engine was actually used used_engine = cells[0].get("ocr_engine", "tesseract") if cells else engine + # Apply IPA phonetic fixes directly to cell texts (for overlay mode) + fix_cell_phonetics(cells, pronunciation=pronunciation) + # Grid result (always generic) word_result = { "cells": cells, @@ -2169,11 +2173,14 @@ async def _word_batch_stream_generator( logger.info(f"SSE batch: client disconnected after OCR for {session_id}") return - # 4. Send columns meta + # 4. Apply IPA phonetic fixes directly to cell texts (for overlay mode) + fix_cell_phonetics(cells, pronunciation=pronunciation) + + # 5. Send columns meta if columns_meta: yield f"data: {json.dumps({'type': 'columns', 'columns_used': columns_meta})}\n\n" - # 5. Stream all cells + # 6. Stream all cells for idx, cell in enumerate(cells): cell_event = { "type": "cell", @@ -2323,6 +2330,9 @@ async def _word_stream_generator( used_engine = all_cells[0].get("ocr_engine", "tesseract") if all_cells else engine + # Apply IPA phonetic fixes directly to cell texts (for overlay mode) + fix_cell_phonetics(all_cells, pronunciation=pronunciation) + word_result = { "cells": all_cells, "grid_shape": { @@ -3996,6 +4006,9 @@ async def run_auto(session_id: str, req: RunAutoRequest, request: Request): n_content_rows = len([r for r in row_geoms if r.row_type == 'content']) used_engine = cells[0].get("ocr_engine", "tesseract") if cells else req.ocr_engine + # Apply IPA phonetic fixes directly to cell texts + fix_cell_phonetics(cells, pronunciation=req.pronunciation) + word_result_data = { "cells": cells, "grid_shape": { diff --git a/klausur-service/backend/tests/test_cell_phonetics.py b/klausur-service/backend/tests/test_cell_phonetics.py new file mode 100644 index 0000000..4918497 --- /dev/null +++ b/klausur-service/backend/tests/test_cell_phonetics.py @@ -0,0 +1,117 @@ +"""Tests for fix_cell_phonetics and _insert_missing_ipa.""" + +import pytest +from unittest.mock import patch, MagicMock + +import sys +import os +sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..')) + + +class TestInsertMissingIpa: + """Tests for _insert_missing_ipa function.""" + + def test_single_headword_gets_ipa(self): + """Single English headword should get IPA inserted.""" + from cv_ocr_engines import _insert_missing_ipa + result = _insert_missing_ipa("badge", "british") + assert "[" in result and "]" in result + assert result.startswith("badge [") + + def test_short_phrase_first_word_gets_ipa(self): + """First real word in short phrase gets IPA.""" + from cv_ocr_engines import _insert_missing_ipa + result = _insert_missing_ipa("film", "british") + assert "[" in result + + def test_long_sentence_unchanged(self): + """Sentences with >6 words should not get IPA.""" + from cv_ocr_engines import _insert_missing_ipa + text = "Can I borrow your CD player from you please" + result = _insert_missing_ipa(text, "british") + assert result == text + + def test_existing_brackets_unchanged(self): + """Text with existing brackets should not get double IPA.""" + from cv_ocr_engines import _insert_missing_ipa + text = "dance [dˈɑːns]" + result = _insert_missing_ipa(text, "british") + assert result == text + + def test_empty_text_unchanged(self): + """Empty text returns empty.""" + from cv_ocr_engines import _insert_missing_ipa + assert _insert_missing_ipa("", "british") == "" + assert _insert_missing_ipa(" ", "british") == "" + + def test_grammar_words_skipped(self): + """Grammar particles should not get IPA.""" + from cv_ocr_engines import _insert_missing_ipa + # "sth" is in _GRAMMAR_BRACKET_WORDS + result = _insert_missing_ipa("sth", "british") + assert "[" not in result + + def test_german_word_no_ipa(self): + """German words (no IPA entry) stay unchanged.""" + from cv_ocr_engines import _insert_missing_ipa + result = _insert_missing_ipa("Anstecknadel", "british") + assert result == "Anstecknadel" + + +class TestFixCellPhonetics: + """Tests for fix_cell_phonetics function.""" + + def test_english_column_cells_processed(self): + """Cells with col_type column_en should be processed.""" + from cv_ocr_engines import fix_cell_phonetics + cells = [ + {"cell_id": "c1", "col_type": "column_en", "text": "badge"}, + {"cell_id": "c2", "col_type": "column_de", "text": "Anstecknadel"}, + ] + fix_cell_phonetics(cells, pronunciation="british") + # English cell should have IPA + assert "[" in cells[0]["text"] + # German cell should be unchanged + assert cells[1]["text"] == "Anstecknadel" + + def test_column_text_cells_processed(self): + """Cells with col_type column_text should be processed.""" + from cv_ocr_engines import fix_cell_phonetics + cells = [ + {"cell_id": "c1", "col_type": "column_text", "text": "challenge"}, + ] + fix_cell_phonetics(cells, pronunciation="british") + assert "[" in cells[0]["text"] + + def test_garbled_ipa_replaced(self): + """Garbled IPA brackets should be replaced with correct IPA.""" + from cv_ocr_engines import fix_cell_phonetics + cells = [ + {"cell_id": "c1", "col_type": "column_en", "text": "dance {'tfatno]"}, + ] + fix_cell_phonetics(cells, pronunciation="british") + # Should have proper IPA now + text = cells[0]["text"] + assert "dance [" in text + assert "{'tfatno]" not in text + + def test_empty_cells_unchanged(self): + """Empty cells should not cause errors.""" + from cv_ocr_engines import fix_cell_phonetics + cells = [ + {"cell_id": "c1", "col_type": "column_en", "text": ""}, + {"cell_id": "c2", "col_type": "column_en", "text": None}, + ] + fix_cell_phonetics(cells, pronunciation="british") + assert cells[0]["text"] == "" + + def test_non_english_col_types_skipped(self): + """Cells with column_de, column_example etc. should not be processed.""" + from cv_ocr_engines import fix_cell_phonetics + cells = [ + {"cell_id": "c1", "col_type": "column_de", "text": "Eis (gefrorenes Wasser)"}, + {"cell_id": "c2", "col_type": "column_example", "text": "(sich beschweren)"}, + ] + fix_cell_phonetics(cells, pronunciation="british") + assert cells[0]["text"] == "Eis (gefrorenes Wasser)" + assert cells[1]["text"] == "(sich beschweren)"