Files
breakpilot-lehrer/klausur-service/backend/tests/test_cell_phonetics.py
Benjamin Admin 5f2ed44654
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 41s
CI / test-go-edu-search (push) Successful in 32s
CI / test-python-klausur (push) Failing after 2m41s
CI / test-python-agent-core (push) Successful in 34s
CI / test-nodejs-website (push) Successful in 39s
Cleanup: Delete ALL 242 shims, update ALL consumer imports
klausur-service: 183 shims deleted, 26 test files + 8 source files updated
backend-lehrer: 59 shims deleted, main.py + 8 source files updated

All imports now use the new package paths directly.
Zero shims remaining in the entire codebase.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-26 00:11:33 +02:00

175 lines
7.4 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""Tests for fix_cell_phonetics and _insert_missing_ipa."""
import pytest
from unittest.mock import patch, MagicMock
import sys
import os
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
class TestInsertMissingIpa:
"""Tests for _insert_missing_ipa function."""
def test_single_headword_gets_ipa(self):
"""Single English headword should get IPA inserted."""
from ocr.engines.engines import _insert_missing_ipa
result = _insert_missing_ipa("badge", "british")
assert "[" in result and "]" in result
assert result.startswith("badge [")
def test_short_phrase_first_word_gets_ipa(self):
"""First real word in short phrase gets IPA."""
from ocr.engines.engines import _insert_missing_ipa
result = _insert_missing_ipa("film", "british")
assert "[" in result
def test_long_sentence_unchanged(self):
"""Sentences with >6 words should not get IPA."""
from ocr.engines.engines import _insert_missing_ipa
text = "Can I borrow your CD player from you please"
result = _insert_missing_ipa(text, "british")
assert result == text
def test_existing_brackets_unchanged(self):
"""Text with existing brackets should not get double IPA."""
from ocr.engines.engines import _insert_missing_ipa
text = "dance [dˈɑːns]"
result = _insert_missing_ipa(text, "british")
assert result == text
def test_empty_text_unchanged(self):
"""Empty text returns empty."""
from ocr.engines.engines import _insert_missing_ipa
assert _insert_missing_ipa("", "british") == ""
assert _insert_missing_ipa(" ", "british") == ""
def test_grammar_words_skipped(self):
"""Grammar particles should not get IPA."""
from ocr.engines.engines import _insert_missing_ipa
# "sth" is in _GRAMMAR_BRACKET_WORDS
result = _insert_missing_ipa("sth", "british")
assert "[" not in result
def test_german_word_no_ipa(self):
"""German words (no IPA entry) stay unchanged."""
from ocr.engines.engines import _insert_missing_ipa
result = _insert_missing_ipa("Anstecknadel", "british")
assert result == "Anstecknadel"
def test_compound_word_schoolbag_gets_ipa(self):
"""R07: Compound word 'schoolbag' should get decomposed IPA (school+bag)."""
from ocr.engines.engines import _insert_missing_ipa
result = _insert_missing_ipa("schoolbag", "british")
assert "[" in result and "]" in result
assert result.startswith("schoolbag [")
def test_compound_word_blackbird(self):
"""Compound word 'blackbird' should get decomposed IPA."""
from ocr.engines.engines import _insert_missing_ipa
result = _insert_missing_ipa("blackbird", "british")
assert "[" in result and "]" in result
def test_compound_word_too_short(self):
"""Words shorter than 6 chars should not attempt compound decomposition."""
from ocr.engines.engines import _decompose_compound
assert _decompose_compound("bag", "british") is None
def test_decompose_compound_direct(self):
"""Direct test of _decompose_compound for known compounds."""
from ocr.engines.engines import _decompose_compound
# schoolbag = school + bag — both should be in dictionary
result = _decompose_compound("schoolbag", "british")
assert result is not None
class TestStripPostBracketGarbled:
"""Tests for _strip_post_bracket_garbled — trailing garbled IPA removal."""
def test_simple_trailing_garbled(self):
"""R21-simple: 'sea [sˈiː] si:' → trailing IPA marker removed."""
from ocr.engines.engines import _strip_post_bracket_garbled
result = _strip_post_bracket_garbled("sea [sˈiː] si:")
assert "si:" not in result
assert result.startswith("sea [sˈiː]")
def test_multi_word_trailing_garbled(self):
"""R21: 'seat [sˈiːt] belt si:t belt' → keep 'belt', remove garbled."""
from ocr.engines.engines import _strip_post_bracket_garbled
result = _strip_post_bracket_garbled("seat [sˈiːt] belt si:t belt")
assert "belt" in result # real word kept
assert "si:t" not in result # garbled removed
# Should contain "seat [sˈiːt] belt" but not the garbled duplication
assert result.count("belt") == 1
def test_delimiter_after_bracket_kept(self):
"""Delimiters after IPA bracket are kept."""
from ocr.engines.engines import _strip_post_bracket_garbled
result = _strip_post_bracket_garbled("dance [dˈɑːns] tanzen")
assert " tanzen" in result
def test_german_after_bracket_kept(self):
"""German words (uppercase) after IPA bracket are kept."""
from ocr.engines.engines import _strip_post_bracket_garbled
result = _strip_post_bracket_garbled("badge [bædʒ] Abzeichen")
assert "Abzeichen" in result
class TestFixCellPhonetics:
"""Tests for fix_cell_phonetics function."""
def test_english_column_cells_processed(self):
"""Cells with col_type column_en should be processed."""
from ocr.engines.engines import fix_cell_phonetics
cells = [
{"cell_id": "c1", "col_type": "column_en", "text": "badge"},
{"cell_id": "c2", "col_type": "column_de", "text": "Anstecknadel"},
]
fix_cell_phonetics(cells, pronunciation="british")
# English cell should have IPA
assert "[" in cells[0]["text"]
# German cell should be unchanged
assert cells[1]["text"] == "Anstecknadel"
def test_column_text_cells_processed(self):
"""Cells with col_type column_text should be processed."""
from ocr.engines.engines import fix_cell_phonetics
cells = [
{"cell_id": "c1", "col_type": "column_text", "text": "challenge"},
]
fix_cell_phonetics(cells, pronunciation="british")
assert "[" in cells[0]["text"]
def test_garbled_ipa_replaced(self):
"""Garbled IPA brackets should be replaced with correct IPA."""
from ocr.engines.engines import fix_cell_phonetics
cells = [
{"cell_id": "c1", "col_type": "column_en", "text": "dance {'tfatno]"},
]
fix_cell_phonetics(cells, pronunciation="british")
# Should have proper IPA now
text = cells[0]["text"]
assert "dance [" in text
assert "{'tfatno]" not in text
def test_empty_cells_unchanged(self):
"""Empty cells should not cause errors."""
from ocr.engines.engines import fix_cell_phonetics
cells = [
{"cell_id": "c1", "col_type": "column_en", "text": ""},
{"cell_id": "c2", "col_type": "column_en", "text": None},
]
fix_cell_phonetics(cells, pronunciation="british")
assert cells[0]["text"] == ""
def test_non_english_col_types_skipped(self):
"""Cells with column_de, column_example etc. should not be processed."""
from ocr.engines.engines import fix_cell_phonetics
cells = [
{"cell_id": "c1", "col_type": "column_de", "text": "Eis (gefrorenes Wasser)"},
{"cell_id": "c2", "col_type": "column_example", "text": "(sich beschweren)"},
]
fix_cell_phonetics(cells, pronunciation="british")
assert cells[0]["text"] == "Eis (gefrorenes Wasser)"
assert cells[1]["text"] == "(sich beschweren)"