Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 28s
CI / test-go-edu-search (push) Successful in 27s
CI / test-python-klausur (push) Failing after 1m55s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 19s
Track A (Backend): - Compound word IPA decomposition (schoolbag→school+bag) - Trailing garbled IPA fragment removal after brackets (R21 fix) - Regression runner with DB persistence, history endpoints - Page crop determinism verified with tests Track B (Frontend): - OCR Regression dashboard (/ai/ocr-regression) - Ground Truth Review workflow (/ai/ocr-ground-truth) with split-view, confidence highlighting, inline edit, batch mark, progress tracking Track C (Docs): - OCR-Pipeline.md v5.0 (Steps 5e-5h) - Regression testing guide - mkdocs.yml nav update Track D (Infra): - TrOCR baseline benchmark script - run-regression.sh shell script - Migration 008: regression_runs table Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
175 lines
7.3 KiB
Python
175 lines
7.3 KiB
Python
"""Tests for fix_cell_phonetics and _insert_missing_ipa."""
|
||
|
||
import pytest
|
||
from unittest.mock import patch, MagicMock
|
||
|
||
import sys
|
||
import os
|
||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
|
||
|
||
|
||
class TestInsertMissingIpa:
|
||
"""Tests for _insert_missing_ipa function."""
|
||
|
||
def test_single_headword_gets_ipa(self):
|
||
"""Single English headword should get IPA inserted."""
|
||
from cv_ocr_engines import _insert_missing_ipa
|
||
result = _insert_missing_ipa("badge", "british")
|
||
assert "[" in result and "]" in result
|
||
assert result.startswith("badge [")
|
||
|
||
def test_short_phrase_first_word_gets_ipa(self):
|
||
"""First real word in short phrase gets IPA."""
|
||
from cv_ocr_engines import _insert_missing_ipa
|
||
result = _insert_missing_ipa("film", "british")
|
||
assert "[" in result
|
||
|
||
def test_long_sentence_unchanged(self):
|
||
"""Sentences with >6 words should not get IPA."""
|
||
from cv_ocr_engines import _insert_missing_ipa
|
||
text = "Can I borrow your CD player from you please"
|
||
result = _insert_missing_ipa(text, "british")
|
||
assert result == text
|
||
|
||
def test_existing_brackets_unchanged(self):
|
||
"""Text with existing brackets should not get double IPA."""
|
||
from cv_ocr_engines import _insert_missing_ipa
|
||
text = "dance [dˈɑːns]"
|
||
result = _insert_missing_ipa(text, "british")
|
||
assert result == text
|
||
|
||
def test_empty_text_unchanged(self):
|
||
"""Empty text returns empty."""
|
||
from cv_ocr_engines import _insert_missing_ipa
|
||
assert _insert_missing_ipa("", "british") == ""
|
||
assert _insert_missing_ipa(" ", "british") == ""
|
||
|
||
def test_grammar_words_skipped(self):
|
||
"""Grammar particles should not get IPA."""
|
||
from cv_ocr_engines import _insert_missing_ipa
|
||
# "sth" is in _GRAMMAR_BRACKET_WORDS
|
||
result = _insert_missing_ipa("sth", "british")
|
||
assert "[" not in result
|
||
|
||
def test_german_word_no_ipa(self):
|
||
"""German words (no IPA entry) stay unchanged."""
|
||
from cv_ocr_engines import _insert_missing_ipa
|
||
result = _insert_missing_ipa("Anstecknadel", "british")
|
||
assert result == "Anstecknadel"
|
||
|
||
def test_compound_word_schoolbag_gets_ipa(self):
|
||
"""R07: Compound word 'schoolbag' should get decomposed IPA (school+bag)."""
|
||
from cv_ocr_engines import _insert_missing_ipa
|
||
result = _insert_missing_ipa("schoolbag", "british")
|
||
assert "[" in result and "]" in result
|
||
assert result.startswith("schoolbag [")
|
||
|
||
def test_compound_word_blackbird(self):
|
||
"""Compound word 'blackbird' should get decomposed IPA."""
|
||
from cv_ocr_engines import _insert_missing_ipa
|
||
result = _insert_missing_ipa("blackbird", "british")
|
||
assert "[" in result and "]" in result
|
||
|
||
def test_compound_word_too_short(self):
|
||
"""Words shorter than 6 chars should not attempt compound decomposition."""
|
||
from cv_ocr_engines import _decompose_compound
|
||
assert _decompose_compound("bag", "british") is None
|
||
|
||
def test_decompose_compound_direct(self):
|
||
"""Direct test of _decompose_compound for known compounds."""
|
||
from cv_ocr_engines import _decompose_compound
|
||
# schoolbag = school + bag — both should be in dictionary
|
||
result = _decompose_compound("schoolbag", "british")
|
||
assert result is not None
|
||
|
||
|
||
class TestStripPostBracketGarbled:
|
||
"""Tests for _strip_post_bracket_garbled — trailing garbled IPA removal."""
|
||
|
||
def test_simple_trailing_garbled(self):
|
||
"""R21-simple: 'sea [sˈiː] si:' → trailing IPA marker removed."""
|
||
from cv_ocr_engines import _strip_post_bracket_garbled
|
||
result = _strip_post_bracket_garbled("sea [sˈiː] si:")
|
||
assert "si:" not in result
|
||
assert result.startswith("sea [sˈiː]")
|
||
|
||
def test_multi_word_trailing_garbled(self):
|
||
"""R21: 'seat [sˈiːt] belt si:t belt' → keep 'belt', remove garbled."""
|
||
from cv_ocr_engines import _strip_post_bracket_garbled
|
||
result = _strip_post_bracket_garbled("seat [sˈiːt] belt si:t belt")
|
||
assert "belt" in result # real word kept
|
||
assert "si:t" not in result # garbled removed
|
||
# Should contain "seat [sˈiːt] belt" but not the garbled duplication
|
||
assert result.count("belt") == 1
|
||
|
||
def test_delimiter_after_bracket_kept(self):
|
||
"""Delimiters after IPA bracket are kept."""
|
||
from cv_ocr_engines import _strip_post_bracket_garbled
|
||
result = _strip_post_bracket_garbled("dance [dˈɑːns] – tanzen")
|
||
assert "– tanzen" in result
|
||
|
||
def test_german_after_bracket_kept(self):
|
||
"""German words (uppercase) after IPA bracket are kept."""
|
||
from cv_ocr_engines import _strip_post_bracket_garbled
|
||
result = _strip_post_bracket_garbled("badge [bædʒ] Abzeichen")
|
||
assert "Abzeichen" in result
|
||
|
||
|
||
class TestFixCellPhonetics:
|
||
"""Tests for fix_cell_phonetics function."""
|
||
|
||
def test_english_column_cells_processed(self):
|
||
"""Cells with col_type column_en should be processed."""
|
||
from cv_ocr_engines import fix_cell_phonetics
|
||
cells = [
|
||
{"cell_id": "c1", "col_type": "column_en", "text": "badge"},
|
||
{"cell_id": "c2", "col_type": "column_de", "text": "Anstecknadel"},
|
||
]
|
||
fix_cell_phonetics(cells, pronunciation="british")
|
||
# English cell should have IPA
|
||
assert "[" in cells[0]["text"]
|
||
# German cell should be unchanged
|
||
assert cells[1]["text"] == "Anstecknadel"
|
||
|
||
def test_column_text_cells_processed(self):
|
||
"""Cells with col_type column_text should be processed."""
|
||
from cv_ocr_engines import fix_cell_phonetics
|
||
cells = [
|
||
{"cell_id": "c1", "col_type": "column_text", "text": "challenge"},
|
||
]
|
||
fix_cell_phonetics(cells, pronunciation="british")
|
||
assert "[" in cells[0]["text"]
|
||
|
||
def test_garbled_ipa_replaced(self):
|
||
"""Garbled IPA brackets should be replaced with correct IPA."""
|
||
from cv_ocr_engines import fix_cell_phonetics
|
||
cells = [
|
||
{"cell_id": "c1", "col_type": "column_en", "text": "dance {'tfatno]"},
|
||
]
|
||
fix_cell_phonetics(cells, pronunciation="british")
|
||
# Should have proper IPA now
|
||
text = cells[0]["text"]
|
||
assert "dance [" in text
|
||
assert "{'tfatno]" not in text
|
||
|
||
def test_empty_cells_unchanged(self):
|
||
"""Empty cells should not cause errors."""
|
||
from cv_ocr_engines import fix_cell_phonetics
|
||
cells = [
|
||
{"cell_id": "c1", "col_type": "column_en", "text": ""},
|
||
{"cell_id": "c2", "col_type": "column_en", "text": None},
|
||
]
|
||
fix_cell_phonetics(cells, pronunciation="british")
|
||
assert cells[0]["text"] == ""
|
||
|
||
def test_non_english_col_types_skipped(self):
|
||
"""Cells with column_de, column_example etc. should not be processed."""
|
||
from cv_ocr_engines import fix_cell_phonetics
|
||
cells = [
|
||
{"cell_id": "c1", "col_type": "column_de", "text": "Eis (gefrorenes Wasser)"},
|
||
{"cell_id": "c2", "col_type": "column_example", "text": "(sich beschweren)"},
|
||
]
|
||
fix_cell_phonetics(cells, pronunciation="british")
|
||
assert cells[0]["text"] == "Eis (gefrorenes Wasser)"
|
||
assert cells[1]["text"] == "(sich beschweren)"
|