feat: Sprint 1 — IPA hardening, regression framework, ground-truth review
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 28s
CI / test-go-edu-search (push) Successful in 27s
CI / test-python-klausur (push) Failing after 1m55s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 19s

Track A (Backend):
- Compound word IPA decomposition (schoolbag→school+bag)
- Trailing garbled IPA fragment removal after brackets (R21 fix)
- Regression runner with DB persistence, history endpoints
- Page crop determinism verified with tests

Track B (Frontend):
- OCR Regression dashboard (/ai/ocr-regression)
- Ground Truth Review workflow (/ai/ocr-ground-truth)
  with split-view, confidence highlighting, inline edit,
  batch mark, progress tracking

Track C (Docs):
- OCR-Pipeline.md v5.0 (Steps 5e-5h)
- Regression testing guide
- mkdocs.yml nav update

Track D (Infra):
- TrOCR baseline benchmark script
- run-regression.sh shell script
- Migration 008: regression_runs table

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-23 09:21:27 +01:00
parent f5d5d6c59c
commit a1e079b911
13 changed files with 1796 additions and 15 deletions

View File

@@ -57,6 +57,63 @@ class TestInsertMissingIpa:
result = _insert_missing_ipa("Anstecknadel", "british")
assert result == "Anstecknadel"
def test_compound_word_schoolbag_gets_ipa(self):
"""R07: Compound word 'schoolbag' should get decomposed IPA (school+bag)."""
from cv_ocr_engines import _insert_missing_ipa
result = _insert_missing_ipa("schoolbag", "british")
assert "[" in result and "]" in result
assert result.startswith("schoolbag [")
def test_compound_word_blackbird(self):
"""Compound word 'blackbird' should get decomposed IPA."""
from cv_ocr_engines import _insert_missing_ipa
result = _insert_missing_ipa("blackbird", "british")
assert "[" in result and "]" in result
def test_compound_word_too_short(self):
"""Words shorter than 6 chars should not attempt compound decomposition."""
from cv_ocr_engines import _decompose_compound
assert _decompose_compound("bag", "british") is None
def test_decompose_compound_direct(self):
"""Direct test of _decompose_compound for known compounds."""
from cv_ocr_engines import _decompose_compound
# schoolbag = school + bag — both should be in dictionary
result = _decompose_compound("schoolbag", "british")
assert result is not None
class TestStripPostBracketGarbled:
"""Tests for _strip_post_bracket_garbled — trailing garbled IPA removal."""
def test_simple_trailing_garbled(self):
"""R21-simple: 'sea [sˈiː] si:' → trailing IPA marker removed."""
from cv_ocr_engines import _strip_post_bracket_garbled
result = _strip_post_bracket_garbled("sea [sˈiː] si:")
assert "si:" not in result
assert result.startswith("sea [sˈiː]")
def test_multi_word_trailing_garbled(self):
"""R21: 'seat [sˈiːt] belt si:t belt' → keep 'belt', remove garbled."""
from cv_ocr_engines import _strip_post_bracket_garbled
result = _strip_post_bracket_garbled("seat [sˈiːt] belt si:t belt")
assert "belt" in result # real word kept
assert "si:t" not in result # garbled removed
# Should contain "seat [sˈiːt] belt" but not the garbled duplication
assert result.count("belt") == 1
def test_delimiter_after_bracket_kept(self):
"""Delimiters after IPA bracket are kept."""
from cv_ocr_engines import _strip_post_bracket_garbled
result = _strip_post_bracket_garbled("dance [dˈɑːns] tanzen")
assert " tanzen" in result
def test_german_after_bracket_kept(self):
"""German words (uppercase) after IPA bracket are kept."""
from cv_ocr_engines import _strip_post_bracket_garbled
result = _strip_post_bracket_garbled("badge [bædʒ] Abzeichen")
assert "Abzeichen" in result
class TestFixCellPhonetics:
"""Tests for fix_cell_phonetics function."""