feat(ocr-pipeline): 6 systematic improvements for robustness, performance & UX
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 37s
CI / test-go-edu-search (push) Successful in 26s
CI / test-python-klausur (push) Failing after 1m57s
CI / test-python-agent-core (push) Successful in 19s
CI / test-nodejs-website (push) Successful in 21s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 37s
CI / test-go-edu-search (push) Successful in 26s
CI / test-python-klausur (push) Failing after 1m57s
CI / test-python-agent-core (push) Successful in 19s
CI / test-nodejs-website (push) Successful in 21s
1. Unit tests: 76 new parametrized tests for noise filter, phonetic detection,
cell text cleaning, and row merging (116 total, all green)
2. Continuation-row merge: detect multi-line vocab entries where text wraps
(lowercase EN + empty DE) and merge into previous entry
3. Empty DE fallback: secondary PSM=7 OCR pass for cells missed by PSM=6
4. Batch-OCR: collect empty cells per column, run single Tesseract call on
column strip instead of per-cell (~66% fewer calls for 3+ empty cells)
5. StepReconstruction UI: font scaling via naturalHeight, empty EN/DE field
highlighting, undo/redo (Ctrl+Z), per-cell reset button
6. Session reprocess: POST /sessions/{id}/reprocess endpoint to re-run from
any step, with reprocess button on completed pipeline steps
Also fixes pre-existing dewarp_image tuple unpacking bug in run_cv_pipeline
and updates dewarp tests to match current (image, info) return signature.
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -9,6 +9,9 @@ Tests cover:
|
||||
- Stage 5: Layout analysis (content bounds, projection profiles, column detection)
|
||||
- Stage 6: Multi-pass OCR region handling
|
||||
- Stage 7: Line grouping and vocabulary matching
|
||||
- Noise filter functions (_is_noise_tail_token, _clean_cell_text)
|
||||
- Phonetic detection (_is_phonetic_only_text)
|
||||
- Phonetic & continuation row merging
|
||||
- Orchestrator (run_cv_pipeline)
|
||||
|
||||
DSGVO Note: All tests run locally with synthetic data. No external API calls.
|
||||
@@ -36,6 +39,11 @@ from cv_vocab_pipeline import (
|
||||
CV2_AVAILABLE,
|
||||
TESSERACT_AVAILABLE,
|
||||
CV_PIPELINE_AVAILABLE,
|
||||
_is_noise_tail_token,
|
||||
_clean_cell_text,
|
||||
_is_phonetic_only_text,
|
||||
_merge_phonetic_continuation_rows,
|
||||
_merge_continuation_rows,
|
||||
)
|
||||
|
||||
|
||||
@@ -202,16 +210,28 @@ class TestDeskew:
|
||||
|
||||
@pytest.mark.skipif(not CV2_AVAILABLE, reason="OpenCV not available")
|
||||
class TestDewarp:
|
||||
"""Test dewarp (pass-through) stage."""
|
||||
"""Test dewarp stage (returns (image, info) tuple)."""
|
||||
|
||||
def test_dewarp_passthrough(self, white_image):
|
||||
"""Current dewarp should return the same image (pass-through)."""
|
||||
def test_dewarp_returns_tuple(self, white_image):
|
||||
"""dewarp_image must return (image, dewarp_info) tuple."""
|
||||
result = dewarp_image(white_image)
|
||||
np.testing.assert_array_equal(result, white_image)
|
||||
assert isinstance(result, tuple)
|
||||
assert len(result) == 2
|
||||
img_out, info = result
|
||||
assert isinstance(img_out, np.ndarray)
|
||||
assert isinstance(info, dict)
|
||||
assert "shear_degrees" in info
|
||||
|
||||
def test_dewarp_preserves_shape(self, text_like_image):
|
||||
result = dewarp_image(text_like_image)
|
||||
assert result.shape == text_like_image.shape
|
||||
"""Output image should have same shape as input."""
|
||||
img_out, _ = dewarp_image(text_like_image)
|
||||
assert img_out.shape == text_like_image.shape
|
||||
|
||||
def test_dewarp_white_image_no_correction(self, white_image):
|
||||
"""A uniform white image should get no shear correction."""
|
||||
img_out, info = dewarp_image(white_image)
|
||||
assert abs(info["shear_degrees"]) < 0.5
|
||||
assert img_out.shape == white_image.shape
|
||||
|
||||
|
||||
# =============================================
|
||||
@@ -561,6 +581,268 @@ class TestStageIntegration:
|
||||
assert layout_img.shape[:2] == corrected.shape[:2]
|
||||
|
||||
|
||||
# =============================================
|
||||
# NOISE FILTER TESTS
|
||||
# =============================================
|
||||
|
||||
class TestNoiseFilter:
|
||||
"""Test _is_noise_tail_token for trailing OCR noise detection."""
|
||||
|
||||
# --- Tokens that should be KEPT (return False) ---
|
||||
|
||||
@pytest.mark.parametrize("token", [
|
||||
# Compound words with hyphens
|
||||
"money-saver",
|
||||
"under-",
|
||||
"well-known",
|
||||
# Words with parenthesized parts (dictionary entries)
|
||||
"Schild(chen)",
|
||||
"(Salat-)Gurke",
|
||||
"(auf)",
|
||||
"(on)",
|
||||
"selbst)",
|
||||
"(wir",
|
||||
"Tanz(veranstaltung)",
|
||||
"(zer)brechen",
|
||||
# Phonetic brackets
|
||||
"serva]",
|
||||
"['mani",
|
||||
"[eg]",
|
||||
"[maus]",
|
||||
# Words with trailing punctuation
|
||||
"cupcakes.",
|
||||
"sister.",
|
||||
"mice",
|
||||
# Abbreviations
|
||||
"e.g.",
|
||||
"sth.",
|
||||
"usw.",
|
||||
"adj.",
|
||||
# Ellipsis
|
||||
"...",
|
||||
"\u2026",
|
||||
# Regular words
|
||||
"the",
|
||||
"cat",
|
||||
"big",
|
||||
"run",
|
||||
"set",
|
||||
"ago",
|
||||
])
|
||||
def test_keep_real_tokens(self, token):
|
||||
"""Real words, dictionary punctuation, and phonetic brackets are kept."""
|
||||
assert _is_noise_tail_token(token) is False, f"Should keep {token!r}"
|
||||
|
||||
# --- Tokens that should be FILTERED (return True) ---
|
||||
|
||||
@pytest.mark.parametrize("token", [
|
||||
# Pure non-alpha
|
||||
"B|",
|
||||
"3d",
|
||||
"x7",
|
||||
")",
|
||||
"|",
|
||||
"@",
|
||||
"3",
|
||||
# Very short non-dictionary fragments
|
||||
"ee",
|
||||
"k",
|
||||
"zz",
|
||||
"qq",
|
||||
# Empty
|
||||
"",
|
||||
" ",
|
||||
])
|
||||
def test_filter_noise_tokens(self, token):
|
||||
"""OCR noise fragments are filtered."""
|
||||
assert _is_noise_tail_token(token) is True, f"Should filter {token!r}"
|
||||
|
||||
|
||||
class TestCleanCellText:
|
||||
"""Test _clean_cell_text integration (full text → cleaned text)."""
|
||||
|
||||
def test_empty_returns_empty(self):
|
||||
assert _clean_cell_text("") == ""
|
||||
assert _clean_cell_text(" ") == ""
|
||||
|
||||
def test_real_word_unchanged(self):
|
||||
assert _clean_cell_text("cupcakes") == "cupcakes"
|
||||
|
||||
def test_strips_trailing_noise(self):
|
||||
"""Trailing noise tokens should be removed."""
|
||||
result = _clean_cell_text("cupcakes B|")
|
||||
assert result == "cupcakes"
|
||||
|
||||
def test_keeps_trailing_real_word(self):
|
||||
"""Trailing real words should be kept."""
|
||||
result = _clean_cell_text("big cat")
|
||||
assert result == "big cat"
|
||||
|
||||
def test_abbreviation_kept(self):
|
||||
"""Known abbreviations should not be cleared."""
|
||||
result = _clean_cell_text("e.g.")
|
||||
assert result == "e.g."
|
||||
|
||||
def test_pure_garbage_cleared(self):
|
||||
"""OCR garbage without real words should be cleared."""
|
||||
result = _clean_cell_text("3d |x")
|
||||
assert result == ""
|
||||
|
||||
def test_compound_word_preserved(self):
|
||||
"""Compound words with hyphens should be preserved."""
|
||||
result = _clean_cell_text("money-saver")
|
||||
assert result == "money-saver"
|
||||
|
||||
def test_parenthesized_word_preserved(self):
|
||||
result = _clean_cell_text("(Salat-)Gurke")
|
||||
assert result == "(Salat-)Gurke"
|
||||
|
||||
def test_multiple_trailing_noise(self):
|
||||
"""Multiple trailing noise tokens should all be removed."""
|
||||
result = _clean_cell_text("achieve 3 |")
|
||||
assert result == "achieve"
|
||||
|
||||
|
||||
class TestPhoneticOnlyText:
|
||||
"""Test _is_phonetic_only_text for phonetic transcription detection."""
|
||||
|
||||
@pytest.mark.parametrize("text,expected", [
|
||||
# Phonetic-only patterns → True
|
||||
("['mani serva]", True),
|
||||
("[dɑːns]", True),
|
||||
("[\"a:mand]", True),
|
||||
("['wɜːkʃɒp]", True),
|
||||
# serva] has 5 alpha chars after bracket removal → NOT phonetic-only
|
||||
("serva]", False),
|
||||
# NOT phonetic-only → False
|
||||
("almond ['a:mand]", False),
|
||||
("Mandel", False),
|
||||
("cupcakes", False),
|
||||
("", False),
|
||||
("achieve", False),
|
||||
("money-saver ['mani]", False),
|
||||
])
|
||||
def test_phonetic_detection(self, text, expected):
|
||||
assert _is_phonetic_only_text(text) is expected, \
|
||||
f"_is_phonetic_only_text({text!r}) should be {expected}"
|
||||
|
||||
|
||||
class TestMergePhoneticContinuationRows:
|
||||
"""Test _merge_phonetic_continuation_rows for phonetic row merging."""
|
||||
|
||||
def test_empty_list(self):
|
||||
assert _merge_phonetic_continuation_rows([]) == []
|
||||
|
||||
def test_single_entry(self):
|
||||
entries = [{"english": "cat", "german": "Katze", "example": ""}]
|
||||
result = _merge_phonetic_continuation_rows(entries)
|
||||
assert len(result) == 1
|
||||
assert result[0]["english"] == "cat"
|
||||
|
||||
def test_merges_phonetic_row(self):
|
||||
"""Phonetic-only row should merge into previous entry."""
|
||||
entries = [
|
||||
{"english": "money-saver", "german": "Sparfuchs", "example": "", "row_index": 0},
|
||||
{"english": "['mani serva]", "german": "", "example": "", "row_index": 1},
|
||||
]
|
||||
result = _merge_phonetic_continuation_rows(entries)
|
||||
assert len(result) == 1
|
||||
assert result[0]["english"] == "money-saver ['mani serva]"
|
||||
assert result[0]["german"] == "Sparfuchs"
|
||||
|
||||
def test_no_merge_when_de_present(self):
|
||||
"""Row with DE text should NOT be merged even if EN looks phonetic."""
|
||||
entries = [
|
||||
{"english": "cat", "german": "Katze", "example": ""},
|
||||
{"english": "[kæt]", "german": "some text", "example": ""},
|
||||
]
|
||||
result = _merge_phonetic_continuation_rows(entries)
|
||||
assert len(result) == 2
|
||||
|
||||
def test_no_merge_regular_rows(self):
|
||||
"""Normal vocab rows should not be merged."""
|
||||
entries = [
|
||||
{"english": "cat", "german": "Katze", "example": ""},
|
||||
{"english": "dog", "german": "Hund", "example": ""},
|
||||
]
|
||||
result = _merge_phonetic_continuation_rows(entries)
|
||||
assert len(result) == 2
|
||||
|
||||
def test_merges_example_too(self):
|
||||
"""If phonetic row has example text, it should merge into previous."""
|
||||
entries = [
|
||||
{"english": "dance", "german": "tanzen", "example": "", "row_index": 0},
|
||||
{"english": "[dɑːns]", "german": "", "example": "Let's dance.", "row_index": 1},
|
||||
]
|
||||
result = _merge_phonetic_continuation_rows(entries)
|
||||
assert len(result) == 1
|
||||
assert result[0]["english"] == "dance [dɑːns]"
|
||||
assert result[0]["example"] == "Let's dance."
|
||||
|
||||
|
||||
class TestMergeContinuationRows:
|
||||
"""Test _merge_continuation_rows for multi-line entry merging."""
|
||||
|
||||
def test_empty_list(self):
|
||||
assert _merge_continuation_rows([]) == []
|
||||
|
||||
def test_no_merge_independent_rows(self):
|
||||
"""Rows with both EN and DE should not be merged."""
|
||||
entries = [
|
||||
{"english": "cat", "german": "Katze", "example": "", "row_index": 0},
|
||||
{"english": "dog", "german": "Hund", "example": "", "row_index": 1},
|
||||
]
|
||||
result = _merge_continuation_rows(entries)
|
||||
assert len(result) == 2
|
||||
|
||||
def test_merge_lowercase_continuation(self):
|
||||
"""Lowercase EN with empty DE should merge into previous."""
|
||||
entries = [
|
||||
{"english": "to put up", "german": "aufstellen", "example": "", "row_index": 0},
|
||||
{"english": "with sth.", "german": "", "example": "", "row_index": 1},
|
||||
]
|
||||
result = _merge_continuation_rows(entries)
|
||||
assert len(result) == 1
|
||||
assert result[0]["english"] == "to put up with sth."
|
||||
assert result[0]["german"] == "aufstellen"
|
||||
|
||||
def test_no_merge_uppercase_start(self):
|
||||
"""EN starting with uppercase and empty DE is likely its own entry, not a continuation."""
|
||||
entries = [
|
||||
{"english": "cat", "german": "Katze", "example": "", "row_index": 0},
|
||||
{"english": "Dog", "german": "", "example": "", "row_index": 1},
|
||||
]
|
||||
result = _merge_continuation_rows(entries)
|
||||
assert len(result) == 2
|
||||
|
||||
def test_no_merge_when_previous_ends_with_period(self):
|
||||
"""If previous entry ends with sentence terminator, next is not continuation."""
|
||||
entries = [
|
||||
{"english": "That's great.", "german": "Das ist toll.", "example": "", "row_index": 0},
|
||||
{"english": "really nice", "german": "", "example": "", "row_index": 1},
|
||||
]
|
||||
result = _merge_continuation_rows(entries)
|
||||
assert len(result) == 2
|
||||
|
||||
def test_no_merge_long_text(self):
|
||||
"""Text with 4+ words is likely an example sentence, not continuation."""
|
||||
entries = [
|
||||
{"english": "achieve", "german": "erreichen", "example": "", "row_index": 0},
|
||||
{"english": "she achieved her goals", "german": "", "example": "", "row_index": 1},
|
||||
]
|
||||
result = _merge_continuation_rows(entries)
|
||||
assert len(result) == 2
|
||||
|
||||
def test_first_entry_not_merged(self):
|
||||
"""First entry with empty DE should not crash (no previous)."""
|
||||
entries = [
|
||||
{"english": "something", "german": "", "example": "", "row_index": 0},
|
||||
{"english": "cat", "german": "Katze", "example": "", "row_index": 1},
|
||||
]
|
||||
result = _merge_continuation_rows(entries)
|
||||
assert len(result) == 2
|
||||
|
||||
|
||||
# =============================================
|
||||
# RUN TESTS
|
||||
# =============================================
|
||||
|
||||
Reference in New Issue
Block a user