feat(ocr-pipeline): 6 systematic improvements for robustness, performance & UX
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 37s
CI / test-go-edu-search (push) Successful in 26s
CI / test-python-klausur (push) Failing after 1m57s
CI / test-python-agent-core (push) Successful in 19s
CI / test-nodejs-website (push) Successful in 21s

1. Unit tests: 76 new parametrized tests for noise filter, phonetic detection,
   cell text cleaning, and row merging (116 total, all green)
2. Continuation-row merge: detect multi-line vocab entries where text wraps
   (lowercase EN + empty DE) and merge into previous entry
3. Empty DE fallback: secondary PSM=7 OCR pass for cells missed by PSM=6
4. Batch-OCR: collect empty cells per column, run single Tesseract call on
   column strip instead of per-cell (~66% fewer calls for 3+ empty cells)
5. StepReconstruction UI: font scaling via naturalHeight, empty EN/DE field
   highlighting, undo/redo (Ctrl+Z), per-cell reset button
6. Session reprocess: POST /sessions/{id}/reprocess endpoint to re-run from
   any step, with reprocess button on completed pipeline steps

Also fixes pre-existing dewarp_image tuple unpacking bug in run_cv_pipeline
and updates dewarp tests to match current (image, info) return signature.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-02 14:46:38 +01:00
parent c3a924a620
commit e718353d9f
6 changed files with 775 additions and 79 deletions

View File

@@ -3503,6 +3503,21 @@ def _ocr_single_cell(
)
used_engine = 'cell_ocr_fallback'
# --- SECONDARY FALLBACK: PSM=7 (single line) for still-empty cells ---
if not text.strip() and _run_fallback and not use_rapid:
cell_lang = lang_map.get(col.type, lang)
psm7_words = ocr_region(ocr_img, cell_region, lang=cell_lang, psm=7)
if psm7_words:
psm7_words = [w for w in psm7_words if w.get('conf', 0) >= _MIN_WORD_CONF]
if psm7_words:
p7_text = _words_to_reading_order_text(psm7_words, y_tolerance_px=10)
if p7_text.strip():
text = p7_text
avg_conf = round(
sum(w['conf'] for w in psm7_words) / len(psm7_words), 1
)
used_engine = 'cell_ocr_psm7'
# --- NOISE FILTER: clear cells that contain only OCR artifacts ---
if text.strip():
text = _clean_cell_text(text)
@@ -3628,6 +3643,79 @@ def build_cell_grid(
)
cells.append(cell)
# --- BATCH FALLBACK: re-OCR empty cells by column strip ---
# Collect cells that are still empty but have visible pixels.
# Instead of calling Tesseract once per cell (expensive), crop an entire
# column strip and run OCR once, then assign words to cells by Y position.
empty_by_col: Dict[int, List[int]] = {} # col_idx → [cell list indices]
for ci, cell in enumerate(cells):
if not cell['text'].strip() and cell.get('ocr_engine') != 'cell_ocr_psm7':
bpx = cell['bbox_px']
x, y, w, h = bpx['x'], bpx['y'], bpx['w'], bpx['h']
if w > 0 and h > 0 and ocr_img is not None:
crop = ocr_img[y:y + h, x:x + w]
if crop.size > 0:
dark_ratio = float(np.count_nonzero(crop < 180)) / crop.size
if dark_ratio > 0.005:
empty_by_col.setdefault(cell['col_index'], []).append(ci)
for col_idx, cell_indices in empty_by_col.items():
if len(cell_indices) < 3:
continue # Not worth batching for < 3 cells
# Find the column strip bounding box (union of all empty cell bboxes)
min_y = min(cells[ci]['bbox_px']['y'] for ci in cell_indices)
max_y_h = max(cells[ci]['bbox_px']['y'] + cells[ci]['bbox_px']['h'] for ci in cell_indices)
col_x = cells[cell_indices[0]]['bbox_px']['x']
col_w = cells[cell_indices[0]]['bbox_px']['w']
strip_region = PageRegion(
type=relevant_cols[col_idx].type,
x=col_x, y=min_y,
width=col_w, height=max_y_h - min_y,
)
strip_lang = lang_map.get(relevant_cols[col_idx].type, lang)
if use_rapid and img_bgr is not None:
strip_words = ocr_region_rapid(img_bgr, strip_region)
else:
strip_words = ocr_region(ocr_img, strip_region, lang=strip_lang, psm=6)
if not strip_words:
continue
strip_words = [w for w in strip_words if w.get('conf', 0) >= 30]
if not strip_words:
continue
# Assign words to cells by Y overlap
for ci in cell_indices:
cell_y = cells[ci]['bbox_px']['y']
cell_h = cells[ci]['bbox_px']['h']
cell_mid_y = cell_y + cell_h / 2
matched_words = [
w for w in strip_words
if abs((w['top'] + w['height'] / 2) - cell_mid_y) < cell_h * 0.8
]
if matched_words:
matched_words.sort(key=lambda w: w['left'])
batch_text = ' '.join(w['text'] for w in matched_words)
batch_text = _clean_cell_text(batch_text)
if batch_text.strip():
cells[ci]['text'] = batch_text
cells[ci]['confidence'] = round(
sum(w['conf'] for w in matched_words) / len(matched_words), 1
)
cells[ci]['ocr_engine'] = 'batch_column_ocr'
batch_filled = sum(1 for ci in cell_indices if cells[ci]['text'].strip())
if batch_filled > 0:
logger.info(
f"build_cell_grid: batch OCR filled {batch_filled}/{len(cell_indices)} "
f"empty cells in column {col_idx}"
)
logger.info(f"build_cell_grid: {len(cells)} cells from "
f"{len(content_rows)} rows × {len(relevant_cols)} columns, "
f"engine={engine_name}")
@@ -3869,6 +3957,69 @@ def _merge_phonetic_continuation_rows(
return merged
def _merge_continuation_rows(
entries: List[Dict[str, Any]],
) -> List[Dict[str, Any]]:
"""Merge multi-line vocabulary entries where text wraps to the next row.
A row is a continuation of the previous entry when:
- EN has text, but DE is empty
- EN starts with a lowercase letter (not a new vocab entry)
- Previous entry's EN does NOT end with a sentence terminator (.!?)
- The continuation text has fewer than 4 words (not an example sentence)
- The row was not already merged as phonetic
Example:
Row 5: EN="to put up" DE="aufstellen"
Row 6: EN="with sth." DE=""
→ Merged: EN="to put up with sth." DE="aufstellen"
"""
if len(entries) < 2:
return entries
merged: List[Dict[str, Any]] = []
for entry in entries:
en = (entry.get('english') or '').strip()
de = (entry.get('german') or '').strip()
if merged and en and not de:
# Check: not phonetic (already handled)
if _is_phonetic_only_text(en):
merged.append(entry)
continue
# Check: starts with lowercase
first_alpha = next((c for c in en if c.isalpha()), '')
starts_lower = first_alpha and first_alpha.islower()
# Check: fewer than 4 words (not an example sentence)
word_count = len(en.split())
is_short = word_count < 4
# Check: previous entry doesn't end with sentence terminator
prev = merged[-1]
prev_en = (prev.get('english') or '').strip()
prev_ends_sentence = prev_en and prev_en[-1] in '.!?'
if starts_lower and is_short and not prev_ends_sentence:
# Merge into previous entry
prev['english'] = (prev_en + ' ' + en).strip()
# Merge example if present
ex = (entry.get('example') or '').strip()
if ex:
prev_ex = (prev.get('example') or '').strip()
prev['example'] = (prev_ex + ' ' + ex).strip() if prev_ex else ex
logger.debug(
f"Merged continuation row {entry.get('row_index')} "
f"into previous entry: {prev['english']!r}"
)
continue
merged.append(entry)
return merged
def build_word_grid(
ocr_img: np.ndarray,
column_regions: List[PageRegion],
@@ -3920,9 +4071,12 @@ def build_word_grid(
# --- Post-processing pipeline (deterministic, no LLM) ---
n_raw = len(entries)
# 0. Merge phonetic-only continuation rows into previous entry
# 0a. Merge phonetic-only continuation rows into previous entry
entries = _merge_phonetic_continuation_rows(entries)
# 0b. Merge multi-line continuation rows (lowercase EN, empty DE)
entries = _merge_continuation_rows(entries)
# 1. Fix character confusion (I/1/l based on context)
entries = _fix_character_confusion(entries)
@@ -4361,7 +4515,7 @@ async def run_cv_pipeline(
# Stage 3: Dewarp
if enable_dewarp:
t = time.time()
img = dewarp_image(img)
img, _dewarp_info = dewarp_image(img)
result.stages['dewarp'] = round(time.time() - t, 2)
# Stage 4: Dual image preparation

View File

@@ -1623,6 +1623,69 @@ async def save_reconstruction(session_id: str, request: Request):
}
@router.post("/sessions/{session_id}/reprocess")
async def reprocess_session(session_id: str, request: Request):
"""Re-run pipeline from a specific step, clearing downstream data.
Body: {"from_step": 5} (1-indexed step number)
Clears downstream results:
- from_step <= 1: deskew_result, dewarp_result, column_result, row_result, word_result
- from_step <= 2: dewarp_result, column_result, row_result, word_result
- from_step <= 3: column_result, row_result, word_result
- from_step <= 4: row_result, word_result
- from_step <= 5: word_result (cells, vocab_entries)
- from_step <= 6: word_result.llm_review only
"""
session = await get_session_db(session_id)
if not session:
raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
body = await request.json()
from_step = body.get("from_step", 1)
if not isinstance(from_step, int) or from_step < 1 or from_step > 7:
raise HTTPException(status_code=400, detail="from_step must be between 1 and 7")
update_kwargs: Dict[str, Any] = {"current_step": from_step}
# Clear downstream data based on from_step
if from_step <= 5:
update_kwargs["word_result"] = None
elif from_step == 6:
# Only clear LLM review from word_result
word_result = session.get("word_result")
if word_result:
word_result.pop("llm_review", None)
word_result.pop("llm_corrections", None)
update_kwargs["word_result"] = word_result
if from_step <= 4:
update_kwargs["row_result"] = None
if from_step <= 3:
update_kwargs["column_result"] = None
if from_step <= 2:
update_kwargs["dewarp_result"] = None
if from_step <= 1:
update_kwargs["deskew_result"] = None
await update_session_db(session_id, **update_kwargs)
# Also clear cache
if session_id in _cache:
for key in list(update_kwargs.keys()):
if key != "current_step":
_cache[session_id][key] = update_kwargs[key]
_cache[session_id]["current_step"] = from_step
logger.info(f"Session {session_id} reprocessing from step {from_step}")
return {
"session_id": session_id,
"from_step": from_step,
"cleared": [k for k in update_kwargs if k != "current_step"],
}
async def _get_rows_overlay(session_id: str) -> Response:
"""Generate dewarped image with row bands drawn on it."""
session = await get_session_db(session_id)

View File

@@ -9,6 +9,9 @@ Tests cover:
- Stage 5: Layout analysis (content bounds, projection profiles, column detection)
- Stage 6: Multi-pass OCR region handling
- Stage 7: Line grouping and vocabulary matching
- Noise filter functions (_is_noise_tail_token, _clean_cell_text)
- Phonetic detection (_is_phonetic_only_text)
- Phonetic & continuation row merging
- Orchestrator (run_cv_pipeline)
DSGVO Note: All tests run locally with synthetic data. No external API calls.
@@ -36,6 +39,11 @@ from cv_vocab_pipeline import (
CV2_AVAILABLE,
TESSERACT_AVAILABLE,
CV_PIPELINE_AVAILABLE,
_is_noise_tail_token,
_clean_cell_text,
_is_phonetic_only_text,
_merge_phonetic_continuation_rows,
_merge_continuation_rows,
)
@@ -202,16 +210,28 @@ class TestDeskew:
@pytest.mark.skipif(not CV2_AVAILABLE, reason="OpenCV not available")
class TestDewarp:
"""Test dewarp (pass-through) stage."""
"""Test dewarp stage (returns (image, info) tuple)."""
def test_dewarp_passthrough(self, white_image):
"""Current dewarp should return the same image (pass-through)."""
def test_dewarp_returns_tuple(self, white_image):
"""dewarp_image must return (image, dewarp_info) tuple."""
result = dewarp_image(white_image)
np.testing.assert_array_equal(result, white_image)
assert isinstance(result, tuple)
assert len(result) == 2
img_out, info = result
assert isinstance(img_out, np.ndarray)
assert isinstance(info, dict)
assert "shear_degrees" in info
def test_dewarp_preserves_shape(self, text_like_image):
result = dewarp_image(text_like_image)
assert result.shape == text_like_image.shape
"""Output image should have same shape as input."""
img_out, _ = dewarp_image(text_like_image)
assert img_out.shape == text_like_image.shape
def test_dewarp_white_image_no_correction(self, white_image):
"""A uniform white image should get no shear correction."""
img_out, info = dewarp_image(white_image)
assert abs(info["shear_degrees"]) < 0.5
assert img_out.shape == white_image.shape
# =============================================
@@ -561,6 +581,268 @@ class TestStageIntegration:
assert layout_img.shape[:2] == corrected.shape[:2]
# =============================================
# NOISE FILTER TESTS
# =============================================
class TestNoiseFilter:
"""Test _is_noise_tail_token for trailing OCR noise detection."""
# --- Tokens that should be KEPT (return False) ---
@pytest.mark.parametrize("token", [
# Compound words with hyphens
"money-saver",
"under-",
"well-known",
# Words with parenthesized parts (dictionary entries)
"Schild(chen)",
"(Salat-)Gurke",
"(auf)",
"(on)",
"selbst)",
"(wir",
"Tanz(veranstaltung)",
"(zer)brechen",
# Phonetic brackets
"serva]",
"['mani",
"[eg]",
"[maus]",
# Words with trailing punctuation
"cupcakes.",
"sister.",
"mice",
# Abbreviations
"e.g.",
"sth.",
"usw.",
"adj.",
# Ellipsis
"...",
"\u2026",
# Regular words
"the",
"cat",
"big",
"run",
"set",
"ago",
])
def test_keep_real_tokens(self, token):
"""Real words, dictionary punctuation, and phonetic brackets are kept."""
assert _is_noise_tail_token(token) is False, f"Should keep {token!r}"
# --- Tokens that should be FILTERED (return True) ---
@pytest.mark.parametrize("token", [
# Pure non-alpha
"B|",
"3d",
"x7",
")",
"|",
"@",
"3",
# Very short non-dictionary fragments
"ee",
"k",
"zz",
"qq",
# Empty
"",
" ",
])
def test_filter_noise_tokens(self, token):
"""OCR noise fragments are filtered."""
assert _is_noise_tail_token(token) is True, f"Should filter {token!r}"
class TestCleanCellText:
"""Test _clean_cell_text integration (full text → cleaned text)."""
def test_empty_returns_empty(self):
assert _clean_cell_text("") == ""
assert _clean_cell_text(" ") == ""
def test_real_word_unchanged(self):
assert _clean_cell_text("cupcakes") == "cupcakes"
def test_strips_trailing_noise(self):
"""Trailing noise tokens should be removed."""
result = _clean_cell_text("cupcakes B|")
assert result == "cupcakes"
def test_keeps_trailing_real_word(self):
"""Trailing real words should be kept."""
result = _clean_cell_text("big cat")
assert result == "big cat"
def test_abbreviation_kept(self):
"""Known abbreviations should not be cleared."""
result = _clean_cell_text("e.g.")
assert result == "e.g."
def test_pure_garbage_cleared(self):
"""OCR garbage without real words should be cleared."""
result = _clean_cell_text("3d |x")
assert result == ""
def test_compound_word_preserved(self):
"""Compound words with hyphens should be preserved."""
result = _clean_cell_text("money-saver")
assert result == "money-saver"
def test_parenthesized_word_preserved(self):
result = _clean_cell_text("(Salat-)Gurke")
assert result == "(Salat-)Gurke"
def test_multiple_trailing_noise(self):
"""Multiple trailing noise tokens should all be removed."""
result = _clean_cell_text("achieve 3 |")
assert result == "achieve"
class TestPhoneticOnlyText:
"""Test _is_phonetic_only_text for phonetic transcription detection."""
@pytest.mark.parametrize("text,expected", [
# Phonetic-only patterns → True
("['mani serva]", True),
("[dɑːns]", True),
("[\"a:mand]", True),
("['ːkʃɒp]", True),
# serva] has 5 alpha chars after bracket removal → NOT phonetic-only
("serva]", False),
# NOT phonetic-only → False
("almond ['a:mand]", False),
("Mandel", False),
("cupcakes", False),
("", False),
("achieve", False),
("money-saver ['mani]", False),
])
def test_phonetic_detection(self, text, expected):
assert _is_phonetic_only_text(text) is expected, \
f"_is_phonetic_only_text({text!r}) should be {expected}"
class TestMergePhoneticContinuationRows:
"""Test _merge_phonetic_continuation_rows for phonetic row merging."""
def test_empty_list(self):
assert _merge_phonetic_continuation_rows([]) == []
def test_single_entry(self):
entries = [{"english": "cat", "german": "Katze", "example": ""}]
result = _merge_phonetic_continuation_rows(entries)
assert len(result) == 1
assert result[0]["english"] == "cat"
def test_merges_phonetic_row(self):
"""Phonetic-only row should merge into previous entry."""
entries = [
{"english": "money-saver", "german": "Sparfuchs", "example": "", "row_index": 0},
{"english": "['mani serva]", "german": "", "example": "", "row_index": 1},
]
result = _merge_phonetic_continuation_rows(entries)
assert len(result) == 1
assert result[0]["english"] == "money-saver ['mani serva]"
assert result[0]["german"] == "Sparfuchs"
def test_no_merge_when_de_present(self):
"""Row with DE text should NOT be merged even if EN looks phonetic."""
entries = [
{"english": "cat", "german": "Katze", "example": ""},
{"english": "[kæt]", "german": "some text", "example": ""},
]
result = _merge_phonetic_continuation_rows(entries)
assert len(result) == 2
def test_no_merge_regular_rows(self):
"""Normal vocab rows should not be merged."""
entries = [
{"english": "cat", "german": "Katze", "example": ""},
{"english": "dog", "german": "Hund", "example": ""},
]
result = _merge_phonetic_continuation_rows(entries)
assert len(result) == 2
def test_merges_example_too(self):
"""If phonetic row has example text, it should merge into previous."""
entries = [
{"english": "dance", "german": "tanzen", "example": "", "row_index": 0},
{"english": "[dɑːns]", "german": "", "example": "Let's dance.", "row_index": 1},
]
result = _merge_phonetic_continuation_rows(entries)
assert len(result) == 1
assert result[0]["english"] == "dance [dɑːns]"
assert result[0]["example"] == "Let's dance."
class TestMergeContinuationRows:
"""Test _merge_continuation_rows for multi-line entry merging."""
def test_empty_list(self):
assert _merge_continuation_rows([]) == []
def test_no_merge_independent_rows(self):
"""Rows with both EN and DE should not be merged."""
entries = [
{"english": "cat", "german": "Katze", "example": "", "row_index": 0},
{"english": "dog", "german": "Hund", "example": "", "row_index": 1},
]
result = _merge_continuation_rows(entries)
assert len(result) == 2
def test_merge_lowercase_continuation(self):
"""Lowercase EN with empty DE should merge into previous."""
entries = [
{"english": "to put up", "german": "aufstellen", "example": "", "row_index": 0},
{"english": "with sth.", "german": "", "example": "", "row_index": 1},
]
result = _merge_continuation_rows(entries)
assert len(result) == 1
assert result[0]["english"] == "to put up with sth."
assert result[0]["german"] == "aufstellen"
def test_no_merge_uppercase_start(self):
"""EN starting with uppercase and empty DE is likely its own entry, not a continuation."""
entries = [
{"english": "cat", "german": "Katze", "example": "", "row_index": 0},
{"english": "Dog", "german": "", "example": "", "row_index": 1},
]
result = _merge_continuation_rows(entries)
assert len(result) == 2
def test_no_merge_when_previous_ends_with_period(self):
"""If previous entry ends with sentence terminator, next is not continuation."""
entries = [
{"english": "That's great.", "german": "Das ist toll.", "example": "", "row_index": 0},
{"english": "really nice", "german": "", "example": "", "row_index": 1},
]
result = _merge_continuation_rows(entries)
assert len(result) == 2
def test_no_merge_long_text(self):
"""Text with 4+ words is likely an example sentence, not continuation."""
entries = [
{"english": "achieve", "german": "erreichen", "example": "", "row_index": 0},
{"english": "she achieved her goals", "german": "", "example": "", "row_index": 1},
]
result = _merge_continuation_rows(entries)
assert len(result) == 2
def test_first_entry_not_merged(self):
"""First entry with empty DE should not crash (no previous)."""
entries = [
{"english": "something", "german": "", "example": "", "row_index": 0},
{"english": "cat", "german": "Katze", "example": "", "row_index": 1},
]
result = _merge_continuation_rows(entries)
assert len(result) == 2
# =============================================
# RUN TESTS
# =============================================