feat: cell-first OCR + document type detection + dynamic pipeline steps

Cell-First OCR (v2): Each cell is cropped and OCR'd in isolation,
eliminating neighbour bleeding (e.g. "to", "ps" in marker columns).
Uses ThreadPoolExecutor for parallel Tesseract calls.

Document type detection: Classifies pages as vocab_table, full_text,
or generic_table using projection profiles (<2s, no OCR needed).
Frontend dynamically skips columns/rows steps for full-text pages.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-04 13:52:38 +01:00
parent 00a74b3144
commit 29c74a9962
7 changed files with 1001 additions and 75 deletions

View File

@@ -25,7 +25,9 @@ from dataclasses import asdict
# Import module under test
from cv_vocab_pipeline import (
ColumnGeometry,
DocumentTypeResult,
PageRegion,
RowGeometry,
VocabRow,
PipelineResult,
deskew_image,
@@ -48,9 +50,12 @@ from cv_vocab_pipeline import (
CV_PIPELINE_AVAILABLE,
_is_noise_tail_token,
_clean_cell_text,
_clean_cell_text_lite,
_is_phonetic_only_text,
_merge_phonetic_continuation_rows,
_merge_continuation_rows,
_ocr_cell_crop,
detect_document_type,
)
@@ -1566,6 +1571,167 @@ class TestCellsToVocabEntriesPageRef:
assert entries[0]['source_page'] == 'p.59'
# =============================================
# CELL-FIRST OCR (v2) TESTS
# =============================================
class TestCleanCellTextLite:
"""Tests for _clean_cell_text_lite() — simplified noise filter."""
def test_empty_string(self):
assert _clean_cell_text_lite('') == ''
def test_whitespace_only(self):
assert _clean_cell_text_lite(' ') == ''
def test_real_word_passes(self):
assert _clean_cell_text_lite('hello') == 'hello'
def test_sentence_passes(self):
assert _clean_cell_text_lite('to have dinner') == 'to have dinner'
def test_garbage_text_cleared(self):
"""Garbage text (no dictionary words) should be cleared."""
assert _clean_cell_text_lite('xqzjk') == ''
def test_no_real_word_cleared(self):
"""Single chars with no real word (2+ letters) cleared."""
assert _clean_cell_text_lite('3') == ''
assert _clean_cell_text_lite('|') == ''
def test_known_abbreviation_kept(self):
"""Known abbreviations should pass through."""
assert _clean_cell_text_lite('sth') == 'sth'
assert _clean_cell_text_lite('eg') == 'eg'
def test_no_trailing_noise_stripping(self):
"""Unlike _clean_cell_text, lite does NOT strip trailing tokens.
Since cells are isolated, all tokens are legitimate."""
result = _clean_cell_text_lite('apple tree')
assert result == 'apple tree'
def test_page_reference(self):
"""Page references like p.60 should pass."""
# 'p' is a known abbreviation
assert _clean_cell_text_lite('p.60') != ''
class TestOcrCellCrop:
"""Tests for _ocr_cell_crop() — isolated cell OCR."""
def test_empty_cell_pixel_density(self):
"""Cells with very few dark pixels should return empty text."""
# All white image → no text
ocr_img = np.ones((400, 600), dtype=np.uint8) * 255
row = RowGeometry(index=0, x=0, y=50, width=600, height=30,
word_count=1, words=[{'text': 'a'}])
col = PageRegion(type='column_en', x=50, y=0, width=200, height=400)
result = _ocr_cell_crop(
0, 0, row, col, ocr_img, None, 600, 400,
'tesseract', 'eng+deu', {'column_en': 'eng'},
)
assert result['text'] == ''
assert result['cell_id'] == 'R00_C0'
assert result['col_type'] == 'column_en'
def test_zero_width_cell(self):
"""Zero-width cells should return empty."""
ocr_img = np.ones((400, 600), dtype=np.uint8) * 255
row = RowGeometry(index=0, x=0, y=50, width=600, height=30,
word_count=1, words=[])
col = PageRegion(type='column_en', x=50, y=0, width=0, height=400)
result = _ocr_cell_crop(
0, 0, row, col, ocr_img, None, 600, 400,
'tesseract', 'eng+deu', {},
)
assert result['text'] == ''
def test_bbox_calculation(self):
"""Check bbox_px and bbox_pct are correct."""
ocr_img = np.ones((1000, 2000), dtype=np.uint8) * 255
row = RowGeometry(index=0, x=0, y=100, width=2000, height=50,
word_count=1, words=[{'text': 'test'}])
col = PageRegion(type='column_de', x=400, y=0, width=600, height=1000)
result = _ocr_cell_crop(
0, 0, row, col, ocr_img, None, 2000, 1000,
'tesseract', 'eng+deu', {'column_de': 'deu'},
)
assert result['bbox_px'] == {'x': 400, 'y': 100, 'w': 600, 'h': 50}
assert result['bbox_pct']['x'] == 20.0 # 400/2000*100
assert result['bbox_pct']['y'] == 10.0 # 100/1000*100
class TestDetectDocumentType:
"""Tests for detect_document_type() — image-based classification."""
def test_empty_image(self):
"""Empty image should default to full_text."""
empty = np.array([], dtype=np.uint8).reshape(0, 0)
result = detect_document_type(empty, empty)
assert result.doc_type == 'full_text'
assert result.pipeline == 'full_page'
def test_table_image_detected(self):
"""Image with clear column gaps and row gaps → table."""
# Create 600x400 binary image with 3 columns separated by white gaps
img = np.ones((400, 600), dtype=np.uint8) * 255
# Column 1: x=20..170
for y in range(30, 370, 20):
img[y:y+10, 20:170] = 0
# Gap: x=170..210 (white)
# Column 2: x=210..370
for y in range(30, 370, 20):
img[y:y+10, 210:370] = 0
# Gap: x=370..410 (white)
# Column 3: x=410..580
for y in range(30, 370, 20):
img[y:y+10, 410:580] = 0
bgr = np.stack([img, img, img], axis=-1)
result = detect_document_type(img, bgr)
assert result.doc_type in ('vocab_table', 'generic_table')
assert result.pipeline == 'cell_first'
assert result.confidence >= 0.5
def test_fulltext_image_detected(self):
"""Uniform text without column gaps → full_text."""
img = np.ones((400, 600), dtype=np.uint8) * 255
# Uniform text lines across full width (no column gaps)
for y in range(30, 370, 15):
img[y:y+8, 30:570] = 0
bgr = np.stack([img, img, img], axis=-1)
result = detect_document_type(img, bgr)
assert result.doc_type == 'full_text'
assert result.pipeline == 'full_page'
assert 'columns' in result.skip_steps
assert 'rows' in result.skip_steps
def test_result_has_features(self):
"""Result should contain debug features."""
img = np.ones((200, 300), dtype=np.uint8) * 255
bgr = np.stack([img, img, img], axis=-1)
result = detect_document_type(img, bgr)
assert 'vertical_gaps' in result.features
assert 'row_gaps' in result.features
assert 'density_mean' in result.features
assert 'density_std' in result.features
def test_document_type_result_dataclass(self):
"""DocumentTypeResult dataclass should initialize correctly."""
r = DocumentTypeResult(
doc_type='vocab_table',
confidence=0.9,
pipeline='cell_first',
)
assert r.doc_type == 'vocab_table'
assert r.skip_steps == []
assert r.features == {}
# =============================================
# RUN TESTS
# =============================================