feat: cell-first OCR + document type detection + dynamic pipeline steps
Cell-First OCR (v2): Each cell is cropped and OCR'd in isolation, eliminating neighbour bleeding (e.g. "to", "ps" in marker columns). Uses ThreadPoolExecutor for parallel Tesseract calls. Document type detection: Classifies pages as vocab_table, full_text, or generic_table using projection profiles (<2s, no OCR needed). Frontend dynamically skips columns/rows steps for full-text pages. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -25,7 +25,9 @@ from dataclasses import asdict
|
||||
# Import module under test
|
||||
from cv_vocab_pipeline import (
|
||||
ColumnGeometry,
|
||||
DocumentTypeResult,
|
||||
PageRegion,
|
||||
RowGeometry,
|
||||
VocabRow,
|
||||
PipelineResult,
|
||||
deskew_image,
|
||||
@@ -48,9 +50,12 @@ from cv_vocab_pipeline import (
|
||||
CV_PIPELINE_AVAILABLE,
|
||||
_is_noise_tail_token,
|
||||
_clean_cell_text,
|
||||
_clean_cell_text_lite,
|
||||
_is_phonetic_only_text,
|
||||
_merge_phonetic_continuation_rows,
|
||||
_merge_continuation_rows,
|
||||
_ocr_cell_crop,
|
||||
detect_document_type,
|
||||
)
|
||||
|
||||
|
||||
@@ -1566,6 +1571,167 @@ class TestCellsToVocabEntriesPageRef:
|
||||
assert entries[0]['source_page'] == 'p.59'
|
||||
|
||||
|
||||
# =============================================
|
||||
# CELL-FIRST OCR (v2) TESTS
|
||||
# =============================================
|
||||
|
||||
class TestCleanCellTextLite:
|
||||
"""Tests for _clean_cell_text_lite() — simplified noise filter."""
|
||||
|
||||
def test_empty_string(self):
|
||||
assert _clean_cell_text_lite('') == ''
|
||||
|
||||
def test_whitespace_only(self):
|
||||
assert _clean_cell_text_lite(' ') == ''
|
||||
|
||||
def test_real_word_passes(self):
|
||||
assert _clean_cell_text_lite('hello') == 'hello'
|
||||
|
||||
def test_sentence_passes(self):
|
||||
assert _clean_cell_text_lite('to have dinner') == 'to have dinner'
|
||||
|
||||
def test_garbage_text_cleared(self):
|
||||
"""Garbage text (no dictionary words) should be cleared."""
|
||||
assert _clean_cell_text_lite('xqzjk') == ''
|
||||
|
||||
def test_no_real_word_cleared(self):
|
||||
"""Single chars with no real word (2+ letters) cleared."""
|
||||
assert _clean_cell_text_lite('3') == ''
|
||||
assert _clean_cell_text_lite('|') == ''
|
||||
|
||||
def test_known_abbreviation_kept(self):
|
||||
"""Known abbreviations should pass through."""
|
||||
assert _clean_cell_text_lite('sth') == 'sth'
|
||||
assert _clean_cell_text_lite('eg') == 'eg'
|
||||
|
||||
def test_no_trailing_noise_stripping(self):
|
||||
"""Unlike _clean_cell_text, lite does NOT strip trailing tokens.
|
||||
Since cells are isolated, all tokens are legitimate."""
|
||||
result = _clean_cell_text_lite('apple tree')
|
||||
assert result == 'apple tree'
|
||||
|
||||
def test_page_reference(self):
|
||||
"""Page references like p.60 should pass."""
|
||||
# 'p' is a known abbreviation
|
||||
assert _clean_cell_text_lite('p.60') != ''
|
||||
|
||||
|
||||
class TestOcrCellCrop:
|
||||
"""Tests for _ocr_cell_crop() — isolated cell OCR."""
|
||||
|
||||
def test_empty_cell_pixel_density(self):
|
||||
"""Cells with very few dark pixels should return empty text."""
|
||||
# All white image → no text
|
||||
ocr_img = np.ones((400, 600), dtype=np.uint8) * 255
|
||||
row = RowGeometry(index=0, x=0, y=50, width=600, height=30,
|
||||
word_count=1, words=[{'text': 'a'}])
|
||||
col = PageRegion(type='column_en', x=50, y=0, width=200, height=400)
|
||||
|
||||
result = _ocr_cell_crop(
|
||||
0, 0, row, col, ocr_img, None, 600, 400,
|
||||
'tesseract', 'eng+deu', {'column_en': 'eng'},
|
||||
)
|
||||
assert result['text'] == ''
|
||||
assert result['cell_id'] == 'R00_C0'
|
||||
assert result['col_type'] == 'column_en'
|
||||
|
||||
def test_zero_width_cell(self):
|
||||
"""Zero-width cells should return empty."""
|
||||
ocr_img = np.ones((400, 600), dtype=np.uint8) * 255
|
||||
row = RowGeometry(index=0, x=0, y=50, width=600, height=30,
|
||||
word_count=1, words=[])
|
||||
col = PageRegion(type='column_en', x=50, y=0, width=0, height=400)
|
||||
|
||||
result = _ocr_cell_crop(
|
||||
0, 0, row, col, ocr_img, None, 600, 400,
|
||||
'tesseract', 'eng+deu', {},
|
||||
)
|
||||
assert result['text'] == ''
|
||||
|
||||
def test_bbox_calculation(self):
|
||||
"""Check bbox_px and bbox_pct are correct."""
|
||||
ocr_img = np.ones((1000, 2000), dtype=np.uint8) * 255
|
||||
row = RowGeometry(index=0, x=0, y=100, width=2000, height=50,
|
||||
word_count=1, words=[{'text': 'test'}])
|
||||
col = PageRegion(type='column_de', x=400, y=0, width=600, height=1000)
|
||||
|
||||
result = _ocr_cell_crop(
|
||||
0, 0, row, col, ocr_img, None, 2000, 1000,
|
||||
'tesseract', 'eng+deu', {'column_de': 'deu'},
|
||||
)
|
||||
assert result['bbox_px'] == {'x': 400, 'y': 100, 'w': 600, 'h': 50}
|
||||
assert result['bbox_pct']['x'] == 20.0 # 400/2000*100
|
||||
assert result['bbox_pct']['y'] == 10.0 # 100/1000*100
|
||||
|
||||
|
||||
class TestDetectDocumentType:
|
||||
"""Tests for detect_document_type() — image-based classification."""
|
||||
|
||||
def test_empty_image(self):
|
||||
"""Empty image should default to full_text."""
|
||||
empty = np.array([], dtype=np.uint8).reshape(0, 0)
|
||||
result = detect_document_type(empty, empty)
|
||||
assert result.doc_type == 'full_text'
|
||||
assert result.pipeline == 'full_page'
|
||||
|
||||
def test_table_image_detected(self):
|
||||
"""Image with clear column gaps and row gaps → table."""
|
||||
# Create 600x400 binary image with 3 columns separated by white gaps
|
||||
img = np.ones((400, 600), dtype=np.uint8) * 255
|
||||
# Column 1: x=20..170
|
||||
for y in range(30, 370, 20):
|
||||
img[y:y+10, 20:170] = 0
|
||||
# Gap: x=170..210 (white)
|
||||
# Column 2: x=210..370
|
||||
for y in range(30, 370, 20):
|
||||
img[y:y+10, 210:370] = 0
|
||||
# Gap: x=370..410 (white)
|
||||
# Column 3: x=410..580
|
||||
for y in range(30, 370, 20):
|
||||
img[y:y+10, 410:580] = 0
|
||||
|
||||
bgr = np.stack([img, img, img], axis=-1)
|
||||
result = detect_document_type(img, bgr)
|
||||
assert result.doc_type in ('vocab_table', 'generic_table')
|
||||
assert result.pipeline == 'cell_first'
|
||||
assert result.confidence >= 0.5
|
||||
|
||||
def test_fulltext_image_detected(self):
|
||||
"""Uniform text without column gaps → full_text."""
|
||||
img = np.ones((400, 600), dtype=np.uint8) * 255
|
||||
# Uniform text lines across full width (no column gaps)
|
||||
for y in range(30, 370, 15):
|
||||
img[y:y+8, 30:570] = 0
|
||||
|
||||
bgr = np.stack([img, img, img], axis=-1)
|
||||
result = detect_document_type(img, bgr)
|
||||
assert result.doc_type == 'full_text'
|
||||
assert result.pipeline == 'full_page'
|
||||
assert 'columns' in result.skip_steps
|
||||
assert 'rows' in result.skip_steps
|
||||
|
||||
def test_result_has_features(self):
|
||||
"""Result should contain debug features."""
|
||||
img = np.ones((200, 300), dtype=np.uint8) * 255
|
||||
bgr = np.stack([img, img, img], axis=-1)
|
||||
result = detect_document_type(img, bgr)
|
||||
assert 'vertical_gaps' in result.features
|
||||
assert 'row_gaps' in result.features
|
||||
assert 'density_mean' in result.features
|
||||
assert 'density_std' in result.features
|
||||
|
||||
def test_document_type_result_dataclass(self):
|
||||
"""DocumentTypeResult dataclass should initialize correctly."""
|
||||
r = DocumentTypeResult(
|
||||
doc_type='vocab_table',
|
||||
confidence=0.9,
|
||||
pipeline='cell_first',
|
||||
)
|
||||
assert r.doc_type == 'vocab_table'
|
||||
assert r.skip_steps == []
|
||||
assert r.features == {}
|
||||
|
||||
|
||||
# =============================================
|
||||
# RUN TESTS
|
||||
# =============================================
|
||||
|
||||
Reference in New Issue
Block a user