feat: cell-first OCR + document type detection + dynamic pipeline steps

Cell-First OCR (v2): Each cell is cropped and OCR'd in isolation, eliminating neighbour bleeding (e.g. "to", "ps" in marker columns). Uses ThreadPoolExecutor for parallel Tesseract calls. Document type detection: Classifies pages as vocab_table, full_text, or generic_table using projection profiles (<2s, no OCR needed). Frontend dynamically skips columns/rows steps for full-text pages. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-04 13:52:38 +01:00
parent 00a74b3144
commit 29c74a9962
7 changed files with 1001 additions and 75 deletions
@@ -25,7 +25,9 @@ from dataclasses import asdict
 # Import module under test
 from cv_vocab_pipeline import (
    ColumnGeometry,
+    DocumentTypeResult,
    PageRegion,
+    RowGeometry,
    VocabRow,
    PipelineResult,
    deskew_image,
@@ -48,9 +50,12 @@ from cv_vocab_pipeline import (
    CV_PIPELINE_AVAILABLE,
    _is_noise_tail_token,
    _clean_cell_text,
+    _clean_cell_text_lite,
    _is_phonetic_only_text,
    _merge_phonetic_continuation_rows,
    _merge_continuation_rows,
+    _ocr_cell_crop,
+    detect_document_type,
 )


@@ -1566,6 +1571,167 @@ class TestCellsToVocabEntriesPageRef:
        assert entries[0]['source_page'] == 'p.59'


+# =============================================
+# CELL-FIRST OCR (v2) TESTS
+# =============================================
+
+class TestCleanCellTextLite:
+    """Tests for _clean_cell_text_lite() — simplified noise filter."""
+
+    def test_empty_string(self):
+        assert _clean_cell_text_lite('') == ''
+
+    def test_whitespace_only(self):
+        assert _clean_cell_text_lite('   ') == ''
+
+    def test_real_word_passes(self):
+        assert _clean_cell_text_lite('hello') == 'hello'
+
+    def test_sentence_passes(self):
+        assert _clean_cell_text_lite('to have dinner') == 'to have dinner'
+
+    def test_garbage_text_cleared(self):
+        """Garbage text (no dictionary words) should be cleared."""
+        assert _clean_cell_text_lite('xqzjk') == ''
+
+    def test_no_real_word_cleared(self):
+        """Single chars with no real word (2+ letters) cleared."""
+        assert _clean_cell_text_lite('3') == ''
+        assert _clean_cell_text_lite('|') == ''
+
+    def test_known_abbreviation_kept(self):
+        """Known abbreviations should pass through."""
+        assert _clean_cell_text_lite('sth') == 'sth'
+        assert _clean_cell_text_lite('eg') == 'eg'
+
+    def test_no_trailing_noise_stripping(self):
+        """Unlike _clean_cell_text, lite does NOT strip trailing tokens.
+        Since cells are isolated, all tokens are legitimate."""
+        result = _clean_cell_text_lite('apple tree')
+        assert result == 'apple tree'
+
+    def test_page_reference(self):
+        """Page references like p.60 should pass."""
+        # 'p' is a known abbreviation
+        assert _clean_cell_text_lite('p.60') != ''
+
+
+class TestOcrCellCrop:
+    """Tests for _ocr_cell_crop() — isolated cell OCR."""
+
+    def test_empty_cell_pixel_density(self):
+        """Cells with very few dark pixels should return empty text."""
+        # All white image → no text
+        ocr_img = np.ones((400, 600), dtype=np.uint8) * 255
+        row = RowGeometry(index=0, x=0, y=50, width=600, height=30,
+                          word_count=1, words=[{'text': 'a'}])
+        col = PageRegion(type='column_en', x=50, y=0, width=200, height=400)
+
+        result = _ocr_cell_crop(
+            0, 0, row, col, ocr_img, None, 600, 400,
+            'tesseract', 'eng+deu', {'column_en': 'eng'},
+        )
+        assert result['text'] == ''
+        assert result['cell_id'] == 'R00_C0'
+        assert result['col_type'] == 'column_en'
+
+    def test_zero_width_cell(self):
+        """Zero-width cells should return empty."""
+        ocr_img = np.ones((400, 600), dtype=np.uint8) * 255
+        row = RowGeometry(index=0, x=0, y=50, width=600, height=30,
+                          word_count=1, words=[])
+        col = PageRegion(type='column_en', x=50, y=0, width=0, height=400)
+
+        result = _ocr_cell_crop(
+            0, 0, row, col, ocr_img, None, 600, 400,
+            'tesseract', 'eng+deu', {},
+        )
+        assert result['text'] == ''
+
+    def test_bbox_calculation(self):
+        """Check bbox_px and bbox_pct are correct."""
+        ocr_img = np.ones((1000, 2000), dtype=np.uint8) * 255
+        row = RowGeometry(index=0, x=0, y=100, width=2000, height=50,
+                          word_count=1, words=[{'text': 'test'}])
+        col = PageRegion(type='column_de', x=400, y=0, width=600, height=1000)
+
+        result = _ocr_cell_crop(
+            0, 0, row, col, ocr_img, None, 2000, 1000,
+            'tesseract', 'eng+deu', {'column_de': 'deu'},
+        )
+        assert result['bbox_px'] == {'x': 400, 'y': 100, 'w': 600, 'h': 50}
+        assert result['bbox_pct']['x'] == 20.0  # 400/2000*100
+        assert result['bbox_pct']['y'] == 10.0  # 100/1000*100
+
+
+class TestDetectDocumentType:
+    """Tests for detect_document_type() — image-based classification."""
+
+    def test_empty_image(self):
+        """Empty image should default to full_text."""
+        empty = np.array([], dtype=np.uint8).reshape(0, 0)
+        result = detect_document_type(empty, empty)
+        assert result.doc_type == 'full_text'
+        assert result.pipeline == 'full_page'
+
+    def test_table_image_detected(self):
+        """Image with clear column gaps and row gaps → table."""
+        # Create 600x400 binary image with 3 columns separated by white gaps
+        img = np.ones((400, 600), dtype=np.uint8) * 255
+        # Column 1: x=20..170
+        for y in range(30, 370, 20):
+            img[y:y+10, 20:170] = 0
+        # Gap: x=170..210 (white)
+        # Column 2: x=210..370
+        for y in range(30, 370, 20):
+            img[y:y+10, 210:370] = 0
+        # Gap: x=370..410 (white)
+        # Column 3: x=410..580
+        for y in range(30, 370, 20):
+            img[y:y+10, 410:580] = 0
+
+        bgr = np.stack([img, img, img], axis=-1)
+        result = detect_document_type(img, bgr)
+        assert result.doc_type in ('vocab_table', 'generic_table')
+        assert result.pipeline == 'cell_first'
+        assert result.confidence >= 0.5
+
+    def test_fulltext_image_detected(self):
+        """Uniform text without column gaps → full_text."""
+        img = np.ones((400, 600), dtype=np.uint8) * 255
+        # Uniform text lines across full width (no column gaps)
+        for y in range(30, 370, 15):
+            img[y:y+8, 30:570] = 0
+
+        bgr = np.stack([img, img, img], axis=-1)
+        result = detect_document_type(img, bgr)
+        assert result.doc_type == 'full_text'
+        assert result.pipeline == 'full_page'
+        assert 'columns' in result.skip_steps
+        assert 'rows' in result.skip_steps
+
+    def test_result_has_features(self):
+        """Result should contain debug features."""
+        img = np.ones((200, 300), dtype=np.uint8) * 255
+        bgr = np.stack([img, img, img], axis=-1)
+        result = detect_document_type(img, bgr)
+        assert 'vertical_gaps' in result.features
+        assert 'row_gaps' in result.features
+        assert 'density_mean' in result.features
+        assert 'density_std' in result.features
+
+    def test_document_type_result_dataclass(self):
+        """DocumentTypeResult dataclass should initialize correctly."""
+        r = DocumentTypeResult(
+            doc_type='vocab_table',
+            confidence=0.9,
+            pipeline='cell_first',
+        )
+        assert r.doc_type == 'vocab_table'
+        assert r.skip_steps == []
+        assert r.features == {}
+
+
 # =============================================
 # RUN TESTS
 # =============================================