feat(ocr-pipeline): generic sub-column detection via left-edge clustering
Detects hidden sub-columns (e.g. page references like "p.59") within already-recognized columns by clustering word left-edge positions and splitting when a clear minority cluster exists. The sub-column is then classified as page_ref and mapped to VocabRow.source_page. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -24,6 +24,7 @@ from dataclasses import asdict
|
||||
|
||||
# Import module under test
|
||||
from cv_vocab_pipeline import (
|
||||
ColumnGeometry,
|
||||
PageRegion,
|
||||
VocabRow,
|
||||
PipelineResult,
|
||||
@@ -35,6 +36,7 @@ from cv_vocab_pipeline import (
|
||||
_filter_narrow_runs,
|
||||
_build_margin_regions,
|
||||
_detect_header_footer_gaps,
|
||||
_detect_sub_columns,
|
||||
_region_has_content,
|
||||
_add_header_footer,
|
||||
analyze_layout,
|
||||
@@ -1170,6 +1172,192 @@ class TestRegionContentCheck:
|
||||
assert bottom_regions[0].type == 'footer'
|
||||
|
||||
|
||||
# =============================================
|
||||
# Sub-Column Detection Tests
|
||||
# =============================================
|
||||
|
||||
class TestSubColumnDetection:
|
||||
"""Tests for _detect_sub_columns() left-edge clustering."""
|
||||
|
||||
def _make_word(self, left: int, text: str = "word", conf: int = 90) -> dict:
|
||||
return {'left': left, 'top': 100, 'width': 50, 'height': 20,
|
||||
'text': text, 'conf': conf}
|
||||
|
||||
def _make_geo(self, x: int, width: int, words: list, content_w: int = 1000) -> ColumnGeometry:
|
||||
return ColumnGeometry(
|
||||
index=0, x=x, y=50, width=width, height=500,
|
||||
word_count=len(words), words=words,
|
||||
width_ratio=width / content_w,
|
||||
)
|
||||
|
||||
def test_sub_column_split_page_refs(self):
|
||||
"""Column with 3 'p.XX' left + 20 EN words right → split into 2."""
|
||||
content_w = 1000
|
||||
# 3 page-ref words at left=100, 20 vocab words at left=250
|
||||
page_words = [self._make_word(100, f"p.{59+i}") for i in range(3)]
|
||||
vocab_words = [self._make_word(250, f"word{i}") for i in range(20)]
|
||||
all_words = page_words + vocab_words
|
||||
geo = self._make_geo(x=80, width=300, words=all_words, content_w=content_w)
|
||||
|
||||
result = _detect_sub_columns([geo], content_w)
|
||||
|
||||
assert len(result) == 2, f"Expected 2 columns, got {len(result)}"
|
||||
# Left sub-column should be narrower with fewer words
|
||||
left_col = result[0]
|
||||
right_col = result[1]
|
||||
assert left_col.x < right_col.x
|
||||
assert left_col.word_count == 3
|
||||
assert right_col.word_count == 20
|
||||
# Indices should be 0, 1
|
||||
assert left_col.index == 0
|
||||
assert right_col.index == 1
|
||||
|
||||
def test_no_split_uniform_alignment(self):
|
||||
"""All words aligned at same position → no change."""
|
||||
content_w = 1000
|
||||
words = [self._make_word(200, f"word{i}") for i in range(15)]
|
||||
geo = self._make_geo(x=180, width=300, words=words, content_w=content_w)
|
||||
|
||||
result = _detect_sub_columns([geo], content_w)
|
||||
|
||||
assert len(result) == 1
|
||||
assert result[0].word_count == 15
|
||||
|
||||
def test_no_split_narrow_column(self):
|
||||
"""Narrow column (width_ratio < 0.15) → no split attempted."""
|
||||
content_w = 1000
|
||||
words = [self._make_word(50, "a")] * 3 + [self._make_word(120, "b")] * 10
|
||||
geo = self._make_geo(x=40, width=140, words=words, content_w=content_w)
|
||||
# width_ratio = 140/1000 = 0.14 < 0.15
|
||||
|
||||
result = _detect_sub_columns([geo], content_w)
|
||||
|
||||
assert len(result) == 1
|
||||
|
||||
def test_no_split_balanced_clusters(self):
|
||||
"""Both clusters similarly sized (ratio >= 0.35) → no split."""
|
||||
content_w = 1000
|
||||
left_words = [self._make_word(100, f"a{i}") for i in range(8)]
|
||||
right_words = [self._make_word(300, f"b{i}") for i in range(12)]
|
||||
all_words = left_words + right_words
|
||||
geo = self._make_geo(x=80, width=400, words=all_words, content_w=content_w)
|
||||
# 8/20 = 0.4 >= 0.35 → no split
|
||||
|
||||
result = _detect_sub_columns([geo], content_w)
|
||||
|
||||
assert len(result) == 1
|
||||
|
||||
def test_sub_column_reindexing(self):
|
||||
"""After split, indices are correctly 0, 1, 2 across all columns."""
|
||||
content_w = 1000
|
||||
# First column: no split
|
||||
words1 = [self._make_word(50, f"de{i}") for i in range(10)]
|
||||
geo1 = ColumnGeometry(index=0, x=30, y=50, width=200, height=500,
|
||||
word_count=10, words=words1, width_ratio=0.2)
|
||||
# Second column: will split
|
||||
page_words = [self._make_word(400, f"p.{i}") for i in range(3)]
|
||||
en_words = [self._make_word(550, f"en{i}") for i in range(15)]
|
||||
geo2 = ColumnGeometry(index=1, x=380, y=50, width=300, height=500,
|
||||
word_count=18, words=page_words + en_words, width_ratio=0.3)
|
||||
|
||||
result = _detect_sub_columns([geo1, geo2], content_w)
|
||||
|
||||
assert len(result) == 3
|
||||
assert [g.index for g in result] == [0, 1, 2]
|
||||
# First column unchanged
|
||||
assert result[0].word_count == 10
|
||||
# Sub-column (page refs)
|
||||
assert result[1].word_count == 3
|
||||
# Main column (EN words)
|
||||
assert result[2].word_count == 15
|
||||
|
||||
def test_no_split_too_few_words(self):
|
||||
"""Column with fewer than 5 words → no split attempted."""
|
||||
content_w = 1000
|
||||
words = [self._make_word(100, "a"), self._make_word(300, "b"),
|
||||
self._make_word(300, "c"), self._make_word(300, "d")]
|
||||
geo = self._make_geo(x=80, width=300, words=words, content_w=content_w)
|
||||
|
||||
result = _detect_sub_columns([geo], content_w)
|
||||
|
||||
assert len(result) == 1
|
||||
|
||||
def test_no_split_single_minority_word(self):
|
||||
"""Only 1 word in minority cluster → no split (need >= 2)."""
|
||||
content_w = 1000
|
||||
minority = [self._make_word(100, "p.59")]
|
||||
majority = [self._make_word(300, f"w{i}") for i in range(20)]
|
||||
geo = self._make_geo(x=80, width=350, words=minority + majority, content_w=content_w)
|
||||
|
||||
result = _detect_sub_columns([geo], content_w)
|
||||
|
||||
assert len(result) == 1
|
||||
|
||||
|
||||
class TestCellsToVocabEntriesPageRef:
|
||||
"""Test that page_ref cells are mapped to source_page field."""
|
||||
|
||||
def test_page_ref_mapped_to_source_page(self):
|
||||
"""Cell with col_type='page_ref' → source_page field populated."""
|
||||
from cv_vocab_pipeline import _cells_to_vocab_entries
|
||||
|
||||
cells = [
|
||||
{
|
||||
'row_index': 0,
|
||||
'col_type': 'column_en',
|
||||
'text': 'hello',
|
||||
'bbox_pct': [10, 10, 30, 5],
|
||||
'confidence': 95.0,
|
||||
'ocr_engine': 'tesseract',
|
||||
},
|
||||
{
|
||||
'row_index': 0,
|
||||
'col_type': 'column_de',
|
||||
'text': 'hallo',
|
||||
'bbox_pct': [40, 10, 30, 5],
|
||||
'confidence': 90.0,
|
||||
'ocr_engine': 'tesseract',
|
||||
},
|
||||
{
|
||||
'row_index': 0,
|
||||
'col_type': 'page_ref',
|
||||
'text': 'p.59',
|
||||
'bbox_pct': [5, 10, 5, 5],
|
||||
'confidence': 80.0,
|
||||
'ocr_engine': 'tesseract',
|
||||
},
|
||||
]
|
||||
|
||||
entries = _cells_to_vocab_entries(cells)
|
||||
|
||||
assert len(entries) == 1
|
||||
assert entries[0]['english'] == 'hello'
|
||||
assert entries[0]['german'] == 'hallo'
|
||||
assert entries[0]['source_page'] == 'p.59'
|
||||
assert entries[0]['bbox_ref'] == [5, 10, 5, 5]
|
||||
|
||||
def test_no_page_ref_defaults_empty(self):
|
||||
"""Without page_ref cell, source_page defaults to empty string."""
|
||||
from cv_vocab_pipeline import _cells_to_vocab_entries
|
||||
|
||||
cells = [
|
||||
{
|
||||
'row_index': 0,
|
||||
'col_type': 'column_en',
|
||||
'text': 'world',
|
||||
'bbox_pct': [10, 10, 30, 5],
|
||||
'confidence': 95.0,
|
||||
'ocr_engine': 'tesseract',
|
||||
},
|
||||
]
|
||||
|
||||
entries = _cells_to_vocab_entries(cells)
|
||||
|
||||
assert len(entries) == 1
|
||||
assert entries[0]['source_page'] == ''
|
||||
assert entries[0]['bbox_ref'] is None
|
||||
|
||||
|
||||
# =============================================
|
||||
# RUN TESTS
|
||||
# =============================================
|
||||
|
||||
Reference in New Issue
Block a user