refactor(ocr-pipeline): use left-edge alignment approach for sub-column detection

Replace gap-based splitting with alignment-bin approach: cluster word left-edges within 8px tolerance, find the leftmost bin with >= 10% of words as the true column start, split off any words to its left as a sub-column. This correctly handles both page references ("p.59") and misread exclamation marks ("!" → "I") even when the pixel gap is small. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-02 18:56:38 +01:00
parent f13116345b
commit 7252f9a956
2 changed files with 87 additions and 97 deletions
@@ -1177,7 +1177,7 @@ class TestRegionContentCheck:
 # =============================================

 class TestSubColumnDetection:
-    """Tests for _detect_sub_columns() left-edge clustering."""
+    """Tests for _detect_sub_columns() left-edge alignment detection."""

    def _make_word(self, left: int, text: str = "word", conf: int = 90) -> dict:
        return {'left': left, 'top': 100, 'width': 50, 'height': 20,
@@ -1191,27 +1191,46 @@ class TestSubColumnDetection:
        )

    def test_sub_column_split_page_refs(self):
-        """Column with 3 'p.XX' left + 20 EN words right → split into 2."""
+        """3 page-refs left + 30 vocab words right → split into 2.
+
+        The leftmost bin with >= 10% of words (i.e. >= 4) is the vocab bin
+        at left=250, so the 3 page-refs are outliers.
+        """
        content_w = 1000
-        # 3 page-ref words at left=100, 20 vocab words at left=250
        page_words = [self._make_word(100, f"p.{59+i}") for i in range(3)]
-        vocab_words = [self._make_word(250, f"word{i}") for i in range(20)]
+        vocab_words = [self._make_word(250, f"word{i}") for i in range(30)]
        all_words = page_words + vocab_words
        geo = self._make_geo(x=80, width=300, words=all_words, content_w=content_w)

        result = _detect_sub_columns([geo], content_w)

        assert len(result) == 2, f"Expected 2 columns, got {len(result)}"
-        # Left sub-column should be narrower with fewer words
        left_col = result[0]
        right_col = result[1]
        assert left_col.x < right_col.x
        assert left_col.word_count == 3
-        assert right_col.word_count == 20
-        # Indices should be 0, 1
+        assert right_col.word_count == 30
        assert left_col.index == 0
        assert right_col.index == 1

+    def test_sub_column_split_exclamation_marks(self):
+        """5 '!' (misread as I/|) left + 80 example words → split into 2.
+
+        Mirrors the real-world case where red ! marks are OCR'd as I, |, B, 1
+        at a position slightly left of the example sentence start.
+        """
+        content_w = 1500
+        bang_words = [self._make_word(950 + i, chr(ord('I')), conf=60) for i in range(5)]
+        example_words = [self._make_word(975 + (i * 3), f"word{i}") for i in range(80)]
+        all_words = bang_words + example_words
+        geo = self._make_geo(x=940, width=530, words=all_words, content_w=content_w)
+
+        result = _detect_sub_columns([geo], content_w)
+
+        assert len(result) == 2
+        assert result[0].word_count == 5
+        assert result[1].word_count == 80
+
    def test_no_split_uniform_alignment(self):
        """All words aligned at same position → no change."""
        content_w = 1000
@@ -1228,7 +1247,6 @@ class TestSubColumnDetection:
        content_w = 1000
        words = [self._make_word(50, "a")] * 3 + [self._make_word(120, "b")] * 10
        geo = self._make_geo(x=40, width=140, words=words, content_w=content_w)
-        # width_ratio = 140/1000 = 0.14 < 0.15

        result = _detect_sub_columns([geo], content_w)

@@ -1241,7 +1259,6 @@ class TestSubColumnDetection:
        right_words = [self._make_word(300, f"b{i}") for i in range(12)]
        all_words = left_words + right_words
        geo = self._make_geo(x=80, width=400, words=all_words, content_w=content_w)
-        # 8/20 = 0.4 >= 0.35 → no split

        result = _detect_sub_columns([geo], content_w)

@@ -1250,26 +1267,23 @@ class TestSubColumnDetection:
    def test_sub_column_reindexing(self):
        """After split, indices are correctly 0, 1, 2 across all columns."""
        content_w = 1000
-        # First column: no split
+        # First column: no split (all words at same alignment)
        words1 = [self._make_word(50, f"de{i}") for i in range(10)]
        geo1 = ColumnGeometry(index=0, x=30, y=50, width=200, height=500,
                              word_count=10, words=words1, width_ratio=0.2)
-        # Second column: will split
+        # Second column: will split (3 outliers + 30 main)
        page_words = [self._make_word(400, f"p.{i}") for i in range(3)]
-        en_words = [self._make_word(550, f"en{i}") for i in range(15)]
+        en_words = [self._make_word(550, f"en{i}") for i in range(30)]
        geo2 = ColumnGeometry(index=1, x=380, y=50, width=300, height=500,
-                              word_count=18, words=page_words + en_words, width_ratio=0.3)
+                              word_count=33, words=page_words + en_words, width_ratio=0.3)

        result = _detect_sub_columns([geo1, geo2], content_w)

        assert len(result) == 3
        assert [g.index for g in result] == [0, 1, 2]
-        # First column unchanged
        assert result[0].word_count == 10
-        # Sub-column (page refs)
        assert result[1].word_count == 3
-        # Main column (EN words)
-        assert result[2].word_count == 15
+        assert result[2].word_count == 30

    def test_no_split_too_few_words(self):
        """Column with fewer than 5 words → no split attempted."""
@@ -1283,10 +1297,10 @@ class TestSubColumnDetection:
        assert len(result) == 1

    def test_no_split_single_minority_word(self):
-        """Only 1 word in minority cluster → no split (need >= 2)."""
+        """Only 1 word left of column start → no split (need >= 2)."""
        content_w = 1000
        minority = [self._make_word(100, "p.59")]
-        majority = [self._make_word(300, f"w{i}") for i in range(20)]
+        majority = [self._make_word(300, f"w{i}") for i in range(30)]
        geo = self._make_geo(x=80, width=350, words=minority + majority, content_w=content_w)

        result = _detect_sub_columns([geo], content_w)