fix: border strip pre-filter + 3-column detection for vocabulary tables

The border strip filter (Step 4e) used the LARGEST x-gap which incorrectly removed base words along with edge artifacts. Now uses a two-stage approach: 1. _filter_border_strip_words() pre-filters raw words BEFORE column detection, scanning from the page edge inward to find the FIRST significant gap (>30px) 2. Step 4e runs as fallback only when pre-filter didn't apply Session 4233 now correctly detects 3 columns (base word | oder | synonyms) instead of 2. Threshold raised from 15% to 20% to handle pages with many edge artifacts. All 4 ground-truth sessions pass regression. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-21 21:01:43 +01:00
parent 4000110501
commit 46c8c28d34
2 changed files with 212 additions and 117 deletions
@@ -1109,64 +1109,56 @@ class TestBorderStripFilter:

    def test_left_border_strip_removed(self):
        """Word_boxes at x<120 with 45px gap to content at x>=179 are removed."""
-        # Simulate border strip (11 wbs) + real content (20 wbs)
+        # Simulate border strip (3 wbs) + base words (7 wbs) + oder (7 wbs)
+        # + synonyms (20 wbs). The old "largest gap" algorithm would pick
+        # the 67px gap between base words and "oder", removing base words.
+        # The new "first gap from edge" algorithm picks the 45px gap between
+        # border artifacts and base words.
        border_wbs = [
-            self._make_wb("M", 49, 436, 46, 44),
-            self._make_wb("x", 113, 610, 21, 38),
-            self._make_wb("Er", 45, 998, 62, 37),
+            self._make_wb("M", 49, 436, 46, 44),    # right=95
+            self._make_wb("x", 113, 610, 21, 38),    # right=134
+            self._make_wb("Er", 45, 998, 62, 37),     # right=107
        ]
-        content_wbs = []
-        for i in range(20):
-            # Place content words at x=179 and x=280 (gap=1px between them,
-            # much smaller than the 45px border-to-content gap)
-            content_wbs.append(self._make_wb(f"word{i}", 179 + (i % 2) * 51, 100 + i * 40))
-        # Build zone with cells
-        cells = []
-        # Border-only cells
-        for i, wb in enumerate(border_wbs):
-            cells.append({"cell_id": f"R{i}_C0", "col_index": 0, "row_index": i,
-                          "word_boxes": [wb], "text": wb["text"]})
-        # Content cells
-        for i, wb in enumerate(content_wbs):
-            ri = len(border_wbs) + i
-            cells.append({"cell_id": f"R{ri}_C1", "col_index": 1, "row_index": ri,
-                          "word_boxes": [wb], "text": wb["text"]})
-        zone = {"zone_index": 0, "zone_type": "content", "cells": cells,
-                "columns": [], "rows": []}
-        # The filter runs inside _build_grid_core, but we can test the
-        # pattern detection logic: 3 border wbs + 20 content wbs,
-        # border right edge = 113+21=134, content left = 179, gap = 45px
-        # 3/23 = 13% < 15% threshold
-        from cv_ocr_engines import _group_words_into_lines
-        all_left = sorted(
-            [(wb["left"], wb) for cell in cells for wb in cell.get("word_boxes", [])],
-            key=lambda t: t[0]
-        )
-        # Find largest gap
-        best_gap = 0
-        best_idx = -1
-        for gi in range(len(all_left) - 1):
-            right_edge = all_left[gi][0] + all_left[gi][1]["width"]
-            gap = all_left[gi + 1][0] - right_edge
-            if gap > best_gap:
-                best_gap = gap
-                best_idx = gi
-        assert best_gap >= 30, f"Gap should be >=30, got {best_gap}"
-        left_count = best_idx + 1
+        base_wbs = [self._make_wb(f"base{i}", 179, 100 + i * 60, 100, 20) for i in range(7)]
+        oder_wbs = [self._make_wb("oder", 379, 100 + i * 60, 68, 20) for i in range(7)]
+        synonym_wbs = [self._make_wb(f"syn{i}", 452 + (i % 5) * 30, 100 + (i // 5) * 60, 80, 20) for i in range(20)]
+
+        all_wbs = border_wbs + base_wbs + oder_wbs + synonym_wbs
+        all_left = sorted([(wb["left"], wb) for wb in all_wbs], key=lambda t: t[0])
        total = len(all_left)
-        assert left_count / total < 0.15, f"Border ratio {left_count}/{total} should be <15%"
+
+        # New algorithm: scan from left edge, find FIRST gap >30px
+        running_right = 0
+        left_strip_count = 0
+        for gi in range(total - 1):
+            running_right = max(running_right, all_left[gi][0] + all_left[gi][1]["width"])
+            gap = all_left[gi + 1][0] - running_right
+            if gap > 30:
+                left_strip_count = gi + 1
+                break
+
+        # Should find the 45px gap between border (right=134) and base (left=179)
+        assert left_strip_count == len(border_wbs), (
+            f"Expected {len(border_wbs)} border wbs, got {left_strip_count}"
+        )
+        assert left_strip_count / total < 0.20, (
+            f"Border ratio {left_strip_count}/{total} should be <20%"
+        )

    def test_no_removal_when_no_gap(self):
        """No gap > 30px between word_boxes → nothing removed."""
+        # Words spaced 20px apart with width 50 → overlap, no gap >30px
        wbs = [self._make_wb(f"w{i}", 10 + i * 20, 100) for i in range(15)]
        all_left = sorted([(wb["left"], wb) for wb in wbs], key=lambda t: t[0])
-        best_gap = 0
+        running_right = 0
+        found_gap = False
        for gi in range(len(all_left) - 1):
-            right_edge = all_left[gi][0] + all_left[gi][1]["width"]
-            gap = all_left[gi + 1][0] - right_edge
-            if gap > best_gap:
-                best_gap = gap
-        assert best_gap < 30, f"No significant gap expected, got {best_gap}"
+            running_right = max(running_right, all_left[gi][0] + all_left[gi][1]["width"])
+            gap = all_left[gi + 1][0] - running_right
+            if gap > 30:
+                found_gap = True
+                break
+        assert not found_gap, "No significant gap expected"

    def test_equal_sides_not_removed(self):
        """Two roughly equal groups (50/50) are NOT treated as border strip."""
@@ -1176,15 +1168,17 @@ class TestBorderStripFilter:
            [(wb["left"], wb) for wb in left_wbs + right_wbs],
            key=lambda t: t[0]
        )
-        best_gap = 0
-        best_idx = -1
-        for gi in range(len(all_left) - 1):
-            right_edge = all_left[gi][0] + all_left[gi][1]["width"]
-            gap = all_left[gi + 1][0] - right_edge
-            if gap > best_gap:
-                best_gap = gap
-                best_idx = gi
-        left_count = best_idx + 1
        total = len(all_left)
+        # Left scan: first gap >30px from left
+        running_right = 0
+        left_strip_count = 0
+        for gi in range(total - 1):
+            running_right = max(running_right, all_left[gi][0] + all_left[gi][1]["width"])
+            gap = all_left[gi + 1][0] - running_right
+            if gap > 30:
+                left_strip_count = gi + 1
+                break
        # 10/20 = 50% — NOT below 15% threshold, so no removal
-        assert left_count / total >= 0.15, "Equal groups should NOT trigger border removal"
+        assert left_strip_count == 0 or left_strip_count / total >= 0.20, (
+            "Equal groups should NOT trigger border removal"
+        )