Filter pipe-character word_boxes from OCR column divider artifacts

Step 4d removes "|" and "||" word_boxes that OCR produces when reading physical vertical divider lines between columns. Also strips stray pipe chars from cell text. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-20 12:09:50 +01:00
parent 1f7989cfc2
commit 7ac09b5941
2 changed files with 128 additions and 0 deletions
@@ -418,6 +418,98 @@ class TestFilterBorderGhosts:
        assert len(filtered) == 0


+# ---------------------------------------------------------------------------
+# Step 4d: Pipe-character divider filter
+# ---------------------------------------------------------------------------
+
+class TestPipeDividerFilter:
+    """Step 4d removes '|' word_boxes that are OCR artifacts from column dividers."""
+
+    def test_pipe_word_boxes_removed(self):
+        """Word boxes with text '|' or '||' are removed from cells."""
+        zone = {
+            "zone_index": 0,
+            "cells": [
+                {
+                    "cell_id": "Z0_R0_C0",
+                    "text": "hello | world",
+                    "word_boxes": [
+                        {"text": "hello", "top": 10, "left": 10, "height": 15, "width": 40},
+                        {"text": "|", "top": 10, "left": 55, "height": 15, "width": 5},
+                        {"text": "world", "top": 10, "left": 65, "height": 15, "width": 40},
+                    ],
+                },
+            ],
+            "rows": [{"index": 0}],
+        }
+        # Simulate Step 4d inline
+        import re
+        _PIPE_RE = re.compile(r"^\|+$")
+        for cell in zone["cells"]:
+            wbs = cell.get("word_boxes") or []
+            filtered = [wb for wb in wbs if not _PIPE_RE.match((wb.get("text") or "").strip())]
+            if len(filtered) < len(wbs):
+                cell["word_boxes"] = filtered
+                cell["text"] = " ".join(
+                    wb.get("text", "").strip()
+                    for wb in sorted(filtered, key=lambda w: (w.get("top", 0), w.get("left", 0)))
+                    if wb.get("text", "").strip()
+                )
+        assert len(zone["cells"][0]["word_boxes"]) == 2
+        assert zone["cells"][0]["text"] == "hello world"
+
+    def test_pipe_only_cell_removed(self):
+        """A cell containing only '|' word_boxes becomes empty and is removed."""
+        zone = {
+            "zone_index": 0,
+            "cells": [
+                {
+                    "cell_id": "Z0_R0_C0",
+                    "text": "hello",
+                    "word_boxes": [
+                        {"text": "hello", "top": 10, "left": 10, "height": 15, "width": 40},
+                    ],
+                },
+                {
+                    "cell_id": "Z0_R0_C1",
+                    "text": "|",
+                    "word_boxes": [
+                        {"text": "|", "top": 10, "left": 740, "height": 15, "width": 5},
+                    ],
+                },
+            ],
+            "rows": [{"index": 0}],
+        }
+        import re
+        _PIPE_RE = re.compile(r"^\|+$")
+        removed = 0
+        for cell in zone["cells"]:
+            wbs = cell.get("word_boxes") or []
+            filtered = [wb for wb in wbs if not _PIPE_RE.match((wb.get("text") or "").strip())]
+            if len(filtered) < len(wbs):
+                removed += len(wbs) - len(filtered)
+                cell["word_boxes"] = filtered
+                cell["text"] = " ".join(
+                    wb.get("text", "").strip()
+                    for wb in sorted(filtered, key=lambda w: (w.get("top", 0), w.get("left", 0)))
+                    if wb.get("text", "").strip()
+                )
+        if removed:
+            zone["cells"] = [c for c in zone["cells"] if (c.get("word_boxes") or c.get("text", "").strip())]
+        assert removed == 1
+        assert len(zone["cells"]) == 1
+        assert zone["cells"][0]["text"] == "hello"
+
+    def test_double_pipe_removed(self):
+        """'||' is also treated as a divider artifact."""
+        import re
+        _PIPE_RE = re.compile(r"^\|+$")
+        assert _PIPE_RE.match("||") is not None
+        assert _PIPE_RE.match("|") is not None
+        assert _PIPE_RE.match("hello") is None
+        assert _PIPE_RE.match("|word") is None
+
+
 # ---------------------------------------------------------------------------
 # _detect_header_rows (Fix 3: skip_first_row_header)
 # ---------------------------------------------------------------------------