Filter pipe-character word_boxes from OCR column divider artifacts

Step 4d removes "|" and "||" word_boxes that OCR produces when reading physical vertical divider lines between columns. Also strips stray pipe chars from cell text. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-20 12:09:50 +01:00
parent 1f7989cfc2
commit 7ac09b5941
2 changed files with 128 additions and 0 deletions
@@ -1709,6 +1709,42 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
                removed_oversized, oversized_threshold, z.get("zone_index", 0),
            )

+    # 4d. Remove pipe-character word_boxes (column divider artifacts).
+    # OCR reads physical vertical divider lines as "|" or "||" characters.
+    # These sit at consistent x positions near column boundaries and pollute
+    # cell text.  Remove them from word_boxes and rebuild cell text.
+    _PIPE_RE = re.compile(r"^\|+$")
+    for z in zones_data:
+        removed_pipes = 0
+        for cell in z.get("cells", []):
+            wbs = cell.get("word_boxes") or []
+            filtered = [wb for wb in wbs if not _PIPE_RE.match((wb.get("text") or "").strip())]
+            if len(filtered) < len(wbs):
+                removed_pipes += len(wbs) - len(filtered)
+                cell["word_boxes"] = filtered
+                cell["text"] = " ".join(
+                    wb.get("text", "").strip()
+                    for wb in sorted(filtered, key=lambda w: (w.get("top", 0), w.get("left", 0)))
+                    if wb.get("text", "").strip()
+                )
+        # Remove cells that became empty after pipe removal
+        if removed_pipes:
+            z["cells"] = [c for c in z.get("cells", []) if (c.get("word_boxes") or c.get("text", "").strip())]
+            logger.info(
+                "build-grid: removed %d pipe-divider word_boxes from zone %d",
+                removed_pipes, z.get("zone_index", 0),
+            )
+
+    # Also strip leading/trailing pipe chars from cell text that may remain
+    # from word_boxes that contained mixed text like "word|" or "|word".
+    for z in zones_data:
+        for cell in z.get("cells", []):
+            text = cell.get("text", "")
+            if "|" in text:
+                cleaned = text.replace("|", "").strip()
+                if cleaned != text:
+                    cell["text"] = cleaned
+
    # 5. Color annotation on final word_boxes in cells
    if img_bgr is not None:
        all_wb: List[Dict] = []
@@ -418,6 +418,98 @@ class TestFilterBorderGhosts:
        assert len(filtered) == 0


+# ---------------------------------------------------------------------------
+# Step 4d: Pipe-character divider filter
+# ---------------------------------------------------------------------------
+
+class TestPipeDividerFilter:
+    """Step 4d removes '|' word_boxes that are OCR artifacts from column dividers."""
+
+    def test_pipe_word_boxes_removed(self):
+        """Word boxes with text '|' or '||' are removed from cells."""
+        zone = {
+            "zone_index": 0,
+            "cells": [
+                {
+                    "cell_id": "Z0_R0_C0",
+                    "text": "hello | world",
+                    "word_boxes": [
+                        {"text": "hello", "top": 10, "left": 10, "height": 15, "width": 40},
+                        {"text": "|", "top": 10, "left": 55, "height": 15, "width": 5},
+                        {"text": "world", "top": 10, "left": 65, "height": 15, "width": 40},
+                    ],
+                },
+            ],
+            "rows": [{"index": 0}],
+        }
+        # Simulate Step 4d inline
+        import re
+        _PIPE_RE = re.compile(r"^\|+$")
+        for cell in zone["cells"]:
+            wbs = cell.get("word_boxes") or []
+            filtered = [wb for wb in wbs if not _PIPE_RE.match((wb.get("text") or "").strip())]
+            if len(filtered) < len(wbs):
+                cell["word_boxes"] = filtered
+                cell["text"] = " ".join(
+                    wb.get("text", "").strip()
+                    for wb in sorted(filtered, key=lambda w: (w.get("top", 0), w.get("left", 0)))
+                    if wb.get("text", "").strip()
+                )
+        assert len(zone["cells"][0]["word_boxes"]) == 2
+        assert zone["cells"][0]["text"] == "hello world"
+
+    def test_pipe_only_cell_removed(self):
+        """A cell containing only '|' word_boxes becomes empty and is removed."""
+        zone = {
+            "zone_index": 0,
+            "cells": [
+                {
+                    "cell_id": "Z0_R0_C0",
+                    "text": "hello",
+                    "word_boxes": [
+                        {"text": "hello", "top": 10, "left": 10, "height": 15, "width": 40},
+                    ],
+                },
+                {
+                    "cell_id": "Z0_R0_C1",
+                    "text": "|",
+                    "word_boxes": [
+                        {"text": "|", "top": 10, "left": 740, "height": 15, "width": 5},
+                    ],
+                },
+            ],
+            "rows": [{"index": 0}],
+        }
+        import re
+        _PIPE_RE = re.compile(r"^\|+$")
+        removed = 0
+        for cell in zone["cells"]:
+            wbs = cell.get("word_boxes") or []
+            filtered = [wb for wb in wbs if not _PIPE_RE.match((wb.get("text") or "").strip())]
+            if len(filtered) < len(wbs):
+                removed += len(wbs) - len(filtered)
+                cell["word_boxes"] = filtered
+                cell["text"] = " ".join(
+                    wb.get("text", "").strip()
+                    for wb in sorted(filtered, key=lambda w: (w.get("top", 0), w.get("left", 0)))
+                    if wb.get("text", "").strip()
+                )
+        if removed:
+            zone["cells"] = [c for c in zone["cells"] if (c.get("word_boxes") or c.get("text", "").strip())]
+        assert removed == 1
+        assert len(zone["cells"]) == 1
+        assert zone["cells"][0]["text"] == "hello"
+
+    def test_double_pipe_removed(self):
+        """'||' is also treated as a divider artifact."""
+        import re
+        _PIPE_RE = re.compile(r"^\|+$")
+        assert _PIPE_RE.match("||") is not None
+        assert _PIPE_RE.match("|") is not None
+        assert _PIPE_RE.match("hello") is None
+        assert _PIPE_RE.match("|word") is None
+
+
 # ---------------------------------------------------------------------------
 # _detect_header_rows (Fix 3: skip_first_row_header)
 # ---------------------------------------------------------------------------