Filter pipe-character word_boxes from OCR column divider artifacts

Step 4d removes "|" and "||" word_boxes that OCR produces when reading physical vertical divider lines between columns. Also strips stray pipe chars from cell text. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-20 12:09:50 +01:00
parent 1f7989cfc2
commit 7ac09b5941
2 changed files with 128 additions and 0 deletions
@@ -1709,6 +1709,42 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
                removed_oversized, oversized_threshold, z.get("zone_index", 0),
            )
    # 4d. Remove pipe-character word_boxes (column divider artifacts).
    # OCR reads physical vertical divider lines as "|" or "||" characters.
    # These sit at consistent x positions near column boundaries and pollute
    # cell text.  Remove them from word_boxes and rebuild cell text.
    _PIPE_RE = re.compile(r"^\|+$")
    for z in zones_data:
        removed_pipes = 0
        for cell in z.get("cells", []):
            wbs = cell.get("word_boxes") or []
            filtered = [wb for wb in wbs if not _PIPE_RE.match((wb.get("text") or "").strip())]
            if len(filtered) < len(wbs):
                removed_pipes += len(wbs) - len(filtered)
                cell["word_boxes"] = filtered
                cell["text"] = " ".join(
                    wb.get("text", "").strip()
                    for wb in sorted(filtered, key=lambda w: (w.get("top", 0), w.get("left", 0)))
                    if wb.get("text", "").strip()
                )
        # Remove cells that became empty after pipe removal
        if removed_pipes:
            z["cells"] = [c for c in z.get("cells", []) if (c.get("word_boxes") or c.get("text", "").strip())]
            logger.info(
                "build-grid: removed %d pipe-divider word_boxes from zone %d",
                removed_pipes, z.get("zone_index", 0),
            )
    # Also strip leading/trailing pipe chars from cell text that may remain
    # from word_boxes that contained mixed text like "word|" or "|word".
    for z in zones_data:
        for cell in z.get("cells", []):
            text = cell.get("text", "")
            if "|" in text:
                cleaned = text.replace("|", "").strip()
                if cleaned != text:
                    cell["text"] = cleaned
    # 5. Color annotation on final word_boxes in cells
    if img_bgr is not None:
        all_wb: List[Dict] = []
@@ -418,6 +418,98 @@ class TestFilterBorderGhosts:
        assert len(filtered) == 0
 # ---------------------------------------------------------------------------
 # Step 4d: Pipe-character divider filter
 # ---------------------------------------------------------------------------
 class TestPipeDividerFilter:
    """Step 4d removes '|' word_boxes that are OCR artifacts from column dividers."""
    def test_pipe_word_boxes_removed(self):
        """Word boxes with text '|' or '||' are removed from cells."""
        zone = {
            "zone_index": 0,
            "cells": [
                {
                    "cell_id": "Z0_R0_C0",
                    "text": "hello | world",
                    "word_boxes": [
                        {"text": "hello", "top": 10, "left": 10, "height": 15, "width": 40},
                        {"text": "|", "top": 10, "left": 55, "height": 15, "width": 5},
                        {"text": "world", "top": 10, "left": 65, "height": 15, "width": 40},
                    ],
                },
            ],
            "rows": [{"index": 0}],
        }
        # Simulate Step 4d inline
        import re
        _PIPE_RE = re.compile(r"^\|+$")
        for cell in zone["cells"]:
            wbs = cell.get("word_boxes") or []
            filtered = [wb for wb in wbs if not _PIPE_RE.match((wb.get("text") or "").strip())]
            if len(filtered) < len(wbs):
                cell["word_boxes"] = filtered
                cell["text"] = " ".join(
                    wb.get("text", "").strip()
                    for wb in sorted(filtered, key=lambda w: (w.get("top", 0), w.get("left", 0)))
                    if wb.get("text", "").strip()
                )
        assert len(zone["cells"][0]["word_boxes"]) == 2
        assert zone["cells"][0]["text"] == "hello world"
    def test_pipe_only_cell_removed(self):
        """A cell containing only '|' word_boxes becomes empty and is removed."""
        zone = {
            "zone_index": 0,
            "cells": [
                {
                    "cell_id": "Z0_R0_C0",
                    "text": "hello",
                    "word_boxes": [
                        {"text": "hello", "top": 10, "left": 10, "height": 15, "width": 40},
                    ],
                },
                {
                    "cell_id": "Z0_R0_C1",
                    "text": "|",
                    "word_boxes": [
                        {"text": "|", "top": 10, "left": 740, "height": 15, "width": 5},
                    ],
                },
            ],
            "rows": [{"index": 0}],
        }
        import re
        _PIPE_RE = re.compile(r"^\|+$")
        removed = 0
        for cell in zone["cells"]:
            wbs = cell.get("word_boxes") or []
            filtered = [wb for wb in wbs if not _PIPE_RE.match((wb.get("text") or "").strip())]
            if len(filtered) < len(wbs):
                removed += len(wbs) - len(filtered)
                cell["word_boxes"] = filtered
                cell["text"] = " ".join(
                    wb.get("text", "").strip()
                    for wb in sorted(filtered, key=lambda w: (w.get("top", 0), w.get("left", 0)))
                    if wb.get("text", "").strip()
                )
        if removed:
            zone["cells"] = [c for c in zone["cells"] if (c.get("word_boxes") or c.get("text", "").strip())]
        assert removed == 1
        assert len(zone["cells"]) == 1
        assert zone["cells"][0]["text"] == "hello"
    def test_double_pipe_removed(self):
        """'||' is also treated as a divider artifact."""
        import re
        _PIPE_RE = re.compile(r"^\|+$")
        assert _PIPE_RE.match("||") is not None
        assert _PIPE_RE.match("|") is not None
        assert _PIPE_RE.match("hello") is None
        assert _PIPE_RE.match("|word") is None
 # ---------------------------------------------------------------------------
 # _detect_header_rows (Fix 3: skip_first_row_header)
 # ---------------------------------------------------------------------------