Filter pipe-character word_boxes from OCR column divider artifacts

Step 4d removes "|" and "||" word_boxes that OCR produces when reading
physical vertical divider lines between columns. Also strips stray pipe
chars from cell text.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-20 12:09:50 +01:00
parent 1f7989cfc2
commit 7ac09b5941
2 changed files with 128 additions and 0 deletions

View File

@@ -418,6 +418,98 @@ class TestFilterBorderGhosts:
assert len(filtered) == 0
# ---------------------------------------------------------------------------
# Step 4d: Pipe-character divider filter
# ---------------------------------------------------------------------------
class TestPipeDividerFilter:
"""Step 4d removes '|' word_boxes that are OCR artifacts from column dividers."""
def test_pipe_word_boxes_removed(self):
"""Word boxes with text '|' or '||' are removed from cells."""
zone = {
"zone_index": 0,
"cells": [
{
"cell_id": "Z0_R0_C0",
"text": "hello | world",
"word_boxes": [
{"text": "hello", "top": 10, "left": 10, "height": 15, "width": 40},
{"text": "|", "top": 10, "left": 55, "height": 15, "width": 5},
{"text": "world", "top": 10, "left": 65, "height": 15, "width": 40},
],
},
],
"rows": [{"index": 0}],
}
# Simulate Step 4d inline
import re
_PIPE_RE = re.compile(r"^\|+$")
for cell in zone["cells"]:
wbs = cell.get("word_boxes") or []
filtered = [wb for wb in wbs if not _PIPE_RE.match((wb.get("text") or "").strip())]
if len(filtered) < len(wbs):
cell["word_boxes"] = filtered
cell["text"] = " ".join(
wb.get("text", "").strip()
for wb in sorted(filtered, key=lambda w: (w.get("top", 0), w.get("left", 0)))
if wb.get("text", "").strip()
)
assert len(zone["cells"][0]["word_boxes"]) == 2
assert zone["cells"][0]["text"] == "hello world"
def test_pipe_only_cell_removed(self):
"""A cell containing only '|' word_boxes becomes empty and is removed."""
zone = {
"zone_index": 0,
"cells": [
{
"cell_id": "Z0_R0_C0",
"text": "hello",
"word_boxes": [
{"text": "hello", "top": 10, "left": 10, "height": 15, "width": 40},
],
},
{
"cell_id": "Z0_R0_C1",
"text": "|",
"word_boxes": [
{"text": "|", "top": 10, "left": 740, "height": 15, "width": 5},
],
},
],
"rows": [{"index": 0}],
}
import re
_PIPE_RE = re.compile(r"^\|+$")
removed = 0
for cell in zone["cells"]:
wbs = cell.get("word_boxes") or []
filtered = [wb for wb in wbs if not _PIPE_RE.match((wb.get("text") or "").strip())]
if len(filtered) < len(wbs):
removed += len(wbs) - len(filtered)
cell["word_boxes"] = filtered
cell["text"] = " ".join(
wb.get("text", "").strip()
for wb in sorted(filtered, key=lambda w: (w.get("top", 0), w.get("left", 0)))
if wb.get("text", "").strip()
)
if removed:
zone["cells"] = [c for c in zone["cells"] if (c.get("word_boxes") or c.get("text", "").strip())]
assert removed == 1
assert len(zone["cells"]) == 1
assert zone["cells"][0]["text"] == "hello"
def test_double_pipe_removed(self):
"""'||' is also treated as a divider artifact."""
import re
_PIPE_RE = re.compile(r"^\|+$")
assert _PIPE_RE.match("||") is not None
assert _PIPE_RE.match("|") is not None
assert _PIPE_RE.match("hello") is None
assert _PIPE_RE.match("|word") is None
# ---------------------------------------------------------------------------
# _detect_header_rows (Fix 3: skip_first_row_header)
# ---------------------------------------------------------------------------