Filter pipe-character word_boxes from OCR column divider artifacts
Step 4d removes "|" and "||" word_boxes that OCR produces when reading physical vertical divider lines between columns. Also strips stray pipe chars from cell text. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1709,6 +1709,42 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
|
|||||||
removed_oversized, oversized_threshold, z.get("zone_index", 0),
|
removed_oversized, oversized_threshold, z.get("zone_index", 0),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# 4d. Remove pipe-character word_boxes (column divider artifacts).
|
||||||
|
# OCR reads physical vertical divider lines as "|" or "||" characters.
|
||||||
|
# These sit at consistent x positions near column boundaries and pollute
|
||||||
|
# cell text. Remove them from word_boxes and rebuild cell text.
|
||||||
|
_PIPE_RE = re.compile(r"^\|+$")
|
||||||
|
for z in zones_data:
|
||||||
|
removed_pipes = 0
|
||||||
|
for cell in z.get("cells", []):
|
||||||
|
wbs = cell.get("word_boxes") or []
|
||||||
|
filtered = [wb for wb in wbs if not _PIPE_RE.match((wb.get("text") or "").strip())]
|
||||||
|
if len(filtered) < len(wbs):
|
||||||
|
removed_pipes += len(wbs) - len(filtered)
|
||||||
|
cell["word_boxes"] = filtered
|
||||||
|
cell["text"] = " ".join(
|
||||||
|
wb.get("text", "").strip()
|
||||||
|
for wb in sorted(filtered, key=lambda w: (w.get("top", 0), w.get("left", 0)))
|
||||||
|
if wb.get("text", "").strip()
|
||||||
|
)
|
||||||
|
# Remove cells that became empty after pipe removal
|
||||||
|
if removed_pipes:
|
||||||
|
z["cells"] = [c for c in z.get("cells", []) if (c.get("word_boxes") or c.get("text", "").strip())]
|
||||||
|
logger.info(
|
||||||
|
"build-grid: removed %d pipe-divider word_boxes from zone %d",
|
||||||
|
removed_pipes, z.get("zone_index", 0),
|
||||||
|
)
|
||||||
|
|
||||||
|
# Also strip leading/trailing pipe chars from cell text that may remain
|
||||||
|
# from word_boxes that contained mixed text like "word|" or "|word".
|
||||||
|
for z in zones_data:
|
||||||
|
for cell in z.get("cells", []):
|
||||||
|
text = cell.get("text", "")
|
||||||
|
if "|" in text:
|
||||||
|
cleaned = text.replace("|", "").strip()
|
||||||
|
if cleaned != text:
|
||||||
|
cell["text"] = cleaned
|
||||||
|
|
||||||
# 5. Color annotation on final word_boxes in cells
|
# 5. Color annotation on final word_boxes in cells
|
||||||
if img_bgr is not None:
|
if img_bgr is not None:
|
||||||
all_wb: List[Dict] = []
|
all_wb: List[Dict] = []
|
||||||
|
|||||||
@@ -418,6 +418,98 @@ class TestFilterBorderGhosts:
|
|||||||
assert len(filtered) == 0
|
assert len(filtered) == 0
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Step 4d: Pipe-character divider filter
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class TestPipeDividerFilter:
|
||||||
|
"""Step 4d removes '|' word_boxes that are OCR artifacts from column dividers."""
|
||||||
|
|
||||||
|
def test_pipe_word_boxes_removed(self):
|
||||||
|
"""Word boxes with text '|' or '||' are removed from cells."""
|
||||||
|
zone = {
|
||||||
|
"zone_index": 0,
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_id": "Z0_R0_C0",
|
||||||
|
"text": "hello | world",
|
||||||
|
"word_boxes": [
|
||||||
|
{"text": "hello", "top": 10, "left": 10, "height": 15, "width": 40},
|
||||||
|
{"text": "|", "top": 10, "left": 55, "height": 15, "width": 5},
|
||||||
|
{"text": "world", "top": 10, "left": 65, "height": 15, "width": 40},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
],
|
||||||
|
"rows": [{"index": 0}],
|
||||||
|
}
|
||||||
|
# Simulate Step 4d inline
|
||||||
|
import re
|
||||||
|
_PIPE_RE = re.compile(r"^\|+$")
|
||||||
|
for cell in zone["cells"]:
|
||||||
|
wbs = cell.get("word_boxes") or []
|
||||||
|
filtered = [wb for wb in wbs if not _PIPE_RE.match((wb.get("text") or "").strip())]
|
||||||
|
if len(filtered) < len(wbs):
|
||||||
|
cell["word_boxes"] = filtered
|
||||||
|
cell["text"] = " ".join(
|
||||||
|
wb.get("text", "").strip()
|
||||||
|
for wb in sorted(filtered, key=lambda w: (w.get("top", 0), w.get("left", 0)))
|
||||||
|
if wb.get("text", "").strip()
|
||||||
|
)
|
||||||
|
assert len(zone["cells"][0]["word_boxes"]) == 2
|
||||||
|
assert zone["cells"][0]["text"] == "hello world"
|
||||||
|
|
||||||
|
def test_pipe_only_cell_removed(self):
|
||||||
|
"""A cell containing only '|' word_boxes becomes empty and is removed."""
|
||||||
|
zone = {
|
||||||
|
"zone_index": 0,
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_id": "Z0_R0_C0",
|
||||||
|
"text": "hello",
|
||||||
|
"word_boxes": [
|
||||||
|
{"text": "hello", "top": 10, "left": 10, "height": 15, "width": 40},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_id": "Z0_R0_C1",
|
||||||
|
"text": "|",
|
||||||
|
"word_boxes": [
|
||||||
|
{"text": "|", "top": 10, "left": 740, "height": 15, "width": 5},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
],
|
||||||
|
"rows": [{"index": 0}],
|
||||||
|
}
|
||||||
|
import re
|
||||||
|
_PIPE_RE = re.compile(r"^\|+$")
|
||||||
|
removed = 0
|
||||||
|
for cell in zone["cells"]:
|
||||||
|
wbs = cell.get("word_boxes") or []
|
||||||
|
filtered = [wb for wb in wbs if not _PIPE_RE.match((wb.get("text") or "").strip())]
|
||||||
|
if len(filtered) < len(wbs):
|
||||||
|
removed += len(wbs) - len(filtered)
|
||||||
|
cell["word_boxes"] = filtered
|
||||||
|
cell["text"] = " ".join(
|
||||||
|
wb.get("text", "").strip()
|
||||||
|
for wb in sorted(filtered, key=lambda w: (w.get("top", 0), w.get("left", 0)))
|
||||||
|
if wb.get("text", "").strip()
|
||||||
|
)
|
||||||
|
if removed:
|
||||||
|
zone["cells"] = [c for c in zone["cells"] if (c.get("word_boxes") or c.get("text", "").strip())]
|
||||||
|
assert removed == 1
|
||||||
|
assert len(zone["cells"]) == 1
|
||||||
|
assert zone["cells"][0]["text"] == "hello"
|
||||||
|
|
||||||
|
def test_double_pipe_removed(self):
|
||||||
|
"""'||' is also treated as a divider artifact."""
|
||||||
|
import re
|
||||||
|
_PIPE_RE = re.compile(r"^\|+$")
|
||||||
|
assert _PIPE_RE.match("||") is not None
|
||||||
|
assert _PIPE_RE.match("|") is not None
|
||||||
|
assert _PIPE_RE.match("hello") is None
|
||||||
|
assert _PIPE_RE.match("|word") is None
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# _detect_header_rows (Fix 3: skip_first_row_header)
|
# _detect_header_rows (Fix 3: skip_first_row_header)
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|||||||
Reference in New Issue
Block a user