Filter pipe-character word_boxes from OCR column divider artifacts

Step 4d removes "|" and "||" word_boxes that OCR produces when reading
physical vertical divider lines between columns. Also strips stray pipe
chars from cell text.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-20 12:09:50 +01:00
parent 1f7989cfc2
commit 7ac09b5941
2 changed files with 128 additions and 0 deletions

View File

@@ -1709,6 +1709,42 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
removed_oversized, oversized_threshold, z.get("zone_index", 0),
)
# 4d. Remove pipe-character word_boxes (column divider artifacts).
# OCR reads physical vertical divider lines as "|" or "||" characters.
# These sit at consistent x positions near column boundaries and pollute
# cell text. Remove them from word_boxes and rebuild cell text.
_PIPE_RE = re.compile(r"^\|+$")
for z in zones_data:
removed_pipes = 0
for cell in z.get("cells", []):
wbs = cell.get("word_boxes") or []
filtered = [wb for wb in wbs if not _PIPE_RE.match((wb.get("text") or "").strip())]
if len(filtered) < len(wbs):
removed_pipes += len(wbs) - len(filtered)
cell["word_boxes"] = filtered
cell["text"] = " ".join(
wb.get("text", "").strip()
for wb in sorted(filtered, key=lambda w: (w.get("top", 0), w.get("left", 0)))
if wb.get("text", "").strip()
)
# Remove cells that became empty after pipe removal
if removed_pipes:
z["cells"] = [c for c in z.get("cells", []) if (c.get("word_boxes") or c.get("text", "").strip())]
logger.info(
"build-grid: removed %d pipe-divider word_boxes from zone %d",
removed_pipes, z.get("zone_index", 0),
)
# Also strip leading/trailing pipe chars from cell text that may remain
# from word_boxes that contained mixed text like "word|" or "|word".
for z in zones_data:
for cell in z.get("cells", []):
text = cell.get("text", "")
if "|" in text:
cleaned = text.replace("|", "").strip()
if cleaned != text:
cell["text"] = cleaned
# 5. Color annotation on final word_boxes in cells
if img_bgr is not None:
all_wb: List[Dict] = []