Filter pipe-character word_boxes from OCR column divider artifacts
Step 4d removes "|" and "||" word_boxes that OCR produces when reading physical vertical divider lines between columns. Also strips stray pipe chars from cell text. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1709,6 +1709,42 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
|
||||
removed_oversized, oversized_threshold, z.get("zone_index", 0),
|
||||
)
|
||||
|
||||
# 4d. Remove pipe-character word_boxes (column divider artifacts).
|
||||
# OCR reads physical vertical divider lines as "|" or "||" characters.
|
||||
# These sit at consistent x positions near column boundaries and pollute
|
||||
# cell text. Remove them from word_boxes and rebuild cell text.
|
||||
_PIPE_RE = re.compile(r"^\|+$")
|
||||
for z in zones_data:
|
||||
removed_pipes = 0
|
||||
for cell in z.get("cells", []):
|
||||
wbs = cell.get("word_boxes") or []
|
||||
filtered = [wb for wb in wbs if not _PIPE_RE.match((wb.get("text") or "").strip())]
|
||||
if len(filtered) < len(wbs):
|
||||
removed_pipes += len(wbs) - len(filtered)
|
||||
cell["word_boxes"] = filtered
|
||||
cell["text"] = " ".join(
|
||||
wb.get("text", "").strip()
|
||||
for wb in sorted(filtered, key=lambda w: (w.get("top", 0), w.get("left", 0)))
|
||||
if wb.get("text", "").strip()
|
||||
)
|
||||
# Remove cells that became empty after pipe removal
|
||||
if removed_pipes:
|
||||
z["cells"] = [c for c in z.get("cells", []) if (c.get("word_boxes") or c.get("text", "").strip())]
|
||||
logger.info(
|
||||
"build-grid: removed %d pipe-divider word_boxes from zone %d",
|
||||
removed_pipes, z.get("zone_index", 0),
|
||||
)
|
||||
|
||||
# Also strip leading/trailing pipe chars from cell text that may remain
|
||||
# from word_boxes that contained mixed text like "word|" or "|word".
|
||||
for z in zones_data:
|
||||
for cell in z.get("cells", []):
|
||||
text = cell.get("text", "")
|
||||
if "|" in text:
|
||||
cleaned = text.replace("|", "").strip()
|
||||
if cleaned != text:
|
||||
cell["text"] = cleaned
|
||||
|
||||
# 5. Color annotation on final word_boxes in cells
|
||||
if img_bgr is not None:
|
||||
all_wb: List[Dict] = []
|
||||
|
||||
Reference in New Issue
Block a user