From 7ac09b59413ad73db9c20f940a02ec133ed2daec Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Fri, 20 Mar 2026 12:09:50 +0100 Subject: [PATCH] Filter pipe-character word_boxes from OCR column divider artifacts Step 4d removes "|" and "||" word_boxes that OCR produces when reading physical vertical divider lines between columns. Also strips stray pipe chars from cell text. Co-Authored-By: Claude Opus 4.6 --- klausur-service/backend/grid_editor_api.py | 36 ++++++++ .../backend/tests/test_grid_editor_api.py | 92 +++++++++++++++++++ 2 files changed, 128 insertions(+) diff --git a/klausur-service/backend/grid_editor_api.py b/klausur-service/backend/grid_editor_api.py index eb3406a..4daf05c 100644 --- a/klausur-service/backend/grid_editor_api.py +++ b/klausur-service/backend/grid_editor_api.py @@ -1709,6 +1709,42 @@ async def _build_grid_core(session_id: str, session: dict) -> dict: removed_oversized, oversized_threshold, z.get("zone_index", 0), ) + # 4d. Remove pipe-character word_boxes (column divider artifacts). + # OCR reads physical vertical divider lines as "|" or "||" characters. + # These sit at consistent x positions near column boundaries and pollute + # cell text. Remove them from word_boxes and rebuild cell text. + _PIPE_RE = re.compile(r"^\|+$") + for z in zones_data: + removed_pipes = 0 + for cell in z.get("cells", []): + wbs = cell.get("word_boxes") or [] + filtered = [wb for wb in wbs if not _PIPE_RE.match((wb.get("text") or "").strip())] + if len(filtered) < len(wbs): + removed_pipes += len(wbs) - len(filtered) + cell["word_boxes"] = filtered + cell["text"] = " ".join( + wb.get("text", "").strip() + for wb in sorted(filtered, key=lambda w: (w.get("top", 0), w.get("left", 0))) + if wb.get("text", "").strip() + ) + # Remove cells that became empty after pipe removal + if removed_pipes: + z["cells"] = [c for c in z.get("cells", []) if (c.get("word_boxes") or c.get("text", "").strip())] + logger.info( + "build-grid: removed %d pipe-divider word_boxes from zone %d", + removed_pipes, z.get("zone_index", 0), + ) + + # Also strip leading/trailing pipe chars from cell text that may remain + # from word_boxes that contained mixed text like "word|" or "|word". + for z in zones_data: + for cell in z.get("cells", []): + text = cell.get("text", "") + if "|" in text: + cleaned = text.replace("|", "").strip() + if cleaned != text: + cell["text"] = cleaned + # 5. Color annotation on final word_boxes in cells if img_bgr is not None: all_wb: List[Dict] = [] diff --git a/klausur-service/backend/tests/test_grid_editor_api.py b/klausur-service/backend/tests/test_grid_editor_api.py index 708bf57..a268c65 100644 --- a/klausur-service/backend/tests/test_grid_editor_api.py +++ b/klausur-service/backend/tests/test_grid_editor_api.py @@ -418,6 +418,98 @@ class TestFilterBorderGhosts: assert len(filtered) == 0 +# --------------------------------------------------------------------------- +# Step 4d: Pipe-character divider filter +# --------------------------------------------------------------------------- + +class TestPipeDividerFilter: + """Step 4d removes '|' word_boxes that are OCR artifacts from column dividers.""" + + def test_pipe_word_boxes_removed(self): + """Word boxes with text '|' or '||' are removed from cells.""" + zone = { + "zone_index": 0, + "cells": [ + { + "cell_id": "Z0_R0_C0", + "text": "hello | world", + "word_boxes": [ + {"text": "hello", "top": 10, "left": 10, "height": 15, "width": 40}, + {"text": "|", "top": 10, "left": 55, "height": 15, "width": 5}, + {"text": "world", "top": 10, "left": 65, "height": 15, "width": 40}, + ], + }, + ], + "rows": [{"index": 0}], + } + # Simulate Step 4d inline + import re + _PIPE_RE = re.compile(r"^\|+$") + for cell in zone["cells"]: + wbs = cell.get("word_boxes") or [] + filtered = [wb for wb in wbs if not _PIPE_RE.match((wb.get("text") or "").strip())] + if len(filtered) < len(wbs): + cell["word_boxes"] = filtered + cell["text"] = " ".join( + wb.get("text", "").strip() + for wb in sorted(filtered, key=lambda w: (w.get("top", 0), w.get("left", 0))) + if wb.get("text", "").strip() + ) + assert len(zone["cells"][0]["word_boxes"]) == 2 + assert zone["cells"][0]["text"] == "hello world" + + def test_pipe_only_cell_removed(self): + """A cell containing only '|' word_boxes becomes empty and is removed.""" + zone = { + "zone_index": 0, + "cells": [ + { + "cell_id": "Z0_R0_C0", + "text": "hello", + "word_boxes": [ + {"text": "hello", "top": 10, "left": 10, "height": 15, "width": 40}, + ], + }, + { + "cell_id": "Z0_R0_C1", + "text": "|", + "word_boxes": [ + {"text": "|", "top": 10, "left": 740, "height": 15, "width": 5}, + ], + }, + ], + "rows": [{"index": 0}], + } + import re + _PIPE_RE = re.compile(r"^\|+$") + removed = 0 + for cell in zone["cells"]: + wbs = cell.get("word_boxes") or [] + filtered = [wb for wb in wbs if not _PIPE_RE.match((wb.get("text") or "").strip())] + if len(filtered) < len(wbs): + removed += len(wbs) - len(filtered) + cell["word_boxes"] = filtered + cell["text"] = " ".join( + wb.get("text", "").strip() + for wb in sorted(filtered, key=lambda w: (w.get("top", 0), w.get("left", 0))) + if wb.get("text", "").strip() + ) + if removed: + zone["cells"] = [c for c in zone["cells"] if (c.get("word_boxes") or c.get("text", "").strip())] + assert removed == 1 + assert len(zone["cells"]) == 1 + assert zone["cells"][0]["text"] == "hello" + + def test_double_pipe_removed(self): + """'||' is also treated as a divider artifact.""" + import re + _PIPE_RE = re.compile(r"^\|+$") + assert _PIPE_RE.match("||") is not None + assert _PIPE_RE.match("|") is not None + assert _PIPE_RE.match("hello") is None + assert _PIPE_RE.match("|word") is None + + # --------------------------------------------------------------------------- # _detect_header_rows (Fix 3: skip_first_row_header) # ---------------------------------------------------------------------------