From 7750b2a05f93255acaf9c5d5eec1318c211c34af Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Thu, 19 Mar 2026 23:04:00 +0100 Subject: [PATCH] Fix ghost filter for borderless boxes + remove oversized graphic artifacts 1. Skip ghost filtering for boxes with border_thickness=0 (images/graphics have no border lines to produce OCR artifacts like |, I) 2. Remove individual word_boxes with height > 3x zone median (OCR from graphics like a huge "N" from a map image below text) Co-Authored-By: Claude Opus 4.6 --- klausur-service/backend/grid_editor_api.py | 49 +++++++++++++++++-- .../backend/tests/test_grid_editor_api.py | 15 +++++- 2 files changed, 58 insertions(+), 6 deletions(-) diff --git a/klausur-service/backend/grid_editor_api.py b/klausur-service/backend/grid_editor_api.py index c5dbf4b..2a2d479 100644 --- a/klausur-service/backend/grid_editor_api.py +++ b/klausur-service/backend/grid_editor_api.py @@ -290,15 +290,18 @@ def _filter_border_ghosts( x_bands: List[tuple] = [] y_bands: List[tuple] = [] for b in boxes: - bx = b.x if hasattr(b, "x") else b.get("x", 0) - by = b.y if hasattr(b, "y") else b.get("y", 0) - bw = b.width if hasattr(b, "width") else b.get("w", b.get("width", 0)) - bh = b.height if hasattr(b, "height") else b.get("h", b.get("height", 0)) bt = ( b.border_thickness if hasattr(b, "border_thickness") else b.get("border_thickness", 3) ) + # Skip borderless boxes (images/graphics) — no border line to produce ghosts + if bt == 0: + continue + bx = b.x if hasattr(b, "x") else b.get("x", 0) + by = b.y if hasattr(b, "y") else b.get("y", 0) + bw = b.width if hasattr(b, "width") else b.get("w", b.get("width", 0)) + bh = b.height if hasattr(b, "height") else b.get("h", b.get("height", 0)) margin = max(bt * 2, 10) + 6 x_bands.append((bx - margin, bx + margin)) x_bands.append((bx + bw - margin, bx + bw + margin)) @@ -1518,6 +1521,44 @@ async def _build_grid_core(session_id: str, session: dict) -> dict: sorted(junk_row_indices), ) + # 4c. Remove oversized word_boxes from individual cells. + # OCR artifacts from graphics/images (e.g. a huge "N" from a map image) + # have word heights 3-5x the median. Remove them per-word so they don't + # pollute cells that also contain valid text in other columns. + for z in zones_data: + cells = z.get("cells", []) + if not cells: + continue + all_wh = [ + wb["height"] + for cell in cells + for wb in cell.get("word_boxes") or [] + if wb.get("height", 0) > 0 + ] + if not all_wh: + continue + med_h = sorted(all_wh)[len(all_wh) // 2] + oversized_threshold = med_h * 3 + removed_oversized = 0 + for cell in cells: + wbs = cell.get("word_boxes") or [] + filtered = [wb for wb in wbs if wb.get("height", 0) <= oversized_threshold] + if len(filtered) < len(wbs): + removed_oversized += len(wbs) - len(filtered) + cell["word_boxes"] = filtered + cell["text"] = " ".join( + wb.get("text", "").strip() + for wb in sorted(filtered, key=lambda w: (w.get("top", 0), w.get("left", 0))) + if wb.get("text", "").strip() + ) + if removed_oversized: + # Remove cells that became empty after oversized removal + z["cells"] = [c for c in cells if c.get("word_boxes")] + logger.info( + "build-grid: removed %d oversized word_boxes (>%dpx) from zone %d", + removed_oversized, oversized_threshold, z.get("zone_index", 0), + ) + # 5. Color annotation on final word_boxes in cells if img_bgr is not None: all_wb: List[Dict] = [] diff --git a/klausur-service/backend/tests/test_grid_editor_api.py b/klausur-service/backend/tests/test_grid_editor_api.py index 6e6e342..c9c1284 100644 --- a/klausur-service/backend/tests/test_grid_editor_api.py +++ b/klausur-service/backend/tests/test_grid_editor_api.py @@ -384,8 +384,8 @@ class TestFilterBorderGhosts: assert filtered[0]["text"] == "hello" def test_multi_char_ghost_kept(self): - """Multi-char '(=' on a box border → NOT filtered (real content).""" - box = DetectedBox(x=648, y=129, width=338, height=125, confidence=0.7, border_thickness=0) + """Multi-char '(=' on a bordered box → NOT filtered (real content).""" + box = DetectedBox(x=648, y=129, width=338, height=125, confidence=0.7, border_thickness=3) words = [ {"text": "(=", "left": 644, "top": 294, "width": 16, "height": 17}, {"text": "I", "left": 665, "top": 294, "width": 9, "height": 18}, @@ -394,6 +394,17 @@ class TestFilterBorderGhosts: assert count == 0 assert len(filtered) == 2 + def test_borderless_box_no_ghost_filter(self): + """Borderless box (border_thickness=0) → no ghost filtering at all.""" + box = DetectedBox(x=648, y=129, width=338, height=125, confidence=0.7, border_thickness=0) + words = [ + {"text": "I", "left": 643, "top": 272, "width": 6, "height": 19}, # near box edge + {"text": "|", "left": 647, "top": 200, "width": 3, "height": 10}, # even pipe on edge + ] + filtered, count = _filter_border_ghosts(words, [box]) + assert count == 0 # nothing filtered — borderless box + assert len(filtered) == 2 + def test_single_paren_on_border_removed(self): """Single ')' on border → filtered.""" box = DetectedBox(x=100, y=100, width=200, height=150, confidence=0.9, border_thickness=2)