diff --git a/klausur-service/backend/grid_editor_api.py b/klausur-service/backend/grid_editor_api.py index c7f25f3..3523964 100644 --- a/klausur-service/backend/grid_editor_api.py +++ b/klausur-service/backend/grid_editor_api.py @@ -1894,6 +1894,66 @@ async def _build_grid_core(session_id: str, session: dict) -> dict: if cleaned != text: cell["text"] = cleaned + # 4e. Detect and remove page-border decoration strips. + # Some textbooks have decorative alphabet strips along the page edge + # (coloured letters, illustrations). OCR picks up scattered letters + # from these as artifacts. Detection: find a significant x-gap + # (>30 px) between a small cluster of word_boxes near the page edge + # and the main content block. + border_strip_removed = 0 + for z in zones_data: + cells = z.get("cells", []) + if not cells: + continue + # Collect all word_boxes with their cell reference + all_wbs_with_cell: List[tuple] = [] # (left, wb, cell) + for cell in cells: + for wb in cell.get("word_boxes") or []: + all_wbs_with_cell.append((wb.get("left", 0), wb, cell)) + if len(all_wbs_with_cell) < 10: + continue + # Sort by x and find the largest gap + all_wbs_with_cell.sort(key=lambda t: t[0]) + best_gap = 0 + best_gap_idx = -1 + for gi in range(len(all_wbs_with_cell) - 1): + right_edge = all_wbs_with_cell[gi][0] + all_wbs_with_cell[gi][1].get("width", 0) + gap = all_wbs_with_cell[gi + 1][0] - right_edge + if gap > best_gap: + best_gap = gap + best_gap_idx = gi + if best_gap < 30 or best_gap_idx < 0: + continue + left_count = best_gap_idx + 1 + right_count = len(all_wbs_with_cell) - left_count + total = len(all_wbs_with_cell) + # The border strip is the SMALLER side with < 15% of total + if left_count < right_count and left_count / total < 0.15: + strip_side = "left" + strip_wbs = {id(t[1]) for t in all_wbs_with_cell[:left_count]} + elif right_count < left_count and right_count / total < 0.15: + strip_side = "right" + strip_wbs = {id(t[1]) for t in all_wbs_with_cell[left_count:]} + else: + continue + # Remove strip word_boxes from cells + for cell in cells: + wbs = cell.get("word_boxes") or [] + filtered = [wb for wb in wbs if id(wb) not in strip_wbs] + if len(filtered) < len(wbs): + border_strip_removed += len(wbs) - len(filtered) + cell["word_boxes"] = filtered + cell["text"] = _words_to_reading_order_text(filtered) + # Remove cells that became empty + z["cells"] = [c for c in cells + if (c.get("word_boxes") or c.get("text", "").strip())] + logger.info( + "Step 4e: removed %d border-strip word_boxes (%s) from zone %d " + "(gap=%dpx, strip=%d/%d wbs)", + border_strip_removed, strip_side, z.get("zone_index", 0), + best_gap, left_count if strip_side == "left" else right_count, total, + ) + # 5. Color annotation on final word_boxes in cells if img_bgr is not None: all_wb: List[Dict] = [] diff --git a/klausur-service/backend/tests/test_grid_editor_api.py b/klausur-service/backend/tests/test_grid_editor_api.py index f32a8d2..d66111e 100644 --- a/klausur-service/backend/tests/test_grid_editor_api.py +++ b/klausur-service/backend/tests/test_grid_editor_api.py @@ -1093,3 +1093,96 @@ class TestWordBoxReadingOrder: assert [w["text"] for w in sorted_wbs] == ["tie", "sb/sth", "up"] # Same objects, same order assert [id(w) for w in sorted_wbs] == [id(w) for w in wbs] + + +# --------------------------------------------------------------------------- +# Border strip detection (Step 4e) +# --------------------------------------------------------------------------- + +class TestBorderStripFilter: + """Verify decorative page-border word_boxes are detected and removed.""" + + @staticmethod + def _make_wb(text, left, top, width=50, height=20, conf=95): + return {"text": text, "left": left, "top": top, + "width": width, "height": height, "conf": conf} + + def test_left_border_strip_removed(self): + """Word_boxes at x<120 with 45px gap to content at x>=179 are removed.""" + # Simulate border strip (11 wbs) + real content (20 wbs) + border_wbs = [ + self._make_wb("M", 49, 436, 46, 44), + self._make_wb("x", 113, 610, 21, 38), + self._make_wb("Er", 45, 998, 62, 37), + ] + content_wbs = [] + for i in range(20): + content_wbs.append(self._make_wb(f"word{i}", 179 + (i % 3) * 100, 100 + i * 40)) + # Build zone with cells + cells = [] + # Border-only cells + for i, wb in enumerate(border_wbs): + cells.append({"cell_id": f"R{i}_C0", "col_index": 0, "row_index": i, + "word_boxes": [wb], "text": wb["text"]}) + # Content cells + for i, wb in enumerate(content_wbs): + ri = len(border_wbs) + i + cells.append({"cell_id": f"R{ri}_C1", "col_index": 1, "row_index": ri, + "word_boxes": [wb], "text": wb["text"]}) + zone = {"zone_index": 0, "zone_type": "content", "cells": cells, + "columns": [], "rows": []} + # The filter runs inside _build_grid_core, but we can test the + # pattern detection logic: 3 border wbs + 20 content wbs, + # border right edge = 113+21=134, content left = 179, gap = 45px + # 3/23 = 13% < 15% threshold + from cv_ocr_engines import _group_words_into_lines + all_left = sorted( + [(wb["left"], wb) for cell in cells for wb in cell.get("word_boxes", [])], + key=lambda t: t[0] + ) + # Find largest gap + best_gap = 0 + best_idx = -1 + for gi in range(len(all_left) - 1): + right_edge = all_left[gi][0] + all_left[gi][1]["width"] + gap = all_left[gi + 1][0] - right_edge + if gap > best_gap: + best_gap = gap + best_idx = gi + assert best_gap >= 30, f"Gap should be >=30, got {best_gap}" + left_count = best_idx + 1 + total = len(all_left) + assert left_count / total < 0.15, f"Border ratio {left_count}/{total} should be <15%" + + def test_no_removal_when_no_gap(self): + """No gap > 30px between word_boxes → nothing removed.""" + wbs = [self._make_wb(f"w{i}", 10 + i * 20, 100) for i in range(15)] + all_left = sorted([(wb["left"], wb) for wb in wbs], key=lambda t: t[0]) + best_gap = 0 + for gi in range(len(all_left) - 1): + right_edge = all_left[gi][0] + all_left[gi][1]["width"] + gap = all_left[gi + 1][0] - right_edge + if gap > best_gap: + best_gap = gap + assert best_gap < 30, f"No significant gap expected, got {best_gap}" + + def test_equal_sides_not_removed(self): + """Two roughly equal groups (50/50) are NOT treated as border strip.""" + left_wbs = [self._make_wb(f"L{i}", 10 + i * 10, 100 + i * 30) for i in range(10)] + right_wbs = [self._make_wb(f"R{i}", 200 + i * 10, 100 + i * 30) for i in range(10)] + all_left = sorted( + [(wb["left"], wb) for wb in left_wbs + right_wbs], + key=lambda t: t[0] + ) + best_gap = 0 + best_idx = -1 + for gi in range(len(all_left) - 1): + right_edge = all_left[gi][0] + all_left[gi][1]["width"] + gap = all_left[gi + 1][0] - right_edge + if gap > best_gap: + best_gap = gap + best_idx = gi + left_count = best_idx + 1 + total = len(all_left) + # 10/20 = 50% — NOT below 15% threshold, so no removal + assert left_count / total >= 0.15, "Equal groups should NOT trigger border removal"