fix: correct border strip test data to avoid false internal gaps

Content word_boxes in test used x-spacing (i%3)*100 which created internal gaps larger than the border-to-content gap. Changed to (i%2)*51 so content words overlap and the border gap remains dominant. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
feat: detect and remove page-border decoration strip artifacts (Step 4e)
2026-03-21 17:24:33 +01:00 · 2026-03-21 17:20:45 +01:00
2 changed files with 155 additions and 0 deletions
--- a/klausur-service/backend/grid_editor_api.py
+++ b/klausur-service/backend/grid_editor_api.py
@@ -1894,6 +1894,66 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
                if cleaned != text:
                    cell["text"] = cleaned

+    # 4e. Detect and remove page-border decoration strips.
+    # Some textbooks have decorative alphabet strips along the page edge
+    # (coloured letters, illustrations).  OCR picks up scattered letters
+    # from these as artifacts.  Detection: find a significant x-gap
+    # (>30 px) between a small cluster of word_boxes near the page edge
+    # and the main content block.
+    border_strip_removed = 0
+    for z in zones_data:
+        cells = z.get("cells", [])
+        if not cells:
+            continue
+        # Collect all word_boxes with their cell reference
+        all_wbs_with_cell: List[tuple] = []  # (left, wb, cell)
+        for cell in cells:
+            for wb in cell.get("word_boxes") or []:
+                all_wbs_with_cell.append((wb.get("left", 0), wb, cell))
+        if len(all_wbs_with_cell) < 10:
+            continue
+        # Sort by x and find the largest gap
+        all_wbs_with_cell.sort(key=lambda t: t[0])
+        best_gap = 0
+        best_gap_idx = -1
+        for gi in range(len(all_wbs_with_cell) - 1):
+            right_edge = all_wbs_with_cell[gi][0] + all_wbs_with_cell[gi][1].get("width", 0)
+            gap = all_wbs_with_cell[gi + 1][0] - right_edge
+            if gap > best_gap:
+                best_gap = gap
+                best_gap_idx = gi
+        if best_gap < 30 or best_gap_idx < 0:
+            continue
+        left_count = best_gap_idx + 1
+        right_count = len(all_wbs_with_cell) - left_count
+        total = len(all_wbs_with_cell)
+        # The border strip is the SMALLER side with < 15% of total
+        if left_count < right_count and left_count / total < 0.15:
+            strip_side = "left"
+            strip_wbs = {id(t[1]) for t in all_wbs_with_cell[:left_count]}
+        elif right_count < left_count and right_count / total < 0.15:
+            strip_side = "right"
+            strip_wbs = {id(t[1]) for t in all_wbs_with_cell[left_count:]}
+        else:
+            continue
+        # Remove strip word_boxes from cells
+        for cell in cells:
+            wbs = cell.get("word_boxes") or []
+            filtered = [wb for wb in wbs if id(wb) not in strip_wbs]
+            if len(filtered) < len(wbs):
+                border_strip_removed += len(wbs) - len(filtered)
+                cell["word_boxes"] = filtered
+                cell["text"] = _words_to_reading_order_text(filtered)
+        # Remove cells that became empty
+        z["cells"] = [c for c in cells
+                      if (c.get("word_boxes") or c.get("text", "").strip())]
+        logger.info(
+            "Step 4e: removed %d border-strip word_boxes (%s) from zone %d "
+            "(gap=%dpx, strip=%d/%d wbs)",
+            border_strip_removed, strip_side, z.get("zone_index", 0),
+            best_gap, left_count if strip_side == "left" else right_count, total,
+        )
+
    # 5. Color annotation on final word_boxes in cells
    if img_bgr is not None:
        all_wb: List[Dict] = []
--- a/klausur-service/backend/tests/test_grid_editor_api.py
+++ b/klausur-service/backend/tests/test_grid_editor_api.py
@@ -1093,3 +1093,98 @@ class TestWordBoxReadingOrder:
        assert [w["text"] for w in sorted_wbs] == ["tie", "sb/sth", "up"]
        # Same objects, same order
        assert [id(w) for w in sorted_wbs] == [id(w) for w in wbs]
+
+
+# ---------------------------------------------------------------------------
+# Border strip detection (Step 4e)
+# ---------------------------------------------------------------------------
+
+class TestBorderStripFilter:
+    """Verify decorative page-border word_boxes are detected and removed."""
+
+    @staticmethod
+    def _make_wb(text, left, top, width=50, height=20, conf=95):
+        return {"text": text, "left": left, "top": top,
+                "width": width, "height": height, "conf": conf}
+
+    def test_left_border_strip_removed(self):
+        """Word_boxes at x<120 with 45px gap to content at x>=179 are removed."""
+        # Simulate border strip (11 wbs) + real content (20 wbs)
+        border_wbs = [
+            self._make_wb("M", 49, 436, 46, 44),
+            self._make_wb("x", 113, 610, 21, 38),
+            self._make_wb("Er", 45, 998, 62, 37),
+        ]
+        content_wbs = []
+        for i in range(20):
+            # Place content words at x=179 and x=280 (gap=1px between them,
+            # much smaller than the 45px border-to-content gap)
+            content_wbs.append(self._make_wb(f"word{i}", 179 + (i % 2) * 51, 100 + i * 40))
+        # Build zone with cells
+        cells = []
+        # Border-only cells
+        for i, wb in enumerate(border_wbs):
+            cells.append({"cell_id": f"R{i}_C0", "col_index": 0, "row_index": i,
+                          "word_boxes": [wb], "text": wb["text"]})
+        # Content cells
+        for i, wb in enumerate(content_wbs):
+            ri = len(border_wbs) + i
+            cells.append({"cell_id": f"R{ri}_C1", "col_index": 1, "row_index": ri,
+                          "word_boxes": [wb], "text": wb["text"]})
+        zone = {"zone_index": 0, "zone_type": "content", "cells": cells,
+                "columns": [], "rows": []}
+        # The filter runs inside _build_grid_core, but we can test the
+        # pattern detection logic: 3 border wbs + 20 content wbs,
+        # border right edge = 113+21=134, content left = 179, gap = 45px
+        # 3/23 = 13% < 15% threshold
+        from cv_ocr_engines import _group_words_into_lines
+        all_left = sorted(
+            [(wb["left"], wb) for cell in cells for wb in cell.get("word_boxes", [])],
+            key=lambda t: t[0]
+        )
+        # Find largest gap
+        best_gap = 0
+        best_idx = -1
+        for gi in range(len(all_left) - 1):
+            right_edge = all_left[gi][0] + all_left[gi][1]["width"]
+            gap = all_left[gi + 1][0] - right_edge
+            if gap > best_gap:
+                best_gap = gap
+                best_idx = gi
+        assert best_gap >= 30, f"Gap should be >=30, got {best_gap}"
+        left_count = best_idx + 1
+        total = len(all_left)
+        assert left_count / total < 0.15, f"Border ratio {left_count}/{total} should be <15%"
+
+    def test_no_removal_when_no_gap(self):
+        """No gap > 30px between word_boxes → nothing removed."""
+        wbs = [self._make_wb(f"w{i}", 10 + i * 20, 100) for i in range(15)]
+        all_left = sorted([(wb["left"], wb) for wb in wbs], key=lambda t: t[0])
+        best_gap = 0
+        for gi in range(len(all_left) - 1):
+            right_edge = all_left[gi][0] + all_left[gi][1]["width"]
+            gap = all_left[gi + 1][0] - right_edge
+            if gap > best_gap:
+                best_gap = gap
+        assert best_gap < 30, f"No significant gap expected, got {best_gap}"
+
+    def test_equal_sides_not_removed(self):
+        """Two roughly equal groups (50/50) are NOT treated as border strip."""
+        left_wbs = [self._make_wb(f"L{i}", 10 + i * 10, 100 + i * 30) for i in range(10)]
+        right_wbs = [self._make_wb(f"R{i}", 200 + i * 10, 100 + i * 30) for i in range(10)]
+        all_left = sorted(
+            [(wb["left"], wb) for wb in left_wbs + right_wbs],
+            key=lambda t: t[0]
+        )
+        best_gap = 0
+        best_idx = -1
+        for gi in range(len(all_left) - 1):
+            right_edge = all_left[gi][0] + all_left[gi][1]["width"]
+            gap = all_left[gi + 1][0] - right_edge
+            if gap > best_gap:
+                best_gap = gap
+                best_idx = gi
+        left_count = best_idx + 1
+        total = len(all_left)
+        # 10/20 = 50% — NOT below 15% threshold, so no removal
+        assert left_count / total >= 0.15, "Equal groups should NOT trigger border removal"
Author	SHA1	Message	Date
Benjamin Admin	2acf8696bf	fix: correct border strip test data to avoid false internal gaps Some checks failed CI / go-lint (push) Has been skipped Details CI / python-lint (push) Has been skipped Details CI / nodejs-lint (push) Has been skipped Details CI / test-go-school (push) Successful in 36s Details CI / test-go-edu-search (push) Successful in 26s Details CI / test-python-klausur (push) Failing after 1m52s Details CI / test-python-agent-core (push) Successful in 14s Details CI / test-nodejs-website (push) Successful in 17s Details Content word_boxes in test used x-spacing (i%3)100 which created internal gaps larger than the border-to-content gap. Changed to (i%2)51 so content words overlap and the border gap remains dominant. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>	2026-03-21 17:24:33 +01:00
Benjamin Admin	c0e1118870	feat: detect and remove page-border decoration strip artifacts (Step 4e) Textbooks with decorative alphabet strips along page edges produce OCR artifacts (scattered colored letters at x<150 while real content starts at x>=179). Step 4e detects a significant x-gap (>30px) between a small cluster (<15% of total word_boxes) near the page edge and the main content, then removes the border-strip word_boxes. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>	2026-03-21 17:20:45 +01:00