feat: detect and remove page-border decoration strip artifacts (Step 4e)

Textbooks with decorative alphabet strips along page edges produce OCR artifacts (scattered colored letters at x<150 while real content starts at x>=179). Step 4e detects a significant x-gap (>30px) between a small cluster (<15% of total word_boxes) near the page edge and the main content, then removes the border-strip word_boxes. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-21 17:20:45 +01:00
parent f31a7175a2
commit c0e1118870
2 changed files with 153 additions and 0 deletions
--- a/klausur-service/backend/grid_editor_api.py
+++ b/klausur-service/backend/grid_editor_api.py
@@ -1894,6 +1894,66 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
                if cleaned != text:
                    cell["text"] = cleaned

+    # 4e. Detect and remove page-border decoration strips.
+    # Some textbooks have decorative alphabet strips along the page edge
+    # (coloured letters, illustrations).  OCR picks up scattered letters
+    # from these as artifacts.  Detection: find a significant x-gap
+    # (>30 px) between a small cluster of word_boxes near the page edge
+    # and the main content block.
+    border_strip_removed = 0
+    for z in zones_data:
+        cells = z.get("cells", [])
+        if not cells:
+            continue
+        # Collect all word_boxes with their cell reference
+        all_wbs_with_cell: List[tuple] = []  # (left, wb, cell)
+        for cell in cells:
+            for wb in cell.get("word_boxes") or []:
+                all_wbs_with_cell.append((wb.get("left", 0), wb, cell))
+        if len(all_wbs_with_cell) < 10:
+            continue
+        # Sort by x and find the largest gap
+        all_wbs_with_cell.sort(key=lambda t: t[0])
+        best_gap = 0
+        best_gap_idx = -1
+        for gi in range(len(all_wbs_with_cell) - 1):
+            right_edge = all_wbs_with_cell[gi][0] + all_wbs_with_cell[gi][1].get("width", 0)
+            gap = all_wbs_with_cell[gi + 1][0] - right_edge
+            if gap > best_gap:
+                best_gap = gap
+                best_gap_idx = gi
+        if best_gap < 30 or best_gap_idx < 0:
+            continue
+        left_count = best_gap_idx + 1
+        right_count = len(all_wbs_with_cell) - left_count
+        total = len(all_wbs_with_cell)
+        # The border strip is the SMALLER side with < 15% of total
+        if left_count < right_count and left_count / total < 0.15:
+            strip_side = "left"
+            strip_wbs = {id(t[1]) for t in all_wbs_with_cell[:left_count]}
+        elif right_count < left_count and right_count / total < 0.15:
+            strip_side = "right"
+            strip_wbs = {id(t[1]) for t in all_wbs_with_cell[left_count:]}
+        else:
+            continue
+        # Remove strip word_boxes from cells
+        for cell in cells:
+            wbs = cell.get("word_boxes") or []
+            filtered = [wb for wb in wbs if id(wb) not in strip_wbs]
+            if len(filtered) < len(wbs):
+                border_strip_removed += len(wbs) - len(filtered)
+                cell["word_boxes"] = filtered
+                cell["text"] = _words_to_reading_order_text(filtered)
+        # Remove cells that became empty
+        z["cells"] = [c for c in cells
+                      if (c.get("word_boxes") or c.get("text", "").strip())]
+        logger.info(
+            "Step 4e: removed %d border-strip word_boxes (%s) from zone %d "
+            "(gap=%dpx, strip=%d/%d wbs)",
+            border_strip_removed, strip_side, z.get("zone_index", 0),
+            best_gap, left_count if strip_side == "left" else right_count, total,
+        )
+
    # 5. Color annotation on final word_boxes in cells
    if img_bgr is not None:
        all_wb: List[Dict] = []