diff --git a/klausur-service/backend/grid_editor_api.py b/klausur-service/backend/grid_editor_api.py index 9787053..b2dc95c 100644 --- a/klausur-service/backend/grid_editor_api.py +++ b/klausur-service/backend/grid_editor_api.py @@ -14,7 +14,7 @@ DATENSCHUTZ: Alle Verarbeitung erfolgt lokal. import logging import re import time -from typing import Any, Dict, List, Optional +from typing import Any, Dict, List, Optional, Tuple import cv2 import numpy as np @@ -40,6 +40,60 @@ router = APIRouter(prefix="/api/v1/ocr-pipeline", tags=["grid-editor"]) # Helpers # --------------------------------------------------------------------------- +def _filter_border_strip_words(words: List[Dict]) -> Tuple[List[Dict], int]: + """Remove page-border decoration strip words BEFORE column detection. + + Scans from each page edge inward to find the first significant x-gap + (>30 px). If the edge cluster contains <15 % of total words, those + words are removed as border-strip artifacts (alphabet letters, + illustration fragments). + + Must run BEFORE ``_build_zone_grid`` so that column detection only + sees real content words and doesn't produce inflated row counts. + """ + if len(words) < 10: + return words, 0 + + sorted_words = sorted(words, key=lambda w: w.get("left", 0)) + total = len(sorted_words) + + # -- Left-edge scan (running max right-edge) -- + left_count = 0 + running_right = 0 + for gi in range(total - 1): + running_right = max( + running_right, + sorted_words[gi].get("left", 0) + sorted_words[gi].get("width", 0), + ) + if sorted_words[gi + 1].get("left", 0) - running_right > 30: + left_count = gi + 1 + break + + # -- Right-edge scan (running min left) -- + right_count = 0 + running_left = sorted_words[-1].get("left", 0) + for gi in range(total - 1, 0, -1): + running_left = min(running_left, sorted_words[gi].get("left", 0)) + prev_right = ( + sorted_words[gi - 1].get("left", 0) + + sorted_words[gi - 1].get("width", 0) + ) + if running_left - prev_right > 30: + right_count = total - gi + break + + strip_ids: set = set() + if left_count > 0 and left_count / total < 0.20: + strip_ids = {id(w) for w in sorted_words[:left_count]} + elif right_count > 0 and right_count / total < 0.20: + strip_ids = {id(w) for w in sorted_words[total - right_count :]} + + if not strip_ids: + return words, 0 + + return [w for w in words if id(w) not in strip_ids], len(strip_ids) + + def _cluster_columns_by_alignment( words: List[Dict], zone_w: int, @@ -1447,6 +1501,7 @@ async def _build_grid_core(session_id: str, session: dict) -> dict: zones_data: List[Dict[str, Any]] = [] boxes_detected = 0 recovered_count = 0 + border_prefiltered = False img_bgr = None content_x, content_y, content_w, content_h = _get_content_bounds(all_words) @@ -1591,6 +1646,13 @@ async def _build_grid_core(session_id: str, session: dict) -> dict: "build-grid: filtered %d words inside image overlays from zone %d", ov_removed, pz.index, ) + zone_words, bs_removed = _filter_border_strip_words(zone_words) + if bs_removed: + border_prefiltered = True + logger.info( + "build-grid: pre-filtered %d border-strip words from zone %d", + bs_removed, pz.index, + ) grid = _build_zone_grid( zone_words, pz.x, pz.y, pz.width, pz.height, pz.index, img_w, img_h, @@ -1728,6 +1790,16 @@ async def _build_grid_core(session_id: str, session: dict) -> dict: "build-grid session %s: filtered %d recovered artifacts (fallback zone)", session_id, removed, ) + # Pre-filter border-strip words so column detection is not + # confused by edge artifacts. When this removes words, Step 4e + # is skipped (it would otherwise re-detect content as a "strip"). + filtered_words, bs_removed = _filter_border_strip_words(filtered_words) + if bs_removed: + border_prefiltered = True + logger.info( + "build-grid session %s: pre-filtered %d border-strip words", + session_id, bs_removed, + ) grid = _build_zone_grid( filtered_words, content_x, content_y, content_w, content_h, 0, img_w, img_h, @@ -1895,64 +1967,93 @@ async def _build_grid_core(session_id: str, session: dict) -> dict: cell["text"] = cleaned # 4e. Detect and remove page-border decoration strips. - # Some textbooks have decorative alphabet strips along the page edge - # (coloured letters, illustrations). OCR picks up scattered letters - # from these as artifacts. Detection: find a significant x-gap - # (>30 px) between a small cluster of word_boxes near the page edge - # and the main content block. + # Skipped when the pre-filter already removed border words BEFORE + # column detection — re-running would incorrectly detect the + # leftmost content column as a "strip". border_strip_removed = 0 - for z in zones_data: - cells = z.get("cells", []) - if not cells: - continue - # Collect all word_boxes with their cell reference - all_wbs_with_cell: List[tuple] = [] # (left, wb, cell) - for cell in cells: - for wb in cell.get("word_boxes") or []: - all_wbs_with_cell.append((wb.get("left", 0), wb, cell)) - if len(all_wbs_with_cell) < 10: - continue - # Sort by x and find the largest gap - all_wbs_with_cell.sort(key=lambda t: t[0]) - best_gap = 0 - best_gap_idx = -1 - for gi in range(len(all_wbs_with_cell) - 1): - right_edge = all_wbs_with_cell[gi][0] + all_wbs_with_cell[gi][1].get("width", 0) - gap = all_wbs_with_cell[gi + 1][0] - right_edge - if gap > best_gap: - best_gap = gap - best_gap_idx = gi - if best_gap < 30 or best_gap_idx < 0: - continue - left_count = best_gap_idx + 1 - right_count = len(all_wbs_with_cell) - left_count - total = len(all_wbs_with_cell) - # The border strip is the SMALLER side with < 15% of total - if left_count < right_count and left_count / total < 0.15: - strip_side = "left" - strip_wbs = {id(t[1]) for t in all_wbs_with_cell[:left_count]} - elif right_count < left_count and right_count / total < 0.15: - strip_side = "right" - strip_wbs = {id(t[1]) for t in all_wbs_with_cell[left_count:]} - else: - continue - # Remove strip word_boxes from cells - for cell in cells: - wbs = cell.get("word_boxes") or [] - filtered = [wb for wb in wbs if id(wb) not in strip_wbs] - if len(filtered) < len(wbs): - border_strip_removed += len(wbs) - len(filtered) - cell["word_boxes"] = filtered - cell["text"] = _words_to_reading_order_text(filtered) - # Remove cells that became empty - z["cells"] = [c for c in cells - if (c.get("word_boxes") or c.get("text", "").strip())] - logger.info( - "Step 4e: removed %d border-strip word_boxes (%s) from zone %d " - "(gap=%dpx, strip=%d/%d wbs)", - border_strip_removed, strip_side, z.get("zone_index", 0), - best_gap, left_count if strip_side == "left" else right_count, total, - ) + if border_prefiltered: + logger.info("Step 4e: skipped (border pre-filter already applied)") + else: + # Some textbooks have decorative alphabet strips along the page + # edge. OCR picks up scattered letters from these as artifacts. + # Detection: find the first significant x-gap (>30 px) from each + # page edge between a small cluster (<20 %) and the main content. + for z in zones_data: + cells = z.get("cells", []) + if not cells: + continue + all_wbs_with_cell: List[tuple] = [] # (left, wb, cell) + for cell in cells: + for wb in cell.get("word_boxes") or []: + all_wbs_with_cell.append((wb.get("left", 0), wb, cell)) + if len(all_wbs_with_cell) < 10: + continue + all_wbs_with_cell.sort(key=lambda t: t[0]) + total = len(all_wbs_with_cell) + + # -- Left-edge scan -- + left_strip_count = 0 + left_gap = 0 + running_right = 0 + for gi in range(total - 1): + running_right = max( + running_right, + all_wbs_with_cell[gi][0] + all_wbs_with_cell[gi][1].get("width", 0), + ) + gap = all_wbs_with_cell[gi + 1][0] - running_right + if gap > 30: + left_strip_count = gi + 1 + left_gap = gap + break + + # -- Right-edge scan -- + right_strip_count = 0 + right_gap = 0 + running_left = all_wbs_with_cell[-1][0] + for gi in range(total - 1, 0, -1): + running_left = min(running_left, all_wbs_with_cell[gi][0]) + prev_right = ( + all_wbs_with_cell[gi - 1][0] + + all_wbs_with_cell[gi - 1][1].get("width", 0) + ) + gap = running_left - prev_right + if gap > 30: + right_strip_count = total - gi + right_gap = gap + break + + strip_wbs: set = set() + strip_side = "" + strip_gap = 0 + strip_count = 0 + if left_strip_count > 0 and left_strip_count / total < 0.20: + strip_side = "left" + strip_count = left_strip_count + strip_gap = left_gap + strip_wbs = {id(t[1]) for t in all_wbs_with_cell[:left_strip_count]} + elif right_strip_count > 0 and right_strip_count / total < 0.20: + strip_side = "right" + strip_count = right_strip_count + strip_gap = right_gap + strip_wbs = {id(t[1]) for t in all_wbs_with_cell[total - right_strip_count:]} + + if not strip_wbs: + continue + for cell in cells: + wbs = cell.get("word_boxes") or [] + filtered = [wb for wb in wbs if id(wb) not in strip_wbs] + if len(filtered) < len(wbs): + border_strip_removed += len(wbs) - len(filtered) + cell["word_boxes"] = filtered + cell["text"] = _words_to_reading_order_text(filtered) + z["cells"] = [c for c in cells + if (c.get("word_boxes") or c.get("text", "").strip())] + logger.info( + "Step 4e: removed %d border-strip word_boxes (%s) from zone %d " + "(gap=%dpx, strip=%d/%d wbs)", + border_strip_removed, strip_side, z.get("zone_index", 0), + strip_gap, strip_count, total, + ) # 5. Color annotation on final word_boxes in cells if img_bgr is not None: diff --git a/klausur-service/backend/tests/test_grid_editor_api.py b/klausur-service/backend/tests/test_grid_editor_api.py index b94273c..0b7a89b 100644 --- a/klausur-service/backend/tests/test_grid_editor_api.py +++ b/klausur-service/backend/tests/test_grid_editor_api.py @@ -1109,64 +1109,56 @@ class TestBorderStripFilter: def test_left_border_strip_removed(self): """Word_boxes at x<120 with 45px gap to content at x>=179 are removed.""" - # Simulate border strip (11 wbs) + real content (20 wbs) + # Simulate border strip (3 wbs) + base words (7 wbs) + oder (7 wbs) + # + synonyms (20 wbs). The old "largest gap" algorithm would pick + # the 67px gap between base words and "oder", removing base words. + # The new "first gap from edge" algorithm picks the 45px gap between + # border artifacts and base words. border_wbs = [ - self._make_wb("M", 49, 436, 46, 44), - self._make_wb("x", 113, 610, 21, 38), - self._make_wb("Er", 45, 998, 62, 37), + self._make_wb("M", 49, 436, 46, 44), # right=95 + self._make_wb("x", 113, 610, 21, 38), # right=134 + self._make_wb("Er", 45, 998, 62, 37), # right=107 ] - content_wbs = [] - for i in range(20): - # Place content words at x=179 and x=280 (gap=1px between them, - # much smaller than the 45px border-to-content gap) - content_wbs.append(self._make_wb(f"word{i}", 179 + (i % 2) * 51, 100 + i * 40)) - # Build zone with cells - cells = [] - # Border-only cells - for i, wb in enumerate(border_wbs): - cells.append({"cell_id": f"R{i}_C0", "col_index": 0, "row_index": i, - "word_boxes": [wb], "text": wb["text"]}) - # Content cells - for i, wb in enumerate(content_wbs): - ri = len(border_wbs) + i - cells.append({"cell_id": f"R{ri}_C1", "col_index": 1, "row_index": ri, - "word_boxes": [wb], "text": wb["text"]}) - zone = {"zone_index": 0, "zone_type": "content", "cells": cells, - "columns": [], "rows": []} - # The filter runs inside _build_grid_core, but we can test the - # pattern detection logic: 3 border wbs + 20 content wbs, - # border right edge = 113+21=134, content left = 179, gap = 45px - # 3/23 = 13% < 15% threshold - from cv_ocr_engines import _group_words_into_lines - all_left = sorted( - [(wb["left"], wb) for cell in cells for wb in cell.get("word_boxes", [])], - key=lambda t: t[0] - ) - # Find largest gap - best_gap = 0 - best_idx = -1 - for gi in range(len(all_left) - 1): - right_edge = all_left[gi][0] + all_left[gi][1]["width"] - gap = all_left[gi + 1][0] - right_edge - if gap > best_gap: - best_gap = gap - best_idx = gi - assert best_gap >= 30, f"Gap should be >=30, got {best_gap}" - left_count = best_idx + 1 + base_wbs = [self._make_wb(f"base{i}", 179, 100 + i * 60, 100, 20) for i in range(7)] + oder_wbs = [self._make_wb("oder", 379, 100 + i * 60, 68, 20) for i in range(7)] + synonym_wbs = [self._make_wb(f"syn{i}", 452 + (i % 5) * 30, 100 + (i // 5) * 60, 80, 20) for i in range(20)] + + all_wbs = border_wbs + base_wbs + oder_wbs + synonym_wbs + all_left = sorted([(wb["left"], wb) for wb in all_wbs], key=lambda t: t[0]) total = len(all_left) - assert left_count / total < 0.15, f"Border ratio {left_count}/{total} should be <15%" + + # New algorithm: scan from left edge, find FIRST gap >30px + running_right = 0 + left_strip_count = 0 + for gi in range(total - 1): + running_right = max(running_right, all_left[gi][0] + all_left[gi][1]["width"]) + gap = all_left[gi + 1][0] - running_right + if gap > 30: + left_strip_count = gi + 1 + break + + # Should find the 45px gap between border (right=134) and base (left=179) + assert left_strip_count == len(border_wbs), ( + f"Expected {len(border_wbs)} border wbs, got {left_strip_count}" + ) + assert left_strip_count / total < 0.20, ( + f"Border ratio {left_strip_count}/{total} should be <20%" + ) def test_no_removal_when_no_gap(self): """No gap > 30px between word_boxes → nothing removed.""" + # Words spaced 20px apart with width 50 → overlap, no gap >30px wbs = [self._make_wb(f"w{i}", 10 + i * 20, 100) for i in range(15)] all_left = sorted([(wb["left"], wb) for wb in wbs], key=lambda t: t[0]) - best_gap = 0 + running_right = 0 + found_gap = False for gi in range(len(all_left) - 1): - right_edge = all_left[gi][0] + all_left[gi][1]["width"] - gap = all_left[gi + 1][0] - right_edge - if gap > best_gap: - best_gap = gap - assert best_gap < 30, f"No significant gap expected, got {best_gap}" + running_right = max(running_right, all_left[gi][0] + all_left[gi][1]["width"]) + gap = all_left[gi + 1][0] - running_right + if gap > 30: + found_gap = True + break + assert not found_gap, "No significant gap expected" def test_equal_sides_not_removed(self): """Two roughly equal groups (50/50) are NOT treated as border strip.""" @@ -1176,15 +1168,17 @@ class TestBorderStripFilter: [(wb["left"], wb) for wb in left_wbs + right_wbs], key=lambda t: t[0] ) - best_gap = 0 - best_idx = -1 - for gi in range(len(all_left) - 1): - right_edge = all_left[gi][0] + all_left[gi][1]["width"] - gap = all_left[gi + 1][0] - right_edge - if gap > best_gap: - best_gap = gap - best_idx = gi - left_count = best_idx + 1 total = len(all_left) + # Left scan: first gap >30px from left + running_right = 0 + left_strip_count = 0 + for gi in range(total - 1): + running_right = max(running_right, all_left[gi][0] + all_left[gi][1]["width"]) + gap = all_left[gi + 1][0] - running_right + if gap > 30: + left_strip_count = gi + 1 + break # 10/20 = 50% — NOT below 15% threshold, so no removal - assert left_count / total >= 0.15, "Equal groups should NOT trigger border removal" + assert left_strip_count == 0 or left_strip_count / total >= 0.20, ( + "Equal groups should NOT trigger border removal" + )