From 00cbf266cbec407c8f2e6ba9402330600396a280 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Wed, 18 Mar 2026 09:05:07 +0100 Subject: [PATCH] Add oversized-stub filter for large page numbers/marks in grid rows MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rows with ≤2 words, total text ≤3 chars, and word height >1.8x median are removed as non-content elements (e.g. red page number "( 9"). Co-Authored-By: Claude Opus 4.6 --- klausur-service/backend/grid_editor_api.py | 47 +++++++++++++++++----- 1 file changed, 37 insertions(+), 10 deletions(-) diff --git a/klausur-service/backend/grid_editor_api.py b/klausur-service/backend/grid_editor_api.py index 5e3561b..e153b31 100644 --- a/klausur-service/backend/grid_editor_api.py +++ b/klausur-service/backend/grid_editor_api.py @@ -976,6 +976,9 @@ async def build_grid(session_id: str): # 4b. Remove junk rows: rows where ALL cells contain only short, # low-confidence text (OCR noise, stray marks). Real vocabulary rows # have at least one word with conf >= 50 or meaningful text length. + # Also remove "oversized stub" rows: rows with ≤2 very short words + # whose word-boxes are significantly taller than the median (e.g. + # large red page numbers like "( 9" that are not real text content). _JUNK_CONF_THRESHOLD = 50 _JUNK_MAX_TEXT_LEN = 3 for z in zones_data: @@ -983,25 +986,49 @@ async def build_grid(session_id: str): rows = z.get("rows", []) if not cells or not rows: continue + + # Compute median word height across the zone for oversized detection + all_wb_heights = [ + wb["height"] + for cell in cells + for wb in cell.get("word_boxes") or [] + if wb.get("height", 0) > 0 + ] + median_wb_h = sorted(all_wb_heights)[len(all_wb_heights) // 2] if all_wb_heights else 28 + junk_row_indices = set() for row in rows: ri = row["index"] row_cells = [c for c in cells if c.get("row_index") == ri] if not row_cells: continue - # Check if ALL word_boxes in ALL cells of this row are junk + + row_wbs = [ + wb for cell in row_cells + for wb in cell.get("word_boxes") or [] + ] + + # Rule 1: ALL word_boxes are low-conf AND short text all_junk = True - for cell in row_cells: - for wb in cell.get("word_boxes") or []: - text = (wb.get("text") or "").strip() - conf = wb.get("conf", 0) - if conf >= _JUNK_CONF_THRESHOLD or len(text) > _JUNK_MAX_TEXT_LEN: - all_junk = False - break - if not all_junk: + for wb in row_wbs: + text = (wb.get("text") or "").strip() + conf = wb.get("conf", 0) + if conf >= _JUNK_CONF_THRESHOLD or len(text) > _JUNK_MAX_TEXT_LEN: + all_junk = False break - if all_junk: + if all_junk and row_wbs: junk_row_indices.add(ri) + continue + + # Rule 2: oversized stub — ≤2 words, all short text (≤2 chars), + # and word height > 1.8× median (page numbers, stray marks) + if len(row_wbs) <= 2: + total_text = "".join((wb.get("text") or "").strip() for wb in row_wbs) + max_h = max((wb.get("height", 0) for wb in row_wbs), default=0) + if len(total_text) <= 3 and max_h > median_wb_h * 1.8: + junk_row_indices.add(ri) + continue + if junk_row_indices: z["cells"] = [c for c in cells if c.get("row_index") not in junk_row_indices] z["rows"] = [r for r in rows if r["index"] not in junk_row_indices]