From 00cbf266cbec407c8f2e6ba9402330600396a280 Mon Sep 17 00:00:00 2001
From: Benjamin Admin <benjaminadmin@MacBookPro.fritz.box>
Date: Wed, 18 Mar 2026 09:05:07 +0100
Subject: [PATCH] Add oversized-stub filter for large page numbers/marks in
 grid rows
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Rows with ≤2 words, total text ≤3 chars, and word height >1.8x median
are removed as non-content elements (e.g. red page number "( 9").

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 klausur-service/backend/grid_editor_api.py | 47 +++++++++++++++++-----
 1 file changed, 37 insertions(+), 10 deletions(-)

diff --git a/klausur-service/backend/grid_editor_api.py b/klausur-service/backend/grid_editor_api.py
index 5e3561b..e153b31 100644
--- a/klausur-service/backend/grid_editor_api.py
+++ b/klausur-service/backend/grid_editor_api.py
@@ -976,6 +976,9 @@ async def build_grid(session_id: str):
     # 4b. Remove junk rows: rows where ALL cells contain only short,
     # low-confidence text (OCR noise, stray marks).  Real vocabulary rows
     # have at least one word with conf >= 50 or meaningful text length.
+    # Also remove "oversized stub" rows: rows with ≤2 very short words
+    # whose word-boxes are significantly taller than the median (e.g.
+    # large red page numbers like "( 9" that are not real text content).
     _JUNK_CONF_THRESHOLD = 50
     _JUNK_MAX_TEXT_LEN = 3
     for z in zones_data:
@@ -983,25 +986,49 @@ async def build_grid(session_id: str):
         rows = z.get("rows", [])
         if not cells or not rows:
             continue
+
+        # Compute median word height across the zone for oversized detection
+        all_wb_heights = [
+            wb["height"]
+            for cell in cells
+            for wb in cell.get("word_boxes") or []
+            if wb.get("height", 0) > 0
+        ]
+        median_wb_h = sorted(all_wb_heights)[len(all_wb_heights) // 2] if all_wb_heights else 28
+
         junk_row_indices = set()
         for row in rows:
             ri = row["index"]
             row_cells = [c for c in cells if c.get("row_index") == ri]
             if not row_cells:
                 continue
-            # Check if ALL word_boxes in ALL cells of this row are junk
+
+            row_wbs = [
+                wb for cell in row_cells
+                for wb in cell.get("word_boxes") or []
+            ]
+
+            # Rule 1: ALL word_boxes are low-conf AND short text
             all_junk = True
-            for cell in row_cells:
-                for wb in cell.get("word_boxes") or []:
-                    text = (wb.get("text") or "").strip()
-                    conf = wb.get("conf", 0)
-                    if conf >= _JUNK_CONF_THRESHOLD or len(text) > _JUNK_MAX_TEXT_LEN:
-                        all_junk = False
-                        break
-                if not all_junk:
+            for wb in row_wbs:
+                text = (wb.get("text") or "").strip()
+                conf = wb.get("conf", 0)
+                if conf >= _JUNK_CONF_THRESHOLD or len(text) > _JUNK_MAX_TEXT_LEN:
+                    all_junk = False
                     break
-            if all_junk:
+            if all_junk and row_wbs:
                 junk_row_indices.add(ri)
+                continue
+
+            # Rule 2: oversized stub — ≤2 words, all short text (≤2 chars),
+            # and word height > 1.8× median (page numbers, stray marks)
+            if len(row_wbs) <= 2:
+                total_text = "".join((wb.get("text") or "").strip() for wb in row_wbs)
+                max_h = max((wb.get("height", 0) for wb in row_wbs), default=0)
+                if len(total_text) <= 3 and max_h > median_wb_h * 1.8:
+                    junk_row_indices.add(ri)
+                    continue
+
         if junk_row_indices:
             z["cells"] = [c for c in cells if c.get("row_index") not in junk_row_indices]
             z["rows"] = [r for r in rows if r["index"] not in junk_row_indices]