diff --git a/klausur-service/backend/grid_editor_api.py b/klausur-service/backend/grid_editor_api.py index 2cc04e0..d4744ec 100644 --- a/klausur-service/backend/grid_editor_api.py +++ b/klausur-service/backend/grid_editor_api.py @@ -615,10 +615,15 @@ async def _build_grid_core( # Rule 2: oversized stub — ≤3 words, short total text, # and word height > 1.8× median (page numbers, stray marks, # OCR from illustration labels like "SEA &") + # Skip if any word looks like a page reference (p.55, S.12). if len(row_wbs) <= 3: total_text = "".join((wb.get("text") or "").strip() for wb in row_wbs) max_h = max((wb.get("height", 0) for wb in row_wbs), default=0) - if len(total_text) <= 5 and max_h > median_wb_h * 1.8: + has_page_ref = any( + re.match(r'^[pPsS]\.?\s*\d+$', (wb.get("text") or "").strip()) + for wb in row_wbs + ) + if len(total_text) <= 5 and max_h > median_wb_h * 1.8 and not has_page_ref: junk_row_indices.add(ri) continue