From 9ceee4e07cef94c09740365269b14ec86b857065 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Sat, 11 Apr 2026 22:40:37 +0200 Subject: [PATCH] Protect page references from junk-row removal Rows containing only a page reference (p.55, S.12) were removed as "oversized stubs" (Rule 2) when their word-box height exceeded the median. Now skips Rule 2 if any word matches the page-ref pattern. Co-Authored-By: Claude Opus 4.6 (1M context) --- klausur-service/backend/grid_editor_api.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/klausur-service/backend/grid_editor_api.py b/klausur-service/backend/grid_editor_api.py index 2cc04e0..d4744ec 100644 --- a/klausur-service/backend/grid_editor_api.py +++ b/klausur-service/backend/grid_editor_api.py @@ -615,10 +615,15 @@ async def _build_grid_core( # Rule 2: oversized stub — ≤3 words, short total text, # and word height > 1.8× median (page numbers, stray marks, # OCR from illustration labels like "SEA &") + # Skip if any word looks like a page reference (p.55, S.12). if len(row_wbs) <= 3: total_text = "".join((wb.get("text") or "").strip() for wb in row_wbs) max_h = max((wb.get("height", 0) for wb in row_wbs), default=0) - if len(total_text) <= 5 and max_h > median_wb_h * 1.8: + has_page_ref = any( + re.match(r'^[pPsS]\.?\s*\d+$', (wb.get("text") or "").strip()) + for wb in row_wbs + ) + if len(total_text) <= 5 and max_h > median_wb_h * 1.8 and not has_page_ref: junk_row_indices.add(ri) continue