From 278067fe206c5b30501bc4bcfcd461b3060c19c6 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Fri, 20 Mar 2026 08:55:55 +0100 Subject: [PATCH] Fix page_ref extraction: only extract cells matching page-ref pattern Column_1 cells like "to" (infinitive markers) were incorrectly extracted as page_refs. Now only cells matching p.70, ,.65, or bare digits are treated as page references. Co-Authored-By: Claude Opus 4.6 --- klausur-service/backend/grid_editor_api.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/klausur-service/backend/grid_editor_api.py b/klausur-service/backend/grid_editor_api.py index d91cb9d..6d657bb 100644 --- a/klausur-service/backend/grid_editor_api.py +++ b/klausur-service/backend/grid_editor_api.py @@ -1884,6 +1884,8 @@ async def _build_grid_core(session_id: str, session: dict) -> dict: # = page number at bottom) are standalone rows that should be # removed from the table entirely. _REAL_IPA_CHARS_SET = set("ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ") + # Page-ref pattern: "p.70", "P.70", ",.65" (garbled "p"), or bare "70" + _PAGE_REF_RE = re.compile(r'^[pP,]?\s*\.?\s*\d+$') for z in zones_data: if z.get("zone_type") != "content": continue @@ -1892,7 +1894,7 @@ async def _build_grid_core(session_id: str, session: dict) -> dict: if not rows: continue - # Extract column_1 (page_ref) cells → zone metadata + # Extract column_1 cells that look like page references page_refs = [] page_ref_cell_ids = set() for cell in cells: @@ -1901,6 +1903,8 @@ async def _build_grid_core(session_id: str, session: dict) -> dict: text = (cell.get("text") or "").strip() if not text: continue + if not _PAGE_REF_RE.match(text): + continue page_refs.append({ "row_index": cell.get("row_index"), "text": text,