diff --git a/klausur-service/backend/grid_editor_api.py b/klausur-service/backend/grid_editor_api.py index d91cb9d..6d657bb 100644 --- a/klausur-service/backend/grid_editor_api.py +++ b/klausur-service/backend/grid_editor_api.py @@ -1884,6 +1884,8 @@ async def _build_grid_core(session_id: str, session: dict) -> dict: # = page number at bottom) are standalone rows that should be # removed from the table entirely. _REAL_IPA_CHARS_SET = set("ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ") + # Page-ref pattern: "p.70", "P.70", ",.65" (garbled "p"), or bare "70" + _PAGE_REF_RE = re.compile(r'^[pP,]?\s*\.?\s*\d+$') for z in zones_data: if z.get("zone_type") != "content": continue @@ -1892,7 +1894,7 @@ async def _build_grid_core(session_id: str, session: dict) -> dict: if not rows: continue - # Extract column_1 (page_ref) cells → zone metadata + # Extract column_1 cells that look like page references page_refs = [] page_ref_cell_ids = set() for cell in cells: @@ -1901,6 +1903,8 @@ async def _build_grid_core(session_id: str, session: dict) -> dict: text = (cell.get("text") or "").strip() if not text: continue + if not _PAGE_REF_RE.match(text): + continue page_refs.append({ "row_index": cell.get("row_index"), "text": text,