From d76fb2a9c83b5377334a4cdc3d672eb81e37cb29 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Fri, 20 Mar 2026 08:47:39 +0100 Subject: [PATCH] Fix page_ref + footer extraction: extract individual cells, skip IPA footers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Step 5g now extracts column_1 cells individually as page_refs (instead of requiring the whole row to be column_1-only), and footer detection skips rows containing real IPA Unicode symbols to avoid false positives on IPA continuation rows like [sˈiː] – [sˈɔː] – [sˈiːn]. Co-Authored-By: Claude Opus 4.6 --- klausur-service/backend/grid_editor_api.py | 92 ++++++++++++---------- 1 file changed, 49 insertions(+), 43 deletions(-) diff --git a/klausur-service/backend/grid_editor_api.py b/klausur-service/backend/grid_editor_api.py index 664f877..d91cb9d 100644 --- a/klausur-service/backend/grid_editor_api.py +++ b/klausur-service/backend/grid_editor_api.py @@ -1877,11 +1877,13 @@ async def _build_grid_core(session_id: str, session: dict) -> dict: if stripped and stripped != text: cell["text"] = stripped - # 5g. Extract page_ref rows and footer rows from content zones. - # Page references (column_1 cells like "p.70") and footer lines - # (e.g. "two hundred and twelve" = page number) should not be part - # of the vocabulary table. Move them to zone-level metadata so the - # frontend can display them separately. + # 5g. Extract page_ref cells and footer rows from content zones. + # Page references (column_1 cells like "p.70") sit in rows that + # also contain vocabulary — extract them as zone metadata without + # removing the row. Footer lines (e.g. "two hundred and twelve" + # = page number at bottom) are standalone rows that should be + # removed from the table entirely. + _REAL_IPA_CHARS_SET = set("ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ") for z in zones_data: if z.get("zone_type") != "content": continue @@ -1890,53 +1892,57 @@ async def _build_grid_core(session_id: str, session: dict) -> dict: if not rows: continue + # Extract column_1 (page_ref) cells → zone metadata page_refs = [] - footer_rows = [] - - # Detect page_ref rows: rows where the ONLY cell is column_1 - # (just a page number like "p.65", "p.70") - for row in rows: - if row.get("is_header"): + page_ref_cell_ids = set() + for cell in cells: + if cell.get("col_type") != "column_1": continue - ri = row["index"] - row_cells = [c for c in cells if c.get("row_index") == ri] - if (len(row_cells) == 1 - and row_cells[0].get("col_type") == "column_1"): - page_refs.append({ - "row_index": ri, - "text": (row_cells[0].get("text") or "").strip(), - "bbox_pct": row_cells[0].get("bbox_pct", {}), - }) + text = (cell.get("text") or "").strip() + if not text: + continue + page_refs.append({ + "row_index": cell.get("row_index"), + "text": text, + "bbox_pct": cell.get("bbox_pct", {}), + }) + page_ref_cell_ids.add(cell.get("cell_id")) - # Detect footer: last non-header row if it has only 1 content - # cell and no column_1 page_ref (standalone text like page num) + # Remove page_ref cells from the table (but keep their rows) + if page_ref_cell_ids: + z["cells"] = [c for c in z["cells"] + if c.get("cell_id") not in page_ref_cell_ids] + + # Detect footer: last non-header row if it has only 1 cell + # and the text is NOT IPA (no real IPA Unicode symbols). + # This catches page numbers like "two hundred and twelve". + footer_rows = [] non_header_rows = [r for r in rows if not r.get("is_header")] if non_header_rows: last_row = non_header_rows[-1] last_ri = last_row["index"] - last_cells = [c for c in cells if c.get("row_index") == last_ri] - content_last = [ - c for c in last_cells - if c.get("col_type", "").startswith("column_") - and c.get("col_type") != "column_1" - ] - if len(content_last) == 1 and len(last_cells) == 1: - footer_rows.append({ - "row_index": last_ri, - "text": (content_last[0].get("text") or "").strip(), - "bbox_pct": content_last[0].get("bbox_pct", {}), - }) + last_cells = [c for c in z["cells"] + if c.get("row_index") == last_ri] + if len(last_cells) == 1: + text = (last_cells[0].get("text") or "").strip() + # Not IPA (no real IPA symbols) and not a heading + has_real_ipa = any(c in _REAL_IPA_CHARS_SET for c in text) + if text and not has_real_ipa and last_cells[0].get("col_type") != "heading": + footer_rows.append({ + "row_index": last_ri, + "text": text, + "bbox_pct": last_cells[0].get("bbox_pct", {}), + }) - # Remove page_ref and footer cells/rows from the table - remove_ris = set() - for pr in page_refs: - remove_ris.add(pr["row_index"]) - for fr in footer_rows: - remove_ris.add(fr["row_index"]) + # Remove footer rows from the table + if footer_rows: + remove_ris = {fr["row_index"] for fr in footer_rows} + z["cells"] = [c for c in z["cells"] + if c.get("row_index") not in remove_ris] + z["rows"] = [r for r in z["rows"] + if r["index"] not in remove_ris] - if remove_ris: - z["cells"] = [c for c in cells if c.get("row_index") not in remove_ris] - z["rows"] = [r for r in rows if r["index"] not in remove_ris] + if page_refs or footer_rows: logger.info( "Extracted %d page_refs + %d footer rows from zone %d", len(page_refs), len(footer_rows), z.get("zone_index", 0),