Fix page_ref + footer extraction: extract individual cells, skip IPA footers

Step 5g now extracts column_1 cells individually as page_refs (instead of requiring the whole row to be column_1-only), and footer detection skips rows containing real IPA Unicode symbols to avoid false positives on IPA continuation rows like [sˈiː] – [sˈɔː] – [sˈiːn]. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-20 08:47:39 +01:00
parent 9681fcbd05
commit d76fb2a9c8
1 changed files with 49 additions and 43 deletions
@@ -1877,11 +1877,13 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
            if stripped and stripped != text:
                cell["text"] = stripped

-    # 5g. Extract page_ref rows and footer rows from content zones.
-    # Page references (column_1 cells like "p.70") and footer lines
-    # (e.g. "two hundred and twelve" = page number) should not be part
-    # of the vocabulary table.  Move them to zone-level metadata so the
-    # frontend can display them separately.
+    # 5g. Extract page_ref cells and footer rows from content zones.
+    # Page references (column_1 cells like "p.70") sit in rows that
+    # also contain vocabulary — extract them as zone metadata without
+    # removing the row.  Footer lines (e.g. "two hundred and twelve"
+    # = page number at bottom) are standalone rows that should be
+    # removed from the table entirely.
+    _REAL_IPA_CHARS_SET = set("ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ")
    for z in zones_data:
        if z.get("zone_type") != "content":
            continue
@@ -1890,53 +1892,57 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
        if not rows:
            continue

+        # Extract column_1 (page_ref) cells → zone metadata
        page_refs = []
-        footer_rows = []
-
-        # Detect page_ref rows: rows where the ONLY cell is column_1
-        # (just a page number like "p.65", "p.70")
-        for row in rows:
-            if row.get("is_header"):
+        page_ref_cell_ids = set()
+        for cell in cells:
+            if cell.get("col_type") != "column_1":
                continue
-            ri = row["index"]
-            row_cells = [c for c in cells if c.get("row_index") == ri]
-            if (len(row_cells) == 1
-                    and row_cells[0].get("col_type") == "column_1"):
-                page_refs.append({
-                    "row_index": ri,
-                    "text": (row_cells[0].get("text") or "").strip(),
-                    "bbox_pct": row_cells[0].get("bbox_pct", {}),
-                })
+            text = (cell.get("text") or "").strip()
+            if not text:
+                continue
+            page_refs.append({
+                "row_index": cell.get("row_index"),
+                "text": text,
+                "bbox_pct": cell.get("bbox_pct", {}),
+            })
+            page_ref_cell_ids.add(cell.get("cell_id"))

-        # Detect footer: last non-header row if it has only 1 content
-        # cell and no column_1 page_ref (standalone text like page num)
+        # Remove page_ref cells from the table (but keep their rows)
+        if page_ref_cell_ids:
+            z["cells"] = [c for c in z["cells"]
+                          if c.get("cell_id") not in page_ref_cell_ids]
+
+        # Detect footer: last non-header row if it has only 1 cell
+        # and the text is NOT IPA (no real IPA Unicode symbols).
+        # This catches page numbers like "two hundred and twelve".
+        footer_rows = []
        non_header_rows = [r for r in rows if not r.get("is_header")]
        if non_header_rows:
            last_row = non_header_rows[-1]
            last_ri = last_row["index"]
-            last_cells = [c for c in cells if c.get("row_index") == last_ri]
-            content_last = [
-                c for c in last_cells
-                if c.get("col_type", "").startswith("column_")
-                and c.get("col_type") != "column_1"
-            ]
-            if len(content_last) == 1 and len(last_cells) == 1:
-                footer_rows.append({
-                    "row_index": last_ri,
-                    "text": (content_last[0].get("text") or "").strip(),
-                    "bbox_pct": content_last[0].get("bbox_pct", {}),
-                })
+            last_cells = [c for c in z["cells"]
+                          if c.get("row_index") == last_ri]
+            if len(last_cells) == 1:
+                text = (last_cells[0].get("text") or "").strip()
+                # Not IPA (no real IPA symbols) and not a heading
+                has_real_ipa = any(c in _REAL_IPA_CHARS_SET for c in text)
+                if text and not has_real_ipa and last_cells[0].get("col_type") != "heading":
+                    footer_rows.append({
+                        "row_index": last_ri,
+                        "text": text,
+                        "bbox_pct": last_cells[0].get("bbox_pct", {}),
+                    })

-        # Remove page_ref and footer cells/rows from the table
-        remove_ris = set()
-        for pr in page_refs:
-            remove_ris.add(pr["row_index"])
-        for fr in footer_rows:
-            remove_ris.add(fr["row_index"])
+        # Remove footer rows from the table
+        if footer_rows:
+            remove_ris = {fr["row_index"] for fr in footer_rows}
+            z["cells"] = [c for c in z["cells"]
+                          if c.get("row_index") not in remove_ris]
+            z["rows"] = [r for r in z["rows"]
+                         if r["index"] not in remove_ris]

-        if remove_ris:
-            z["cells"] = [c for c in cells if c.get("row_index") not in remove_ris]
-            z["rows"] = [r for r in rows if r["index"] not in remove_ris]
+        if page_refs or footer_rows:
            logger.info(
                "Extracted %d page_refs + %d footer rows from zone %d",
                len(page_refs), len(footer_rows), z.get("zone_index", 0),