Fix page_ref + footer extraction: extract individual cells, skip IPA footers

Step 5g now extracts column_1 cells individually as page_refs (instead of
requiring the whole row to be column_1-only), and footer detection skips
rows containing real IPA Unicode symbols to avoid false positives on
IPA continuation rows like [sˈiː] – [sˈɔː] – [sˈiːn].

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-20 08:47:39 +01:00
parent 9681fcbd05
commit d76fb2a9c8

View File

@@ -1877,11 +1877,13 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
if stripped and stripped != text:
cell["text"] = stripped
# 5g. Extract page_ref rows and footer rows from content zones.
# Page references (column_1 cells like "p.70") and footer lines
# (e.g. "two hundred and twelve" = page number) should not be part
# of the vocabulary table. Move them to zone-level metadata so the
# frontend can display them separately.
# 5g. Extract page_ref cells and footer rows from content zones.
# Page references (column_1 cells like "p.70") sit in rows that
# also contain vocabulary — extract them as zone metadata without
# removing the row. Footer lines (e.g. "two hundred and twelve"
# = page number at bottom) are standalone rows that should be
# removed from the table entirely.
_REAL_IPA_CHARS_SET = set("ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ")
for z in zones_data:
if z.get("zone_type") != "content":
continue
@@ -1890,53 +1892,57 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
if not rows:
continue
# Extract column_1 (page_ref) cells → zone metadata
page_refs = []
footer_rows = []
# Detect page_ref rows: rows where the ONLY cell is column_1
# (just a page number like "p.65", "p.70")
for row in rows:
if row.get("is_header"):
page_ref_cell_ids = set()
for cell in cells:
if cell.get("col_type") != "column_1":
continue
ri = row["index"]
row_cells = [c for c in cells if c.get("row_index") == ri]
if (len(row_cells) == 1
and row_cells[0].get("col_type") == "column_1"):
page_refs.append({
"row_index": ri,
"text": (row_cells[0].get("text") or "").strip(),
"bbox_pct": row_cells[0].get("bbox_pct", {}),
})
text = (cell.get("text") or "").strip()
if not text:
continue
page_refs.append({
"row_index": cell.get("row_index"),
"text": text,
"bbox_pct": cell.get("bbox_pct", {}),
})
page_ref_cell_ids.add(cell.get("cell_id"))
# Detect footer: last non-header row if it has only 1 content
# cell and no column_1 page_ref (standalone text like page num)
# Remove page_ref cells from the table (but keep their rows)
if page_ref_cell_ids:
z["cells"] = [c for c in z["cells"]
if c.get("cell_id") not in page_ref_cell_ids]
# Detect footer: last non-header row if it has only 1 cell
# and the text is NOT IPA (no real IPA Unicode symbols).
# This catches page numbers like "two hundred and twelve".
footer_rows = []
non_header_rows = [r for r in rows if not r.get("is_header")]
if non_header_rows:
last_row = non_header_rows[-1]
last_ri = last_row["index"]
last_cells = [c for c in cells if c.get("row_index") == last_ri]
content_last = [
c for c in last_cells
if c.get("col_type", "").startswith("column_")
and c.get("col_type") != "column_1"
]
if len(content_last) == 1 and len(last_cells) == 1:
footer_rows.append({
"row_index": last_ri,
"text": (content_last[0].get("text") or "").strip(),
"bbox_pct": content_last[0].get("bbox_pct", {}),
})
last_cells = [c for c in z["cells"]
if c.get("row_index") == last_ri]
if len(last_cells) == 1:
text = (last_cells[0].get("text") or "").strip()
# Not IPA (no real IPA symbols) and not a heading
has_real_ipa = any(c in _REAL_IPA_CHARS_SET for c in text)
if text and not has_real_ipa and last_cells[0].get("col_type") != "heading":
footer_rows.append({
"row_index": last_ri,
"text": text,
"bbox_pct": last_cells[0].get("bbox_pct", {}),
})
# Remove page_ref and footer cells/rows from the table
remove_ris = set()
for pr in page_refs:
remove_ris.add(pr["row_index"])
for fr in footer_rows:
remove_ris.add(fr["row_index"])
# Remove footer rows from the table
if footer_rows:
remove_ris = {fr["row_index"] for fr in footer_rows}
z["cells"] = [c for c in z["cells"]
if c.get("row_index") not in remove_ris]
z["rows"] = [r for r in z["rows"]
if r["index"] not in remove_ris]
if remove_ris:
z["cells"] = [c for c in cells if c.get("row_index") not in remove_ris]
z["rows"] = [r for r in rows if r["index"] not in remove_ris]
if page_refs or footer_rows:
logger.info(
"Extracted %d page_refs + %d footer rows from zone %d",
len(page_refs), len(footer_rows), z.get("zone_index", 0),