Fix page_ref extraction: only extract cells matching page-ref pattern

Column_1 cells like "to" (infinitive markers) were incorrectly extracted
as page_refs. Now only cells matching p.70, ,.65, or bare digits are
treated as page references.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-20 08:55:55 +01:00
parent d76fb2a9c8
commit 278067fe20

View File

@@ -1884,6 +1884,8 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
# = page number at bottom) are standalone rows that should be
# removed from the table entirely.
_REAL_IPA_CHARS_SET = set("ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ")
# Page-ref pattern: "p.70", "P.70", ",.65" (garbled "p"), or bare "70"
_PAGE_REF_RE = re.compile(r'^[pP,]?\s*\.?\s*\d+$')
for z in zones_data:
if z.get("zone_type") != "content":
continue
@@ -1892,7 +1894,7 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
if not rows:
continue
# Extract column_1 (page_ref) cells → zone metadata
# Extract column_1 cells that look like page references
page_refs = []
page_ref_cell_ids = set()
for cell in cells:
@@ -1901,6 +1903,8 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
text = (cell.get("text") or "").strip()
if not text:
continue
if not _PAGE_REF_RE.match(text):
continue
page_refs.append({
"row_index": cell.get("row_index"),
"text": text,