Fix page_ref extraction: only extract cells matching page-ref pattern
Column_1 cells like "to" (infinitive markers) were incorrectly extracted as page_refs. Now only cells matching p.70, ,.65, or bare digits are treated as page references. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1884,6 +1884,8 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
|
||||
# = page number at bottom) are standalone rows that should be
|
||||
# removed from the table entirely.
|
||||
_REAL_IPA_CHARS_SET = set("ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ")
|
||||
# Page-ref pattern: "p.70", "P.70", ",.65" (garbled "p"), or bare "70"
|
||||
_PAGE_REF_RE = re.compile(r'^[pP,]?\s*\.?\s*\d+$')
|
||||
for z in zones_data:
|
||||
if z.get("zone_type") != "content":
|
||||
continue
|
||||
@@ -1892,7 +1894,7 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
|
||||
if not rows:
|
||||
continue
|
||||
|
||||
# Extract column_1 (page_ref) cells → zone metadata
|
||||
# Extract column_1 cells that look like page references
|
||||
page_refs = []
|
||||
page_ref_cell_ids = set()
|
||||
for cell in cells:
|
||||
@@ -1901,6 +1903,8 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
|
||||
text = (cell.get("text") or "").strip()
|
||||
if not text:
|
||||
continue
|
||||
if not _PAGE_REF_RE.match(text):
|
||||
continue
|
||||
page_refs.append({
|
||||
"row_index": cell.get("row_index"),
|
||||
"text": text,
|
||||
|
||||
Reference in New Issue
Block a user