Fix page_ref extraction: only extract cells matching page-ref pattern
Column_1 cells like "to" (infinitive markers) were incorrectly extracted as page_refs. Now only cells matching p.70, ,.65, or bare digits are treated as page references. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1884,6 +1884,8 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
|
|||||||
# = page number at bottom) are standalone rows that should be
|
# = page number at bottom) are standalone rows that should be
|
||||||
# removed from the table entirely.
|
# removed from the table entirely.
|
||||||
_REAL_IPA_CHARS_SET = set("ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ")
|
_REAL_IPA_CHARS_SET = set("ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ")
|
||||||
|
# Page-ref pattern: "p.70", "P.70", ",.65" (garbled "p"), or bare "70"
|
||||||
|
_PAGE_REF_RE = re.compile(r'^[pP,]?\s*\.?\s*\d+$')
|
||||||
for z in zones_data:
|
for z in zones_data:
|
||||||
if z.get("zone_type") != "content":
|
if z.get("zone_type") != "content":
|
||||||
continue
|
continue
|
||||||
@@ -1892,7 +1894,7 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
|
|||||||
if not rows:
|
if not rows:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Extract column_1 (page_ref) cells → zone metadata
|
# Extract column_1 cells that look like page references
|
||||||
page_refs = []
|
page_refs = []
|
||||||
page_ref_cell_ids = set()
|
page_ref_cell_ids = set()
|
||||||
for cell in cells:
|
for cell in cells:
|
||||||
@@ -1901,6 +1903,8 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
|
|||||||
text = (cell.get("text") or "").strip()
|
text = (cell.get("text") or "").strip()
|
||||||
if not text:
|
if not text:
|
||||||
continue
|
continue
|
||||||
|
if not _PAGE_REF_RE.match(text):
|
||||||
|
continue
|
||||||
page_refs.append({
|
page_refs.append({
|
||||||
"row_index": cell.get("row_index"),
|
"row_index": cell.get("row_index"),
|
||||||
"text": text,
|
"text": text,
|
||||||
|
|||||||
Reference in New Issue
Block a user