Remove page-number footers from grid, promote to metadata
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 47s
CI / test-go-edu-search (push) Successful in 40s
CI / test-python-klausur (push) Failing after 2m55s
CI / test-python-agent-core (push) Successful in 30s
CI / test-nodejs-website (push) Successful in 37s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 47s
CI / test-go-edu-search (push) Successful in 40s
CI / test-python-klausur (push) Failing after 2m55s
CI / test-python-agent-core (push) Successful in 30s
CI / test-nodejs-website (push) Successful in 37s
Footer rows that are page numbers (digits or written-out like "two hundred and nine") are now removed from the grid entirely and promoted to the page_number metadata field. Non-page-number footer content stays as a visible footer row. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -1150,6 +1150,15 @@ async def _build_grid_core(
|
||||
_REAL_IPA_CHARS_SET = set("ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ")
|
||||
# Page-ref pattern: "p.70", "P.70", ",.65" (garbled "p"), or bare "70"
|
||||
_PAGE_REF_RE = re.compile(r'^[pP,]?\s*\.?\s*\d+$')
|
||||
_NUMBER_WORDS = {
|
||||
"one", "two", "three", "four", "five", "six", "seven",
|
||||
"eight", "nine", "ten", "eleven", "twelve", "thirteen",
|
||||
"fourteen", "fifteen", "sixteen", "seventeen", "eighteen",
|
||||
"nineteen", "twenty", "thirty", "forty", "fifty", "sixty",
|
||||
"seventy", "eighty", "ninety", "hundred", "thousand", "and",
|
||||
"einhundert", "zweihundert", "dreihundert", "vierhundert",
|
||||
"und", "zehn", "zwanzig", "dreißig", "vierzig", "fünfzig",
|
||||
}
|
||||
for z in zones_data:
|
||||
if z.get("zone_type") != "content":
|
||||
continue
|
||||
@@ -1199,15 +1208,6 @@ async def _build_grid_core(
|
||||
# Comma-separated text is a content continuation, not a footer
|
||||
has_commas = ',' in text
|
||||
# Written-out page numbers like "two hundred and nine"
|
||||
_NUMBER_WORDS = {
|
||||
"one", "two", "three", "four", "five", "six", "seven",
|
||||
"eight", "nine", "ten", "eleven", "twelve", "thirteen",
|
||||
"fourteen", "fifteen", "sixteen", "seventeen", "eighteen",
|
||||
"nineteen", "twenty", "thirty", "forty", "fifty", "sixty",
|
||||
"seventy", "eighty", "ninety", "hundred", "thousand", "and",
|
||||
"einhundert", "zweihundert", "dreihundert", "vierhundert",
|
||||
"und", "zehn", "zwanzig", "dreißig", "vierzig", "fünfzig",
|
||||
}
|
||||
text_words = set(text.lower().split())
|
||||
is_written_number = len(text_words) >= 2 and text_words.issubset(_NUMBER_WORDS)
|
||||
# Short text or written-out number
|
||||
@@ -1221,9 +1221,41 @@ async def _build_grid_core(
|
||||
"bbox_pct": last_cells[0].get("bbox_pct", {}),
|
||||
})
|
||||
|
||||
# Mark footer rows (keep in table, just tag for frontend)
|
||||
if footer_rows:
|
||||
footer_ris = {fr["row_index"] for fr in footer_rows}
|
||||
# Classify footer rows: page numbers are removed from the grid
|
||||
# and promoted to page_number metadata; other footers stay as rows.
|
||||
page_number_footers = []
|
||||
other_footers = []
|
||||
for fr in footer_rows:
|
||||
ft = fr["text"].strip()
|
||||
# Pure digits
|
||||
digits = "".join(c for c in ft if c.isdigit())
|
||||
if digits and re.match(r'^[\d\s.]+$', ft):
|
||||
page_number_footers.append(fr)
|
||||
# Written-out numbers
|
||||
elif ft.lower().split() and set(ft.lower().split()).issubset(_NUMBER_WORDS):
|
||||
page_number_footers.append(fr)
|
||||
else:
|
||||
other_footers.append(fr)
|
||||
|
||||
# Remove page-number footer rows from grid entirely
|
||||
if page_number_footers:
|
||||
pn_ris = {fr["row_index"] for fr in page_number_footers}
|
||||
z["cells"] = [c for c in z["cells"] if c.get("row_index") not in pn_ris]
|
||||
z["rows"] = [r for r in z["rows"] if r["index"] not in pn_ris]
|
||||
# Set page_number metadata (use first one)
|
||||
pn_text = page_number_footers[0]["text"].strip()
|
||||
pn_digits = "".join(c for c in pn_text if c.isdigit())
|
||||
if not page_number_info:
|
||||
page_number_info = {
|
||||
"text": pn_text,
|
||||
"y_pct": page_number_footers[0].get("bbox_pct", {}).get("y", 95),
|
||||
}
|
||||
if pn_digits:
|
||||
page_number_info["number"] = int(pn_digits)
|
||||
|
||||
# Mark remaining footer rows (non-page-number content)
|
||||
if other_footers:
|
||||
footer_ris = {fr["row_index"] for fr in other_footers}
|
||||
for r in z["rows"]:
|
||||
if r["index"] in footer_ris:
|
||||
r["is_footer"] = True
|
||||
@@ -1233,15 +1265,16 @@ async def _build_grid_core(
|
||||
|
||||
if page_refs or footer_rows:
|
||||
logger.info(
|
||||
"Extracted %d page_refs + %d footer rows from zone %d",
|
||||
len(page_refs), len(footer_rows), z.get("zone_index", 0),
|
||||
"Extracted %d page_refs + %d footer rows (%d page numbers removed) from zone %d",
|
||||
len(page_refs), len(footer_rows), len(page_number_footers),
|
||||
z.get("zone_index", 0),
|
||||
)
|
||||
|
||||
# Store as zone-level metadata
|
||||
if page_refs:
|
||||
z["page_refs"] = page_refs
|
||||
if footer_rows:
|
||||
z["footer"] = footer_rows
|
||||
if other_footers:
|
||||
z["footer"] = other_footers
|
||||
|
||||
# 5h. Convert slash-delimited IPA to bracket notation.
|
||||
# Dictionary-style pages print IPA between slashes: "tiger /'taiga/"
|
||||
|
||||
Reference in New Issue
Block a user