Remove page-number footers from grid, promote to metadata
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 47s
CI / test-go-edu-search (push) Successful in 40s
CI / test-python-klausur (push) Failing after 2m55s
CI / test-python-agent-core (push) Successful in 30s
CI / test-nodejs-website (push) Successful in 37s

Footer rows that are page numbers (digits or written-out like
"two hundred and nine") are now removed from the grid entirely
and promoted to the page_number metadata field. Non-page-number
footer content stays as a visible footer row.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-04-12 08:50:20 +02:00
parent 757c8460c9
commit 656cadbb1e

View File

@@ -1150,6 +1150,15 @@ async def _build_grid_core(
_REAL_IPA_CHARS_SET = set("ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ")
# Page-ref pattern: "p.70", "P.70", ",.65" (garbled "p"), or bare "70"
_PAGE_REF_RE = re.compile(r'^[pP,]?\s*\.?\s*\d+$')
_NUMBER_WORDS = {
"one", "two", "three", "four", "five", "six", "seven",
"eight", "nine", "ten", "eleven", "twelve", "thirteen",
"fourteen", "fifteen", "sixteen", "seventeen", "eighteen",
"nineteen", "twenty", "thirty", "forty", "fifty", "sixty",
"seventy", "eighty", "ninety", "hundred", "thousand", "and",
"einhundert", "zweihundert", "dreihundert", "vierhundert",
"und", "zehn", "zwanzig", "dreißig", "vierzig", "fünfzig",
}
for z in zones_data:
if z.get("zone_type") != "content":
continue
@@ -1199,15 +1208,6 @@ async def _build_grid_core(
# Comma-separated text is a content continuation, not a footer
has_commas = ',' in text
# Written-out page numbers like "two hundred and nine"
_NUMBER_WORDS = {
"one", "two", "three", "four", "five", "six", "seven",
"eight", "nine", "ten", "eleven", "twelve", "thirteen",
"fourteen", "fifteen", "sixteen", "seventeen", "eighteen",
"nineteen", "twenty", "thirty", "forty", "fifty", "sixty",
"seventy", "eighty", "ninety", "hundred", "thousand", "and",
"einhundert", "zweihundert", "dreihundert", "vierhundert",
"und", "zehn", "zwanzig", "dreißig", "vierzig", "fünfzig",
}
text_words = set(text.lower().split())
is_written_number = len(text_words) >= 2 and text_words.issubset(_NUMBER_WORDS)
# Short text or written-out number
@@ -1221,9 +1221,41 @@ async def _build_grid_core(
"bbox_pct": last_cells[0].get("bbox_pct", {}),
})
# Mark footer rows (keep in table, just tag for frontend)
if footer_rows:
footer_ris = {fr["row_index"] for fr in footer_rows}
# Classify footer rows: page numbers are removed from the grid
# and promoted to page_number metadata; other footers stay as rows.
page_number_footers = []
other_footers = []
for fr in footer_rows:
ft = fr["text"].strip()
# Pure digits
digits = "".join(c for c in ft if c.isdigit())
if digits and re.match(r'^[\d\s.]+$', ft):
page_number_footers.append(fr)
# Written-out numbers
elif ft.lower().split() and set(ft.lower().split()).issubset(_NUMBER_WORDS):
page_number_footers.append(fr)
else:
other_footers.append(fr)
# Remove page-number footer rows from grid entirely
if page_number_footers:
pn_ris = {fr["row_index"] for fr in page_number_footers}
z["cells"] = [c for c in z["cells"] if c.get("row_index") not in pn_ris]
z["rows"] = [r for r in z["rows"] if r["index"] not in pn_ris]
# Set page_number metadata (use first one)
pn_text = page_number_footers[0]["text"].strip()
pn_digits = "".join(c for c in pn_text if c.isdigit())
if not page_number_info:
page_number_info = {
"text": pn_text,
"y_pct": page_number_footers[0].get("bbox_pct", {}).get("y", 95),
}
if pn_digits:
page_number_info["number"] = int(pn_digits)
# Mark remaining footer rows (non-page-number content)
if other_footers:
footer_ris = {fr["row_index"] for fr in other_footers}
for r in z["rows"]:
if r["index"] in footer_ris:
r["is_footer"] = True
@@ -1233,15 +1265,16 @@ async def _build_grid_core(
if page_refs or footer_rows:
logger.info(
"Extracted %d page_refs + %d footer rows from zone %d",
len(page_refs), len(footer_rows), z.get("zone_index", 0),
"Extracted %d page_refs + %d footer rows (%d page numbers removed) from zone %d",
len(page_refs), len(footer_rows), len(page_number_footers),
z.get("zone_index", 0),
)
# Store as zone-level metadata
if page_refs:
z["page_refs"] = page_refs
if footer_rows:
z["footer"] = footer_rows
if other_footers:
z["footer"] = other_footers
# 5h. Convert slash-delimited IPA to bracket notation.
# Dictionary-style pages print IPA between slashes: "tiger /'taiga/"