Remove page-number footers from grid, promote to metadata
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 47s
CI / test-go-edu-search (push) Successful in 40s
CI / test-python-klausur (push) Failing after 2m55s
CI / test-python-agent-core (push) Successful in 30s
CI / test-nodejs-website (push) Successful in 37s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 47s
CI / test-go-edu-search (push) Successful in 40s
CI / test-python-klausur (push) Failing after 2m55s
CI / test-python-agent-core (push) Successful in 30s
CI / test-nodejs-website (push) Successful in 37s
Footer rows that are page numbers (digits or written-out like "two hundred and nine") are now removed from the grid entirely and promoted to the page_number metadata field. Non-page-number footer content stays as a visible footer row. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -1150,6 +1150,15 @@ async def _build_grid_core(
|
|||||||
_REAL_IPA_CHARS_SET = set("ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ")
|
_REAL_IPA_CHARS_SET = set("ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ")
|
||||||
# Page-ref pattern: "p.70", "P.70", ",.65" (garbled "p"), or bare "70"
|
# Page-ref pattern: "p.70", "P.70", ",.65" (garbled "p"), or bare "70"
|
||||||
_PAGE_REF_RE = re.compile(r'^[pP,]?\s*\.?\s*\d+$')
|
_PAGE_REF_RE = re.compile(r'^[pP,]?\s*\.?\s*\d+$')
|
||||||
|
_NUMBER_WORDS = {
|
||||||
|
"one", "two", "three", "four", "five", "six", "seven",
|
||||||
|
"eight", "nine", "ten", "eleven", "twelve", "thirteen",
|
||||||
|
"fourteen", "fifteen", "sixteen", "seventeen", "eighteen",
|
||||||
|
"nineteen", "twenty", "thirty", "forty", "fifty", "sixty",
|
||||||
|
"seventy", "eighty", "ninety", "hundred", "thousand", "and",
|
||||||
|
"einhundert", "zweihundert", "dreihundert", "vierhundert",
|
||||||
|
"und", "zehn", "zwanzig", "dreißig", "vierzig", "fünfzig",
|
||||||
|
}
|
||||||
for z in zones_data:
|
for z in zones_data:
|
||||||
if z.get("zone_type") != "content":
|
if z.get("zone_type") != "content":
|
||||||
continue
|
continue
|
||||||
@@ -1199,15 +1208,6 @@ async def _build_grid_core(
|
|||||||
# Comma-separated text is a content continuation, not a footer
|
# Comma-separated text is a content continuation, not a footer
|
||||||
has_commas = ',' in text
|
has_commas = ',' in text
|
||||||
# Written-out page numbers like "two hundred and nine"
|
# Written-out page numbers like "two hundred and nine"
|
||||||
_NUMBER_WORDS = {
|
|
||||||
"one", "two", "three", "four", "five", "six", "seven",
|
|
||||||
"eight", "nine", "ten", "eleven", "twelve", "thirteen",
|
|
||||||
"fourteen", "fifteen", "sixteen", "seventeen", "eighteen",
|
|
||||||
"nineteen", "twenty", "thirty", "forty", "fifty", "sixty",
|
|
||||||
"seventy", "eighty", "ninety", "hundred", "thousand", "and",
|
|
||||||
"einhundert", "zweihundert", "dreihundert", "vierhundert",
|
|
||||||
"und", "zehn", "zwanzig", "dreißig", "vierzig", "fünfzig",
|
|
||||||
}
|
|
||||||
text_words = set(text.lower().split())
|
text_words = set(text.lower().split())
|
||||||
is_written_number = len(text_words) >= 2 and text_words.issubset(_NUMBER_WORDS)
|
is_written_number = len(text_words) >= 2 and text_words.issubset(_NUMBER_WORDS)
|
||||||
# Short text or written-out number
|
# Short text or written-out number
|
||||||
@@ -1221,9 +1221,41 @@ async def _build_grid_core(
|
|||||||
"bbox_pct": last_cells[0].get("bbox_pct", {}),
|
"bbox_pct": last_cells[0].get("bbox_pct", {}),
|
||||||
})
|
})
|
||||||
|
|
||||||
# Mark footer rows (keep in table, just tag for frontend)
|
# Classify footer rows: page numbers are removed from the grid
|
||||||
if footer_rows:
|
# and promoted to page_number metadata; other footers stay as rows.
|
||||||
footer_ris = {fr["row_index"] for fr in footer_rows}
|
page_number_footers = []
|
||||||
|
other_footers = []
|
||||||
|
for fr in footer_rows:
|
||||||
|
ft = fr["text"].strip()
|
||||||
|
# Pure digits
|
||||||
|
digits = "".join(c for c in ft if c.isdigit())
|
||||||
|
if digits and re.match(r'^[\d\s.]+$', ft):
|
||||||
|
page_number_footers.append(fr)
|
||||||
|
# Written-out numbers
|
||||||
|
elif ft.lower().split() and set(ft.lower().split()).issubset(_NUMBER_WORDS):
|
||||||
|
page_number_footers.append(fr)
|
||||||
|
else:
|
||||||
|
other_footers.append(fr)
|
||||||
|
|
||||||
|
# Remove page-number footer rows from grid entirely
|
||||||
|
if page_number_footers:
|
||||||
|
pn_ris = {fr["row_index"] for fr in page_number_footers}
|
||||||
|
z["cells"] = [c for c in z["cells"] if c.get("row_index") not in pn_ris]
|
||||||
|
z["rows"] = [r for r in z["rows"] if r["index"] not in pn_ris]
|
||||||
|
# Set page_number metadata (use first one)
|
||||||
|
pn_text = page_number_footers[0]["text"].strip()
|
||||||
|
pn_digits = "".join(c for c in pn_text if c.isdigit())
|
||||||
|
if not page_number_info:
|
||||||
|
page_number_info = {
|
||||||
|
"text": pn_text,
|
||||||
|
"y_pct": page_number_footers[0].get("bbox_pct", {}).get("y", 95),
|
||||||
|
}
|
||||||
|
if pn_digits:
|
||||||
|
page_number_info["number"] = int(pn_digits)
|
||||||
|
|
||||||
|
# Mark remaining footer rows (non-page-number content)
|
||||||
|
if other_footers:
|
||||||
|
footer_ris = {fr["row_index"] for fr in other_footers}
|
||||||
for r in z["rows"]:
|
for r in z["rows"]:
|
||||||
if r["index"] in footer_ris:
|
if r["index"] in footer_ris:
|
||||||
r["is_footer"] = True
|
r["is_footer"] = True
|
||||||
@@ -1233,15 +1265,16 @@ async def _build_grid_core(
|
|||||||
|
|
||||||
if page_refs or footer_rows:
|
if page_refs or footer_rows:
|
||||||
logger.info(
|
logger.info(
|
||||||
"Extracted %d page_refs + %d footer rows from zone %d",
|
"Extracted %d page_refs + %d footer rows (%d page numbers removed) from zone %d",
|
||||||
len(page_refs), len(footer_rows), z.get("zone_index", 0),
|
len(page_refs), len(footer_rows), len(page_number_footers),
|
||||||
|
z.get("zone_index", 0),
|
||||||
)
|
)
|
||||||
|
|
||||||
# Store as zone-level metadata
|
# Store as zone-level metadata
|
||||||
if page_refs:
|
if page_refs:
|
||||||
z["page_refs"] = page_refs
|
z["page_refs"] = page_refs
|
||||||
if footer_rows:
|
if other_footers:
|
||||||
z["footer"] = footer_rows
|
z["footer"] = other_footers
|
||||||
|
|
||||||
# 5h. Convert slash-delimited IPA to bracket notation.
|
# 5h. Convert slash-delimited IPA to bracket notation.
|
||||||
# Dictionary-style pages print IPA between slashes: "tiger /'taiga/"
|
# Dictionary-style pages print IPA between slashes: "tiger /'taiga/"
|
||||||
|
|||||||
Reference in New Issue
Block a user