Detect written-out page numbers as footer rows
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 47s
CI / test-go-edu-search (push) Successful in 44s
CI / test-python-klausur (push) Failing after 2m46s
CI / test-python-agent-core (push) Successful in 32s
CI / test-nodejs-website (push) Successful in 39s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 47s
CI / test-go-edu-search (push) Successful in 44s
CI / test-python-klausur (push) Failing after 2m46s
CI / test-python-agent-core (push) Successful in 32s
CI / test-nodejs-website (push) Successful in 39s
"two hundred and nine" (22 chars) was kept as a content row because the footer detection only accepted text ≤20 chars. Now recognizes written-out number words (English + German) as page numbers regardless of length. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -1198,10 +1198,22 @@ async def _build_grid_core(
|
||||
has_real_ipa = any(c in _REAL_IPA_CHARS_SET for c in text)
|
||||
# Comma-separated text is a content continuation, not a footer
|
||||
has_commas = ',' in text
|
||||
# Long text (>20 chars) is unlikely a page number
|
||||
is_short = len(text) <= 20
|
||||
# Written-out page numbers like "two hundred and nine"
|
||||
_NUMBER_WORDS = {
|
||||
"one", "two", "three", "four", "five", "six", "seven",
|
||||
"eight", "nine", "ten", "eleven", "twelve", "thirteen",
|
||||
"fourteen", "fifteen", "sixteen", "seventeen", "eighteen",
|
||||
"nineteen", "twenty", "thirty", "forty", "fifty", "sixty",
|
||||
"seventy", "eighty", "ninety", "hundred", "thousand", "and",
|
||||
"einhundert", "zweihundert", "dreihundert", "vierhundert",
|
||||
"und", "zehn", "zwanzig", "dreißig", "vierzig", "fünfzig",
|
||||
}
|
||||
text_words = set(text.lower().split())
|
||||
is_written_number = len(text_words) >= 2 and text_words.issubset(_NUMBER_WORDS)
|
||||
# Short text or written-out number
|
||||
is_page_number = len(text) <= 20 or is_written_number
|
||||
if (text and not has_real_ipa and not has_commas
|
||||
and is_short
|
||||
and is_page_number
|
||||
and last_cells[0].get("col_type") != "heading"):
|
||||
footer_rows.append({
|
||||
"row_index": last_ri,
|
||||
|
||||
Reference in New Issue
Block a user