Extract page number as metadata instead of silently removing it
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 27s
CI / test-go-edu-search (push) Successful in 36s
CI / test-python-klausur (push) Failing after 2m9s
CI / test-python-agent-core (push) Successful in 17s
CI / test-nodejs-website (push) Successful in 21s

_filter_footer_words now returns page number info (text, y_pct, number)
instead of just removing footer words. The page number is included in
the grid result as `page_number` and displayed in the frontend summary
bar as "S. 233".

This preserves page numbers for later page concatenation in the
customer frontend while still removing them from the grid content.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-26 08:52:09 +01:00
parent 5af5d821a5
commit e019dde01b
4 changed files with 36 additions and 8 deletions

View File

@@ -1301,29 +1301,42 @@ def _filter_footer_words(
img_h: int,
log: Any,
session_id: str,
) -> None:
) -> Optional[Dict]:
"""Remove isolated words in the bottom 5% of the page (page numbers).
Modifies *words* in place.
Modifies *words* in place and returns a page_number metadata dict
if a page number was extracted, or None.
"""
if not words or img_h <= 0:
return
return None
footer_y = img_h * 0.95
footer_words = [
w for w in words
if w["top"] + w.get("height", 0) / 2 > footer_y
]
if not footer_words:
return
return None
# Only remove if footer has very few words (≤ 3) with short text
total_text = "".join((w.get("text") or "").strip() for w in footer_words)
if len(footer_words) <= 3 and len(total_text) <= 10:
# Extract page number metadata before removing
page_number_info = {
"text": total_text.strip(),
"y_pct": round(footer_words[0]["top"] / img_h * 100, 1),
}
# Try to parse as integer
digits = "".join(c for c in total_text if c.isdigit())
if digits:
page_number_info["number"] = int(digits)
footer_set = set(id(w) for w in footer_words)
words[:] = [w for w in words if id(w) not in footer_set]
log.info(
"build-grid session %s: removed %d footer words ('%s')",
session_id, len(footer_words), total_text,
"build-grid session %s: extracted page number '%s' and removed %d footer words",
session_id, total_text, len(footer_words),
)
return page_number_info
return None
def _filter_header_junk(