From 757c8460c96d617a52b362eccc0c80bcb90c214e Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Sun, 12 Apr 2026 08:39:43 +0200 Subject: [PATCH] Detect written-out page numbers as footer rows MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit "two hundred and nine" (22 chars) was kept as a content row because the footer detection only accepted text ≤20 chars. Now recognizes written-out number words (English + German) as page numbers regardless of length. Co-Authored-By: Claude Opus 4.6 (1M context) --- klausur-service/backend/grid_editor_api.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/klausur-service/backend/grid_editor_api.py b/klausur-service/backend/grid_editor_api.py index 3f39caa..b739f27 100644 --- a/klausur-service/backend/grid_editor_api.py +++ b/klausur-service/backend/grid_editor_api.py @@ -1198,10 +1198,22 @@ async def _build_grid_core( has_real_ipa = any(c in _REAL_IPA_CHARS_SET for c in text) # Comma-separated text is a content continuation, not a footer has_commas = ',' in text - # Long text (>20 chars) is unlikely a page number - is_short = len(text) <= 20 + # Written-out page numbers like "two hundred and nine" + _NUMBER_WORDS = { + "one", "two", "three", "four", "five", "six", "seven", + "eight", "nine", "ten", "eleven", "twelve", "thirteen", + "fourteen", "fifteen", "sixteen", "seventeen", "eighteen", + "nineteen", "twenty", "thirty", "forty", "fifty", "sixty", + "seventy", "eighty", "ninety", "hundred", "thousand", "and", + "einhundert", "zweihundert", "dreihundert", "vierhundert", + "und", "zehn", "zwanzig", "dreißig", "vierzig", "fünfzig", + } + text_words = set(text.lower().split()) + is_written_number = len(text_words) >= 2 and text_words.issubset(_NUMBER_WORDS) + # Short text or written-out number + is_page_number = len(text) <= 20 or is_written_number if (text and not has_real_ipa and not has_commas - and is_short + and is_page_number and last_cells[0].get("col_type") != "heading"): footer_rows.append({ "row_index": last_ri,