Detect written-out page numbers as footer rows

"two hundred and nine" (22 chars) was kept as a content row because the footer detection only accepted text ≤20 chars. Now recognizes written-out number words (English + German) as page numbers regardless of length. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-12 08:39:43 +02:00
parent 501de4374a
commit 757c8460c9
1 changed files with 15 additions and 3 deletions
--- a/klausur-service/backend/grid_editor_api.py
+++ b/klausur-service/backend/grid_editor_api.py
@@ -1198,10 +1198,22 @@ async def _build_grid_core(
                has_real_ipa = any(c in _REAL_IPA_CHARS_SET for c in text)
                # Comma-separated text is a content continuation, not a footer
                has_commas = ',' in text
-                # Long text (>20 chars) is unlikely a page number
-                is_short = len(text) <= 20
+                # Written-out page numbers like "two hundred and nine"
+                _NUMBER_WORDS = {
+                    "one", "two", "three", "four", "five", "six", "seven",
+                    "eight", "nine", "ten", "eleven", "twelve", "thirteen",
+                    "fourteen", "fifteen", "sixteen", "seventeen", "eighteen",
+                    "nineteen", "twenty", "thirty", "forty", "fifty", "sixty",
+                    "seventy", "eighty", "ninety", "hundred", "thousand", "and",
+                    "einhundert", "zweihundert", "dreihundert", "vierhundert",
+                    "und", "zehn", "zwanzig", "dreißig", "vierzig", "fünfzig",
+                }
+                text_words = set(text.lower().split())
+                is_written_number = len(text_words) >= 2 and text_words.issubset(_NUMBER_WORDS)
+                # Short text or written-out number
+                is_page_number = len(text) <= 20 or is_written_number
                if (text and not has_real_ipa and not has_commas
-                        and is_short
+                        and is_page_number
                        and last_cells[0].get("col_type") != "heading"):
                    footer_rows.append({
                        "row_index": last_ri,