From 656cadbb1e0a1a0d3c5e7aed8a83b81b428279b6 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Sun, 12 Apr 2026 08:50:20 +0200 Subject: [PATCH] Remove page-number footers from grid, promote to metadata Footer rows that are page numbers (digits or written-out like "two hundred and nine") are now removed from the grid entirely and promoted to the page_number metadata field. Non-page-number footer content stays as a visible footer row. Co-Authored-By: Claude Opus 4.6 (1M context) --- klausur-service/backend/grid_editor_api.py | 65 ++++++++++++++++------ 1 file changed, 49 insertions(+), 16 deletions(-) diff --git a/klausur-service/backend/grid_editor_api.py b/klausur-service/backend/grid_editor_api.py index b739f27..cc4f214 100644 --- a/klausur-service/backend/grid_editor_api.py +++ b/klausur-service/backend/grid_editor_api.py @@ -1150,6 +1150,15 @@ async def _build_grid_core( _REAL_IPA_CHARS_SET = set("ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ") # Page-ref pattern: "p.70", "P.70", ",.65" (garbled "p"), or bare "70" _PAGE_REF_RE = re.compile(r'^[pP,]?\s*\.?\s*\d+$') + _NUMBER_WORDS = { + "one", "two", "three", "four", "five", "six", "seven", + "eight", "nine", "ten", "eleven", "twelve", "thirteen", + "fourteen", "fifteen", "sixteen", "seventeen", "eighteen", + "nineteen", "twenty", "thirty", "forty", "fifty", "sixty", + "seventy", "eighty", "ninety", "hundred", "thousand", "and", + "einhundert", "zweihundert", "dreihundert", "vierhundert", + "und", "zehn", "zwanzig", "dreißig", "vierzig", "fünfzig", + } for z in zones_data: if z.get("zone_type") != "content": continue @@ -1199,15 +1208,6 @@ async def _build_grid_core( # Comma-separated text is a content continuation, not a footer has_commas = ',' in text # Written-out page numbers like "two hundred and nine" - _NUMBER_WORDS = { - "one", "two", "three", "four", "five", "six", "seven", - "eight", "nine", "ten", "eleven", "twelve", "thirteen", - "fourteen", "fifteen", "sixteen", "seventeen", "eighteen", - "nineteen", "twenty", "thirty", "forty", "fifty", "sixty", - "seventy", "eighty", "ninety", "hundred", "thousand", "and", - "einhundert", "zweihundert", "dreihundert", "vierhundert", - "und", "zehn", "zwanzig", "dreißig", "vierzig", "fünfzig", - } text_words = set(text.lower().split()) is_written_number = len(text_words) >= 2 and text_words.issubset(_NUMBER_WORDS) # Short text or written-out number @@ -1221,9 +1221,41 @@ async def _build_grid_core( "bbox_pct": last_cells[0].get("bbox_pct", {}), }) - # Mark footer rows (keep in table, just tag for frontend) - if footer_rows: - footer_ris = {fr["row_index"] for fr in footer_rows} + # Classify footer rows: page numbers are removed from the grid + # and promoted to page_number metadata; other footers stay as rows. + page_number_footers = [] + other_footers = [] + for fr in footer_rows: + ft = fr["text"].strip() + # Pure digits + digits = "".join(c for c in ft if c.isdigit()) + if digits and re.match(r'^[\d\s.]+$', ft): + page_number_footers.append(fr) + # Written-out numbers + elif ft.lower().split() and set(ft.lower().split()).issubset(_NUMBER_WORDS): + page_number_footers.append(fr) + else: + other_footers.append(fr) + + # Remove page-number footer rows from grid entirely + if page_number_footers: + pn_ris = {fr["row_index"] for fr in page_number_footers} + z["cells"] = [c for c in z["cells"] if c.get("row_index") not in pn_ris] + z["rows"] = [r for r in z["rows"] if r["index"] not in pn_ris] + # Set page_number metadata (use first one) + pn_text = page_number_footers[0]["text"].strip() + pn_digits = "".join(c for c in pn_text if c.isdigit()) + if not page_number_info: + page_number_info = { + "text": pn_text, + "y_pct": page_number_footers[0].get("bbox_pct", {}).get("y", 95), + } + if pn_digits: + page_number_info["number"] = int(pn_digits) + + # Mark remaining footer rows (non-page-number content) + if other_footers: + footer_ris = {fr["row_index"] for fr in other_footers} for r in z["rows"]: if r["index"] in footer_ris: r["is_footer"] = True @@ -1233,15 +1265,16 @@ async def _build_grid_core( if page_refs or footer_rows: logger.info( - "Extracted %d page_refs + %d footer rows from zone %d", - len(page_refs), len(footer_rows), z.get("zone_index", 0), + "Extracted %d page_refs + %d footer rows (%d page numbers removed) from zone %d", + len(page_refs), len(footer_rows), len(page_number_footers), + z.get("zone_index", 0), ) # Store as zone-level metadata if page_refs: z["page_refs"] = page_refs - if footer_rows: - z["footer"] = footer_rows + if other_footers: + z["footer"] = other_footers # 5h. Convert slash-delimited IPA to bracket notation. # Dictionary-style pages print IPA between slashes: "tiger /'taiga/"