Remove page-number footers from grid, promote to metadata

Footer rows that are page numbers (digits or written-out like "two hundred and nine") are now removed from the grid entirely and promoted to the page_number metadata field. Non-page-number footer content stays as a visible footer row. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-12 08:50:20 +02:00
parent 757c8460c9
commit 656cadbb1e
1 changed files with 49 additions and 16 deletions
--- a/klausur-service/backend/grid_editor_api.py
+++ b/klausur-service/backend/grid_editor_api.py
@@ -1150,6 +1150,15 @@ async def _build_grid_core(
    _REAL_IPA_CHARS_SET = set("ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ")
    # Page-ref pattern: "p.70", "P.70", ",.65" (garbled "p"), or bare "70"
    _PAGE_REF_RE = re.compile(r'^[pP,]?\s*\.?\s*\d+$')
+    _NUMBER_WORDS = {
+        "one", "two", "three", "four", "five", "six", "seven",
+        "eight", "nine", "ten", "eleven", "twelve", "thirteen",
+        "fourteen", "fifteen", "sixteen", "seventeen", "eighteen",
+        "nineteen", "twenty", "thirty", "forty", "fifty", "sixty",
+        "seventy", "eighty", "ninety", "hundred", "thousand", "and",
+        "einhundert", "zweihundert", "dreihundert", "vierhundert",
+        "und", "zehn", "zwanzig", "dreißig", "vierzig", "fünfzig",
+    }
    for z in zones_data:
        if z.get("zone_type") != "content":
            continue
@@ -1199,15 +1208,6 @@ async def _build_grid_core(
                # Comma-separated text is a content continuation, not a footer
                has_commas = ',' in text
                # Written-out page numbers like "two hundred and nine"
-                _NUMBER_WORDS = {
-                    "one", "two", "three", "four", "five", "six", "seven",
-                    "eight", "nine", "ten", "eleven", "twelve", "thirteen",
-                    "fourteen", "fifteen", "sixteen", "seventeen", "eighteen",
-                    "nineteen", "twenty", "thirty", "forty", "fifty", "sixty",
-                    "seventy", "eighty", "ninety", "hundred", "thousand", "and",
-                    "einhundert", "zweihundert", "dreihundert", "vierhundert",
-                    "und", "zehn", "zwanzig", "dreißig", "vierzig", "fünfzig",
-                }
                text_words = set(text.lower().split())
                is_written_number = len(text_words) >= 2 and text_words.issubset(_NUMBER_WORDS)
                # Short text or written-out number
@@ -1221,9 +1221,41 @@ async def _build_grid_core(
                        "bbox_pct": last_cells[0].get("bbox_pct", {}),
                    })

-        # Mark footer rows (keep in table, just tag for frontend)
-        if footer_rows:
-            footer_ris = {fr["row_index"] for fr in footer_rows}
+        # Classify footer rows: page numbers are removed from the grid
+        # and promoted to page_number metadata; other footers stay as rows.
+        page_number_footers = []
+        other_footers = []
+        for fr in footer_rows:
+            ft = fr["text"].strip()
+            # Pure digits
+            digits = "".join(c for c in ft if c.isdigit())
+            if digits and re.match(r'^[\d\s.]+$', ft):
+                page_number_footers.append(fr)
+            # Written-out numbers
+            elif ft.lower().split() and set(ft.lower().split()).issubset(_NUMBER_WORDS):
+                page_number_footers.append(fr)
+            else:
+                other_footers.append(fr)
+
+        # Remove page-number footer rows from grid entirely
+        if page_number_footers:
+            pn_ris = {fr["row_index"] for fr in page_number_footers}
+            z["cells"] = [c for c in z["cells"] if c.get("row_index") not in pn_ris]
+            z["rows"] = [r for r in z["rows"] if r["index"] not in pn_ris]
+            # Set page_number metadata (use first one)
+            pn_text = page_number_footers[0]["text"].strip()
+            pn_digits = "".join(c for c in pn_text if c.isdigit())
+            if not page_number_info:
+                page_number_info = {
+                    "text": pn_text,
+                    "y_pct": page_number_footers[0].get("bbox_pct", {}).get("y", 95),
+                }
+                if pn_digits:
+                    page_number_info["number"] = int(pn_digits)
+
+        # Mark remaining footer rows (non-page-number content)
+        if other_footers:
+            footer_ris = {fr["row_index"] for fr in other_footers}
            for r in z["rows"]:
                if r["index"] in footer_ris:
                    r["is_footer"] = True
@@ -1233,15 +1265,16 @@ async def _build_grid_core(

        if page_refs or footer_rows:
            logger.info(
-                "Extracted %d page_refs + %d footer rows from zone %d",
-                len(page_refs), len(footer_rows), z.get("zone_index", 0),
+                "Extracted %d page_refs + %d footer rows (%d page numbers removed) from zone %d",
+                len(page_refs), len(footer_rows), len(page_number_footers),
+                z.get("zone_index", 0),
            )

        # Store as zone-level metadata
        if page_refs:
            z["page_refs"] = page_refs
-        if footer_rows:
-            z["footer"] = footer_rows
+        if other_footers:
+            z["footer"] = other_footers

    # 5h. Convert slash-delimited IPA to bracket notation.
    # Dictionary-style pages print IPA between slashes: "tiger /'taiga/"