From 9681fcbd05c9960e4329d874223a43a41089b379 Mon Sep 17 00:00:00 2001
From: Benjamin Admin <benjaminadmin@MacBook-Pro.fritz.box>
Date: Fri, 20 Mar 2026 08:42:53 +0100
Subject: [PATCH] Strip IPA from headings + extract page_refs and footer from
 table
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Step 5f: Remove dictionary IPA from headings detected after IPA
  correction (e.g. "Theme [θˈiːm]" → "Theme")
- Step 5g: Extract page_ref rows (column_1 only, e.g. "p.70") and
  footer rows (last single-cell row, e.g. page number "212") from
  the vocabulary table into zone-level metadata (page_refs, footer)
  so the frontend can render them separately

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 klausur-service/backend/grid_editor_api.py | 85 ++++++++++++++++++++++
 1 file changed, 85 insertions(+)

diff --git a/klausur-service/backend/grid_editor_api.py b/klausur-service/backend/grid_editor_api.py
index b05c344..664f877 100644
--- a/klausur-service/backend/grid_editor_api.py
+++ b/klausur-service/backend/grid_editor_api.py
@@ -1863,6 +1863,91 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
     if single_heading_count:
         logger.info("Detected %d heading rows by single-cell heuristic", single_heading_count)
 
+    # 5f. Strip IPA from headings — headings detected in 5e ran AFTER
+    # IPA correction (5c), so they may have dictionary IPA appended
+    # (e.g. "Theme [θˈiːm]" → "Theme").  Headings should show the
+    # original text only.
+    for z in zones_data:
+        for cell in z.get("cells", []):
+            if cell.get("col_type") != "heading":
+                continue
+            text = cell.get("text", "")
+            # Strip trailing IPA bracket: "Theme [θˈiːm]" → "Theme"
+            stripped = re.sub(r'\s*\[[^\]]*\]\s*$', '', text).strip()
+            if stripped and stripped != text:
+                cell["text"] = stripped
+
+    # 5g. Extract page_ref rows and footer rows from content zones.
+    # Page references (column_1 cells like "p.70") and footer lines
+    # (e.g. "two hundred and twelve" = page number) should not be part
+    # of the vocabulary table.  Move them to zone-level metadata so the
+    # frontend can display them separately.
+    for z in zones_data:
+        if z.get("zone_type") != "content":
+            continue
+        cells = z.get("cells", [])
+        rows = z.get("rows", [])
+        if not rows:
+            continue
+
+        page_refs = []
+        footer_rows = []
+
+        # Detect page_ref rows: rows where the ONLY cell is column_1
+        # (just a page number like "p.65", "p.70")
+        for row in rows:
+            if row.get("is_header"):
+                continue
+            ri = row["index"]
+            row_cells = [c for c in cells if c.get("row_index") == ri]
+            if (len(row_cells) == 1
+                    and row_cells[0].get("col_type") == "column_1"):
+                page_refs.append({
+                    "row_index": ri,
+                    "text": (row_cells[0].get("text") or "").strip(),
+                    "bbox_pct": row_cells[0].get("bbox_pct", {}),
+                })
+
+        # Detect footer: last non-header row if it has only 1 content
+        # cell and no column_1 page_ref (standalone text like page num)
+        non_header_rows = [r for r in rows if not r.get("is_header")]
+        if non_header_rows:
+            last_row = non_header_rows[-1]
+            last_ri = last_row["index"]
+            last_cells = [c for c in cells if c.get("row_index") == last_ri]
+            content_last = [
+                c for c in last_cells
+                if c.get("col_type", "").startswith("column_")
+                and c.get("col_type") != "column_1"
+            ]
+            if len(content_last) == 1 and len(last_cells) == 1:
+                footer_rows.append({
+                    "row_index": last_ri,
+                    "text": (content_last[0].get("text") or "").strip(),
+                    "bbox_pct": content_last[0].get("bbox_pct", {}),
+                })
+
+        # Remove page_ref and footer cells/rows from the table
+        remove_ris = set()
+        for pr in page_refs:
+            remove_ris.add(pr["row_index"])
+        for fr in footer_rows:
+            remove_ris.add(fr["row_index"])
+
+        if remove_ris:
+            z["cells"] = [c for c in cells if c.get("row_index") not in remove_ris]
+            z["rows"] = [r for r in rows if r["index"] not in remove_ris]
+            logger.info(
+                "Extracted %d page_refs + %d footer rows from zone %d",
+                len(page_refs), len(footer_rows), z.get("zone_index", 0),
+            )
+
+        # Store as zone-level metadata
+        if page_refs:
+            z["page_refs"] = page_refs
+        if footer_rows:
+            z["footer"] = footer_rows
+
     duration = time.time() - t0
 
     # 6. Build result