From 9681fcbd05c9960e4329d874223a43a41089b379 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Fri, 20 Mar 2026 08:42:53 +0100 Subject: [PATCH] Strip IPA from headings + extract page_refs and footer from table MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Step 5f: Remove dictionary IPA from headings detected after IPA correction (e.g. "Theme [θˈiːm]" → "Theme") - Step 5g: Extract page_ref rows (column_1 only, e.g. "p.70") and footer rows (last single-cell row, e.g. page number "212") from the vocabulary table into zone-level metadata (page_refs, footer) so the frontend can render them separately Co-Authored-By: Claude Opus 4.6 --- klausur-service/backend/grid_editor_api.py | 85 ++++++++++++++++++++++ 1 file changed, 85 insertions(+) diff --git a/klausur-service/backend/grid_editor_api.py b/klausur-service/backend/grid_editor_api.py index b05c344..664f877 100644 --- a/klausur-service/backend/grid_editor_api.py +++ b/klausur-service/backend/grid_editor_api.py @@ -1863,6 +1863,91 @@ async def _build_grid_core(session_id: str, session: dict) -> dict: if single_heading_count: logger.info("Detected %d heading rows by single-cell heuristic", single_heading_count) + # 5f. Strip IPA from headings — headings detected in 5e ran AFTER + # IPA correction (5c), so they may have dictionary IPA appended + # (e.g. "Theme [θˈiːm]" → "Theme"). Headings should show the + # original text only. + for z in zones_data: + for cell in z.get("cells", []): + if cell.get("col_type") != "heading": + continue + text = cell.get("text", "") + # Strip trailing IPA bracket: "Theme [θˈiːm]" → "Theme" + stripped = re.sub(r'\s*\[[^\]]*\]\s*$', '', text).strip() + if stripped and stripped != text: + cell["text"] = stripped + + # 5g. Extract page_ref rows and footer rows from content zones. + # Page references (column_1 cells like "p.70") and footer lines + # (e.g. "two hundred and twelve" = page number) should not be part + # of the vocabulary table. Move them to zone-level metadata so the + # frontend can display them separately. + for z in zones_data: + if z.get("zone_type") != "content": + continue + cells = z.get("cells", []) + rows = z.get("rows", []) + if not rows: + continue + + page_refs = [] + footer_rows = [] + + # Detect page_ref rows: rows where the ONLY cell is column_1 + # (just a page number like "p.65", "p.70") + for row in rows: + if row.get("is_header"): + continue + ri = row["index"] + row_cells = [c for c in cells if c.get("row_index") == ri] + if (len(row_cells) == 1 + and row_cells[0].get("col_type") == "column_1"): + page_refs.append({ + "row_index": ri, + "text": (row_cells[0].get("text") or "").strip(), + "bbox_pct": row_cells[0].get("bbox_pct", {}), + }) + + # Detect footer: last non-header row if it has only 1 content + # cell and no column_1 page_ref (standalone text like page num) + non_header_rows = [r for r in rows if not r.get("is_header")] + if non_header_rows: + last_row = non_header_rows[-1] + last_ri = last_row["index"] + last_cells = [c for c in cells if c.get("row_index") == last_ri] + content_last = [ + c for c in last_cells + if c.get("col_type", "").startswith("column_") + and c.get("col_type") != "column_1" + ] + if len(content_last) == 1 and len(last_cells) == 1: + footer_rows.append({ + "row_index": last_ri, + "text": (content_last[0].get("text") or "").strip(), + "bbox_pct": content_last[0].get("bbox_pct", {}), + }) + + # Remove page_ref and footer cells/rows from the table + remove_ris = set() + for pr in page_refs: + remove_ris.add(pr["row_index"]) + for fr in footer_rows: + remove_ris.add(fr["row_index"]) + + if remove_ris: + z["cells"] = [c for c in cells if c.get("row_index") not in remove_ris] + z["rows"] = [r for r in rows if r["index"] not in remove_ris] + logger.info( + "Extracted %d page_refs + %d footer rows from zone %d", + len(page_refs), len(footer_rows), z.get("zone_index", 0), + ) + + # Store as zone-level metadata + if page_refs: + z["page_refs"] = page_refs + if footer_rows: + z["footer"] = footer_rows + duration = time.time() - t0 # 6. Build result