Strip IPA from headings + extract page_refs and footer from table
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 24s
CI / test-go-edu-search (push) Successful in 25s
CI / test-python-klausur (push) Failing after 1m48s
CI / test-python-agent-core (push) Successful in 14s
CI / test-nodejs-website (push) Successful in 17s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 24s
CI / test-go-edu-search (push) Successful in 25s
CI / test-python-klausur (push) Failing after 1m48s
CI / test-python-agent-core (push) Successful in 14s
CI / test-nodejs-website (push) Successful in 17s
- Step 5f: Remove dictionary IPA from headings detected after IPA correction (e.g. "Theme [θˈiːm]" → "Theme") - Step 5g: Extract page_ref rows (column_1 only, e.g. "p.70") and footer rows (last single-cell row, e.g. page number "212") from the vocabulary table into zone-level metadata (page_refs, footer) so the frontend can render them separately Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1863,6 +1863,91 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
|
||||
if single_heading_count:
|
||||
logger.info("Detected %d heading rows by single-cell heuristic", single_heading_count)
|
||||
|
||||
# 5f. Strip IPA from headings — headings detected in 5e ran AFTER
|
||||
# IPA correction (5c), so they may have dictionary IPA appended
|
||||
# (e.g. "Theme [θˈiːm]" → "Theme"). Headings should show the
|
||||
# original text only.
|
||||
for z in zones_data:
|
||||
for cell in z.get("cells", []):
|
||||
if cell.get("col_type") != "heading":
|
||||
continue
|
||||
text = cell.get("text", "")
|
||||
# Strip trailing IPA bracket: "Theme [θˈiːm]" → "Theme"
|
||||
stripped = re.sub(r'\s*\[[^\]]*\]\s*$', '', text).strip()
|
||||
if stripped and stripped != text:
|
||||
cell["text"] = stripped
|
||||
|
||||
# 5g. Extract page_ref rows and footer rows from content zones.
|
||||
# Page references (column_1 cells like "p.70") and footer lines
|
||||
# (e.g. "two hundred and twelve" = page number) should not be part
|
||||
# of the vocabulary table. Move them to zone-level metadata so the
|
||||
# frontend can display them separately.
|
||||
for z in zones_data:
|
||||
if z.get("zone_type") != "content":
|
||||
continue
|
||||
cells = z.get("cells", [])
|
||||
rows = z.get("rows", [])
|
||||
if not rows:
|
||||
continue
|
||||
|
||||
page_refs = []
|
||||
footer_rows = []
|
||||
|
||||
# Detect page_ref rows: rows where the ONLY cell is column_1
|
||||
# (just a page number like "p.65", "p.70")
|
||||
for row in rows:
|
||||
if row.get("is_header"):
|
||||
continue
|
||||
ri = row["index"]
|
||||
row_cells = [c for c in cells if c.get("row_index") == ri]
|
||||
if (len(row_cells) == 1
|
||||
and row_cells[0].get("col_type") == "column_1"):
|
||||
page_refs.append({
|
||||
"row_index": ri,
|
||||
"text": (row_cells[0].get("text") or "").strip(),
|
||||
"bbox_pct": row_cells[0].get("bbox_pct", {}),
|
||||
})
|
||||
|
||||
# Detect footer: last non-header row if it has only 1 content
|
||||
# cell and no column_1 page_ref (standalone text like page num)
|
||||
non_header_rows = [r for r in rows if not r.get("is_header")]
|
||||
if non_header_rows:
|
||||
last_row = non_header_rows[-1]
|
||||
last_ri = last_row["index"]
|
||||
last_cells = [c for c in cells if c.get("row_index") == last_ri]
|
||||
content_last = [
|
||||
c for c in last_cells
|
||||
if c.get("col_type", "").startswith("column_")
|
||||
and c.get("col_type") != "column_1"
|
||||
]
|
||||
if len(content_last) == 1 and len(last_cells) == 1:
|
||||
footer_rows.append({
|
||||
"row_index": last_ri,
|
||||
"text": (content_last[0].get("text") or "").strip(),
|
||||
"bbox_pct": content_last[0].get("bbox_pct", {}),
|
||||
})
|
||||
|
||||
# Remove page_ref and footer cells/rows from the table
|
||||
remove_ris = set()
|
||||
for pr in page_refs:
|
||||
remove_ris.add(pr["row_index"])
|
||||
for fr in footer_rows:
|
||||
remove_ris.add(fr["row_index"])
|
||||
|
||||
if remove_ris:
|
||||
z["cells"] = [c for c in cells if c.get("row_index") not in remove_ris]
|
||||
z["rows"] = [r for r in rows if r["index"] not in remove_ris]
|
||||
logger.info(
|
||||
"Extracted %d page_refs + %d footer rows from zone %d",
|
||||
len(page_refs), len(footer_rows), z.get("zone_index", 0),
|
||||
)
|
||||
|
||||
# Store as zone-level metadata
|
||||
if page_refs:
|
||||
z["page_refs"] = page_refs
|
||||
if footer_rows:
|
||||
z["footer"] = footer_rows
|
||||
|
||||
duration = time.time() - t0
|
||||
|
||||
# 6. Build result
|
||||
|
||||
Reference in New Issue
Block a user