Fix colspan text + box row fields for GridTable compatibility
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 42s
CI / test-go-edu-search (push) Successful in 41s
CI / test-python-klausur (push) Failing after 2m49s
CI / test-python-agent-core (push) Successful in 42s
CI / test-nodejs-website (push) Successful in 33s

Colspan: use original word-block text instead of split cell texts.
Prevents "euros a nd cents" from split_cross_column_words.

Box rows: add is_header field (was undefined, causing GridTable
rendering issues). Add y_min_px/y_max_px to header_only rows.
These missing fields caused empty rows with only row numbers visible.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-04-13 12:08:49 +02:00
parent dc25f243a4
commit 868f99f109
2 changed files with 12 additions and 18 deletions

View File

@@ -177,7 +177,12 @@ def build_box_zone_grid(
).strip()
return {
"columns": [{"col_index": 0, "index": 0, "label": "column_text", "col_type": "column_1"}],
"rows": [{"index": 0, "row_index": 0, "y_min": box_y, "y_max": box_y + box_h, "y_center": box_y + box_h / 2}],
"rows": [{"index": 0, "row_index": 0,
"y_min": box_y, "y_max": box_y + box_h, "y_center": box_y + box_h / 2,
"y_min_px": box_y, "y_max_px": box_y + box_h,
"y_min_pct": round(box_y / img_h * 100, 2) if img_h else 0,
"y_max_pct": round((box_y + box_h) / img_h * 100, 2) if img_h else 0,
"is_header": True}],
"cells": [{
"cell_id": f"Z{zone_index}_R0C0",
"row_index": 0,
@@ -211,11 +216,11 @@ def build_box_zone_grid(
"y_min": y_min,
"y_max": y_max,
"y_center": y_center,
# GridTable expects _px and _pct variants
"y_min_px": y_min,
"y_max_px": y_max,
"y_min_pct": round(y_min / img_h * 100, 2) if img_h else 0,
"y_max_pct": round(y_max / img_h * 100, 2) if img_h else 0,
"is_header": False,
}
rows.append(row)

View File

@@ -1296,22 +1296,11 @@ def _detect_colspan_cells(
is_part_of_span = True
# Only emit the merged cell for the FIRST column in the span
if ci == span["cols"][0]:
# Collect all cells in this span
span_cells = [c for c in cells
if c.get("row_index") == ri
and c.get("col_index") in span["cols"]]
# Merge texts (skip if same text repeated)
texts = []
for sc in sorted(span_cells, key=lambda c: c.get("col_index", 0)):
t = sc.get("text", "").strip()
if t and t not in texts:
texts.append(t)
merged_text = " ".join(texts)
# Collect all word_boxes
all_wb = []
for sc in span_cells:
all_wb.extend(sc.get("word_boxes", []))
# Use the ORIGINAL word-block text (not the split cell texts
# which may have broken words like "euros a" + "nd cents")
orig_word = span["word"]
merged_text = orig_word.get("text", "").strip()
all_wb = [orig_word]
# Compute merged bbox
if all_wb: