Fix colspan text + box row fields for GridTable compatibility
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 42s
CI / test-go-edu-search (push) Successful in 41s
CI / test-python-klausur (push) Failing after 2m49s
CI / test-python-agent-core (push) Successful in 42s
CI / test-nodejs-website (push) Successful in 33s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 42s
CI / test-go-edu-search (push) Successful in 41s
CI / test-python-klausur (push) Failing after 2m49s
CI / test-python-agent-core (push) Successful in 42s
CI / test-nodejs-website (push) Successful in 33s
Colspan: use original word-block text instead of split cell texts. Prevents "euros a nd cents" from split_cross_column_words. Box rows: add is_header field (was undefined, causing GridTable rendering issues). Add y_min_px/y_max_px to header_only rows. These missing fields caused empty rows with only row numbers visible. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -177,7 +177,12 @@ def build_box_zone_grid(
|
||||
).strip()
|
||||
return {
|
||||
"columns": [{"col_index": 0, "index": 0, "label": "column_text", "col_type": "column_1"}],
|
||||
"rows": [{"index": 0, "row_index": 0, "y_min": box_y, "y_max": box_y + box_h, "y_center": box_y + box_h / 2}],
|
||||
"rows": [{"index": 0, "row_index": 0,
|
||||
"y_min": box_y, "y_max": box_y + box_h, "y_center": box_y + box_h / 2,
|
||||
"y_min_px": box_y, "y_max_px": box_y + box_h,
|
||||
"y_min_pct": round(box_y / img_h * 100, 2) if img_h else 0,
|
||||
"y_max_pct": round((box_y + box_h) / img_h * 100, 2) if img_h else 0,
|
||||
"is_header": True}],
|
||||
"cells": [{
|
||||
"cell_id": f"Z{zone_index}_R0C0",
|
||||
"row_index": 0,
|
||||
@@ -211,11 +216,11 @@ def build_box_zone_grid(
|
||||
"y_min": y_min,
|
||||
"y_max": y_max,
|
||||
"y_center": y_center,
|
||||
# GridTable expects _px and _pct variants
|
||||
"y_min_px": y_min,
|
||||
"y_max_px": y_max,
|
||||
"y_min_pct": round(y_min / img_h * 100, 2) if img_h else 0,
|
||||
"y_max_pct": round(y_max / img_h * 100, 2) if img_h else 0,
|
||||
"is_header": False,
|
||||
}
|
||||
rows.append(row)
|
||||
|
||||
|
||||
@@ -1296,22 +1296,11 @@ def _detect_colspan_cells(
|
||||
is_part_of_span = True
|
||||
# Only emit the merged cell for the FIRST column in the span
|
||||
if ci == span["cols"][0]:
|
||||
# Collect all cells in this span
|
||||
span_cells = [c for c in cells
|
||||
if c.get("row_index") == ri
|
||||
and c.get("col_index") in span["cols"]]
|
||||
# Merge texts (skip if same text repeated)
|
||||
texts = []
|
||||
for sc in sorted(span_cells, key=lambda c: c.get("col_index", 0)):
|
||||
t = sc.get("text", "").strip()
|
||||
if t and t not in texts:
|
||||
texts.append(t)
|
||||
merged_text = " ".join(texts)
|
||||
|
||||
# Collect all word_boxes
|
||||
all_wb = []
|
||||
for sc in span_cells:
|
||||
all_wb.extend(sc.get("word_boxes", []))
|
||||
# Use the ORIGINAL word-block text (not the split cell texts
|
||||
# which may have broken words like "euros a" + "nd cents")
|
||||
orig_word = span["word"]
|
||||
merged_text = orig_word.get("text", "").strip()
|
||||
all_wb = [orig_word]
|
||||
|
||||
# Compute merged bbox
|
||||
if all_wb:
|
||||
|
||||
Reference in New Issue
Block a user