Fix colspan text + box row fields for GridTable compatibility
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 42s
CI / test-go-edu-search (push) Successful in 41s
CI / test-python-klausur (push) Failing after 2m49s
CI / test-python-agent-core (push) Successful in 42s
CI / test-nodejs-website (push) Successful in 33s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 42s
CI / test-go-edu-search (push) Successful in 41s
CI / test-python-klausur (push) Failing after 2m49s
CI / test-python-agent-core (push) Successful in 42s
CI / test-nodejs-website (push) Successful in 33s
Colspan: use original word-block text instead of split cell texts. Prevents "euros a nd cents" from split_cross_column_words. Box rows: add is_header field (was undefined, causing GridTable rendering issues). Add y_min_px/y_max_px to header_only rows. These missing fields caused empty rows with only row numbers visible. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -177,7 +177,12 @@ def build_box_zone_grid(
|
|||||||
).strip()
|
).strip()
|
||||||
return {
|
return {
|
||||||
"columns": [{"col_index": 0, "index": 0, "label": "column_text", "col_type": "column_1"}],
|
"columns": [{"col_index": 0, "index": 0, "label": "column_text", "col_type": "column_1"}],
|
||||||
"rows": [{"index": 0, "row_index": 0, "y_min": box_y, "y_max": box_y + box_h, "y_center": box_y + box_h / 2}],
|
"rows": [{"index": 0, "row_index": 0,
|
||||||
|
"y_min": box_y, "y_max": box_y + box_h, "y_center": box_y + box_h / 2,
|
||||||
|
"y_min_px": box_y, "y_max_px": box_y + box_h,
|
||||||
|
"y_min_pct": round(box_y / img_h * 100, 2) if img_h else 0,
|
||||||
|
"y_max_pct": round((box_y + box_h) / img_h * 100, 2) if img_h else 0,
|
||||||
|
"is_header": True}],
|
||||||
"cells": [{
|
"cells": [{
|
||||||
"cell_id": f"Z{zone_index}_R0C0",
|
"cell_id": f"Z{zone_index}_R0C0",
|
||||||
"row_index": 0,
|
"row_index": 0,
|
||||||
@@ -211,11 +216,11 @@ def build_box_zone_grid(
|
|||||||
"y_min": y_min,
|
"y_min": y_min,
|
||||||
"y_max": y_max,
|
"y_max": y_max,
|
||||||
"y_center": y_center,
|
"y_center": y_center,
|
||||||
# GridTable expects _px and _pct variants
|
|
||||||
"y_min_px": y_min,
|
"y_min_px": y_min,
|
||||||
"y_max_px": y_max,
|
"y_max_px": y_max,
|
||||||
"y_min_pct": round(y_min / img_h * 100, 2) if img_h else 0,
|
"y_min_pct": round(y_min / img_h * 100, 2) if img_h else 0,
|
||||||
"y_max_pct": round(y_max / img_h * 100, 2) if img_h else 0,
|
"y_max_pct": round(y_max / img_h * 100, 2) if img_h else 0,
|
||||||
|
"is_header": False,
|
||||||
}
|
}
|
||||||
rows.append(row)
|
rows.append(row)
|
||||||
|
|
||||||
|
|||||||
@@ -1296,22 +1296,11 @@ def _detect_colspan_cells(
|
|||||||
is_part_of_span = True
|
is_part_of_span = True
|
||||||
# Only emit the merged cell for the FIRST column in the span
|
# Only emit the merged cell for the FIRST column in the span
|
||||||
if ci == span["cols"][0]:
|
if ci == span["cols"][0]:
|
||||||
# Collect all cells in this span
|
# Use the ORIGINAL word-block text (not the split cell texts
|
||||||
span_cells = [c for c in cells
|
# which may have broken words like "euros a" + "nd cents")
|
||||||
if c.get("row_index") == ri
|
orig_word = span["word"]
|
||||||
and c.get("col_index") in span["cols"]]
|
merged_text = orig_word.get("text", "").strip()
|
||||||
# Merge texts (skip if same text repeated)
|
all_wb = [orig_word]
|
||||||
texts = []
|
|
||||||
for sc in sorted(span_cells, key=lambda c: c.get("col_index", 0)):
|
|
||||||
t = sc.get("text", "").strip()
|
|
||||||
if t and t not in texts:
|
|
||||||
texts.append(t)
|
|
||||||
merged_text = " ".join(texts)
|
|
||||||
|
|
||||||
# Collect all word_boxes
|
|
||||||
all_wb = []
|
|
||||||
for sc in span_cells:
|
|
||||||
all_wb.extend(sc.get("word_boxes", []))
|
|
||||||
|
|
||||||
# Compute merged bbox
|
# Compute merged bbox
|
||||||
if all_wb:
|
if all_wb:
|
||||||
|
|||||||
Reference in New Issue
Block a user