Generic colspan detection for merged cells in grids and boxes
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 33s
CI / test-go-edu-search (push) Successful in 38s
CI / test-python-klausur (push) Failing after 2m45s
CI / test-python-agent-core (push) Successful in 38s
CI / test-nodejs-website (push) Successful in 34s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 33s
CI / test-go-edu-search (push) Successful in 38s
CI / test-python-klausur (push) Failing after 2m45s
CI / test-python-agent-core (push) Successful in 38s
CI / test-nodejs-website (push) Successful in 34s
New _detect_colspan_cells() in grid_editor_helpers.py: - Runs after _build_cells() for every zone (content + box) - Detects word-blocks that extend across column boundaries - Merges affected cells into spanning_header with colspan=N - Uses column midpoints to determine which columns are covered - Works for full-page scans and box zones equally Also fixes box flowing/bullet_list row height fields (y_min_px/y_max_px). Removed duplicate spanning logic from cv_box_layout.py — now uses the generic _detect_colspan_cells from grid_editor_helpers. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -257,49 +257,8 @@ def build_box_zone_grid(
|
|||||||
global_columns=None, # detect columns independently
|
global_columns=None, # detect columns independently
|
||||||
)
|
)
|
||||||
|
|
||||||
# --- Detect spanning cells ---
|
# Colspan detection is now handled generically by _detect_colspan_cells
|
||||||
# If a row has fewer word-blocks than columns, some cells span multiple
|
# in grid_editor_helpers.py (called inside _build_zone_grid).
|
||||||
# columns. Detect this and mark them as spanning_header so the frontend
|
|
||||||
# renders them correctly (single cell across the row).
|
|
||||||
columns = result.get("columns", [])
|
|
||||||
cells = result.get("cells", [])
|
|
||||||
rows = result.get("rows", [])
|
|
||||||
|
|
||||||
if len(columns) >= 2:
|
|
||||||
# Group original words by row
|
|
||||||
from grid_editor_helpers import _cluster_rows as _cr
|
|
||||||
row_data = _cr(zone_words)
|
|
||||||
row_word_map: Dict[int, List[Dict]] = {}
|
|
||||||
for w in zone_words:
|
|
||||||
yc = w["top"] + w["height"] / 2
|
|
||||||
best = min(row_data, key=lambda r: abs(r["y_center"] - yc))
|
|
||||||
row_word_map.setdefault(best["index"], []).append(w)
|
|
||||||
|
|
||||||
for row in rows:
|
|
||||||
ri = row.get("index", row.get("row_index", -1))
|
|
||||||
rw = row_word_map.get(ri, [])
|
|
||||||
row_cells = [c for c in cells if c.get("row_index") == ri]
|
|
||||||
|
|
||||||
# If this row has fewer word-blocks than columns, it's a spanning row
|
|
||||||
if 0 < len(rw) < len(columns):
|
|
||||||
# Merge all cell texts and mark as spanning
|
|
||||||
merged_text = " ".join(
|
|
||||||
c.get("text", "") for c in sorted(row_cells, key=lambda c: c.get("col_index", 0))
|
|
||||||
).strip()
|
|
||||||
# Remove existing cells for this row
|
|
||||||
cells = [c for c in cells if c.get("row_index") != ri]
|
|
||||||
# Add single spanning cell
|
|
||||||
cells.append({
|
|
||||||
"cell_id": f"Z{zone_index}_R{ri:02d}_C0",
|
|
||||||
"row_index": ri,
|
|
||||||
"col_index": 0,
|
|
||||||
"col_type": "spanning_header",
|
|
||||||
"text": merged_text,
|
|
||||||
"word_boxes": rw,
|
|
||||||
})
|
|
||||||
logger.info("Box zone %d row %d: merged %d cells into spanning cell", zone_index, ri, len(row_cells))
|
|
||||||
|
|
||||||
result["cells"] = cells
|
|
||||||
|
|
||||||
result["box_layout_type"] = layout_type
|
result["box_layout_type"] = layout_type
|
||||||
result["box_grid_reviewed"] = False
|
result["box_grid_reviewed"] = False
|
||||||
|
|||||||
@@ -1218,6 +1218,141 @@ def _detect_header_rows(
|
|||||||
return headers
|
return headers
|
||||||
|
|
||||||
|
|
||||||
|
def _detect_colspan_cells(
|
||||||
|
zone_words: List[Dict],
|
||||||
|
columns: List[Dict],
|
||||||
|
rows: List[Dict],
|
||||||
|
cells: List[Dict],
|
||||||
|
img_w: int,
|
||||||
|
img_h: int,
|
||||||
|
) -> List[Dict]:
|
||||||
|
"""Detect and merge cells that span multiple columns (colspan).
|
||||||
|
|
||||||
|
A word-block (PaddleOCR phrase) that extends significantly past a column
|
||||||
|
boundary into the next column indicates a merged cell. This replaces
|
||||||
|
the incorrectly split cells with a single cell spanning multiple columns.
|
||||||
|
|
||||||
|
Works for both full-page scans and box zones.
|
||||||
|
"""
|
||||||
|
if len(columns) < 2 or not zone_words or not rows:
|
||||||
|
return cells
|
||||||
|
|
||||||
|
from cv_words_first import _assign_word_to_row
|
||||||
|
|
||||||
|
# Column boundaries (midpoints between adjacent columns)
|
||||||
|
col_boundaries = []
|
||||||
|
for ci in range(len(columns) - 1):
|
||||||
|
col_boundaries.append((columns[ci]["x_max"] + columns[ci + 1]["x_min"]) / 2)
|
||||||
|
|
||||||
|
def _cols_covered(w_left: float, w_right: float) -> List[int]:
|
||||||
|
"""Return list of column indices that a word-block covers."""
|
||||||
|
covered = []
|
||||||
|
for col in columns:
|
||||||
|
col_mid = (col["x_min"] + col["x_max"]) / 2
|
||||||
|
# Word covers a column if it extends past the column's midpoint
|
||||||
|
if w_left < col_mid < w_right:
|
||||||
|
covered.append(col["index"])
|
||||||
|
# Also include column if word starts within it
|
||||||
|
elif col["x_min"] <= w_left < col["x_max"]:
|
||||||
|
covered.append(col["index"])
|
||||||
|
return sorted(set(covered))
|
||||||
|
|
||||||
|
# Group original word-blocks by row
|
||||||
|
row_word_blocks: Dict[int, List[Dict]] = {}
|
||||||
|
for w in zone_words:
|
||||||
|
ri = _assign_word_to_row(w, rows)
|
||||||
|
row_word_blocks.setdefault(ri, []).append(w)
|
||||||
|
|
||||||
|
# For each row, check if any word-block spans multiple columns
|
||||||
|
rows_to_merge: Dict[int, List[Dict]] = {} # row_index → list of spanning word-blocks
|
||||||
|
|
||||||
|
for ri, wblocks in row_word_blocks.items():
|
||||||
|
spanning = []
|
||||||
|
for w in wblocks:
|
||||||
|
w_left = w["left"]
|
||||||
|
w_right = w_left + w["width"]
|
||||||
|
covered = _cols_covered(w_left, w_right)
|
||||||
|
if len(covered) >= 2:
|
||||||
|
spanning.append({"word": w, "cols": covered})
|
||||||
|
if spanning:
|
||||||
|
rows_to_merge[ri] = spanning
|
||||||
|
|
||||||
|
if not rows_to_merge:
|
||||||
|
return cells
|
||||||
|
|
||||||
|
# Merge cells for spanning rows
|
||||||
|
new_cells = []
|
||||||
|
for cell in cells:
|
||||||
|
ri = cell.get("row_index", -1)
|
||||||
|
if ri not in rows_to_merge:
|
||||||
|
new_cells.append(cell)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Check if this cell's column is part of a spanning block
|
||||||
|
ci = cell.get("col_index", -1)
|
||||||
|
is_part_of_span = False
|
||||||
|
for span in rows_to_merge[ri]:
|
||||||
|
if ci in span["cols"]:
|
||||||
|
is_part_of_span = True
|
||||||
|
# Only emit the merged cell for the FIRST column in the span
|
||||||
|
if ci == span["cols"][0]:
|
||||||
|
# Collect all cells in this span
|
||||||
|
span_cells = [c for c in cells
|
||||||
|
if c.get("row_index") == ri
|
||||||
|
and c.get("col_index") in span["cols"]]
|
||||||
|
# Merge texts (skip if same text repeated)
|
||||||
|
texts = []
|
||||||
|
for sc in sorted(span_cells, key=lambda c: c.get("col_index", 0)):
|
||||||
|
t = sc.get("text", "").strip()
|
||||||
|
if t and t not in texts:
|
||||||
|
texts.append(t)
|
||||||
|
merged_text = " ".join(texts)
|
||||||
|
|
||||||
|
# Collect all word_boxes
|
||||||
|
all_wb = []
|
||||||
|
for sc in span_cells:
|
||||||
|
all_wb.extend(sc.get("word_boxes", []))
|
||||||
|
|
||||||
|
# Compute merged bbox
|
||||||
|
if all_wb:
|
||||||
|
x_min = min(wb["left"] for wb in all_wb)
|
||||||
|
y_min = min(wb["top"] for wb in all_wb)
|
||||||
|
x_max = max(wb["left"] + wb["width"] for wb in all_wb)
|
||||||
|
y_max = max(wb["top"] + wb["height"] for wb in all_wb)
|
||||||
|
else:
|
||||||
|
x_min = y_min = x_max = y_max = 0
|
||||||
|
|
||||||
|
new_cells.append({
|
||||||
|
"cell_id": cell["cell_id"],
|
||||||
|
"row_index": ri,
|
||||||
|
"col_index": span["cols"][0],
|
||||||
|
"col_type": "spanning_header",
|
||||||
|
"colspan": len(span["cols"]),
|
||||||
|
"text": merged_text,
|
||||||
|
"confidence": cell.get("confidence", 0),
|
||||||
|
"bbox_px": {"x": x_min, "y": y_min,
|
||||||
|
"w": x_max - x_min, "h": y_max - y_min},
|
||||||
|
"bbox_pct": {
|
||||||
|
"x": round(x_min / img_w * 100, 2) if img_w else 0,
|
||||||
|
"y": round(y_min / img_h * 100, 2) if img_h else 0,
|
||||||
|
"w": round((x_max - x_min) / img_w * 100, 2) if img_w else 0,
|
||||||
|
"h": round((y_max - y_min) / img_h * 100, 2) if img_h else 0,
|
||||||
|
},
|
||||||
|
"word_boxes": all_wb,
|
||||||
|
"ocr_engine": cell.get("ocr_engine", ""),
|
||||||
|
"is_bold": cell.get("is_bold", False),
|
||||||
|
})
|
||||||
|
logger.info(
|
||||||
|
"colspan detected: row %d, cols %s → merged %d cells (%r)",
|
||||||
|
ri, span["cols"], len(span_cells), merged_text[:50],
|
||||||
|
)
|
||||||
|
break
|
||||||
|
if not is_part_of_span:
|
||||||
|
new_cells.append(cell)
|
||||||
|
|
||||||
|
return new_cells
|
||||||
|
|
||||||
|
|
||||||
def _build_zone_grid(
|
def _build_zone_grid(
|
||||||
zone_words: List[Dict],
|
zone_words: List[Dict],
|
||||||
zone_x: int,
|
zone_x: int,
|
||||||
@@ -1295,6 +1430,13 @@ def _build_zone_grid(
|
|||||||
# Build cells
|
# Build cells
|
||||||
cells = _build_cells(zone_words, columns, rows, img_w, img_h)
|
cells = _build_cells(zone_words, columns, rows, img_w, img_h)
|
||||||
|
|
||||||
|
# --- Detect colspan (merged cells spanning multiple columns) ---
|
||||||
|
# A word-block that extends across column boundaries indicates a merged
|
||||||
|
# cell (like Excel cell-merge). Detect these and replace the split
|
||||||
|
# cells with a single spanning cell.
|
||||||
|
if len(columns) >= 2:
|
||||||
|
cells = _detect_colspan_cells(zone_words, columns, rows, cells, img_w, img_h)
|
||||||
|
|
||||||
# Prefix cell IDs with zone index
|
# Prefix cell IDs with zone index
|
||||||
for cell in cells:
|
for cell in cells:
|
||||||
cell["cell_id"] = f"Z{zone_index}_{cell['cell_id']}"
|
cell["cell_id"] = f"Z{zone_index}_{cell['cell_id']}"
|
||||||
|
|||||||
Reference in New Issue
Block a user