Fix: max_columns now works in OCR Kombi build-grid pipeline
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 49s
CI / test-go-edu-search (push) Successful in 27s
CI / test-python-klausur (push) Failing after 2m31s
CI / test-python-agent-core (push) Successful in 27s
CI / test-nodejs-website (push) Successful in 30s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 49s
CI / test-go-edu-search (push) Successful in 27s
CI / test-python-klausur (push) Failing after 2m31s
CI / test-python-agent-core (push) Successful in 27s
CI / test-nodejs-website (push) Successful in 30s
The max_columns parameter was only implemented in cv_words_first.py (vocab-worksheet path) but NOT in _build_grid_core which is what the admin OCR Kombi pipeline uses. The Kombi pipeline uses grid_editor_helpers._cluster_columns_by_alignment() which has its own column detection. Fix: Post-processing step 5k merges narrowest columns after grid building when zone has more columns than max_columns. Cells from merged columns get their text appended to the target column. min_conf word filtering was already working (applied before grid build). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -1638,6 +1638,89 @@ async def _build_grid_core(
|
|||||||
if wb_reordered:
|
if wb_reordered:
|
||||||
logger.info("Step 5j: re-ordered word_boxes in %d cells to reading order", wb_reordered)
|
logger.info("Step 5j: re-ordered word_boxes in %d cells to reading order", wb_reordered)
|
||||||
|
|
||||||
|
# 5k. Enforce max_columns by merging narrowest columns
|
||||||
|
if max_columns and max_columns > 0:
|
||||||
|
for z in zones_data:
|
||||||
|
if z.get("zone_type") != "content":
|
||||||
|
continue
|
||||||
|
cols = z.get("columns", [])
|
||||||
|
cells = z.get("cells", [])
|
||||||
|
if len(cols) <= max_columns:
|
||||||
|
continue
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
"max_columns=%d: zone %s has %d columns → merging",
|
||||||
|
max_columns, z.get("zone_index"), len(cols),
|
||||||
|
)
|
||||||
|
|
||||||
|
# Sort columns by width (ascending) — merge narrowest first
|
||||||
|
cols_by_width = sorted(cols, key=lambda c: (c.get("x_max_px", 0) - c.get("x_min_px", 0)))
|
||||||
|
|
||||||
|
while len(cols) > max_columns:
|
||||||
|
# Find the narrowest column
|
||||||
|
narrowest = cols_by_width.pop(0)
|
||||||
|
ni = narrowest["index"]
|
||||||
|
|
||||||
|
# Find its nearest neighbor (by x-position)
|
||||||
|
sorted_by_x = sorted(cols, key=lambda c: c.get("x_min_px", c.get("x_min_pct", 0)))
|
||||||
|
pos = next(i for i, c in enumerate(sorted_by_x) if c["index"] == ni)
|
||||||
|
# Merge into right neighbor if possible, else left
|
||||||
|
if pos + 1 < len(sorted_by_x):
|
||||||
|
merge_target = sorted_by_x[pos + 1]
|
||||||
|
elif pos > 0:
|
||||||
|
merge_target = sorted_by_x[pos - 1]
|
||||||
|
else:
|
||||||
|
break
|
||||||
|
|
||||||
|
ti = merge_target["index"]
|
||||||
|
|
||||||
|
# Expand target column bounds
|
||||||
|
merge_target["x_min_px"] = min(
|
||||||
|
merge_target.get("x_min_px", merge_target.get("x_min_pct", 0)),
|
||||||
|
narrowest.get("x_min_px", narrowest.get("x_min_pct", 0)),
|
||||||
|
)
|
||||||
|
merge_target["x_max_px"] = max(
|
||||||
|
merge_target.get("x_max_px", merge_target.get("x_max_pct", 100)),
|
||||||
|
narrowest.get("x_max_px", narrowest.get("x_max_pct", 100)),
|
||||||
|
)
|
||||||
|
if "x_min_pct" in merge_target and "x_min_pct" in narrowest:
|
||||||
|
merge_target["x_min_pct"] = min(merge_target["x_min_pct"], narrowest["x_min_pct"])
|
||||||
|
merge_target["x_max_pct"] = max(merge_target["x_max_pct"], narrowest["x_max_pct"])
|
||||||
|
|
||||||
|
# Reassign cells from narrowest → target
|
||||||
|
for cell in cells:
|
||||||
|
if cell.get("col_index") == ni:
|
||||||
|
cell["col_index"] = ti
|
||||||
|
# Append text to existing cell in same row if it exists
|
||||||
|
existing = next(
|
||||||
|
(c for c in cells if c["col_index"] == ti
|
||||||
|
and c["row_index"] == cell["row_index"]
|
||||||
|
and c is not cell),
|
||||||
|
None,
|
||||||
|
)
|
||||||
|
if existing:
|
||||||
|
existing["text"] = (
|
||||||
|
(existing.get("text", "") + " " + cell.get("text", "")).strip()
|
||||||
|
)
|
||||||
|
existing["word_boxes"] = existing.get("word_boxes", []) + cell.get("word_boxes", [])
|
||||||
|
cell["_merged"] = True
|
||||||
|
|
||||||
|
# Remove merged cells and column
|
||||||
|
z["cells"] = [c for c in cells if not c.get("_merged")]
|
||||||
|
cells = z["cells"]
|
||||||
|
cols.remove(narrowest)
|
||||||
|
cols_by_width = [c for c in cols_by_width if c["index"] != ni]
|
||||||
|
|
||||||
|
# Re-index columns 0..N-1
|
||||||
|
for new_idx, col in enumerate(sorted(cols, key=lambda c: c.get("x_min_px", c.get("x_min_pct", 0)))):
|
||||||
|
old_idx = col["index"]
|
||||||
|
col["index"] = new_idx
|
||||||
|
for cell in cells:
|
||||||
|
if cell.get("col_index") == old_idx:
|
||||||
|
cell["col_index"] = new_idx
|
||||||
|
|
||||||
|
logger.info("max_columns: zone %s now has %d columns", z.get("zone_index"), len(cols))
|
||||||
|
|
||||||
duration = time.time() - t0
|
duration = time.time() - t0
|
||||||
|
|
||||||
# 6. Build result
|
# 6. Build result
|
||||||
|
|||||||
Reference in New Issue
Block a user