Fix: max_columns now works in OCR Kombi build-grid pipeline
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 49s
CI / test-go-edu-search (push) Successful in 27s
CI / test-python-klausur (push) Failing after 2m31s
CI / test-python-agent-core (push) Successful in 27s
CI / test-nodejs-website (push) Successful in 30s

The max_columns parameter was only implemented in cv_words_first.py
(vocab-worksheet path) but NOT in _build_grid_core which is what
the admin OCR Kombi pipeline uses. The Kombi pipeline uses
grid_editor_helpers._cluster_columns_by_alignment() which has its
own column detection.

Fix: Post-processing step 5k merges narrowest columns after grid
building when zone has more columns than max_columns. Cells from
merged columns get their text appended to the target column.

min_conf word filtering was already working (applied before grid build).

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-04-23 16:40:39 +02:00
parent 2baad68060
commit 141f69ceaa

View File

@@ -1638,6 +1638,89 @@ async def _build_grid_core(
if wb_reordered:
logger.info("Step 5j: re-ordered word_boxes in %d cells to reading order", wb_reordered)
# 5k. Enforce max_columns by merging narrowest columns
if max_columns and max_columns > 0:
for z in zones_data:
if z.get("zone_type") != "content":
continue
cols = z.get("columns", [])
cells = z.get("cells", [])
if len(cols) <= max_columns:
continue
logger.info(
"max_columns=%d: zone %s has %d columns → merging",
max_columns, z.get("zone_index"), len(cols),
)
# Sort columns by width (ascending) — merge narrowest first
cols_by_width = sorted(cols, key=lambda c: (c.get("x_max_px", 0) - c.get("x_min_px", 0)))
while len(cols) > max_columns:
# Find the narrowest column
narrowest = cols_by_width.pop(0)
ni = narrowest["index"]
# Find its nearest neighbor (by x-position)
sorted_by_x = sorted(cols, key=lambda c: c.get("x_min_px", c.get("x_min_pct", 0)))
pos = next(i for i, c in enumerate(sorted_by_x) if c["index"] == ni)
# Merge into right neighbor if possible, else left
if pos + 1 < len(sorted_by_x):
merge_target = sorted_by_x[pos + 1]
elif pos > 0:
merge_target = sorted_by_x[pos - 1]
else:
break
ti = merge_target["index"]
# Expand target column bounds
merge_target["x_min_px"] = min(
merge_target.get("x_min_px", merge_target.get("x_min_pct", 0)),
narrowest.get("x_min_px", narrowest.get("x_min_pct", 0)),
)
merge_target["x_max_px"] = max(
merge_target.get("x_max_px", merge_target.get("x_max_pct", 100)),
narrowest.get("x_max_px", narrowest.get("x_max_pct", 100)),
)
if "x_min_pct" in merge_target and "x_min_pct" in narrowest:
merge_target["x_min_pct"] = min(merge_target["x_min_pct"], narrowest["x_min_pct"])
merge_target["x_max_pct"] = max(merge_target["x_max_pct"], narrowest["x_max_pct"])
# Reassign cells from narrowest → target
for cell in cells:
if cell.get("col_index") == ni:
cell["col_index"] = ti
# Append text to existing cell in same row if it exists
existing = next(
(c for c in cells if c["col_index"] == ti
and c["row_index"] == cell["row_index"]
and c is not cell),
None,
)
if existing:
existing["text"] = (
(existing.get("text", "") + " " + cell.get("text", "")).strip()
)
existing["word_boxes"] = existing.get("word_boxes", []) + cell.get("word_boxes", [])
cell["_merged"] = True
# Remove merged cells and column
z["cells"] = [c for c in cells if not c.get("_merged")]
cells = z["cells"]
cols.remove(narrowest)
cols_by_width = [c for c in cols_by_width if c["index"] != ni]
# Re-index columns 0..N-1
for new_idx, col in enumerate(sorted(cols, key=lambda c: c.get("x_min_px", c.get("x_min_pct", 0)))):
old_idx = col["index"]
col["index"] = new_idx
for cell in cells:
if cell.get("col_index") == old_idx:
cell["col_index"] = new_idx
logger.info("max_columns: zone %s now has %d columns", z.get("zone_index"), len(cols))
duration = time.time() - t0
# 6. Build result