Fix: max_columns now works in OCR Kombi build-grid pipeline

The max_columns parameter was only implemented in cv_words_first.py (vocab-worksheet path) but NOT in _build_grid_core which is what the admin OCR Kombi pipeline uses. The Kombi pipeline uses grid_editor_helpers._cluster_columns_by_alignment() which has its own column detection. Fix: Post-processing step 5k merges narrowest columns after grid building when zone has more columns than max_columns. Cells from merged columns get their text appended to the target column. min_conf word filtering was already working (applied before grid build). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-23 16:40:39 +02:00
parent 2baad68060
commit 141f69ceaa
1 changed files with 83 additions and 0 deletions
@@ -1638,6 +1638,89 @@ async def _build_grid_core(
    if wb_reordered:
        logger.info("Step 5j: re-ordered word_boxes in %d cells to reading order", wb_reordered)
    # 5k. Enforce max_columns by merging narrowest columns
    if max_columns and max_columns > 0:
        for z in zones_data:
            if z.get("zone_type") != "content":
                continue
            cols = z.get("columns", [])
            cells = z.get("cells", [])
            if len(cols) <= max_columns:
                continue
            logger.info(
                "max_columns=%d: zone %s has %d columns → merging",
                max_columns, z.get("zone_index"), len(cols),
            )
            # Sort columns by width (ascending) — merge narrowest first
            cols_by_width = sorted(cols, key=lambda c: (c.get("x_max_px", 0) - c.get("x_min_px", 0)))
            while len(cols) > max_columns:
                # Find the narrowest column
                narrowest = cols_by_width.pop(0)
                ni = narrowest["index"]
                # Find its nearest neighbor (by x-position)
                sorted_by_x = sorted(cols, key=lambda c: c.get("x_min_px", c.get("x_min_pct", 0)))
                pos = next(i for i, c in enumerate(sorted_by_x) if c["index"] == ni)
                # Merge into right neighbor if possible, else left
                if pos + 1 < len(sorted_by_x):
                    merge_target = sorted_by_x[pos + 1]
                elif pos > 0:
                    merge_target = sorted_by_x[pos - 1]
                else:
                    break
                ti = merge_target["index"]
                # Expand target column bounds
                merge_target["x_min_px"] = min(
                    merge_target.get("x_min_px", merge_target.get("x_min_pct", 0)),
                    narrowest.get("x_min_px", narrowest.get("x_min_pct", 0)),
                )
                merge_target["x_max_px"] = max(
                    merge_target.get("x_max_px", merge_target.get("x_max_pct", 100)),
                    narrowest.get("x_max_px", narrowest.get("x_max_pct", 100)),
                )
                if "x_min_pct" in merge_target and "x_min_pct" in narrowest:
                    merge_target["x_min_pct"] = min(merge_target["x_min_pct"], narrowest["x_min_pct"])
                    merge_target["x_max_pct"] = max(merge_target["x_max_pct"], narrowest["x_max_pct"])
                # Reassign cells from narrowest → target
                for cell in cells:
                    if cell.get("col_index") == ni:
                        cell["col_index"] = ti
                        # Append text to existing cell in same row if it exists
                        existing = next(
                            (c for c in cells if c["col_index"] == ti
                             and c["row_index"] == cell["row_index"]
                             and c is not cell),
                            None,
                        )
                        if existing:
                            existing["text"] = (
                                (existing.get("text", "") + " " + cell.get("text", "")).strip()
                            )
                            existing["word_boxes"] = existing.get("word_boxes", []) + cell.get("word_boxes", [])
                            cell["_merged"] = True
                # Remove merged cells and column
                z["cells"] = [c for c in cells if not c.get("_merged")]
                cells = z["cells"]
                cols.remove(narrowest)
                cols_by_width = [c for c in cols_by_width if c["index"] != ni]
            # Re-index columns 0..N-1
            for new_idx, col in enumerate(sorted(cols, key=lambda c: c.get("x_min_px", c.get("x_min_pct", 0)))):
                old_idx = col["index"]
                col["index"] = new_idx
                for cell in cells:
                    if cell.get("col_index") == old_idx:
                        cell["col_index"] = new_idx
            logger.info("max_columns: zone %s now has %d columns", z.get("zone_index"), len(cols))
    duration = time.time() - t0
    # 6. Build result