Fix: max_columns now works in OCR Kombi build-grid pipeline

The max_columns parameter was only implemented in cv_words_first.py (vocab-worksheet path) but NOT in _build_grid_core which is what the admin OCR Kombi pipeline uses. The Kombi pipeline uses grid_editor_helpers._cluster_columns_by_alignment() which has its own column detection. Fix: Post-processing step 5k merges narrowest columns after grid building when zone has more columns than max_columns. Cells from merged columns get their text appended to the target column. min_conf word filtering was already working (applied before grid build). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-23 16:40:39 +02:00
parent 2baad68060
commit 141f69ceaa
1 changed files with 83 additions and 0 deletions
@@ -1638,6 +1638,89 @@ async def _build_grid_core(
    if wb_reordered:
        logger.info("Step 5j: re-ordered word_boxes in %d cells to reading order", wb_reordered)

+    # 5k. Enforce max_columns by merging narrowest columns
+    if max_columns and max_columns > 0:
+        for z in zones_data:
+            if z.get("zone_type") != "content":
+                continue
+            cols = z.get("columns", [])
+            cells = z.get("cells", [])
+            if len(cols) <= max_columns:
+                continue
+
+            logger.info(
+                "max_columns=%d: zone %s has %d columns → merging",
+                max_columns, z.get("zone_index"), len(cols),
+            )
+
+            # Sort columns by width (ascending) — merge narrowest first
+            cols_by_width = sorted(cols, key=lambda c: (c.get("x_max_px", 0) - c.get("x_min_px", 0)))
+
+            while len(cols) > max_columns:
+                # Find the narrowest column
+                narrowest = cols_by_width.pop(0)
+                ni = narrowest["index"]
+
+                # Find its nearest neighbor (by x-position)
+                sorted_by_x = sorted(cols, key=lambda c: c.get("x_min_px", c.get("x_min_pct", 0)))
+                pos = next(i for i, c in enumerate(sorted_by_x) if c["index"] == ni)
+                # Merge into right neighbor if possible, else left
+                if pos + 1 < len(sorted_by_x):
+                    merge_target = sorted_by_x[pos + 1]
+                elif pos > 0:
+                    merge_target = sorted_by_x[pos - 1]
+                else:
+                    break
+
+                ti = merge_target["index"]
+
+                # Expand target column bounds
+                merge_target["x_min_px"] = min(
+                    merge_target.get("x_min_px", merge_target.get("x_min_pct", 0)),
+                    narrowest.get("x_min_px", narrowest.get("x_min_pct", 0)),
+                )
+                merge_target["x_max_px"] = max(
+                    merge_target.get("x_max_px", merge_target.get("x_max_pct", 100)),
+                    narrowest.get("x_max_px", narrowest.get("x_max_pct", 100)),
+                )
+                if "x_min_pct" in merge_target and "x_min_pct" in narrowest:
+                    merge_target["x_min_pct"] = min(merge_target["x_min_pct"], narrowest["x_min_pct"])
+                    merge_target["x_max_pct"] = max(merge_target["x_max_pct"], narrowest["x_max_pct"])
+
+                # Reassign cells from narrowest → target
+                for cell in cells:
+                    if cell.get("col_index") == ni:
+                        cell["col_index"] = ti
+                        # Append text to existing cell in same row if it exists
+                        existing = next(
+                            (c for c in cells if c["col_index"] == ti
+                             and c["row_index"] == cell["row_index"]
+                             and c is not cell),
+                            None,
+                        )
+                        if existing:
+                            existing["text"] = (
+                                (existing.get("text", "") + " " + cell.get("text", "")).strip()
+                            )
+                            existing["word_boxes"] = existing.get("word_boxes", []) + cell.get("word_boxes", [])
+                            cell["_merged"] = True
+
+                # Remove merged cells and column
+                z["cells"] = [c for c in cells if not c.get("_merged")]
+                cells = z["cells"]
+                cols.remove(narrowest)
+                cols_by_width = [c for c in cols_by_width if c["index"] != ni]
+
+            # Re-index columns 0..N-1
+            for new_idx, col in enumerate(sorted(cols, key=lambda c: c.get("x_min_px", c.get("x_min_pct", 0)))):
+                old_idx = col["index"]
+                col["index"] = new_idx
+                for cell in cells:
+                    if cell.get("col_index") == old_idx:
+                        cell["col_index"] = new_idx
+
+            logger.info("max_columns: zone %s now has %d columns", z.get("zone_index"), len(cols))
+
    duration = time.time() - t0

    # 6. Build result