fix: paddle_direct groups words per row (matching _build_cells format)

One cell per row with all words as word_boxes instead of one cell per word. Gives OverlayReconstruction a row-spanning bbox_pct for correct font sizing and per-word positions for slide/cluster placement. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-12 17:10:10 +01:00
parent c743a38eaf
commit 71a1b5f058
1 changed files with 72 additions and 59 deletions
@@ -2597,10 +2597,10 @@ def _paddle_words_to_grid_cells(
 ) -> tuple:
    """Convert PaddleOCR word dicts into GridCell dicts + columns_meta.

-    1. Sort words by (top, left).
-    2. Cluster into rows by Y-proximity (threshold = 50% of median word height).
-    3. Within each row, sort left→right and assign col_index.
-    4. Each word → 1 GridCell with word_boxes and bbox_pct.
+    Groups words into rows (Y-proximity), then builds ONE cell per row
+    with all words as word_boxes — matching the format of _build_cells()
+    in cv_words_first.py. This gives OverlayReconstruction a row-spanning
+    bbox_pct for correct font sizing and per-word positions for placement.

    Returns (cells, columns_meta) in the same format as build_grid_from_words.
    """
@@ -2632,69 +2632,82 @@ def _paddle_words_to_grid_cells(
    if current_row:
        rows.append(current_row)

-    # Sort each row left→right and build cells
+    # Build ONE cell per row (all words in reading order, word_boxes for positioning)
    cells: List[Dict[str, Any]] = []
-    max_col = 0

    for row_idx, row_words in enumerate(rows):
        row_words.sort(key=lambda w: w["left"])
-        for col_idx, w in enumerate(row_words):
-            left = w["left"]
-            top = w["top"]
-            width = w["width"]
-            height = w["height"]
-            conf = w.get("confidence", 0)
-            if isinstance(conf, float) and conf <= 1.0:
-                conf = conf * 100  # normalize to 0-100

-            cell = {
-                "cell_id": f"PD_R{row_idx:02d}_W{col_idx:02d}",
-                "x": left,
-                "y": top,
-                "width": width,
-                "height": height,
-                "text": w.get("text", ""),
-                "confidence": round(conf, 1),
-                "column_index": col_idx,
-                "row_index": row_idx,
-                "zone_index": 0,
-                "ocr_engine": "paddle_direct",
-                "word_boxes": [{
-                    "text": w.get("text", ""),
-                    "left": left,
-                    "top": top,
-                    "width": width,
-                    "height": height,
-                    "confidence": round(conf, 1),
-                }],
-                "bbox_pct": {
-                    "x": round(left / img_w * 100, 3),
-                    "y": round(top / img_h * 100, 3),
-                    "w": round(width / img_w * 100, 3),
-                    "h": round(height / img_h * 100, 3),
-                },
-            }
-            cells.append(cell)
-            if col_idx > max_col:
-                max_col = col_idx
+        # Tight bbox spanning all words in this row
+        x_min = min(w["left"] for w in row_words)
+        y_min = min(w["top"] for w in row_words)
+        x_max = max(w["left"] + w["width"] for w in row_words)
+        y_max = max(w["top"] + w["height"] for w in row_words)
+        bw = x_max - x_min
+        bh = y_max - y_min

-    # Build columns_meta — one pseudo-column per column index
-    columns_meta = []
-    for ci in range(max_col + 1):
-        col_cells = [c for c in cells if c["column_index"] == ci]
-        if col_cells:
-            min_x = min(c["x"] for c in col_cells)
-            max_right = max(c["x"] + c["width"] for c in col_cells)
-            columns_meta.append({
-                "type": "column_text",
-                "x": min_x,
-                "y": 0,
-                "width": max_right - min_x,
-                "height": img_h,
-                "classification_confidence": 1.0,
-                "classification_method": "paddle_direct",
+        # Text: all words joined by space
+        text = " ".join(w.get("text", "").strip() for w in row_words if w.get("text", "").strip())
+
+        # Average confidence
+        confs = []
+        for w in row_words:
+            c = w.get("confidence", 0)
+            if isinstance(c, float) and c <= 1.0:
+                c = c * 100
+            confs.append(c)
+        avg_conf = sum(confs) / len(confs) if confs else 0.0
+
+        # Per-word boxes with absolute pixel coordinates
+        word_boxes = []
+        for w in row_words:
+            raw_text = w.get("text", "").strip()
+            if not raw_text:
+                continue
+            c = w.get("confidence", 0)
+            if isinstance(c, float) and c <= 1.0:
+                c = c * 100
+            word_boxes.append({
+                "text": raw_text,
+                "left": w["left"],
+                "top": w["top"],
+                "width": w["width"],
+                "height": w["height"],
+                "conf": round(c, 1),
            })

+        cell = {
+            "cell_id": f"PD_R{row_idx:02d}_C0",
+            "row_index": row_idx,
+            "col_index": 0,
+            "col_type": "column_text",
+            "text": text,
+            "confidence": round(avg_conf, 1),
+            "zone_index": 0,
+            "ocr_engine": "paddle_direct",
+            "is_bold": False,
+            "word_boxes": word_boxes,
+            "bbox_px": {"x": x_min, "y": y_min, "w": bw, "h": bh},
+            "bbox_pct": {
+                "x": round(x_min / img_w * 100, 2) if img_w else 0,
+                "y": round(y_min / img_h * 100, 2) if img_h else 0,
+                "w": round(bw / img_w * 100, 2) if img_w else 0,
+                "h": round(bh / img_h * 100, 2) if img_h else 0,
+            },
+        }
+        cells.append(cell)
+
+    # Single full-page pseudo-column (all rows belong to column 0)
+    columns_meta = [{
+        "type": "column_text",
+        "x": 0,
+        "y": 0,
+        "width": img_w,
+        "height": img_h,
+        "classification_confidence": 1.0,
+        "classification_method": "paddle_direct",
+    }]
+
    return cells, columns_meta