fix: paddle_direct groups words per row (matching _build_cells format)

One cell per row with all words as word_boxes instead of one cell per word. Gives OverlayReconstruction a row-spanning bbox_pct for correct font sizing and per-word positions for slide/cluster placement. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-12 17:10:10 +01:00
parent c743a38eaf
commit 71a1b5f058
1 changed files with 72 additions and 59 deletions
@@ -2597,10 +2597,10 @@ def _paddle_words_to_grid_cells(
 ) -> tuple:
    """Convert PaddleOCR word dicts into GridCell dicts + columns_meta.
-    1. Sort words by (top, left).
+    Groups words into rows (Y-proximity), then builds ONE cell per row
-    2. Cluster into rows by Y-proximity (threshold = 50% of median word height).
+    with all words as word_boxes — matching the format of _build_cells()
-    3. Within each row, sort left→right and assign col_index.
+    in cv_words_first.py. This gives OverlayReconstruction a row-spanning
-    4. Each word → 1 GridCell with word_boxes and bbox_pct.
+    bbox_pct for correct font sizing and per-word positions for placement.
    Returns (cells, columns_meta) in the same format as build_grid_from_words.
    """
@@ -2632,69 +2632,82 @@ def _paddle_words_to_grid_cells(
    if current_row:
        rows.append(current_row)
-    # Sort each row left→right and build cells
+    # Build ONE cell per row (all words in reading order, word_boxes for positioning)
    cells: List[Dict[str, Any]] = []
    max_col = 0
    for row_idx, row_words in enumerate(rows):
        row_words.sort(key=lambda w: w["left"])
        for col_idx, w in enumerate(row_words):
            left = w["left"]
            top = w["top"]
            width = w["width"]
            height = w["height"]
            conf = w.get("confidence", 0)
            if isinstance(conf, float) and conf <= 1.0:
                conf = conf * 100  # normalize to 0-100
-            cell = {
+        # Tight bbox spanning all words in this row
-                "cell_id": f"PD_R{row_idx:02d}_W{col_idx:02d}",
+        x_min = min(w["left"] for w in row_words)
-                "x": left,
+        y_min = min(w["top"] for w in row_words)
-                "y": top,
+        x_max = max(w["left"] + w["width"] for w in row_words)
-                "width": width,
+        y_max = max(w["top"] + w["height"] for w in row_words)
-                "height": height,
+        bw = x_max - x_min
-                "text": w.get("text", ""),
+        bh = y_max - y_min
                "confidence": round(conf, 1),
                "column_index": col_idx,
                "row_index": row_idx,
                "zone_index": 0,
                "ocr_engine": "paddle_direct",
                "word_boxes": [{
                    "text": w.get("text", ""),
                    "left": left,
                    "top": top,
                    "width": width,
                    "height": height,
                    "confidence": round(conf, 1),
                }],
                "bbox_pct": {
                    "x": round(left / img_w * 100, 3),
                    "y": round(top / img_h * 100, 3),
                    "w": round(width / img_w * 100, 3),
                    "h": round(height / img_h * 100, 3),
                },
            }
            cells.append(cell)
            if col_idx > max_col:
                max_col = col_idx
-    # Build columns_meta — one pseudo-column per column index
+        # Text: all words joined by space
-    columns_meta = []
+        text = " ".join(w.get("text", "").strip() for w in row_words if w.get("text", "").strip())
-    for ci in range(max_col + 1):
+
-        col_cells = [c for c in cells if c["column_index"] == ci]
+        # Average confidence
-        if col_cells:
+        confs = []
-            min_x = min(c["x"] for c in col_cells)
+        for w in row_words:
-            max_right = max(c["x"] + c["width"] for c in col_cells)
+            c = w.get("confidence", 0)
-            columns_meta.append({
+            if isinstance(c, float) and c <= 1.0:
-                "type": "column_text",
+                c = c * 100
-                "x": min_x,
+            confs.append(c)
-                "y": 0,
+        avg_conf = sum(confs) / len(confs) if confs else 0.0
-                "width": max_right - min_x,
+
-                "height": img_h,
+        # Per-word boxes with absolute pixel coordinates
-                "classification_confidence": 1.0,
+        word_boxes = []
-                "classification_method": "paddle_direct",
+        for w in row_words:
            raw_text = w.get("text", "").strip()
            if not raw_text:
                continue
            c = w.get("confidence", 0)
            if isinstance(c, float) and c <= 1.0:
                c = c * 100
            word_boxes.append({
                "text": raw_text,
                "left": w["left"],
                "top": w["top"],
                "width": w["width"],
                "height": w["height"],
                "conf": round(c, 1),
            })
        cell = {
            "cell_id": f"PD_R{row_idx:02d}_C0",
            "row_index": row_idx,
            "col_index": 0,
            "col_type": "column_text",
            "text": text,
            "confidence": round(avg_conf, 1),
            "zone_index": 0,
            "ocr_engine": "paddle_direct",
            "is_bold": False,
            "word_boxes": word_boxes,
            "bbox_px": {"x": x_min, "y": y_min, "w": bw, "h": bh},
            "bbox_pct": {
                "x": round(x_min / img_w * 100, 2) if img_w else 0,
                "y": round(y_min / img_h * 100, 2) if img_h else 0,
                "w": round(bw / img_w * 100, 2) if img_w else 0,
                "h": round(bh / img_h * 100, 2) if img_h else 0,
            },
        }
        cells.append(cell)
    # Single full-page pseudo-column (all rows belong to column 0)
    columns_meta = [{
        "type": "column_text",
        "x": 0,
        "y": 0,
        "width": img_w,
        "height": img_h,
        "classification_confidence": 1.0,
        "classification_method": "paddle_direct",
    }]
    return cells, columns_meta