From 8349c28f54ba2bd2c529ddb008b36b571cb7cebf Mon Sep 17 00:00:00 2001
From: Benjamin Admin <benjaminadmin@MacBook-Pro.fritz.box>
Date: Thu, 12 Mar 2026 17:19:52 +0100
Subject: [PATCH] fix: paddle_direct reuses build_grid_from_words for correct
 overlay
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replaces custom _paddle_words_to_grid_cells with the proven
build_grid_from_words from cv_words_first.py — same function the
regular pipeline uses with PaddleOCR. Handles phrase splitting,
column clustering, and produces cells with word_boxes that the
slide/cluster positioning hooks expect.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 klausur-service/backend/ocr_pipeline_api.py | 134 ++------------------
 1 file changed, 11 insertions(+), 123 deletions(-)

diff --git a/klausur-service/backend/ocr_pipeline_api.py b/klausur-service/backend/ocr_pipeline_api.py
index 39cb199..f19bcf8 100644
--- a/klausur-service/backend/ocr_pipeline_api.py
+++ b/klausur-service/backend/ocr_pipeline_api.py
@@ -2541,11 +2541,20 @@ async def paddle_direct(session_id: str):
     if not word_dicts:
         raise HTTPException(status_code=400, detail="PaddleOCR returned no words")
 
-    cells, columns_meta = _paddle_words_to_grid_cells(word_dicts, img_w, img_h)
+    # Reuse build_grid_from_words — same function that works in the regular
+    # pipeline with PaddleOCR (engine=paddle, grid_method=words_first).
+    # Handles phrase splitting, column clustering, and reading order.
+    cells, columns_meta = build_grid_from_words(word_dicts, img_w, img_h)
     duration = time.time() - t0
 
+    # Tag cells as paddle_direct
+    for cell in cells:
+        cell["ocr_engine"] = "paddle_direct"
+
     n_rows = len(set(c["row_index"] for c in cells)) if cells else 0
     n_cols = len(columns_meta)
+    col_types = {c.get("type") for c in columns_meta}
+    is_vocab = bool(col_types & {"column_en", "column_de"})
 
     word_result = {
         "cells": cells,
@@ -2555,7 +2564,7 @@ async def paddle_direct(session_id: str):
             "total_cells": len(cells),
         },
         "columns_used": columns_meta,
-        "layout": "generic",
+        "layout": "vocab" if is_vocab else "generic",
         "image_width": img_w,
         "image_height": img_h,
         "duration_seconds": round(duration, 2),
@@ -2590,127 +2599,6 @@ async def paddle_direct(session_id: str):
     return {"session_id": session_id, **word_result}
 
 
-def _paddle_words_to_grid_cells(
-    word_dicts: List[Dict[str, Any]],
-    img_w: int,
-    img_h: int,
-) -> tuple:
-    """Convert PaddleOCR word dicts into GridCell dicts + columns_meta.
-
-    Groups words into rows (Y-proximity), then builds ONE cell per row
-    with all words as word_boxes — matching the format of _build_cells()
-    in cv_words_first.py. This gives OverlayReconstruction a row-spanning
-    bbox_pct for correct font sizing and per-word positions for placement.
-
-    Returns (cells, columns_meta) in the same format as build_grid_from_words.
-    """
-    if not word_dicts:
-        return [], []
-
-    # Sort by top then left
-    sorted_words = sorted(word_dicts, key=lambda w: (w["top"], w["left"]))
-
-    # Compute median word height for row clustering threshold
-    heights = [w["height"] for w in sorted_words if w.get("height", 0) > 0]
-    median_h = sorted(heights)[len(heights) // 2] if heights else 30
-    row_threshold = max(median_h * 0.5, 8)
-
-    # Cluster into rows
-    rows: List[List[Dict]] = []
-    current_row: List[Dict] = []
-    current_y = -9999.0
-
-    for w in sorted_words:
-        center_y = w["top"] + w["height"] / 2
-        if current_row and abs(center_y - current_y) > row_threshold:
-            rows.append(current_row)
-            current_row = []
-        current_row.append(w)
-        # Running average Y center for the row
-        current_y = sum(ww["top"] + ww["height"] / 2 for ww in current_row) / len(current_row)
-
-    if current_row:
-        rows.append(current_row)
-
-    # Build ONE cell per row (all words in reading order, word_boxes for positioning)
-    cells: List[Dict[str, Any]] = []
-
-    for row_idx, row_words in enumerate(rows):
-        row_words.sort(key=lambda w: w["left"])
-
-        # Tight bbox spanning all words in this row
-        x_min = min(w["left"] for w in row_words)
-        y_min = min(w["top"] for w in row_words)
-        x_max = max(w["left"] + w["width"] for w in row_words)
-        y_max = max(w["top"] + w["height"] for w in row_words)
-        bw = x_max - x_min
-        bh = y_max - y_min
-
-        # Text: all words joined by space
-        text = " ".join(w.get("text", "").strip() for w in row_words if w.get("text", "").strip())
-
-        # Average confidence
-        confs = []
-        for w in row_words:
-            c = w.get("confidence", 0)
-            if isinstance(c, float) and c <= 1.0:
-                c = c * 100
-            confs.append(c)
-        avg_conf = sum(confs) / len(confs) if confs else 0.0
-
-        # Per-word boxes with absolute pixel coordinates
-        word_boxes = []
-        for w in row_words:
-            raw_text = w.get("text", "").strip()
-            if not raw_text:
-                continue
-            c = w.get("confidence", 0)
-            if isinstance(c, float) and c <= 1.0:
-                c = c * 100
-            word_boxes.append({
-                "text": raw_text,
-                "left": w["left"],
-                "top": w["top"],
-                "width": w["width"],
-                "height": w["height"],
-                "conf": round(c, 1),
-            })
-
-        cell = {
-            "cell_id": f"PD_R{row_idx:02d}_C0",
-            "row_index": row_idx,
-            "col_index": 0,
-            "col_type": "column_text",
-            "text": text,
-            "confidence": round(avg_conf, 1),
-            "zone_index": 0,
-            "ocr_engine": "paddle_direct",
-            "is_bold": False,
-            "word_boxes": word_boxes,
-            "bbox_px": {"x": x_min, "y": y_min, "w": bw, "h": bh},
-            "bbox_pct": {
-                "x": round(x_min / img_w * 100, 2) if img_w else 0,
-                "y": round(y_min / img_h * 100, 2) if img_h else 0,
-                "w": round(bw / img_w * 100, 2) if img_w else 0,
-                "h": round(bh / img_h * 100, 2) if img_h else 0,
-            },
-        }
-        cells.append(cell)
-
-    # Single full-page pseudo-column (all rows belong to column 0)
-    columns_meta = [{
-        "type": "column_text",
-        "x": 0,
-        "y": 0,
-        "width": img_w,
-        "height": img_h,
-        "classification_confidence": 1.0,
-        "classification_method": "paddle_direct",
-    }]
-
-    return cells, columns_meta
-
-
 class WordGroundTruthRequest(BaseModel):
     is_correct: bool
     corrected_entries: Optional[List[Dict[str, Any]]] = None