feat: Paddle Direct — 1-click OCR without deskew/dewarp/crop

New 2-step mode (Upload → PaddleOCR+Overlay) alongside the existing 7-step pipeline. Backend endpoint runs PaddleOCR on the original image and clusters words into rows/cells directly. Frontend adds a mode toggle and PaddleDirectStep component. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-12 16:41:55 +01:00
parent 06d63d18f9
commit 90c1efd9b0
4 changed files with 403 additions and 16 deletions
--- a/klausur-service/backend/ocr_pipeline_api.py
+++ b/klausur-service/backend/ocr_pipeline_api.py
@@ -2509,6 +2509,189 @@ async def _word_stream_generator(
    yield f"data: {json.dumps(complete_event)}\n\n"


+@router.post("/sessions/{session_id}/paddle-direct")
+async def paddle_direct(session_id: str):
+    """Run PaddleOCR on the original image and build a word grid directly.
+
+    Skips deskew/dewarp/crop/rows — just Upload → PaddleOCR → Overlay.
+    The original image is stored as cropped_png so OverlayReconstruction
+    can display it as the background.
+    """
+    original_png = await get_session_image(session_id, "original")
+    if not original_png:
+        raise HTTPException(status_code=404, detail="No original image found for this session")
+
+    img_arr = np.frombuffer(original_png, dtype=np.uint8)
+    img_bgr = cv2.imdecode(img_arr, cv2.IMREAD_COLOR)
+    if img_bgr is None:
+        raise HTTPException(status_code=400, detail="Failed to decode original image")
+
+    img_h, img_w = img_bgr.shape[:2]
+
+    from cv_ocr_engines import ocr_region_paddle
+
+    t0 = time.time()
+    word_dicts = await ocr_region_paddle(img_bgr, region=None)
+    if not word_dicts:
+        raise HTTPException(status_code=400, detail="PaddleOCR returned no words")
+
+    cells, columns_meta = _paddle_words_to_grid_cells(word_dicts, img_w, img_h)
+    duration = time.time() - t0
+
+    n_rows = len(set(c["row_index"] for c in cells)) if cells else 0
+    n_cols = len(columns_meta)
+
+    word_result = {
+        "cells": cells,
+        "grid_shape": {
+            "rows": n_rows,
+            "cols": n_cols,
+            "total_cells": len(cells),
+        },
+        "columns_used": columns_meta,
+        "layout": "generic",
+        "image_width": img_w,
+        "image_height": img_h,
+        "duration_seconds": round(duration, 2),
+        "ocr_engine": "paddle_direct",
+        "grid_method": "paddle_direct",
+        "summary": {
+            "total_cells": len(cells),
+            "non_empty_cells": sum(1 for c in cells if c.get("text")),
+            "low_confidence": sum(1 for c in cells if 0 < c.get("confidence", 0) < 50),
+        },
+    }
+
+    # Store original image as cropped_png so OverlayReconstruction shows it
+    await update_session_db(
+        session_id,
+        word_result=word_result,
+        cropped_png=original_png,
+        current_step=8,
+    )
+
+    logger.info(
+        "paddle_direct session %s: %d cells (%d rows, %d cols) in %.2fs",
+        session_id, len(cells), n_rows, n_cols, duration,
+    )
+
+    await _append_pipeline_log(session_id, "paddle_direct", {
+        "total_cells": len(cells),
+        "non_empty_cells": word_result["summary"]["non_empty_cells"],
+        "ocr_engine": "paddle_direct",
+    }, duration_ms=int(duration * 1000))
+
+    return {"session_id": session_id, **word_result}
+
+
+def _paddle_words_to_grid_cells(
+    word_dicts: List[Dict[str, Any]],
+    img_w: int,
+    img_h: int,
+) -> tuple:
+    """Convert PaddleOCR word dicts into GridCell dicts + columns_meta.
+
+    1. Sort words by (top, left).
+    2. Cluster into rows by Y-proximity (threshold = 50% of median word height).
+    3. Within each row, sort left→right and assign col_index.
+    4. Each word → 1 GridCell with word_boxes and bbox_pct.
+
+    Returns (cells, columns_meta) in the same format as build_grid_from_words.
+    """
+    if not word_dicts:
+        return [], []
+
+    # Sort by top then left
+    sorted_words = sorted(word_dicts, key=lambda w: (w["top"], w["left"]))
+
+    # Compute median word height for row clustering threshold
+    heights = [w["height"] for w in sorted_words if w.get("height", 0) > 0]
+    median_h = sorted(heights)[len(heights) // 2] if heights else 30
+    row_threshold = max(median_h * 0.5, 8)
+
+    # Cluster into rows
+    rows: List[List[Dict]] = []
+    current_row: List[Dict] = []
+    current_y = -9999.0
+
+    for w in sorted_words:
+        center_y = w["top"] + w["height"] / 2
+        if current_row and abs(center_y - current_y) > row_threshold:
+            rows.append(current_row)
+            current_row = []
+        current_row.append(w)
+        # Running average Y center for the row
+        current_y = sum(ww["top"] + ww["height"] / 2 for ww in current_row) / len(current_row)
+
+    if current_row:
+        rows.append(current_row)
+
+    # Sort each row left→right and build cells
+    cells: List[Dict[str, Any]] = []
+    max_col = 0
+
+    for row_idx, row_words in enumerate(rows):
+        row_words.sort(key=lambda w: w["left"])
+        for col_idx, w in enumerate(row_words):
+            left = w["left"]
+            top = w["top"]
+            width = w["width"]
+            height = w["height"]
+            conf = w.get("confidence", 0)
+            if isinstance(conf, float) and conf <= 1.0:
+                conf = conf * 100  # normalize to 0-100
+
+            cell = {
+                "cell_id": f"PD_R{row_idx:02d}_W{col_idx:02d}",
+                "x": left,
+                "y": top,
+                "width": width,
+                "height": height,
+                "text": w.get("text", ""),
+                "confidence": round(conf, 1),
+                "column_index": col_idx,
+                "row_index": row_idx,
+                "zone_index": 0,
+                "ocr_engine": "paddle_direct",
+                "word_boxes": [{
+                    "text": w.get("text", ""),
+                    "left": left,
+                    "top": top,
+                    "width": width,
+                    "height": height,
+                    "confidence": round(conf, 1),
+                }],
+                "bbox_pct": {
+                    "x": round(left / img_w * 100, 3),
+                    "y": round(top / img_h * 100, 3),
+                    "w": round(width / img_w * 100, 3),
+                    "h": round(height / img_h * 100, 3),
+                },
+            }
+            cells.append(cell)
+            if col_idx > max_col:
+                max_col = col_idx
+
+    # Build columns_meta — one pseudo-column per column index
+    columns_meta = []
+    for ci in range(max_col + 1):
+        col_cells = [c for c in cells if c["column_index"] == ci]
+        if col_cells:
+            min_x = min(c["x"] for c in col_cells)
+            max_right = max(c["x"] + c["width"] for c in col_cells)
+            columns_meta.append({
+                "type": "column_text",
+                "x": min_x,
+                "y": 0,
+                "width": max_right - min_x,
+                "height": img_h,
+                "classification_confidence": 1.0,
+                "classification_method": "paddle_direct",
+            })
+
+    return cells, columns_meta
+
+
 class WordGroundTruthRequest(BaseModel):
    is_correct: bool
    corrected_entries: Optional[List[Dict[str, Any]]] = None