From 8349c28f54ba2bd2c529ddb008b36b571cb7cebf Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Thu, 12 Mar 2026 17:19:52 +0100 Subject: [PATCH] fix: paddle_direct reuses build_grid_from_words for correct overlay MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replaces custom _paddle_words_to_grid_cells with the proven build_grid_from_words from cv_words_first.py — same function the regular pipeline uses with PaddleOCR. Handles phrase splitting, column clustering, and produces cells with word_boxes that the slide/cluster positioning hooks expect. Co-Authored-By: Claude Opus 4.6 --- klausur-service/backend/ocr_pipeline_api.py | 134 ++------------------ 1 file changed, 11 insertions(+), 123 deletions(-) diff --git a/klausur-service/backend/ocr_pipeline_api.py b/klausur-service/backend/ocr_pipeline_api.py index 39cb199..f19bcf8 100644 --- a/klausur-service/backend/ocr_pipeline_api.py +++ b/klausur-service/backend/ocr_pipeline_api.py @@ -2541,11 +2541,20 @@ async def paddle_direct(session_id: str): if not word_dicts: raise HTTPException(status_code=400, detail="PaddleOCR returned no words") - cells, columns_meta = _paddle_words_to_grid_cells(word_dicts, img_w, img_h) + # Reuse build_grid_from_words — same function that works in the regular + # pipeline with PaddleOCR (engine=paddle, grid_method=words_first). + # Handles phrase splitting, column clustering, and reading order. + cells, columns_meta = build_grid_from_words(word_dicts, img_w, img_h) duration = time.time() - t0 + # Tag cells as paddle_direct + for cell in cells: + cell["ocr_engine"] = "paddle_direct" + n_rows = len(set(c["row_index"] for c in cells)) if cells else 0 n_cols = len(columns_meta) + col_types = {c.get("type") for c in columns_meta} + is_vocab = bool(col_types & {"column_en", "column_de"}) word_result = { "cells": cells, @@ -2555,7 +2564,7 @@ async def paddle_direct(session_id: str): "total_cells": len(cells), }, "columns_used": columns_meta, - "layout": "generic", + "layout": "vocab" if is_vocab else "generic", "image_width": img_w, "image_height": img_h, "duration_seconds": round(duration, 2), @@ -2590,127 +2599,6 @@ async def paddle_direct(session_id: str): return {"session_id": session_id, **word_result} -def _paddle_words_to_grid_cells( - word_dicts: List[Dict[str, Any]], - img_w: int, - img_h: int, -) -> tuple: - """Convert PaddleOCR word dicts into GridCell dicts + columns_meta. - - Groups words into rows (Y-proximity), then builds ONE cell per row - with all words as word_boxes — matching the format of _build_cells() - in cv_words_first.py. This gives OverlayReconstruction a row-spanning - bbox_pct for correct font sizing and per-word positions for placement. - - Returns (cells, columns_meta) in the same format as build_grid_from_words. - """ - if not word_dicts: - return [], [] - - # Sort by top then left - sorted_words = sorted(word_dicts, key=lambda w: (w["top"], w["left"])) - - # Compute median word height for row clustering threshold - heights = [w["height"] for w in sorted_words if w.get("height", 0) > 0] - median_h = sorted(heights)[len(heights) // 2] if heights else 30 - row_threshold = max(median_h * 0.5, 8) - - # Cluster into rows - rows: List[List[Dict]] = [] - current_row: List[Dict] = [] - current_y = -9999.0 - - for w in sorted_words: - center_y = w["top"] + w["height"] / 2 - if current_row and abs(center_y - current_y) > row_threshold: - rows.append(current_row) - current_row = [] - current_row.append(w) - # Running average Y center for the row - current_y = sum(ww["top"] + ww["height"] / 2 for ww in current_row) / len(current_row) - - if current_row: - rows.append(current_row) - - # Build ONE cell per row (all words in reading order, word_boxes for positioning) - cells: List[Dict[str, Any]] = [] - - for row_idx, row_words in enumerate(rows): - row_words.sort(key=lambda w: w["left"]) - - # Tight bbox spanning all words in this row - x_min = min(w["left"] for w in row_words) - y_min = min(w["top"] for w in row_words) - x_max = max(w["left"] + w["width"] for w in row_words) - y_max = max(w["top"] + w["height"] for w in row_words) - bw = x_max - x_min - bh = y_max - y_min - - # Text: all words joined by space - text = " ".join(w.get("text", "").strip() for w in row_words if w.get("text", "").strip()) - - # Average confidence - confs = [] - for w in row_words: - c = w.get("confidence", 0) - if isinstance(c, float) and c <= 1.0: - c = c * 100 - confs.append(c) - avg_conf = sum(confs) / len(confs) if confs else 0.0 - - # Per-word boxes with absolute pixel coordinates - word_boxes = [] - for w in row_words: - raw_text = w.get("text", "").strip() - if not raw_text: - continue - c = w.get("confidence", 0) - if isinstance(c, float) and c <= 1.0: - c = c * 100 - word_boxes.append({ - "text": raw_text, - "left": w["left"], - "top": w["top"], - "width": w["width"], - "height": w["height"], - "conf": round(c, 1), - }) - - cell = { - "cell_id": f"PD_R{row_idx:02d}_C0", - "row_index": row_idx, - "col_index": 0, - "col_type": "column_text", - "text": text, - "confidence": round(avg_conf, 1), - "zone_index": 0, - "ocr_engine": "paddle_direct", - "is_bold": False, - "word_boxes": word_boxes, - "bbox_px": {"x": x_min, "y": y_min, "w": bw, "h": bh}, - "bbox_pct": { - "x": round(x_min / img_w * 100, 2) if img_w else 0, - "y": round(y_min / img_h * 100, 2) if img_h else 0, - "w": round(bw / img_w * 100, 2) if img_w else 0, - "h": round(bh / img_h * 100, 2) if img_h else 0, - }, - } - cells.append(cell) - - # Single full-page pseudo-column (all rows belong to column 0) - columns_meta = [{ - "type": "column_text", - "x": 0, - "y": 0, - "width": img_w, - "height": img_h, - "classification_confidence": 1.0, - "classification_method": "paddle_direct", - }] - - return cells, columns_meta - - class WordGroundTruthRequest(BaseModel): is_correct: bool corrected_entries: Optional[List[Dict[str, Any]]] = None