feat: Paddle Direct — 1-click OCR without deskew/dewarp/crop
Some checks failed
CI / go-lint (push) Has been cancelled
CI / python-lint (push) Has been cancelled
CI / nodejs-lint (push) Has been cancelled
CI / test-go-school (push) Has been cancelled
CI / test-go-edu-search (push) Has been cancelled
CI / test-python-klausur (push) Has been cancelled
CI / test-python-agent-core (push) Has been cancelled
CI / test-nodejs-website (push) Has been cancelled
Some checks failed
CI / go-lint (push) Has been cancelled
CI / python-lint (push) Has been cancelled
CI / nodejs-lint (push) Has been cancelled
CI / test-go-school (push) Has been cancelled
CI / test-go-edu-search (push) Has been cancelled
CI / test-python-klausur (push) Has been cancelled
CI / test-python-agent-core (push) Has been cancelled
CI / test-nodejs-website (push) Has been cancelled
New 2-step mode (Upload → PaddleOCR+Overlay) alongside the existing 7-step pipeline. Backend endpoint runs PaddleOCR on the original image and clusters words into rows/cells directly. Frontend adds a mode toggle and PaddleDirectStep component. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -2509,6 +2509,189 @@ async def _word_stream_generator(
|
||||
yield f"data: {json.dumps(complete_event)}\n\n"
|
||||
|
||||
|
||||
@router.post("/sessions/{session_id}/paddle-direct")
|
||||
async def paddle_direct(session_id: str):
|
||||
"""Run PaddleOCR on the original image and build a word grid directly.
|
||||
|
||||
Skips deskew/dewarp/crop/rows — just Upload → PaddleOCR → Overlay.
|
||||
The original image is stored as cropped_png so OverlayReconstruction
|
||||
can display it as the background.
|
||||
"""
|
||||
original_png = await get_session_image(session_id, "original")
|
||||
if not original_png:
|
||||
raise HTTPException(status_code=404, detail="No original image found for this session")
|
||||
|
||||
img_arr = np.frombuffer(original_png, dtype=np.uint8)
|
||||
img_bgr = cv2.imdecode(img_arr, cv2.IMREAD_COLOR)
|
||||
if img_bgr is None:
|
||||
raise HTTPException(status_code=400, detail="Failed to decode original image")
|
||||
|
||||
img_h, img_w = img_bgr.shape[:2]
|
||||
|
||||
from cv_ocr_engines import ocr_region_paddle
|
||||
|
||||
t0 = time.time()
|
||||
word_dicts = await ocr_region_paddle(img_bgr, region=None)
|
||||
if not word_dicts:
|
||||
raise HTTPException(status_code=400, detail="PaddleOCR returned no words")
|
||||
|
||||
cells, columns_meta = _paddle_words_to_grid_cells(word_dicts, img_w, img_h)
|
||||
duration = time.time() - t0
|
||||
|
||||
n_rows = len(set(c["row_index"] for c in cells)) if cells else 0
|
||||
n_cols = len(columns_meta)
|
||||
|
||||
word_result = {
|
||||
"cells": cells,
|
||||
"grid_shape": {
|
||||
"rows": n_rows,
|
||||
"cols": n_cols,
|
||||
"total_cells": len(cells),
|
||||
},
|
||||
"columns_used": columns_meta,
|
||||
"layout": "generic",
|
||||
"image_width": img_w,
|
||||
"image_height": img_h,
|
||||
"duration_seconds": round(duration, 2),
|
||||
"ocr_engine": "paddle_direct",
|
||||
"grid_method": "paddle_direct",
|
||||
"summary": {
|
||||
"total_cells": len(cells),
|
||||
"non_empty_cells": sum(1 for c in cells if c.get("text")),
|
||||
"low_confidence": sum(1 for c in cells if 0 < c.get("confidence", 0) < 50),
|
||||
},
|
||||
}
|
||||
|
||||
# Store original image as cropped_png so OverlayReconstruction shows it
|
||||
await update_session_db(
|
||||
session_id,
|
||||
word_result=word_result,
|
||||
cropped_png=original_png,
|
||||
current_step=8,
|
||||
)
|
||||
|
||||
logger.info(
|
||||
"paddle_direct session %s: %d cells (%d rows, %d cols) in %.2fs",
|
||||
session_id, len(cells), n_rows, n_cols, duration,
|
||||
)
|
||||
|
||||
await _append_pipeline_log(session_id, "paddle_direct", {
|
||||
"total_cells": len(cells),
|
||||
"non_empty_cells": word_result["summary"]["non_empty_cells"],
|
||||
"ocr_engine": "paddle_direct",
|
||||
}, duration_ms=int(duration * 1000))
|
||||
|
||||
return {"session_id": session_id, **word_result}
|
||||
|
||||
|
||||
def _paddle_words_to_grid_cells(
|
||||
word_dicts: List[Dict[str, Any]],
|
||||
img_w: int,
|
||||
img_h: int,
|
||||
) -> tuple:
|
||||
"""Convert PaddleOCR word dicts into GridCell dicts + columns_meta.
|
||||
|
||||
1. Sort words by (top, left).
|
||||
2. Cluster into rows by Y-proximity (threshold = 50% of median word height).
|
||||
3. Within each row, sort left→right and assign col_index.
|
||||
4. Each word → 1 GridCell with word_boxes and bbox_pct.
|
||||
|
||||
Returns (cells, columns_meta) in the same format as build_grid_from_words.
|
||||
"""
|
||||
if not word_dicts:
|
||||
return [], []
|
||||
|
||||
# Sort by top then left
|
||||
sorted_words = sorted(word_dicts, key=lambda w: (w["top"], w["left"]))
|
||||
|
||||
# Compute median word height for row clustering threshold
|
||||
heights = [w["height"] for w in sorted_words if w.get("height", 0) > 0]
|
||||
median_h = sorted(heights)[len(heights) // 2] if heights else 30
|
||||
row_threshold = max(median_h * 0.5, 8)
|
||||
|
||||
# Cluster into rows
|
||||
rows: List[List[Dict]] = []
|
||||
current_row: List[Dict] = []
|
||||
current_y = -9999.0
|
||||
|
||||
for w in sorted_words:
|
||||
center_y = w["top"] + w["height"] / 2
|
||||
if current_row and abs(center_y - current_y) > row_threshold:
|
||||
rows.append(current_row)
|
||||
current_row = []
|
||||
current_row.append(w)
|
||||
# Running average Y center for the row
|
||||
current_y = sum(ww["top"] + ww["height"] / 2 for ww in current_row) / len(current_row)
|
||||
|
||||
if current_row:
|
||||
rows.append(current_row)
|
||||
|
||||
# Sort each row left→right and build cells
|
||||
cells: List[Dict[str, Any]] = []
|
||||
max_col = 0
|
||||
|
||||
for row_idx, row_words in enumerate(rows):
|
||||
row_words.sort(key=lambda w: w["left"])
|
||||
for col_idx, w in enumerate(row_words):
|
||||
left = w["left"]
|
||||
top = w["top"]
|
||||
width = w["width"]
|
||||
height = w["height"]
|
||||
conf = w.get("confidence", 0)
|
||||
if isinstance(conf, float) and conf <= 1.0:
|
||||
conf = conf * 100 # normalize to 0-100
|
||||
|
||||
cell = {
|
||||
"cell_id": f"PD_R{row_idx:02d}_W{col_idx:02d}",
|
||||
"x": left,
|
||||
"y": top,
|
||||
"width": width,
|
||||
"height": height,
|
||||
"text": w.get("text", ""),
|
||||
"confidence": round(conf, 1),
|
||||
"column_index": col_idx,
|
||||
"row_index": row_idx,
|
||||
"zone_index": 0,
|
||||
"ocr_engine": "paddle_direct",
|
||||
"word_boxes": [{
|
||||
"text": w.get("text", ""),
|
||||
"left": left,
|
||||
"top": top,
|
||||
"width": width,
|
||||
"height": height,
|
||||
"confidence": round(conf, 1),
|
||||
}],
|
||||
"bbox_pct": {
|
||||
"x": round(left / img_w * 100, 3),
|
||||
"y": round(top / img_h * 100, 3),
|
||||
"w": round(width / img_w * 100, 3),
|
||||
"h": round(height / img_h * 100, 3),
|
||||
},
|
||||
}
|
||||
cells.append(cell)
|
||||
if col_idx > max_col:
|
||||
max_col = col_idx
|
||||
|
||||
# Build columns_meta — one pseudo-column per column index
|
||||
columns_meta = []
|
||||
for ci in range(max_col + 1):
|
||||
col_cells = [c for c in cells if c["column_index"] == ci]
|
||||
if col_cells:
|
||||
min_x = min(c["x"] for c in col_cells)
|
||||
max_right = max(c["x"] + c["width"] for c in col_cells)
|
||||
columns_meta.append({
|
||||
"type": "column_text",
|
||||
"x": min_x,
|
||||
"y": 0,
|
||||
"width": max_right - min_x,
|
||||
"height": img_h,
|
||||
"classification_confidence": 1.0,
|
||||
"classification_method": "paddle_direct",
|
||||
})
|
||||
|
||||
return cells, columns_meta
|
||||
|
||||
|
||||
class WordGroundTruthRequest(BaseModel):
|
||||
is_correct: bool
|
||||
corrected_entries: Optional[List[Dict[str, Any]]] = None
|
||||
|
||||
Reference in New Issue
Block a user