""" OCR Pipeline Words - Word detection and ground truth endpoints. Extracted from ocr_pipeline_api.py. Handles: - POST /sessions/{session_id}/words — main SSE streaming word detection - POST /sessions/{session_id}/paddle-direct — PaddleOCR direct endpoint - POST /sessions/{session_id}/ground-truth/words — save ground truth - GET /sessions/{session_id}/ground-truth/words — get ground truth Lizenz: Apache 2.0 DATENSCHUTZ: Alle Verarbeitung erfolgt lokal. """ import json import logging import time from datetime import datetime from typing import Any, Dict, List, Optional import cv2 import numpy as np from fastapi import APIRouter, HTTPException, Request from fastapi.responses import StreamingResponse from pydantic import BaseModel from cv_vocab_pipeline import ( PageRegion, RowGeometry, _cells_to_vocab_entries, _fix_character_confusion, _fix_phonetic_brackets, fix_cell_phonetics, build_cell_grid_v2, build_cell_grid_v2_streaming, create_ocr_image, detect_column_geometry, ) from cv_words_first import build_grid_from_words from ocr_pipeline_session_store import ( get_session_db, get_session_image, update_session_db, ) from ocr_pipeline_common import ( _cache, _load_session_to_cache, _get_cached, _get_base_image_png, _append_pipeline_log, ) logger = logging.getLogger(__name__) router = APIRouter(prefix="/api/v1/ocr-pipeline", tags=["ocr-pipeline"]) # --------------------------------------------------------------------------- # Pydantic models # --------------------------------------------------------------------------- class WordGroundTruthRequest(BaseModel): is_correct: bool corrected_entries: Optional[List[Dict[str, Any]]] = None notes: Optional[str] = None # --------------------------------------------------------------------------- # Word Detection Endpoint (Step 7) # --------------------------------------------------------------------------- @router.post("/sessions/{session_id}/words") async def detect_words( session_id: str, request: Request, engine: str = "auto", pronunciation: str = "british", stream: bool = False, skip_heal_gaps: bool = False, grid_method: str = "v2", ): """Build word grid from columns × rows, OCR each cell. Query params: engine: 'auto' (default), 'tesseract', 'rapid', or 'paddle' pronunciation: 'british' (default) or 'american' — for IPA dictionary lookup stream: false (default) for JSON response, true for SSE streaming skip_heal_gaps: false (default). When true, cells keep exact row geometry positions without gap-healing expansion. Better for overlay rendering. grid_method: 'v2' (default) or 'words_first' — grid construction strategy. 'v2' uses pre-detected columns/rows (top-down). 'words_first' clusters words bottom-up (no column/row detection needed). """ # PaddleOCR is full-page remote OCR → force words_first grid method if engine == "paddle" and grid_method != "words_first": logger.info("detect_words: engine=paddle requires words_first, overriding grid_method=%s", grid_method) grid_method = "words_first" if session_id not in _cache: logger.info("detect_words: session %s not in cache, loading from DB", session_id) await _load_session_to_cache(session_id) cached = _get_cached(session_id) dewarped_bgr = cached.get("cropped_bgr") if cached.get("cropped_bgr") is not None else cached.get("dewarped_bgr") if dewarped_bgr is None: logger.warning("detect_words: no cropped/dewarped image for session %s (cache keys: %s)", session_id, [k for k in cached.keys() if k.endswith('_bgr')]) raise HTTPException(status_code=400, detail="Crop or dewarp must be completed before word detection") session = await get_session_db(session_id) if not session: raise HTTPException(status_code=404, detail=f"Session {session_id} not found") column_result = session.get("column_result") row_result = session.get("row_result") if not column_result or not column_result.get("columns"): # No column detection — synthesize a single full-page pseudo-column. # This enables the overlay pipeline which skips column detection. img_h_tmp, img_w_tmp = dewarped_bgr.shape[:2] column_result = { "columns": [{ "type": "column_text", "x": 0, "y": 0, "width": img_w_tmp, "height": img_h_tmp, "classification_confidence": 1.0, "classification_method": "full_page_fallback", }], "zones": [], "duration_seconds": 0, } logger.info("detect_words: no column_result — using full-page pseudo-column %dx%d", img_w_tmp, img_h_tmp) if grid_method != "words_first" and (not row_result or not row_result.get("rows")): raise HTTPException(status_code=400, detail="Row detection must be completed first") # Convert column dicts back to PageRegion objects col_regions = [ PageRegion( type=c["type"], x=c["x"], y=c["y"], width=c["width"], height=c["height"], classification_confidence=c.get("classification_confidence", 1.0), classification_method=c.get("classification_method", ""), ) for c in column_result["columns"] ] # Convert row dicts back to RowGeometry objects row_geoms = [ RowGeometry( index=r["index"], x=r["x"], y=r["y"], width=r["width"], height=r["height"], word_count=r.get("word_count", 0), words=[], row_type=r.get("row_type", "content"), gap_before=r.get("gap_before", 0), ) for r in row_result["rows"] ] # Cell-First OCR (v2): no full-page word re-population needed. # Each cell is cropped and OCR'd in isolation → no neighbour bleeding. # We still need word_count > 0 for row filtering in build_cell_grid_v2, # so populate from cached words if available (just for counting). word_dicts = cached.get("_word_dicts") if word_dicts is None: ocr_img_tmp = create_ocr_image(dewarped_bgr) geo_result = detect_column_geometry(ocr_img_tmp, dewarped_bgr) if geo_result is not None: _geoms, left_x, right_x, top_y, bottom_y, word_dicts, inv = geo_result cached["_word_dicts"] = word_dicts cached["_inv"] = inv cached["_content_bounds"] = (left_x, right_x, top_y, bottom_y) if word_dicts: content_bounds = cached.get("_content_bounds") if content_bounds: _lx, _rx, top_y, _by = content_bounds else: top_y = min(r.y for r in row_geoms) if row_geoms else 0 for row in row_geoms: row_y_rel = row.y - top_y row_bottom_rel = row_y_rel + row.height row.words = [ w for w in word_dicts if row_y_rel <= w['top'] + w['height'] / 2 < row_bottom_rel ] row.word_count = len(row.words) # Exclude rows that fall within box zones. # Use inner box range (shrunk by border_thickness) so that rows at # the boundary (overlapping with the box border) are NOT excluded. zones = column_result.get("zones") or [] box_ranges_inner = [] for zone in zones: if zone.get("zone_type") == "box" and zone.get("box"): box = zone["box"] bt = max(box.get("border_thickness", 0), 5) # minimum 5px margin box_ranges_inner.append((box["y"] + bt, box["y"] + box["height"] - bt)) if box_ranges_inner: def _row_in_box(r): center_y = r.y + r.height / 2 return any(by_s <= center_y < by_e for by_s, by_e in box_ranges_inner) before_count = len(row_geoms) row_geoms = [r for r in row_geoms if not _row_in_box(r)] excluded = before_count - len(row_geoms) if excluded: logger.info(f"detect_words: excluded {excluded} rows inside box zones") # --- Words-First path: bottom-up grid from word boxes --- if grid_method == "words_first": t0 = time.time() img_h, img_w = dewarped_bgr.shape[:2] # For paddle engine: run remote PaddleOCR full-page instead of Tesseract if engine == "paddle": from cv_ocr_engines import ocr_region_paddle wf_word_dicts = await ocr_region_paddle(dewarped_bgr, region=None) # PaddleOCR returns absolute coordinates, no content_bounds offset needed cached["_paddle_word_dicts"] = wf_word_dicts else: # Get word_dicts from cache or run Tesseract full-page wf_word_dicts = cached.get("_word_dicts") if wf_word_dicts is None: ocr_img_tmp = create_ocr_image(dewarped_bgr) geo_result = detect_column_geometry(ocr_img_tmp, dewarped_bgr) if geo_result is not None: _geoms, left_x, right_x, top_y, bottom_y, wf_word_dicts, inv = geo_result cached["_word_dicts"] = wf_word_dicts cached["_inv"] = inv cached["_content_bounds"] = (left_x, right_x, top_y, bottom_y) if not wf_word_dicts: raise HTTPException(status_code=400, detail="No words detected — cannot build words-first grid") # Convert word coordinates to absolute image coordinates if needed # (detect_column_geometry returns words relative to content ROI) # PaddleOCR already returns absolute coordinates — skip offset. if engine != "paddle": content_bounds = cached.get("_content_bounds") if content_bounds: lx, _rx, ty, _by = content_bounds abs_words = [] for w in wf_word_dicts: abs_words.append({ **w, 'left': w['left'] + lx, 'top': w['top'] + ty, }) wf_word_dicts = abs_words # Extract box rects for box-aware column clustering box_rects = [] for zone in zones: if zone.get("zone_type") == "box" and zone.get("box"): box_rects.append(zone["box"]) cells, columns_meta = build_grid_from_words( wf_word_dicts, img_w, img_h, box_rects=box_rects or None, ) duration = time.time() - t0 # Apply IPA phonetic fixes fix_cell_phonetics(cells, pronunciation=pronunciation) # Add zone_index for backward compat for cell in cells: cell.setdefault("zone_index", 0) col_types = {c['type'] for c in columns_meta} is_vocab = bool(col_types & {'column_en', 'column_de'}) n_rows = len(set(c['row_index'] for c in cells)) if cells else 0 n_cols = len(columns_meta) used_engine = "paddle" if engine == "paddle" else "words_first" word_result = { "cells": cells, "grid_shape": { "rows": n_rows, "cols": n_cols, "total_cells": len(cells), }, "columns_used": columns_meta, "layout": "vocab" if is_vocab else "generic", "image_width": img_w, "image_height": img_h, "duration_seconds": round(duration, 2), "ocr_engine": used_engine, "grid_method": "words_first", "summary": { "total_cells": len(cells), "non_empty_cells": sum(1 for c in cells if c.get("text")), "low_confidence": sum(1 for c in cells if 0 < c.get("confidence", 0) < 50), }, } if is_vocab or 'column_text' in col_types: entries = _cells_to_vocab_entries(cells, columns_meta) entries = _fix_phonetic_brackets(entries, pronunciation=pronunciation) word_result["vocab_entries"] = entries word_result["entries"] = entries word_result["entry_count"] = len(entries) word_result["summary"]["total_entries"] = len(entries) word_result["summary"]["with_english"] = sum(1 for e in entries if e.get("english")) word_result["summary"]["with_german"] = sum(1 for e in entries if e.get("german")) await update_session_db(session_id, word_result=word_result, current_step=8) cached["word_result"] = word_result logger.info(f"OCR Pipeline: words-first session {session_id}: " f"{len(cells)} cells ({duration:.2f}s), {n_rows} rows, {n_cols} cols") await _append_pipeline_log(session_id, "words", { "grid_method": "words_first", "total_cells": len(cells), "non_empty_cells": word_result["summary"]["non_empty_cells"], "ocr_engine": used_engine, "layout": word_result["layout"], }, duration_ms=int(duration * 1000)) return {"session_id": session_id, **word_result} if stream: # Cell-First OCR v2: use batch-then-stream approach instead of # per-cell streaming. The parallel ThreadPoolExecutor in # build_cell_grid_v2 is much faster than sequential streaming. return StreamingResponse( _word_batch_stream_generator( session_id, cached, col_regions, row_geoms, dewarped_bgr, engine, pronunciation, request, skip_heal_gaps=skip_heal_gaps, ), media_type="text/event-stream", headers={ "Cache-Control": "no-cache", "Connection": "keep-alive", "X-Accel-Buffering": "no", }, ) # --- Non-streaming path (grid_method=v2) --- t0 = time.time() # Create binarized OCR image (for Tesseract) ocr_img = create_ocr_image(dewarped_bgr) img_h, img_w = dewarped_bgr.shape[:2] # Build cell grid using Cell-First OCR (v2) — each cell cropped in isolation cells, columns_meta = build_cell_grid_v2( ocr_img, col_regions, row_geoms, img_w, img_h, ocr_engine=engine, img_bgr=dewarped_bgr, skip_heal_gaps=skip_heal_gaps, ) duration = time.time() - t0 # Add zone_index to each cell (default 0 for backward compatibility) for cell in cells: cell.setdefault("zone_index", 0) # Layout detection col_types = {c['type'] for c in columns_meta} is_vocab = bool(col_types & {'column_en', 'column_de'}) # Count content rows and columns for grid_shape n_content_rows = len([r for r in row_geoms if r.row_type == 'content']) n_cols = len(columns_meta) # Determine which engine was actually used used_engine = cells[0].get("ocr_engine", "tesseract") if cells else engine # Apply IPA phonetic fixes directly to cell texts (for overlay mode) fix_cell_phonetics(cells, pronunciation=pronunciation) # Grid result (always generic) word_result = { "cells": cells, "grid_shape": { "rows": n_content_rows, "cols": n_cols, "total_cells": len(cells), }, "columns_used": columns_meta, "layout": "vocab" if is_vocab else "generic", "image_width": img_w, "image_height": img_h, "duration_seconds": round(duration, 2), "ocr_engine": used_engine, "summary": { "total_cells": len(cells), "non_empty_cells": sum(1 for c in cells if c.get("text")), "low_confidence": sum(1 for c in cells if 0 < c.get("confidence", 0) < 50), }, } # For vocab layout or single-column (box sub-sessions): map cells 1:1 # to vocab entries (row→entry). has_text_col = 'column_text' in col_types if is_vocab or has_text_col: entries = _cells_to_vocab_entries(cells, columns_meta) entries = _fix_phonetic_brackets(entries, pronunciation=pronunciation) word_result["vocab_entries"] = entries word_result["entries"] = entries word_result["entry_count"] = len(entries) word_result["summary"]["total_entries"] = len(entries) word_result["summary"]["with_english"] = sum(1 for e in entries if e.get("english")) word_result["summary"]["with_german"] = sum(1 for e in entries if e.get("german")) # Persist to DB await update_session_db( session_id, word_result=word_result, current_step=8, ) cached["word_result"] = word_result logger.info(f"OCR Pipeline: words session {session_id}: " f"layout={word_result['layout']}, " f"{len(cells)} cells ({duration:.2f}s), summary: {word_result['summary']}") await _append_pipeline_log(session_id, "words", { "total_cells": len(cells), "non_empty_cells": word_result["summary"]["non_empty_cells"], "low_confidence_count": word_result["summary"]["low_confidence"], "ocr_engine": used_engine, "layout": word_result["layout"], "entry_count": word_result.get("entry_count", 0), }, duration_ms=int(duration * 1000)) return { "session_id": session_id, **word_result, } async def _word_batch_stream_generator( session_id: str, cached: Dict[str, Any], col_regions: List[PageRegion], row_geoms: List[RowGeometry], dewarped_bgr: np.ndarray, engine: str, pronunciation: str, request: Request, skip_heal_gaps: bool = False, ): """SSE generator that runs batch OCR (parallel) then streams results. Unlike the old per-cell streaming, this uses build_cell_grid_v2 with ThreadPoolExecutor for parallel OCR, then emits all cells as SSE events. The 'preparing' event keeps the connection alive during OCR processing. """ import asyncio t0 = time.time() ocr_img = create_ocr_image(dewarped_bgr) img_h, img_w = dewarped_bgr.shape[:2] _skip_types = {'column_ignore', 'header', 'footer', 'margin_top', 'margin_bottom', 'margin_left', 'margin_right'} n_content_rows = len([r for r in row_geoms if r.row_type == 'content']) n_cols = len([c for c in col_regions if c.type not in _skip_types]) col_types = {c.type for c in col_regions if c.type not in _skip_types} is_vocab = bool(col_types & {'column_en', 'column_de'}) total_cells = n_content_rows * n_cols # 1. Send meta event immediately meta_event = { "type": "meta", "grid_shape": {"rows": n_content_rows, "cols": n_cols, "total_cells": total_cells}, "layout": "vocab" if is_vocab else "generic", } yield f"data: {json.dumps(meta_event)}\n\n" # 2. Send preparing event (keepalive for proxy) yield f"data: {json.dumps({'type': 'preparing', 'message': 'Cell-First OCR laeuft parallel...'})}\n\n" # 3. Run batch OCR in thread pool with periodic keepalive events. # The OCR takes 30-60s and proxy servers (Nginx) may drop idle SSE # connections after 30-60s. Send keepalive every 5s to prevent this. loop = asyncio.get_event_loop() ocr_future = loop.run_in_executor( None, lambda: build_cell_grid_v2( ocr_img, col_regions, row_geoms, img_w, img_h, ocr_engine=engine, img_bgr=dewarped_bgr, skip_heal_gaps=skip_heal_gaps, ), ) # Send keepalive events every 5 seconds while OCR runs keepalive_count = 0 while not ocr_future.done(): try: cells, columns_meta = await asyncio.wait_for( asyncio.shield(ocr_future), timeout=5.0, ) break # OCR finished except asyncio.TimeoutError: keepalive_count += 1 elapsed = int(time.time() - t0) yield f"data: {json.dumps({'type': 'keepalive', 'elapsed': elapsed, 'message': f'OCR laeuft... ({elapsed}s)'})}\n\n" if await request.is_disconnected(): logger.info(f"SSE batch: client disconnected during OCR for {session_id}") ocr_future.cancel() return else: cells, columns_meta = ocr_future.result() if await request.is_disconnected(): logger.info(f"SSE batch: client disconnected after OCR for {session_id}") return # 4. Apply IPA phonetic fixes directly to cell texts (for overlay mode) fix_cell_phonetics(cells, pronunciation=pronunciation) # 5. Send columns meta if columns_meta: yield f"data: {json.dumps({'type': 'columns', 'columns_used': columns_meta})}\n\n" # 6. Stream all cells for idx, cell in enumerate(cells): cell_event = { "type": "cell", "cell": cell, "progress": {"current": idx + 1, "total": len(cells)}, } yield f"data: {json.dumps(cell_event)}\n\n" # 6. Build final result and persist duration = time.time() - t0 used_engine = cells[0].get("ocr_engine", "tesseract") if cells else engine word_result = { "cells": cells, "grid_shape": {"rows": n_content_rows, "cols": n_cols, "total_cells": len(cells)}, "columns_used": columns_meta, "layout": "vocab" if is_vocab else "generic", "image_width": img_w, "image_height": img_h, "duration_seconds": round(duration, 2), "ocr_engine": used_engine, "summary": { "total_cells": len(cells), "non_empty_cells": sum(1 for c in cells if c.get("text")), "low_confidence": sum(1 for c in cells if 0 < c.get("confidence", 0) < 50), }, } vocab_entries = None has_text_col = 'column_text' in col_types if is_vocab or has_text_col: entries = _cells_to_vocab_entries(cells, columns_meta) entries = _fix_phonetic_brackets(entries, pronunciation=pronunciation) word_result["vocab_entries"] = entries word_result["entries"] = entries word_result["entry_count"] = len(entries) word_result["summary"]["total_entries"] = len(entries) word_result["summary"]["with_english"] = sum(1 for e in entries if e.get("english")) word_result["summary"]["with_german"] = sum(1 for e in entries if e.get("german")) vocab_entries = entries await update_session_db(session_id, word_result=word_result, current_step=8) cached["word_result"] = word_result logger.info(f"OCR Pipeline SSE batch: words session {session_id}: " f"layout={word_result['layout']}, {len(cells)} cells ({duration:.2f}s)") # 7. Send complete event complete_event = { "type": "complete", "summary": word_result["summary"], "duration_seconds": round(duration, 2), "ocr_engine": used_engine, } if vocab_entries is not None: complete_event["vocab_entries"] = vocab_entries yield f"data: {json.dumps(complete_event)}\n\n" async def _word_stream_generator( session_id: str, cached: Dict[str, Any], col_regions: List[PageRegion], row_geoms: List[RowGeometry], dewarped_bgr: np.ndarray, engine: str, pronunciation: str, request: Request, ): """SSE generator that yields cell-by-cell OCR progress.""" t0 = time.time() ocr_img = create_ocr_image(dewarped_bgr) img_h, img_w = dewarped_bgr.shape[:2] # Compute grid shape upfront for the meta event n_content_rows = len([r for r in row_geoms if r.row_type == 'content']) _skip_types = {'column_ignore', 'header', 'footer', 'margin_top', 'margin_bottom', 'margin_left', 'margin_right'} n_cols = len([c for c in col_regions if c.type not in _skip_types]) # Determine layout col_types = {c.type for c in col_regions if c.type not in _skip_types} is_vocab = bool(col_types & {'column_en', 'column_de'}) # Start streaming — first event: meta columns_meta = None # will be set from first yield total_cells = n_content_rows * n_cols meta_event = { "type": "meta", "grid_shape": {"rows": n_content_rows, "cols": n_cols, "total_cells": total_cells}, "layout": "vocab" if is_vocab else "generic", } yield f"data: {json.dumps(meta_event)}\n\n" # Keepalive: send preparing event so proxy doesn't timeout during OCR init yield f"data: {json.dumps({'type': 'preparing', 'message': 'Cell-First OCR wird initialisiert...'})}\n\n" # Stream cells one by one all_cells: List[Dict[str, Any]] = [] cell_idx = 0 last_keepalive = time.time() for cell, cols_meta, total in build_cell_grid_v2_streaming( ocr_img, col_regions, row_geoms, img_w, img_h, ocr_engine=engine, img_bgr=dewarped_bgr, ): if await request.is_disconnected(): logger.info(f"SSE: client disconnected during streaming for {session_id}") return if columns_meta is None: columns_meta = cols_meta # Send columns_used as part of first cell or update meta meta_update = { "type": "columns", "columns_used": cols_meta, } yield f"data: {json.dumps(meta_update)}\n\n" all_cells.append(cell) cell_idx += 1 cell_event = { "type": "cell", "cell": cell, "progress": {"current": cell_idx, "total": total}, } yield f"data: {json.dumps(cell_event)}\n\n" # All cells done — build final result duration = time.time() - t0 if columns_meta is None: columns_meta = [] # Post-OCR: remove rows where ALL cells are empty (inter-row gaps # that had stray Tesseract artifacts giving word_count > 0). rows_with_text: set = set() for c in all_cells: if c.get("text", "").strip(): rows_with_text.add(c["row_index"]) before_filter = len(all_cells) all_cells = [c for c in all_cells if c["row_index"] in rows_with_text] empty_rows_removed = (before_filter - len(all_cells)) // max(n_cols, 1) if empty_rows_removed > 0: logger.info(f"SSE: removed {empty_rows_removed} all-empty rows after OCR") used_engine = all_cells[0].get("ocr_engine", "tesseract") if all_cells else engine # Apply IPA phonetic fixes directly to cell texts (for overlay mode) fix_cell_phonetics(all_cells, pronunciation=pronunciation) word_result = { "cells": all_cells, "grid_shape": { "rows": n_content_rows, "cols": n_cols, "total_cells": len(all_cells), }, "columns_used": columns_meta, "layout": "vocab" if is_vocab else "generic", "image_width": img_w, "image_height": img_h, "duration_seconds": round(duration, 2), "ocr_engine": used_engine, "summary": { "total_cells": len(all_cells), "non_empty_cells": sum(1 for c in all_cells if c.get("text")), "low_confidence": sum(1 for c in all_cells if 0 < c.get("confidence", 0) < 50), }, } # For vocab layout or single-column (box sub-sessions): map cells 1:1 # to vocab entries (row→entry). vocab_entries = None has_text_col = 'column_text' in col_types if is_vocab or has_text_col: entries = _cells_to_vocab_entries(all_cells, columns_meta) entries = _fix_character_confusion(entries) entries = _fix_phonetic_brackets(entries, pronunciation=pronunciation) word_result["vocab_entries"] = entries word_result["entries"] = entries word_result["entry_count"] = len(entries) word_result["summary"]["total_entries"] = len(entries) word_result["summary"]["with_english"] = sum(1 for e in entries if e.get("english")) word_result["summary"]["with_german"] = sum(1 for e in entries if e.get("german")) vocab_entries = entries # Persist to DB await update_session_db( session_id, word_result=word_result, current_step=8, ) cached["word_result"] = word_result logger.info(f"OCR Pipeline SSE: words session {session_id}: " f"layout={word_result['layout']}, " f"{len(all_cells)} cells ({duration:.2f}s)") # Final complete event complete_event = { "type": "complete", "summary": word_result["summary"], "duration_seconds": round(duration, 2), "ocr_engine": used_engine, } if vocab_entries is not None: complete_event["vocab_entries"] = vocab_entries yield f"data: {json.dumps(complete_event)}\n\n" # --------------------------------------------------------------------------- # PaddleOCR Direct Endpoint # --------------------------------------------------------------------------- @router.post("/sessions/{session_id}/paddle-direct") async def paddle_direct(session_id: str): """Run PaddleOCR on the preprocessed image and build a word grid directly. Expects orientation/deskew/dewarp/crop to be done already. Uses the cropped image (falls back to dewarped, then original). The used image is stored as cropped_png so OverlayReconstruction can display it as the background. """ # Try preprocessed images first (crop > dewarp > original) img_png = await get_session_image(session_id, "cropped") if not img_png: img_png = await get_session_image(session_id, "dewarped") if not img_png: img_png = await get_session_image(session_id, "original") if not img_png: raise HTTPException(status_code=404, detail="No image found for this session") img_arr = np.frombuffer(img_png, dtype=np.uint8) img_bgr = cv2.imdecode(img_arr, cv2.IMREAD_COLOR) if img_bgr is None: raise HTTPException(status_code=400, detail="Failed to decode original image") img_h, img_w = img_bgr.shape[:2] from cv_ocr_engines import ocr_region_paddle t0 = time.time() word_dicts = await ocr_region_paddle(img_bgr, region=None) if not word_dicts: raise HTTPException(status_code=400, detail="PaddleOCR returned no words") # Reuse build_grid_from_words — same function that works in the regular # pipeline with PaddleOCR (engine=paddle, grid_method=words_first). # Handles phrase splitting, column clustering, and reading order. cells, columns_meta = build_grid_from_words(word_dicts, img_w, img_h) duration = time.time() - t0 # Tag cells as paddle_direct for cell in cells: cell["ocr_engine"] = "paddle_direct" n_rows = len(set(c["row_index"] for c in cells)) if cells else 0 n_cols = len(columns_meta) col_types = {c.get("type") for c in columns_meta} is_vocab = bool(col_types & {"column_en", "column_de"}) word_result = { "cells": cells, "grid_shape": { "rows": n_rows, "cols": n_cols, "total_cells": len(cells), }, "columns_used": columns_meta, "layout": "vocab" if is_vocab else "generic", "image_width": img_w, "image_height": img_h, "duration_seconds": round(duration, 2), "ocr_engine": "paddle_direct", "grid_method": "paddle_direct", "summary": { "total_cells": len(cells), "non_empty_cells": sum(1 for c in cells if c.get("text")), "low_confidence": sum(1 for c in cells if 0 < c.get("confidence", 0) < 50), }, } # Store preprocessed image as cropped_png so OverlayReconstruction shows it await update_session_db( session_id, word_result=word_result, cropped_png=img_png, current_step=8, ) logger.info( "paddle_direct session %s: %d cells (%d rows, %d cols) in %.2fs", session_id, len(cells), n_rows, n_cols, duration, ) await _append_pipeline_log(session_id, "paddle_direct", { "total_cells": len(cells), "non_empty_cells": word_result["summary"]["non_empty_cells"], "ocr_engine": "paddle_direct", }, duration_ms=int(duration * 1000)) return {"session_id": session_id, **word_result} # --------------------------------------------------------------------------- # Ground Truth Words Endpoints # --------------------------------------------------------------------------- @router.post("/sessions/{session_id}/ground-truth/words") async def save_word_ground_truth(session_id: str, req: WordGroundTruthRequest): """Save ground truth feedback for the word recognition step.""" session = await get_session_db(session_id) if not session: raise HTTPException(status_code=404, detail=f"Session {session_id} not found") ground_truth = session.get("ground_truth") or {} gt = { "is_correct": req.is_correct, "corrected_entries": req.corrected_entries, "notes": req.notes, "saved_at": datetime.utcnow().isoformat(), "word_result": session.get("word_result"), } ground_truth["words"] = gt await update_session_db(session_id, ground_truth=ground_truth) if session_id in _cache: _cache[session_id]["ground_truth"] = ground_truth return {"session_id": session_id, "ground_truth": gt} @router.get("/sessions/{session_id}/ground-truth/words") async def get_word_ground_truth(session_id: str): """Retrieve saved ground truth for word recognition.""" session = await get_session_db(session_id) if not session: raise HTTPException(status_code=404, detail=f"Session {session_id} not found") ground_truth = session.get("ground_truth") or {} words_gt = ground_truth.get("words") if not words_gt: raise HTTPException(status_code=404, detail="No word ground truth saved") return { "session_id": session_id, "words_gt": words_gt, "words_auto": session.get("word_result"), }