""" OCR Pipeline Words Detect — main word detection endpoint (Step 7). Extracted from ocr_pipeline_words.py. Contains the ``detect_words`` endpoint which handles both v2 and words_first grid methods. Lizenz: Apache 2.0 DATENSCHUTZ: Alle Verarbeitung erfolgt lokal. """ import json import logging import time from typing import Any, Dict, List import numpy as np from fastapi import APIRouter, HTTPException, Request from fastapi.responses import StreamingResponse from cv_vocab_pipeline import ( PageRegion, RowGeometry, _cells_to_vocab_entries, _fix_phonetic_brackets, fix_cell_phonetics, build_cell_grid_v2, create_ocr_image, detect_column_geometry, ) from cv_words_first import build_grid_from_words from ocr_pipeline_session_store import ( get_session_db, update_session_db, ) from ocr_pipeline_common import ( _cache, _load_session_to_cache, _get_cached, _append_pipeline_log, ) from ocr_pipeline_words_stream import ( _word_batch_stream_generator, ) logger = logging.getLogger(__name__) router = APIRouter(prefix="/api/v1/ocr-pipeline", tags=["ocr-pipeline"]) # --------------------------------------------------------------------------- # Word Detection Endpoint (Step 7) # --------------------------------------------------------------------------- @router.post("/sessions/{session_id}/words") async def detect_words( session_id: str, request: Request, engine: str = "auto", pronunciation: str = "british", stream: bool = False, skip_heal_gaps: bool = False, grid_method: str = "v2", ): """Build word grid from columns x rows, OCR each cell. Query params: engine: 'auto' (default), 'tesseract', 'rapid', or 'paddle' pronunciation: 'british' (default) or 'american' stream: false (default) for JSON response, true for SSE streaming skip_heal_gaps: false (default). When true, cells keep exact row geometry. grid_method: 'v2' (default) or 'words_first' """ # PaddleOCR is full-page remote OCR -> force words_first grid method if engine == "paddle" and grid_method != "words_first": logger.info("detect_words: engine=paddle requires words_first, overriding grid_method=%s", grid_method) grid_method = "words_first" if session_id not in _cache: logger.info("detect_words: session %s not in cache, loading from DB", session_id) await _load_session_to_cache(session_id) cached = _get_cached(session_id) dewarped_bgr = cached.get("cropped_bgr") if cached.get("cropped_bgr") is not None else cached.get("dewarped_bgr") if dewarped_bgr is None: logger.warning("detect_words: no cropped/dewarped image for session %s (cache keys: %s)", session_id, [k for k in cached.keys() if k.endswith('_bgr')]) raise HTTPException(status_code=400, detail="Crop or dewarp must be completed before word detection") session = await get_session_db(session_id) if not session: raise HTTPException(status_code=404, detail=f"Session {session_id} not found") column_result = session.get("column_result") row_result = session.get("row_result") if not column_result or not column_result.get("columns"): img_h_tmp, img_w_tmp = dewarped_bgr.shape[:2] column_result = { "columns": [{ "type": "column_text", "x": 0, "y": 0, "width": img_w_tmp, "height": img_h_tmp, "classification_confidence": 1.0, "classification_method": "full_page_fallback", }], "zones": [], "duration_seconds": 0, } logger.info("detect_words: no column_result -- using full-page pseudo-column %dx%d", img_w_tmp, img_h_tmp) if grid_method != "words_first" and (not row_result or not row_result.get("rows")): raise HTTPException(status_code=400, detail="Row detection must be completed first") # Convert column dicts back to PageRegion objects col_regions = [ PageRegion( type=c["type"], x=c["x"], y=c["y"], width=c["width"], height=c["height"], classification_confidence=c.get("classification_confidence", 1.0), classification_method=c.get("classification_method", ""), ) for c in column_result["columns"] ] # Convert row dicts back to RowGeometry objects row_geoms = [ RowGeometry( index=r["index"], x=r["x"], y=r["y"], width=r["width"], height=r["height"], word_count=r.get("word_count", 0), words=[], row_type=r.get("row_type", "content"), gap_before=r.get("gap_before", 0), ) for r in row_result["rows"] ] # Populate word counts from cached words word_dicts = cached.get("_word_dicts") if word_dicts is None: ocr_img_tmp = create_ocr_image(dewarped_bgr) geo_result = detect_column_geometry(ocr_img_tmp, dewarped_bgr) if geo_result is not None: _geoms, left_x, right_x, top_y, bottom_y, word_dicts, inv = geo_result cached["_word_dicts"] = word_dicts cached["_inv"] = inv cached["_content_bounds"] = (left_x, right_x, top_y, bottom_y) if word_dicts: content_bounds = cached.get("_content_bounds") if content_bounds: _lx, _rx, top_y, _by = content_bounds else: top_y = min(r.y for r in row_geoms) if row_geoms else 0 for row in row_geoms: row_y_rel = row.y - top_y row_bottom_rel = row_y_rel + row.height row.words = [ w for w in word_dicts if row_y_rel <= w['top'] + w['height'] / 2 < row_bottom_rel ] row.word_count = len(row.words) # Exclude rows that fall within box zones zones = column_result.get("zones") or [] box_ranges_inner = [] for zone in zones: if zone.get("zone_type") == "box" and zone.get("box"): box = zone["box"] bt = max(box.get("border_thickness", 0), 5) box_ranges_inner.append((box["y"] + bt, box["y"] + box["height"] - bt)) if box_ranges_inner: def _row_in_box(r): center_y = r.y + r.height / 2 return any(by_s <= center_y < by_e for by_s, by_e in box_ranges_inner) before_count = len(row_geoms) row_geoms = [r for r in row_geoms if not _row_in_box(r)] excluded = before_count - len(row_geoms) if excluded: logger.info(f"detect_words: excluded {excluded} rows inside box zones") # --- Words-First path --- if grid_method == "words_first": return await _words_first_path( session_id, cached, dewarped_bgr, engine, pronunciation, zones, ) if stream: return StreamingResponse( _word_batch_stream_generator( session_id, cached, col_regions, row_geoms, dewarped_bgr, engine, pronunciation, request, skip_heal_gaps=skip_heal_gaps, ), media_type="text/event-stream", headers={ "Cache-Control": "no-cache", "Connection": "keep-alive", "X-Accel-Buffering": "no", }, ) # --- Non-streaming path (grid_method=v2) --- return await _v2_path( session_id, cached, col_regions, row_geoms, dewarped_bgr, engine, pronunciation, skip_heal_gaps, ) async def _words_first_path( session_id: str, cached: Dict[str, Any], dewarped_bgr: np.ndarray, engine: str, pronunciation: str, zones: list, ) -> dict: """Words-first grid construction path.""" t0 = time.time() img_h, img_w = dewarped_bgr.shape[:2] if engine == "paddle": from cv_ocr_engines import ocr_region_paddle wf_word_dicts = await ocr_region_paddle(dewarped_bgr, region=None) cached["_paddle_word_dicts"] = wf_word_dicts else: wf_word_dicts = cached.get("_word_dicts") if wf_word_dicts is None: ocr_img_tmp = create_ocr_image(dewarped_bgr) geo_result = detect_column_geometry(ocr_img_tmp, dewarped_bgr) if geo_result is not None: _geoms, left_x, right_x, top_y, bottom_y, wf_word_dicts, inv = geo_result cached["_word_dicts"] = wf_word_dicts cached["_inv"] = inv cached["_content_bounds"] = (left_x, right_x, top_y, bottom_y) if not wf_word_dicts: raise HTTPException(status_code=400, detail="No words detected -- cannot build words-first grid") # Convert word coordinates to absolute if needed if engine != "paddle": content_bounds = cached.get("_content_bounds") if content_bounds: lx, _rx, ty, _by = content_bounds abs_words = [] for w in wf_word_dicts: abs_words.append({**w, 'left': w['left'] + lx, 'top': w['top'] + ty}) wf_word_dicts = abs_words box_rects = [] for zone in zones: if zone.get("zone_type") == "box" and zone.get("box"): box_rects.append(zone["box"]) cells, columns_meta = build_grid_from_words( wf_word_dicts, img_w, img_h, box_rects=box_rects or None, ) duration = time.time() - t0 fix_cell_phonetics(cells, pronunciation=pronunciation) for cell in cells: cell.setdefault("zone_index", 0) col_types = {c['type'] for c in columns_meta} is_vocab = bool(col_types & {'column_en', 'column_de'}) n_rows = len(set(c['row_index'] for c in cells)) if cells else 0 n_cols = len(columns_meta) used_engine = "paddle" if engine == "paddle" else "words_first" word_result = { "cells": cells, "grid_shape": {"rows": n_rows, "cols": n_cols, "total_cells": len(cells)}, "columns_used": columns_meta, "layout": "vocab" if is_vocab else "generic", "image_width": img_w, "image_height": img_h, "duration_seconds": round(duration, 2), "ocr_engine": used_engine, "grid_method": "words_first", "summary": { "total_cells": len(cells), "non_empty_cells": sum(1 for c in cells if c.get("text")), "low_confidence": sum(1 for c in cells if 0 < c.get("confidence", 0) < 50), }, } if is_vocab or 'column_text' in col_types: entries = _cells_to_vocab_entries(cells, columns_meta) entries = _fix_phonetic_brackets(entries, pronunciation=pronunciation) word_result["vocab_entries"] = entries word_result["entries"] = entries word_result["entry_count"] = len(entries) word_result["summary"]["total_entries"] = len(entries) word_result["summary"]["with_english"] = sum(1 for e in entries if e.get("english")) word_result["summary"]["with_german"] = sum(1 for e in entries if e.get("german")) await update_session_db(session_id, word_result=word_result, current_step=8) cached["word_result"] = word_result logger.info(f"OCR Pipeline: words-first session {session_id}: " f"{len(cells)} cells ({duration:.2f}s), {n_rows} rows, {n_cols} cols") await _append_pipeline_log(session_id, "words", { "grid_method": "words_first", "total_cells": len(cells), "non_empty_cells": word_result["summary"]["non_empty_cells"], "ocr_engine": used_engine, "layout": word_result["layout"], }, duration_ms=int(duration * 1000)) return {"session_id": session_id, **word_result} async def _v2_path( session_id: str, cached: Dict[str, Any], col_regions: List[PageRegion], row_geoms: List[RowGeometry], dewarped_bgr: np.ndarray, engine: str, pronunciation: str, skip_heal_gaps: bool, ) -> dict: """Cell-First OCR v2 non-streaming path.""" t0 = time.time() ocr_img = create_ocr_image(dewarped_bgr) img_h, img_w = dewarped_bgr.shape[:2] cells, columns_meta = build_cell_grid_v2( ocr_img, col_regions, row_geoms, img_w, img_h, ocr_engine=engine, img_bgr=dewarped_bgr, skip_heal_gaps=skip_heal_gaps, ) duration = time.time() - t0 for cell in cells: cell.setdefault("zone_index", 0) col_types = {c['type'] for c in columns_meta} is_vocab = bool(col_types & {'column_en', 'column_de'}) n_content_rows = len([r for r in row_geoms if r.row_type == 'content']) n_cols = len(columns_meta) used_engine = cells[0].get("ocr_engine", "tesseract") if cells else engine fix_cell_phonetics(cells, pronunciation=pronunciation) word_result = { "cells": cells, "grid_shape": {"rows": n_content_rows, "cols": n_cols, "total_cells": len(cells)}, "columns_used": columns_meta, "layout": "vocab" if is_vocab else "generic", "image_width": img_w, "image_height": img_h, "duration_seconds": round(duration, 2), "ocr_engine": used_engine, "summary": { "total_cells": len(cells), "non_empty_cells": sum(1 for c in cells if c.get("text")), "low_confidence": sum(1 for c in cells if 0 < c.get("confidence", 0) < 50), }, } has_text_col = 'column_text' in col_types if is_vocab or has_text_col: entries = _cells_to_vocab_entries(cells, columns_meta) entries = _fix_phonetic_brackets(entries, pronunciation=pronunciation) word_result["vocab_entries"] = entries word_result["entries"] = entries word_result["entry_count"] = len(entries) word_result["summary"]["total_entries"] = len(entries) word_result["summary"]["with_english"] = sum(1 for e in entries if e.get("english")) word_result["summary"]["with_german"] = sum(1 for e in entries if e.get("german")) await update_session_db(session_id, word_result=word_result, current_step=8) cached["word_result"] = word_result logger.info(f"OCR Pipeline: words session {session_id}: " f"layout={word_result['layout']}, " f"{len(cells)} cells ({duration:.2f}s), summary: {word_result['summary']}") await _append_pipeline_log(session_id, "words", { "total_cells": len(cells), "non_empty_cells": word_result["summary"]["non_empty_cells"], "low_confidence_count": word_result["summary"]["low_confidence"], "ocr_engine": used_engine, "layout": word_result["layout"], "entry_count": word_result.get("entry_count", 0), }, duration_ms=int(duration * 1000)) return {"session_id": session_id, **word_result}