""" OCR Merge Kombi Endpoints — paddle-kombi and rapid-kombi endpoints. Merge helper functions live in ocr_merge_helpers.py. This module re-exports them for backward compatibility. Lizenz: Apache 2.0 DATENSCHUTZ: Alle Verarbeitung erfolgt lokal. """ import logging import time import cv2 import numpy as np from fastapi import APIRouter, HTTPException from cv_words_first import build_grid_from_words from ocr_pipeline_common import _cache, _append_pipeline_log from ocr_pipeline_session_store import get_session_image, update_session_db # Re-export merge helpers for backward compatibility from ocr_merge_helpers import ( # noqa: F401 _split_paddle_multi_words, _group_words_into_rows, _row_center_y, _merge_row_sequences, _merge_paddle_tesseract, _deduplicate_words, ) logger = logging.getLogger(__name__) router = APIRouter(prefix="/api/v1/ocr-pipeline", tags=["ocr-pipeline"]) def _run_tesseract_words(img_bgr) -> list: """Run Tesseract OCR on an image and return word dicts.""" from PIL import Image import pytesseract pil_img = Image.fromarray(cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB)) data = pytesseract.image_to_data( pil_img, lang="eng+deu", config="--psm 6 --oem 3", output_type=pytesseract.Output.DICT, ) tess_words = [] for i in range(len(data["text"])): text = str(data["text"][i]).strip() conf_raw = str(data["conf"][i]) conf = int(conf_raw) if conf_raw.lstrip("-").isdigit() else -1 if not text or conf < 20: continue tess_words.append({ "text": text, "left": data["left"][i], "top": data["top"][i], "width": data["width"][i], "height": data["height"][i], "conf": conf, }) return tess_words def _build_kombi_word_result( cells: list, columns_meta: list, img_w: int, img_h: int, duration: float, engine_name: str, raw_engine_words: list, raw_engine_words_split: list, tess_words: list, merged_words: list, raw_engine_key: str = "raw_paddle_words", raw_split_key: str = "raw_paddle_words_split", ) -> dict: """Build the word_result dict for kombi endpoints.""" n_rows = len(set(c["row_index"] for c in cells)) if cells else 0 n_cols = len(columns_meta) col_types = {c.get("type") for c in columns_meta} is_vocab = bool(col_types & {"column_en", "column_de"}) return { "cells": cells, "grid_shape": {"rows": n_rows, "cols": n_cols, "total_cells": len(cells)}, "columns_used": columns_meta, "layout": "vocab" if is_vocab else "generic", "image_width": img_w, "image_height": img_h, "duration_seconds": round(duration, 2), "ocr_engine": engine_name, "grid_method": engine_name, raw_engine_key: raw_engine_words, raw_split_key: raw_engine_words_split, "raw_tesseract_words": tess_words, "summary": { "total_cells": len(cells), "non_empty_cells": sum(1 for c in cells if c.get("text")), "low_confidence": sum(1 for c in cells if 0 < c.get("confidence", 0) < 50), raw_engine_key.replace("raw_", "").replace("_words", "_words"): len(raw_engine_words), raw_split_key.replace("raw_", "").replace("_words_split", "_words_split"): len(raw_engine_words_split), "tesseract_words": len(tess_words), "merged_words": len(merged_words), }, } async def _load_session_image(session_id: str): """Load preprocessed image for kombi endpoints.""" img_png = await get_session_image(session_id, "cropped") if not img_png: img_png = await get_session_image(session_id, "dewarped") if not img_png: img_png = await get_session_image(session_id, "original") if not img_png: raise HTTPException(status_code=404, detail="No image found for this session") img_arr = np.frombuffer(img_png, dtype=np.uint8) img_bgr = cv2.imdecode(img_arr, cv2.IMREAD_COLOR) if img_bgr is None: raise HTTPException(status_code=400, detail="Failed to decode image") return img_png, img_bgr # --------------------------------------------------------------------------- # Kombi endpoints # --------------------------------------------------------------------------- @router.post("/sessions/{session_id}/paddle-kombi") async def paddle_kombi(session_id: str): """Run PaddleOCR + Tesseract on the preprocessed image and merge results.""" img_png, img_bgr = await _load_session_image(session_id) img_h, img_w = img_bgr.shape[:2] from cv_ocr_engines import ocr_region_paddle t0 = time.time() paddle_words = await ocr_region_paddle(img_bgr, region=None) if not paddle_words: paddle_words = [] tess_words = _run_tesseract_words(img_bgr) paddle_words_split = _split_paddle_multi_words(paddle_words) logger.info( "paddle_kombi: split %d paddle boxes -> %d individual words", len(paddle_words), len(paddle_words_split), ) if not paddle_words_split and not tess_words: raise HTTPException(status_code=400, detail="Both OCR engines returned no words") merged_words = _merge_paddle_tesseract(paddle_words_split, tess_words) merged_words = _deduplicate_words(merged_words) cells, columns_meta = build_grid_from_words(merged_words, img_w, img_h) duration = time.time() - t0 for cell in cells: cell["ocr_engine"] = "kombi" word_result = _build_kombi_word_result( cells, columns_meta, img_w, img_h, duration, "kombi", paddle_words, paddle_words_split, tess_words, merged_words, "raw_paddle_words", "raw_paddle_words_split", ) await update_session_db( session_id, word_result=word_result, cropped_png=img_png, current_step=8, ) if session_id in _cache: _cache[session_id]["word_result"] = word_result logger.info( "paddle_kombi session %s: %d cells (%d rows, %d cols) in %.2fs " "[paddle=%d, tess=%d, merged=%d]", session_id, len(cells), word_result["grid_shape"]["rows"], word_result["grid_shape"]["cols"], duration, len(paddle_words), len(tess_words), len(merged_words), ) await _append_pipeline_log(session_id, "paddle_kombi", { "total_cells": len(cells), "non_empty_cells": word_result["summary"]["non_empty_cells"], "paddle_words": len(paddle_words), "tesseract_words": len(tess_words), "merged_words": len(merged_words), "ocr_engine": "kombi", }, duration_ms=int(duration * 1000)) return {"session_id": session_id, **word_result} @router.post("/sessions/{session_id}/rapid-kombi") async def rapid_kombi(session_id: str): """Run RapidOCR + Tesseract on the preprocessed image and merge results.""" img_png, img_bgr = await _load_session_image(session_id) img_h, img_w = img_bgr.shape[:2] from cv_ocr_engines import ocr_region_rapid from cv_vocab_types import PageRegion t0 = time.time() full_region = PageRegion( type="full_page", x=0, y=0, width=img_w, height=img_h, ) rapid_words = ocr_region_rapid(img_bgr, full_region) if not rapid_words: rapid_words = [] tess_words = _run_tesseract_words(img_bgr) rapid_words_split = _split_paddle_multi_words(rapid_words) logger.info( "rapid_kombi: split %d rapid boxes -> %d individual words", len(rapid_words), len(rapid_words_split), ) if not rapid_words_split and not tess_words: raise HTTPException(status_code=400, detail="Both OCR engines returned no words") merged_words = _merge_paddle_tesseract(rapid_words_split, tess_words) merged_words = _deduplicate_words(merged_words) cells, columns_meta = build_grid_from_words(merged_words, img_w, img_h) duration = time.time() - t0 for cell in cells: cell["ocr_engine"] = "rapid_kombi" word_result = _build_kombi_word_result( cells, columns_meta, img_w, img_h, duration, "rapid_kombi", rapid_words, rapid_words_split, tess_words, merged_words, "raw_rapid_words", "raw_rapid_words_split", ) await update_session_db( session_id, word_result=word_result, cropped_png=img_png, current_step=8, ) if session_id in _cache: _cache[session_id]["word_result"] = word_result logger.info( "rapid_kombi session %s: %d cells (%d rows, %d cols) in %.2fs " "[rapid=%d, tess=%d, merged=%d]", session_id, len(cells), word_result["grid_shape"]["rows"], word_result["grid_shape"]["cols"], duration, len(rapid_words), len(tess_words), len(merged_words), ) await _append_pipeline_log(session_id, "rapid_kombi", { "total_cells": len(cells), "non_empty_cells": word_result["summary"]["non_empty_cells"], "rapid_words": len(rapid_words), "tesseract_words": len(tess_words), "merged_words": len(merged_words), "ocr_engine": "rapid_kombi", }, duration_ms=int(duration * 1000)) return {"session_id": session_id, **word_result}