""" OCR Merge Helpers and Kombi Endpoints. Contains merge helper functions for combining PaddleOCR/RapidOCR with Tesseract results, plus the paddle-kombi and rapid-kombi endpoints. Extracted from ocr_pipeline_api.py for modularity. Lizenz: Apache 2.0 DATENSCHUTZ: Alle Verarbeitung erfolgt lokal. """ import logging import time from typing import Any, Dict, List import cv2 import httpx import numpy as np from fastapi import APIRouter, HTTPException from cv_words_first import build_grid_from_words from ocr_pipeline_common import _cache, _append_pipeline_log from ocr_pipeline_session_store import get_session_image, update_session_db logger = logging.getLogger(__name__) router = APIRouter(prefix="/api/v1/ocr-pipeline", tags=["ocr-pipeline"]) # --------------------------------------------------------------------------- # Merge helper functions # --------------------------------------------------------------------------- def _split_paddle_multi_words(words: list) -> list: """Split PaddleOCR multi-word boxes into individual word boxes. PaddleOCR often returns entire phrases as a single box, e.g. "More than 200 singers took part in the" with one bounding box. This splits them into individual words with proportional widths. Also handles leading "!" (e.g. "!Betonung" → ["!", "Betonung"]) and IPA brackets (e.g. "badge[bxd3]" → ["badge", "[bxd3]"]). """ import re result = [] for w in words: raw_text = w.get("text", "").strip() if not raw_text: continue # Split on whitespace, before "[" (IPA), and after "!" before letter tokens = re.split( r'\s+|(?=\[)|(?<=!)(?=[A-Za-z\u00c0-\u024f])', raw_text ) tokens = [t for t in tokens if t] if len(tokens) <= 1: result.append(w) else: # Split proportionally by character count total_chars = sum(len(t) for t in tokens) if total_chars == 0: continue n_gaps = len(tokens) - 1 gap_px = w["width"] * 0.02 usable_w = w["width"] - gap_px * n_gaps cursor = w["left"] for t in tokens: token_w = max(1, usable_w * len(t) / total_chars) result.append({ "text": t, "left": round(cursor), "top": w["top"], "width": round(token_w), "height": w["height"], "conf": w.get("conf", 0), }) cursor += token_w + gap_px return result def _group_words_into_rows(words: list, row_gap: int = 12) -> list: """Group words into rows by Y-position clustering. Words whose vertical centers are within `row_gap` pixels are on the same row. Returns list of rows, each row is a list of words sorted left-to-right. """ if not words: return [] # Sort by vertical center sorted_words = sorted(words, key=lambda w: w["top"] + w.get("height", 0) / 2) rows: list = [] current_row: list = [sorted_words[0]] current_cy = sorted_words[0]["top"] + sorted_words[0].get("height", 0) / 2 for w in sorted_words[1:]: cy = w["top"] + w.get("height", 0) / 2 if abs(cy - current_cy) <= row_gap: current_row.append(w) else: # Sort current row left-to-right before saving rows.append(sorted(current_row, key=lambda w: w["left"])) current_row = [w] current_cy = cy if current_row: rows.append(sorted(current_row, key=lambda w: w["left"])) return rows def _row_center_y(row: list) -> float: """Average vertical center of a row of words.""" if not row: return 0.0 return sum(w["top"] + w.get("height", 0) / 2 for w in row) / len(row) def _merge_row_sequences(paddle_row: list, tess_row: list) -> list: """Merge two word sequences from the same row using sequence alignment. Both sequences are sorted left-to-right. Walk through both simultaneously: - If words match (same/similar text): take Paddle text with averaged coords - If they don't match: the extra word is unique to one engine, include it This prevents duplicates because both engines produce words in the same order. """ merged = [] pi, ti = 0, 0 while pi < len(paddle_row) and ti < len(tess_row): pw = paddle_row[pi] tw = tess_row[ti] # Check if these are the same word pt = pw.get("text", "").lower().strip() tt = tw.get("text", "").lower().strip() # Same text or one contains the other is_same = (pt == tt) or (len(pt) > 1 and len(tt) > 1 and (pt in tt or tt in pt)) # Spatial overlap check: if words overlap >= 40% horizontally, # they're the same physical word regardless of OCR text differences. # (40% catches borderline cases like "Stick"/"Stück" at 48% overlap) spatial_match = False if not is_same: overlap_left = max(pw["left"], tw["left"]) overlap_right = min( pw["left"] + pw.get("width", 0), tw["left"] + tw.get("width", 0), ) overlap_w = max(0, overlap_right - overlap_left) min_w = min(pw.get("width", 1), tw.get("width", 1)) if min_w > 0 and overlap_w / min_w >= 0.4: is_same = True spatial_match = True if is_same: # Matched — average coordinates weighted by confidence pc = pw.get("conf", 80) tc = tw.get("conf", 50) total = pc + tc if total == 0: total = 1 # Text: prefer higher-confidence engine when texts differ # (e.g. Tesseract "Stück" conf=98 vs PaddleOCR "Stick" conf=80) if spatial_match and pc < tc: best_text = tw["text"] else: best_text = pw["text"] merged.append({ "text": best_text, "left": round((pw["left"] * pc + tw["left"] * tc) / total), "top": round((pw["top"] * pc + tw["top"] * tc) / total), "width": round((pw["width"] * pc + tw["width"] * tc) / total), "height": round((pw["height"] * pc + tw["height"] * tc) / total), "conf": max(pc, tc), }) pi += 1 ti += 1 else: # Different text — one engine found something extra # Look ahead: is the current Paddle word somewhere in Tesseract ahead? paddle_ahead = any( tess_row[t].get("text", "").lower().strip() == pt for t in range(ti + 1, min(ti + 4, len(tess_row))) ) # Is the current Tesseract word somewhere in Paddle ahead? tess_ahead = any( paddle_row[p].get("text", "").lower().strip() == tt for p in range(pi + 1, min(pi + 4, len(paddle_row))) ) if paddle_ahead and not tess_ahead: # Tesseract has an extra word (e.g. "!" or bullet) → include it if tw.get("conf", 0) >= 30: merged.append(tw) ti += 1 elif tess_ahead and not paddle_ahead: # Paddle has an extra word → include it merged.append(pw) pi += 1 else: # Both have unique words or neither found ahead → take leftmost first if pw["left"] <= tw["left"]: merged.append(pw) pi += 1 else: if tw.get("conf", 0) >= 30: merged.append(tw) ti += 1 # Remaining words from either engine while pi < len(paddle_row): merged.append(paddle_row[pi]) pi += 1 while ti < len(tess_row): tw = tess_row[ti] if tw.get("conf", 0) >= 30: merged.append(tw) ti += 1 return merged def _merge_paddle_tesseract(paddle_words: list, tess_words: list) -> list: """Merge word boxes from PaddleOCR and Tesseract using row-based sequence alignment. Strategy: 1. Group each engine's words into rows (by Y-position clustering) 2. Match rows between engines (by vertical center proximity) 3. Within each matched row: merge sequences left-to-right, deduplicating words that appear in both engines at the same sequence position 4. Unmatched rows from either engine: keep as-is This prevents: - Cross-line averaging (words from different lines being merged) - Duplicate words (same word from both engines shown twice) """ if not paddle_words and not tess_words: return [] if not paddle_words: return [w for w in tess_words if w.get("conf", 0) >= 40] if not tess_words: return list(paddle_words) # Step 1: Group into rows paddle_rows = _group_words_into_rows(paddle_words) tess_rows = _group_words_into_rows(tess_words) # Step 2: Match rows between engines by vertical center proximity used_tess_rows: set = set() merged_all: list = [] for pr in paddle_rows: pr_cy = _row_center_y(pr) best_dist, best_tri = float("inf"), -1 for tri, tr in enumerate(tess_rows): if tri in used_tess_rows: continue tr_cy = _row_center_y(tr) dist = abs(pr_cy - tr_cy) if dist < best_dist: best_dist, best_tri = dist, tri # Row height threshold — rows must be within ~1.5x typical line height max_row_dist = max( max((w.get("height", 20) for w in pr), default=20), 15, ) if best_tri >= 0 and best_dist <= max_row_dist: # Matched row — merge sequences tr = tess_rows[best_tri] used_tess_rows.add(best_tri) merged_all.extend(_merge_row_sequences(pr, tr)) else: # No matching Tesseract row — keep Paddle row as-is merged_all.extend(pr) # Add unmatched Tesseract rows for tri, tr in enumerate(tess_rows): if tri not in used_tess_rows: for tw in tr: if tw.get("conf", 0) >= 40: merged_all.append(tw) return merged_all def _deduplicate_words(words: list) -> list: """Remove duplicate words with same text at overlapping positions. PaddleOCR can return overlapping phrases (e.g. "von jm." and "jm. =") that produce duplicate words after splitting. This pass removes them. A word is a duplicate only when BOTH horizontal AND vertical overlap exceed 50% — same text on the same visual line at the same position. """ if not words: return words result: list = [] for w in words: wt = w.get("text", "").lower().strip() if not wt: continue is_dup = False w_right = w["left"] + w.get("width", 0) w_bottom = w["top"] + w.get("height", 0) for existing in result: et = existing.get("text", "").lower().strip() if wt != et: continue # Horizontal overlap ox_l = max(w["left"], existing["left"]) ox_r = min(w_right, existing["left"] + existing.get("width", 0)) ox = max(0, ox_r - ox_l) min_w = min(w.get("width", 1), existing.get("width", 1)) if min_w <= 0 or ox / min_w < 0.5: continue # Vertical overlap — must also be on the same line oy_t = max(w["top"], existing["top"]) oy_b = min(w_bottom, existing["top"] + existing.get("height", 0)) oy = max(0, oy_b - oy_t) min_h = min(w.get("height", 1), existing.get("height", 1)) if min_h > 0 and oy / min_h >= 0.5: is_dup = True break if not is_dup: result.append(w) removed = len(words) - len(result) if removed: logger.info("dedup: removed %d duplicate words", removed) return result # --------------------------------------------------------------------------- # Kombi endpoints # --------------------------------------------------------------------------- @router.post("/sessions/{session_id}/paddle-kombi") async def paddle_kombi(session_id: str): """Run PaddleOCR + Tesseract on the preprocessed image and merge results. Both engines run on the same preprocessed (cropped/dewarped) image. Word boxes are matched by IoU and coordinates are averaged weighted by confidence. Unmatched Tesseract words (bullets, symbols) are added. """ img_png = await get_session_image(session_id, "cropped") if not img_png: img_png = await get_session_image(session_id, "dewarped") if not img_png: img_png = await get_session_image(session_id, "original") if not img_png: raise HTTPException(status_code=404, detail="No image found for this session") img_arr = np.frombuffer(img_png, dtype=np.uint8) img_bgr = cv2.imdecode(img_arr, cv2.IMREAD_COLOR) if img_bgr is None: raise HTTPException(status_code=400, detail="Failed to decode image") img_h, img_w = img_bgr.shape[:2] from cv_ocr_engines import ocr_region_paddle t0 = time.time() # --- PaddleOCR --- paddle_words = await ocr_region_paddle(img_bgr, region=None) if not paddle_words: paddle_words = [] # --- Tesseract --- from PIL import Image import pytesseract pil_img = Image.fromarray(cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB)) data = pytesseract.image_to_data( pil_img, lang="eng+deu", config="--psm 6 --oem 3", output_type=pytesseract.Output.DICT, ) tess_words = [] for i in range(len(data["text"])): text = str(data["text"][i]).strip() conf_raw = str(data["conf"][i]) conf = int(conf_raw) if conf_raw.lstrip("-").isdigit() else -1 if not text or conf < 20: continue tess_words.append({ "text": text, "left": data["left"][i], "top": data["top"][i], "width": data["width"][i], "height": data["height"][i], "conf": conf, }) # --- Split multi-word Paddle boxes into individual words --- paddle_words_split = _split_paddle_multi_words(paddle_words) logger.info( "paddle_kombi: split %d paddle boxes → %d individual words", len(paddle_words), len(paddle_words_split), ) # --- Merge --- if not paddle_words_split and not tess_words: raise HTTPException(status_code=400, detail="Both OCR engines returned no words") merged_words = _merge_paddle_tesseract(paddle_words_split, tess_words) merged_words = _deduplicate_words(merged_words) cells, columns_meta = build_grid_from_words(merged_words, img_w, img_h) duration = time.time() - t0 for cell in cells: cell["ocr_engine"] = "kombi" n_rows = len(set(c["row_index"] for c in cells)) if cells else 0 n_cols = len(columns_meta) col_types = {c.get("type") for c in columns_meta} is_vocab = bool(col_types & {"column_en", "column_de"}) word_result = { "cells": cells, "grid_shape": {"rows": n_rows, "cols": n_cols, "total_cells": len(cells)}, "columns_used": columns_meta, "layout": "vocab" if is_vocab else "generic", "image_width": img_w, "image_height": img_h, "duration_seconds": round(duration, 2), "ocr_engine": "kombi", "grid_method": "kombi", "raw_paddle_words": paddle_words, "raw_paddle_words_split": paddle_words_split, "raw_tesseract_words": tess_words, "summary": { "total_cells": len(cells), "non_empty_cells": sum(1 for c in cells if c.get("text")), "low_confidence": sum(1 for c in cells if 0 < c.get("confidence", 0) < 50), "paddle_words": len(paddle_words), "paddle_words_split": len(paddle_words_split), "tesseract_words": len(tess_words), "merged_words": len(merged_words), }, } await update_session_db( session_id, word_result=word_result, cropped_png=img_png, current_step=8, ) # Update in-memory cache so detect-structure can access word_result if session_id in _cache: _cache[session_id]["word_result"] = word_result logger.info( "paddle_kombi session %s: %d cells (%d rows, %d cols) in %.2fs " "[paddle=%d, tess=%d, merged=%d]", session_id, len(cells), n_rows, n_cols, duration, len(paddle_words), len(tess_words), len(merged_words), ) await _append_pipeline_log(session_id, "paddle_kombi", { "total_cells": len(cells), "non_empty_cells": word_result["summary"]["non_empty_cells"], "paddle_words": len(paddle_words), "tesseract_words": len(tess_words), "merged_words": len(merged_words), "ocr_engine": "kombi", }, duration_ms=int(duration * 1000)) return {"session_id": session_id, **word_result} @router.post("/sessions/{session_id}/rapid-kombi") async def rapid_kombi(session_id: str): """Run RapidOCR + Tesseract on the preprocessed image and merge results. Same merge logic as paddle-kombi, but uses local RapidOCR (ONNX Runtime) instead of remote PaddleOCR service. """ img_png = await get_session_image(session_id, "cropped") if not img_png: img_png = await get_session_image(session_id, "dewarped") if not img_png: img_png = await get_session_image(session_id, "original") if not img_png: raise HTTPException(status_code=404, detail="No image found for this session") img_arr = np.frombuffer(img_png, dtype=np.uint8) img_bgr = cv2.imdecode(img_arr, cv2.IMREAD_COLOR) if img_bgr is None: raise HTTPException(status_code=400, detail="Failed to decode image") img_h, img_w = img_bgr.shape[:2] from cv_ocr_engines import ocr_region_rapid from cv_vocab_types import PageRegion t0 = time.time() # --- RapidOCR (local, synchronous) --- full_region = PageRegion( type="full_page", x=0, y=0, width=img_w, height=img_h, ) rapid_words = ocr_region_rapid(img_bgr, full_region) if not rapid_words: rapid_words = [] # --- Tesseract --- from PIL import Image import pytesseract pil_img = Image.fromarray(cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB)) data = pytesseract.image_to_data( pil_img, lang="eng+deu", config="--psm 6 --oem 3", output_type=pytesseract.Output.DICT, ) tess_words = [] for i in range(len(data["text"])): text = str(data["text"][i]).strip() conf_raw = str(data["conf"][i]) conf = int(conf_raw) if conf_raw.lstrip("-").isdigit() else -1 if not text or conf < 20: continue tess_words.append({ "text": text, "left": data["left"][i], "top": data["top"][i], "width": data["width"][i], "height": data["height"][i], "conf": conf, }) # --- Split multi-word RapidOCR boxes into individual words --- rapid_words_split = _split_paddle_multi_words(rapid_words) logger.info( "rapid_kombi: split %d rapid boxes → %d individual words", len(rapid_words), len(rapid_words_split), ) # --- Merge --- if not rapid_words_split and not tess_words: raise HTTPException(status_code=400, detail="Both OCR engines returned no words") merged_words = _merge_paddle_tesseract(rapid_words_split, tess_words) merged_words = _deduplicate_words(merged_words) cells, columns_meta = build_grid_from_words(merged_words, img_w, img_h) duration = time.time() - t0 for cell in cells: cell["ocr_engine"] = "rapid_kombi" n_rows = len(set(c["row_index"] for c in cells)) if cells else 0 n_cols = len(columns_meta) col_types = {c.get("type") for c in columns_meta} is_vocab = bool(col_types & {"column_en", "column_de"}) word_result = { "cells": cells, "grid_shape": {"rows": n_rows, "cols": n_cols, "total_cells": len(cells)}, "columns_used": columns_meta, "layout": "vocab" if is_vocab else "generic", "image_width": img_w, "image_height": img_h, "duration_seconds": round(duration, 2), "ocr_engine": "rapid_kombi", "grid_method": "rapid_kombi", "raw_rapid_words": rapid_words, "raw_rapid_words_split": rapid_words_split, "raw_tesseract_words": tess_words, "summary": { "total_cells": len(cells), "non_empty_cells": sum(1 for c in cells if c.get("text")), "low_confidence": sum(1 for c in cells if 0 < c.get("confidence", 0) < 50), "rapid_words": len(rapid_words), "rapid_words_split": len(rapid_words_split), "tesseract_words": len(tess_words), "merged_words": len(merged_words), }, } await update_session_db( session_id, word_result=word_result, cropped_png=img_png, current_step=8, ) # Update in-memory cache so detect-structure can access word_result if session_id in _cache: _cache[session_id]["word_result"] = word_result logger.info( "rapid_kombi session %s: %d cells (%d rows, %d cols) in %.2fs " "[rapid=%d, tess=%d, merged=%d]", session_id, len(cells), n_rows, n_cols, duration, len(rapid_words), len(tess_words), len(merged_words), ) await _append_pipeline_log(session_id, "rapid_kombi", { "total_cells": len(cells), "non_empty_cells": word_result["summary"]["non_empty_cells"], "rapid_words": len(rapid_words), "tesseract_words": len(tess_words), "merged_words": len(merged_words), "ocr_engine": "rapid_kombi", }, duration_ms=int(duration * 1000)) return {"session_id": session_id, **word_result}