""" OCR Merge Helpers — functions for combining PaddleOCR/RapidOCR with Tesseract results. Extracted from ocr_pipeline_ocr_merge.py. Lizenz: Apache 2.0 DATENSCHUTZ: Alle Verarbeitung erfolgt lokal. """ import logging from typing import List logger = logging.getLogger(__name__) def _split_paddle_multi_words(words: list) -> list: """Split PaddleOCR multi-word boxes into individual word boxes. PaddleOCR often returns entire phrases as a single box, e.g. "More than 200 singers took part in the" with one bounding box. This splits them into individual words with proportional widths. Also handles leading "!" (e.g. "!Betonung" -> ["!", "Betonung"]) and IPA brackets (e.g. "badge[bxd3]" -> ["badge", "[bxd3]"]). """ import re result = [] for w in words: raw_text = w.get("text", "").strip() if not raw_text: continue # Split on whitespace, before "[" (IPA), and after "!" before letter tokens = re.split( r'\s+|(?=\[)|(?<=!)(?=[A-Za-z\u00c0-\u024f])', raw_text ) tokens = [t for t in tokens if t] if len(tokens) <= 1: result.append(w) else: # Split proportionally by character count total_chars = sum(len(t) for t in tokens) if total_chars == 0: continue n_gaps = len(tokens) - 1 gap_px = w["width"] * 0.02 usable_w = w["width"] - gap_px * n_gaps cursor = w["left"] for t in tokens: token_w = max(1, usable_w * len(t) / total_chars) result.append({ "text": t, "left": round(cursor), "top": w["top"], "width": round(token_w), "height": w["height"], "conf": w.get("conf", 0), }) cursor += token_w + gap_px return result def _group_words_into_rows(words: list, row_gap: int = 12) -> list: """Group words into rows by Y-position clustering. Words whose vertical centers are within `row_gap` pixels are on the same row. Returns list of rows, each row is a list of words sorted left-to-right. """ if not words: return [] # Sort by vertical center sorted_words = sorted(words, key=lambda w: w["top"] + w.get("height", 0) / 2) rows: list = [] current_row: list = [sorted_words[0]] current_cy = sorted_words[0]["top"] + sorted_words[0].get("height", 0) / 2 for w in sorted_words[1:]: cy = w["top"] + w.get("height", 0) / 2 if abs(cy - current_cy) <= row_gap: current_row.append(w) else: # Sort current row left-to-right before saving rows.append(sorted(current_row, key=lambda w: w["left"])) current_row = [w] current_cy = cy if current_row: rows.append(sorted(current_row, key=lambda w: w["left"])) return rows def _row_center_y(row: list) -> float: """Average vertical center of a row of words.""" if not row: return 0.0 return sum(w["top"] + w.get("height", 0) / 2 for w in row) / len(row) def _merge_row_sequences(paddle_row: list, tess_row: list) -> list: """Merge two word sequences from the same row using sequence alignment. Both sequences are sorted left-to-right. Walk through both simultaneously: - If words match (same/similar text): take Paddle text with averaged coords - If they don't match: the extra word is unique to one engine, include it """ merged = [] pi, ti = 0, 0 while pi < len(paddle_row) and ti < len(tess_row): pw = paddle_row[pi] tw = tess_row[ti] pt = pw.get("text", "").lower().strip() tt = tw.get("text", "").lower().strip() is_same = (pt == tt) or (len(pt) > 1 and len(tt) > 1 and (pt in tt or tt in pt)) # Spatial overlap check spatial_match = False if not is_same: overlap_left = max(pw["left"], tw["left"]) overlap_right = min( pw["left"] + pw.get("width", 0), tw["left"] + tw.get("width", 0), ) overlap_w = max(0, overlap_right - overlap_left) min_w = min(pw.get("width", 1), tw.get("width", 1)) if min_w > 0 and overlap_w / min_w >= 0.4: is_same = True spatial_match = True if is_same: pc = pw.get("conf", 80) tc = tw.get("conf", 50) total = pc + tc if total == 0: total = 1 if spatial_match and pc < tc: best_text = tw["text"] else: best_text = pw["text"] merged.append({ "text": best_text, "left": round((pw["left"] * pc + tw["left"] * tc) / total), "top": round((pw["top"] * pc + tw["top"] * tc) / total), "width": round((pw["width"] * pc + tw["width"] * tc) / total), "height": round((pw["height"] * pc + tw["height"] * tc) / total), "conf": max(pc, tc), }) pi += 1 ti += 1 else: paddle_ahead = any( tess_row[t].get("text", "").lower().strip() == pt for t in range(ti + 1, min(ti + 4, len(tess_row))) ) tess_ahead = any( paddle_row[p].get("text", "").lower().strip() == tt for p in range(pi + 1, min(pi + 4, len(paddle_row))) ) if paddle_ahead and not tess_ahead: if tw.get("conf", 0) >= 30: merged.append(tw) ti += 1 elif tess_ahead and not paddle_ahead: merged.append(pw) pi += 1 else: if pw["left"] <= tw["left"]: merged.append(pw) pi += 1 else: if tw.get("conf", 0) >= 30: merged.append(tw) ti += 1 while pi < len(paddle_row): merged.append(paddle_row[pi]) pi += 1 while ti < len(tess_row): tw = tess_row[ti] if tw.get("conf", 0) >= 30: merged.append(tw) ti += 1 return merged def _merge_paddle_tesseract(paddle_words: list, tess_words: list) -> list: """Merge word boxes from PaddleOCR and Tesseract using row-based sequence alignment.""" if not paddle_words and not tess_words: return [] if not paddle_words: return [w for w in tess_words if w.get("conf", 0) >= 40] if not tess_words: return list(paddle_words) paddle_rows = _group_words_into_rows(paddle_words) tess_rows = _group_words_into_rows(tess_words) used_tess_rows: set = set() merged_all: list = [] for pr in paddle_rows: pr_cy = _row_center_y(pr) best_dist, best_tri = float("inf"), -1 for tri, tr in enumerate(tess_rows): if tri in used_tess_rows: continue tr_cy = _row_center_y(tr) dist = abs(pr_cy - tr_cy) if dist < best_dist: best_dist, best_tri = dist, tri max_row_dist = max( max((w.get("height", 20) for w in pr), default=20), 15, ) if best_tri >= 0 and best_dist <= max_row_dist: tr = tess_rows[best_tri] used_tess_rows.add(best_tri) merged_all.extend(_merge_row_sequences(pr, tr)) else: merged_all.extend(pr) for tri, tr in enumerate(tess_rows): if tri not in used_tess_rows: for tw in tr: if tw.get("conf", 0) >= 40: merged_all.append(tw) return merged_all def _deduplicate_words(words: list) -> list: """Remove duplicate words with same text at overlapping positions.""" if not words: return words result: list = [] for w in words: wt = w.get("text", "").lower().strip() if not wt: continue is_dup = False w_right = w["left"] + w.get("width", 0) w_bottom = w["top"] + w.get("height", 0) for existing in result: et = existing.get("text", "").lower().strip() if wt != et: continue ox_l = max(w["left"], existing["left"]) ox_r = min(w_right, existing["left"] + existing.get("width", 0)) ox = max(0, ox_r - ox_l) min_w = min(w.get("width", 1), existing.get("width", 1)) if min_w <= 0 or ox / min_w < 0.5: continue oy_t = max(w["top"], existing["top"]) oy_b = min(w_bottom, existing["top"] + existing.get("height", 0)) oy = max(0, oy_b - oy_t) min_h = min(w.get("height", 1), existing.get("height", 1)) if min_h > 0 and oy / min_h >= 0.5: is_dup = True break if not is_dup: result.append(w) removed = len(words) - len(result) if removed: logger.info("dedup: removed %d duplicate words", removed) return result