breakpilot-lehrer/klausur-service/backend/ocr_merge_helpers.py

"""
OCR Merge Helpers — functions for combining PaddleOCR/RapidOCR with Tesseract results.

Extracted from ocr_pipeline_ocr_merge.py.

Lizenz: Apache 2.0
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
"""

import logging
from typing import List

logger = logging.getLogger(__name__)


def _split_paddle_multi_words(words: list) -> list:
    """Split PaddleOCR multi-word boxes into individual word boxes.

    PaddleOCR often returns entire phrases as a single box, e.g.
    "More than 200 singers took part in the" with one bounding box.
    This splits them into individual words with proportional widths.
    Also handles leading "!" (e.g. "!Betonung" -> ["!", "Betonung"])
    and IPA brackets (e.g. "badge[bxd3]" -> ["badge", "[bxd3]"]).
    """
    import re

    result = []
    for w in words:
        raw_text = w.get("text", "").strip()
        if not raw_text:
            continue
        # Split on whitespace, before "[" (IPA), and after "!" before letter
        tokens = re.split(
            r'\s+|(?=\[)|(?<=!)(?=[A-Za-z\u00c0-\u024f])', raw_text
        )
        tokens = [t for t in tokens if t]

        if len(tokens) <= 1:
            result.append(w)
        else:
            # Split proportionally by character count
            total_chars = sum(len(t) for t in tokens)
            if total_chars == 0:
                continue
            n_gaps = len(tokens) - 1
            gap_px = w["width"] * 0.02
            usable_w = w["width"] - gap_px * n_gaps
            cursor = w["left"]
            for t in tokens:
                token_w = max(1, usable_w * len(t) / total_chars)
                result.append({
                    "text": t,
                    "left": round(cursor),
                    "top": w["top"],
                    "width": round(token_w),
                    "height": w["height"],
                    "conf": w.get("conf", 0),
                })
                cursor += token_w + gap_px
    return result


def _group_words_into_rows(words: list, row_gap: int = 12) -> list:
    """Group words into rows by Y-position clustering.

    Words whose vertical centers are within `row_gap` pixels are on the same row.
    Returns list of rows, each row is a list of words sorted left-to-right.
    """
    if not words:
        return []
    # Sort by vertical center
    sorted_words = sorted(words, key=lambda w: w["top"] + w.get("height", 0) / 2)
    rows: list = []
    current_row: list = [sorted_words[0]]
    current_cy = sorted_words[0]["top"] + sorted_words[0].get("height", 0) / 2

    for w in sorted_words[1:]:
        cy = w["top"] + w.get("height", 0) / 2
        if abs(cy - current_cy) <= row_gap:
            current_row.append(w)
        else:
            # Sort current row left-to-right before saving
            rows.append(sorted(current_row, key=lambda w: w["left"]))
            current_row = [w]
            current_cy = cy
    if current_row:
        rows.append(sorted(current_row, key=lambda w: w["left"]))
    return rows


def _row_center_y(row: list) -> float:
    """Average vertical center of a row of words."""
    if not row:
        return 0.0
    return sum(w["top"] + w.get("height", 0) / 2 for w in row) / len(row)


def _merge_row_sequences(paddle_row: list, tess_row: list) -> list:
    """Merge two word sequences from the same row using sequence alignment.

    Both sequences are sorted left-to-right. Walk through both simultaneously:
    - If words match (same/similar text): take Paddle text with averaged coords
    - If they don't match: the extra word is unique to one engine, include it
    """
    merged = []
    pi, ti = 0, 0

    while pi < len(paddle_row) and ti < len(tess_row):
        pw = paddle_row[pi]
        tw = tess_row[ti]

        pt = pw.get("text", "").lower().strip()
        tt = tw.get("text", "").lower().strip()

        is_same = (pt == tt) or (len(pt) > 1 and len(tt) > 1 and (pt in tt or tt in pt))

        # Spatial overlap check
        spatial_match = False
        if not is_same:
            overlap_left = max(pw["left"], tw["left"])
            overlap_right = min(
                pw["left"] + pw.get("width", 0),
                tw["left"] + tw.get("width", 0),
            )
            overlap_w = max(0, overlap_right - overlap_left)
            min_w = min(pw.get("width", 1), tw.get("width", 1))
            if min_w > 0 and overlap_w / min_w >= 0.4:
                is_same = True
                spatial_match = True

        if is_same:
            pc = pw.get("conf", 80)
            tc = tw.get("conf", 50)
            total = pc + tc
            if total == 0:
                total = 1
            if spatial_match and pc < tc:
                best_text = tw["text"]
            else:
                best_text = pw["text"]
            merged.append({
                "text": best_text,
                "left": round((pw["left"] * pc + tw["left"] * tc) / total),
                "top": round((pw["top"] * pc + tw["top"] * tc) / total),
                "width": round((pw["width"] * pc + tw["width"] * tc) / total),
                "height": round((pw["height"] * pc + tw["height"] * tc) / total),
                "conf": max(pc, tc),
            })
            pi += 1
            ti += 1
        else:
            paddle_ahead = any(
                tess_row[t].get("text", "").lower().strip() == pt
                for t in range(ti + 1, min(ti + 4, len(tess_row)))
            )
            tess_ahead = any(
                paddle_row[p].get("text", "").lower().strip() == tt
                for p in range(pi + 1, min(pi + 4, len(paddle_row)))
            )

            if paddle_ahead and not tess_ahead:
                if tw.get("conf", 0) >= 30:
                    merged.append(tw)
                ti += 1
            elif tess_ahead and not paddle_ahead:
                merged.append(pw)
                pi += 1
            else:
                if pw["left"] <= tw["left"]:
                    merged.append(pw)
                    pi += 1
                else:
                    if tw.get("conf", 0) >= 30:
                        merged.append(tw)
                    ti += 1

    while pi < len(paddle_row):
        merged.append(paddle_row[pi])
        pi += 1
    while ti < len(tess_row):
        tw = tess_row[ti]
        if tw.get("conf", 0) >= 30:
            merged.append(tw)
        ti += 1

    return merged


def _merge_paddle_tesseract(paddle_words: list, tess_words: list) -> list:
    """Merge word boxes from PaddleOCR and Tesseract using row-based sequence alignment."""
    if not paddle_words and not tess_words:
        return []
    if not paddle_words:
        return [w for w in tess_words if w.get("conf", 0) >= 40]
    if not tess_words:
        return list(paddle_words)

    paddle_rows = _group_words_into_rows(paddle_words)
    tess_rows = _group_words_into_rows(tess_words)

    used_tess_rows: set = set()
    merged_all: list = []

    for pr in paddle_rows:
        pr_cy = _row_center_y(pr)
        best_dist, best_tri = float("inf"), -1
        for tri, tr in enumerate(tess_rows):
            if tri in used_tess_rows:
                continue
            tr_cy = _row_center_y(tr)
            dist = abs(pr_cy - tr_cy)
            if dist < best_dist:
                best_dist, best_tri = dist, tri

        max_row_dist = max(
            max((w.get("height", 20) for w in pr), default=20),
            15,
        )

        if best_tri >= 0 and best_dist <= max_row_dist:
            tr = tess_rows[best_tri]
            used_tess_rows.add(best_tri)
            merged_all.extend(_merge_row_sequences(pr, tr))
        else:
            merged_all.extend(pr)

    for tri, tr in enumerate(tess_rows):
        if tri not in used_tess_rows:
            for tw in tr:
                if tw.get("conf", 0) >= 40:
                    merged_all.append(tw)

    return merged_all


def _deduplicate_words(words: list) -> list:
    """Remove duplicate words with same text at overlapping positions."""
    if not words:
        return words

    result: list = []
    for w in words:
        wt = w.get("text", "").lower().strip()
        if not wt:
            continue
        is_dup = False
        w_right = w["left"] + w.get("width", 0)
        w_bottom = w["top"] + w.get("height", 0)
        for existing in result:
            et = existing.get("text", "").lower().strip()
            if wt != et:
                continue
            ox_l = max(w["left"], existing["left"])
            ox_r = min(w_right, existing["left"] + existing.get("width", 0))
            ox = max(0, ox_r - ox_l)
            min_w = min(w.get("width", 1), existing.get("width", 1))
            if min_w <= 0 or ox / min_w < 0.5:
                continue
            oy_t = max(w["top"], existing["top"])
            oy_b = min(w_bottom, existing["top"] + existing.get("height", 0))
            oy = max(0, oy_b - oy_t)
            min_h = min(w.get("height", 1), existing.get("height", 1))
            if min_h > 0 and oy / min_h >= 0.5:
                is_dup = True
                break
        if not is_dup:
            result.append(w)

    removed = len(words) - len(result)
    if removed:
        logger.info("dedup: removed %d duplicate words", removed)
    return result