fix: merge overlapping OCR words with different text (Stick/Stück)
Two issues in paddle-kombi word merge: 1. Overlap threshold too strict: PaddleOCR "Stick" and Tesseract "Stück" overlap at 48.6%, just below the 50% threshold. Both words ended up in the result, overlapping on the same position. Fix: lower threshold from 50% to 40%. 2. Text selection blind to confidence: always took PaddleOCR text even when Tesseract had higher confidence and correct text. Fix: when texts differ due to spatial-only match, prefer the engine with higher confidence. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -3219,8 +3219,10 @@ def _merge_row_sequences(paddle_row: list, tess_row: list) -> list:
|
||||
# Same text or one contains the other
|
||||
is_same = (pt == tt) or (len(pt) > 1 and len(tt) > 1 and (pt in tt or tt in pt))
|
||||
|
||||
# Spatial overlap check: if words overlap >= 50% horizontally,
|
||||
# they're the same physical word regardless of OCR text differences
|
||||
# Spatial overlap check: if words overlap >= 40% horizontally,
|
||||
# they're the same physical word regardless of OCR text differences.
|
||||
# (40% catches borderline cases like "Stick"/"Stück" at 48% overlap)
|
||||
spatial_match = False
|
||||
if not is_same:
|
||||
overlap_left = max(pw["left"], tw["left"])
|
||||
overlap_right = min(
|
||||
@@ -3229,8 +3231,9 @@ def _merge_row_sequences(paddle_row: list, tess_row: list) -> list:
|
||||
)
|
||||
overlap_w = max(0, overlap_right - overlap_left)
|
||||
min_w = min(pw.get("width", 1), tw.get("width", 1))
|
||||
if min_w > 0 and overlap_w / min_w >= 0.5:
|
||||
if min_w > 0 and overlap_w / min_w >= 0.4:
|
||||
is_same = True
|
||||
spatial_match = True
|
||||
|
||||
if is_same:
|
||||
# Matched — average coordinates weighted by confidence
|
||||
@@ -3239,8 +3242,14 @@ def _merge_row_sequences(paddle_row: list, tess_row: list) -> list:
|
||||
total = pc + tc
|
||||
if total == 0:
|
||||
total = 1
|
||||
# Text: prefer higher-confidence engine when texts differ
|
||||
# (e.g. Tesseract "Stück" conf=98 vs PaddleOCR "Stick" conf=80)
|
||||
if spatial_match and pc < tc:
|
||||
best_text = tw["text"]
|
||||
else:
|
||||
best_text = pw["text"]
|
||||
merged.append({
|
||||
"text": pw["text"], # Paddle text preferred
|
||||
"text": best_text,
|
||||
"left": round((pw["left"] * pc + tw["left"] * tc) / total),
|
||||
"top": round((pw["top"] * pc + tw["top"] * tc) / total),
|
||||
"width": round((pw["width"] * pc + tw["width"] * tc) / total),
|
||||
|
||||
Reference in New Issue
Block a user