fix: merge overlapping OCR words with different text (Stick/Stück)
Two issues in paddle-kombi word merge: 1. Overlap threshold too strict: PaddleOCR "Stick" and Tesseract "Stück" overlap at 48.6%, just below the 50% threshold. Both words ended up in the result, overlapping on the same position. Fix: lower threshold from 50% to 40%. 2. Text selection blind to confidence: always took PaddleOCR text even when Tesseract had higher confidence and correct text. Fix: when texts differ due to spatial-only match, prefer the engine with higher confidence. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -3219,8 +3219,10 @@ def _merge_row_sequences(paddle_row: list, tess_row: list) -> list:
|
|||||||
# Same text or one contains the other
|
# Same text or one contains the other
|
||||||
is_same = (pt == tt) or (len(pt) > 1 and len(tt) > 1 and (pt in tt or tt in pt))
|
is_same = (pt == tt) or (len(pt) > 1 and len(tt) > 1 and (pt in tt or tt in pt))
|
||||||
|
|
||||||
# Spatial overlap check: if words overlap >= 50% horizontally,
|
# Spatial overlap check: if words overlap >= 40% horizontally,
|
||||||
# they're the same physical word regardless of OCR text differences
|
# they're the same physical word regardless of OCR text differences.
|
||||||
|
# (40% catches borderline cases like "Stick"/"Stück" at 48% overlap)
|
||||||
|
spatial_match = False
|
||||||
if not is_same:
|
if not is_same:
|
||||||
overlap_left = max(pw["left"], tw["left"])
|
overlap_left = max(pw["left"], tw["left"])
|
||||||
overlap_right = min(
|
overlap_right = min(
|
||||||
@@ -3229,8 +3231,9 @@ def _merge_row_sequences(paddle_row: list, tess_row: list) -> list:
|
|||||||
)
|
)
|
||||||
overlap_w = max(0, overlap_right - overlap_left)
|
overlap_w = max(0, overlap_right - overlap_left)
|
||||||
min_w = min(pw.get("width", 1), tw.get("width", 1))
|
min_w = min(pw.get("width", 1), tw.get("width", 1))
|
||||||
if min_w > 0 and overlap_w / min_w >= 0.5:
|
if min_w > 0 and overlap_w / min_w >= 0.4:
|
||||||
is_same = True
|
is_same = True
|
||||||
|
spatial_match = True
|
||||||
|
|
||||||
if is_same:
|
if is_same:
|
||||||
# Matched — average coordinates weighted by confidence
|
# Matched — average coordinates weighted by confidence
|
||||||
@@ -3239,8 +3242,14 @@ def _merge_row_sequences(paddle_row: list, tess_row: list) -> list:
|
|||||||
total = pc + tc
|
total = pc + tc
|
||||||
if total == 0:
|
if total == 0:
|
||||||
total = 1
|
total = 1
|
||||||
|
# Text: prefer higher-confidence engine when texts differ
|
||||||
|
# (e.g. Tesseract "Stück" conf=98 vs PaddleOCR "Stick" conf=80)
|
||||||
|
if spatial_match and pc < tc:
|
||||||
|
best_text = tw["text"]
|
||||||
|
else:
|
||||||
|
best_text = pw["text"]
|
||||||
merged.append({
|
merged.append({
|
||||||
"text": pw["text"], # Paddle text preferred
|
"text": best_text,
|
||||||
"left": round((pw["left"] * pc + tw["left"] * tc) / total),
|
"left": round((pw["left"] * pc + tw["left"] * tc) / total),
|
||||||
"top": round((pw["top"] * pc + tw["top"] * tc) / total),
|
"top": round((pw["top"] * pc + tw["top"] * tc) / total),
|
||||||
"width": round((pw["width"] * pc + tw["width"] * tc) / total),
|
"width": round((pw["width"] * pc + tw["width"] * tc) / total),
|
||||||
|
|||||||
Reference in New Issue
Block a user