fix: force 3x upscale for short RapidOCR crops + lower box_thresh

- Short cell crops (<80px height) are always 3x upscaled for RapidOCR to improve recognition of periods, ellipsis, and phonetic symbols - Lowered Det.box_thresh from 0.6 to 0.4 to detect small characters that were being filtered out (dots, brackets, IPA symbols) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-04 19:47:36 +01:00
parent bb0e23303c
commit 90ecb46bed
1 changed files with 23 additions and 12 deletions
@@ -3692,7 +3692,8 @@ def _get_rapid_engine():
            "Rec.ocr_version": _OCRVersion.PPOCRV5,
            # Tighter detection boxes to reduce word merging
            "Det.unclip_ratio": 1.3,
-            "Det.box_thresh": 0.6,
+            # Lower threshold to detect small chars (periods, ellipsis, phonetics)
+            "Det.box_thresh": 0.4,
            # Silence verbose logging
            "Global.log_level": "critical",
        })
@@ -4760,22 +4761,32 @@ def _ocr_cell_crop(
        cell_region = PageRegion(type=col.type, x=cx, y=cy, width=cw, height=ch)
        words = ocr_region_lighton(img_bgr, cell_region)
    elif engine_name == "rapid" and img_bgr is not None:
-        # Upscale small BGR crops for RapidOCR — use same min_dim as Tesseract (150px)
+        # Upscale small BGR crops for RapidOCR.
+        # Cell crops typically have height 35-55px but width >300px.
+        # _ensure_minimum_crop_size only scales when EITHER dim < min_dim,
+        # using uniform scale → a 365×54 crop becomes ~1014×150 (scale ~2.78).
+        # For very short heights (< 80px), force 3× upscale for better OCR
+        # of small characters like periods, ellipsis, and phonetic symbols.
        bgr_crop = img_bgr[cy:cy + ch, cx:cx + cw]
        if bgr_crop.size == 0:
            words = []
        else:
-            upscaled_bin = _ensure_minimum_crop_size(
-                bgr_crop, min_dim=150, max_scale=3,
-            )
-            up_h, up_w = upscaled_bin.shape[:2]
-            logger.info("_ocr_cell_crop R%02d_C%d: rapid upscale %dx%d -> %dx%d",
-                        row_idx, col_idx, cw, ch, up_w, up_h)
-            scale_x = up_w / max(cw, 1)
-            scale_y = up_h / max(ch, 1)
-            was_scaled = (up_w != cw or up_h != ch)
+            crop_h, crop_w = bgr_crop.shape[:2]
+            if crop_h < 80:
+                # Force 3× upscale for short rows — small chars need more pixels
+                scale = 3.0
+                bgr_up = cv2.resize(bgr_crop, None, fx=scale, fy=scale,
+                                    interpolation=cv2.INTER_CUBIC)
+            else:
+                bgr_up = _ensure_minimum_crop_size(bgr_crop, min_dim=150, max_scale=3)
+            up_h, up_w = bgr_up.shape[:2]
+            scale_x = up_w / max(crop_w, 1)
+            scale_y = up_h / max(crop_h, 1)
+            was_scaled = (up_w != crop_w or up_h != crop_h)
+            logger.info("_ocr_cell_crop R%02d_C%d: rapid %dx%d -> %dx%d (scale=%.1fx)",
+                        row_idx, col_idx, crop_w, crop_h, up_w, up_h, scale_y)
            tmp_region = PageRegion(type=col.type, x=0, y=0, width=up_w, height=up_h)
-            words = ocr_region_rapid(upscaled_bin, tmp_region)
+            words = ocr_region_rapid(bgr_up, tmp_region)
            # Remap positions back to original image coords
            if words and was_scaled:
                for w in words: