fix: add 3px cell padding + upscale small RapidOCR crops + diagnostic logging

- Add 3px padding around cell crops to avoid clipping edge characters (parentheses in "Tanz(veranstaltung)", descenders, etc.) - Upscale small BGR crops for RapidOCR, same as Tesseract path - Add info-level diagnostic logging to _ocr_cell_crop for debugging Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-04 16:45:59 +01:00
parent e4bdb3cc24
commit 113a1c10e5
1 changed files with 44 additions and 16 deletions
@@ -4703,11 +4703,16 @@ def _ocr_cell_crop(
    disp_w = col.width
    disp_h = row.height

-    # Crop boundaries (clamped to image)
-    cx = max(0, disp_x)
-    cy = max(0, disp_y)
-    cw = min(disp_w, img_w - cx)
-    ch = min(disp_h, img_h - cy)
+    # Crop boundaries: add small internal padding (3px each side) to avoid
+    # clipping characters near column/row edges (e.g. parentheses, descenders).
+    # Stays within image bounds but may extend slightly beyond strict cell.
+    _PAD = 3
+    cx = max(0, disp_x - _PAD)
+    cy = max(0, disp_y - _PAD)
+    cx2 = min(img_w, disp_x + disp_w + _PAD)
+    cy2 = min(img_h, disp_y + disp_h + _PAD)
+    cw = cx2 - cx
+    ch = cy2 - cy

    empty_cell = {
        'cell_id': f"R{row_idx:02d}_C{col_idx}",
@@ -4727,7 +4732,7 @@ def _ocr_cell_crop(
    }

    if cw <= 0 or ch <= 0:
-        logger.debug("_ocr_cell_crop R%02d_C%d: zero-size crop (%dx%d)", row_idx, col_idx, cw, ch)
+        logger.info("_ocr_cell_crop R%02d_C%d: zero-size crop (%dx%d)", row_idx, col_idx, cw, ch)
        return empty_cell

    # --- Pixel-density check: skip truly empty cells ---
@@ -4736,8 +4741,8 @@ def _ocr_cell_crop(
        if crop.size > 0:
            dark_ratio = float(np.count_nonzero(crop < 180)) / crop.size
            if dark_ratio < 0.005:
-                logger.debug("_ocr_cell_crop R%02d_C%d: skip empty (dark_ratio=%.4f, crop=%dx%d)",
-                             row_idx, col_idx, dark_ratio, cw, ch)
+                logger.info("_ocr_cell_crop R%02d_C%d: skip empty (dark_ratio=%.4f, crop=%dx%d)",
+                            row_idx, col_idx, dark_ratio, cw, ch)
                return empty_cell

    # --- Prepare crop for OCR ---
@@ -4755,8 +4760,31 @@ def _ocr_cell_crop(
        cell_region = PageRegion(type=col.type, x=cx, y=cy, width=cw, height=ch)
        words = ocr_region_lighton(img_bgr, cell_region)
    elif engine_name == "rapid" and img_bgr is not None:
-        cell_region = PageRegion(type=col.type, x=cx, y=cy, width=cw, height=ch)
-        words = ocr_region_rapid(img_bgr, cell_region)
+        # Upscale small BGR crops for RapidOCR (same as Tesseract path)
+        bgr_crop = img_bgr[cy:cy + ch, cx:cx + cw]
+        if bgr_crop.size == 0:
+            words = []
+        else:
+            min_dim = 64
+            scale = 1.0
+            if ch < min_dim or cw < min_dim:
+                scale = max(min_dim / max(ch, 1), min_dim / max(cw, 1), 2.0)
+                bgr_crop = cv2.resize(bgr_crop, None, fx=scale, fy=scale,
+                                      interpolation=cv2.INTER_CUBIC)
+            up_h, up_w = bgr_crop.shape[:2]
+            tmp_region = PageRegion(type=col.type, x=0, y=0, width=up_w, height=up_h)
+            words = ocr_region_rapid(bgr_crop, tmp_region)
+            # Remap positions back to original image coords
+            if words and scale != 1.0:
+                for w in words:
+                    w['left'] = int(w['left'] / scale) + cx
+                    w['top'] = int(w['top'] / scale) + cy
+                    w['width'] = int(w['width'] / scale)
+                    w['height'] = int(w['height'] / scale)
+            elif words:
+                for w in words:
+                    w['left'] += cx
+                    w['top'] += cy
    else:
        # Tesseract: upscale tiny crops for better recognition
        if ocr_img is not None:
@@ -4790,11 +4818,11 @@ def _ocr_cell_crop(
        y_tol = max(15, ch)
        text = _words_to_reading_order_text(words, y_tolerance_px=y_tol)
        avg_conf = round(sum(w['conf'] for w in words) / len(words), 1)
-        logger.debug("_ocr_cell_crop R%02d_C%d: OCR raw text=%r conf=%.1f nwords=%d crop=%dx%d psm=%s engine=%s",
-                     row_idx, col_idx, text, avg_conf, len(words), cw, ch, psm, engine_name)
+        logger.info("_ocr_cell_crop R%02d_C%d: OCR raw text=%r conf=%.1f nwords=%d crop=%dx%d psm=%s engine=%s",
+                    row_idx, col_idx, text, avg_conf, len(words), cw, ch, psm, engine_name)
    else:
-        logger.debug("_ocr_cell_crop R%02d_C%d: OCR returned NO words (crop=%dx%d psm=%s engine=%s)",
-                     row_idx, col_idx, cw, ch, psm, engine_name)
+        logger.info("_ocr_cell_crop R%02d_C%d: OCR returned NO words (crop=%dx%d psm=%s engine=%s)",
+                    row_idx, col_idx, cw, ch, psm, engine_name)

    # --- PSM 7 fallback for still-empty Tesseract cells ---
    if not text.strip() and engine_name == "tesseract" and ocr_img is not None:
@@ -4819,8 +4847,8 @@ def _ocr_cell_crop(
        pre_filter = text
        text = _clean_cell_text_lite(text)
        if not text:
-            logger.debug("_ocr_cell_crop R%02d_C%d: _clean_cell_text_lite REMOVED %r",
-                         row_idx, col_idx, pre_filter)
+            logger.info("_ocr_cell_crop R%02d_C%d: _clean_cell_text_lite REMOVED %r",
+                        row_idx, col_idx, pre_filter)
            avg_conf = 0.0

    result = dict(empty_cell)