debug: add diagnostic logging to _ocr_cell_crop for empty cell investigation
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -4727,6 +4727,7 @@ def _ocr_cell_crop(
|
||||
}
|
||||
|
||||
if cw <= 0 or ch <= 0:
|
||||
logger.debug("_ocr_cell_crop R%02d_C%d: zero-size crop (%dx%d)", row_idx, col_idx, cw, ch)
|
||||
return empty_cell
|
||||
|
||||
# --- Pixel-density check: skip truly empty cells ---
|
||||
@@ -4735,6 +4736,8 @@ def _ocr_cell_crop(
|
||||
if crop.size > 0:
|
||||
dark_ratio = float(np.count_nonzero(crop < 180)) / crop.size
|
||||
if dark_ratio < 0.005:
|
||||
logger.debug("_ocr_cell_crop R%02d_C%d: skip empty (dark_ratio=%.4f, crop=%dx%d)",
|
||||
row_idx, col_idx, dark_ratio, cw, ch)
|
||||
return empty_cell
|
||||
|
||||
# --- Prepare crop for OCR ---
|
||||
@@ -4787,6 +4790,11 @@ def _ocr_cell_crop(
|
||||
y_tol = max(15, ch)
|
||||
text = _words_to_reading_order_text(words, y_tolerance_px=y_tol)
|
||||
avg_conf = round(sum(w['conf'] for w in words) / len(words), 1)
|
||||
logger.debug("_ocr_cell_crop R%02d_C%d: OCR raw text=%r conf=%.1f nwords=%d crop=%dx%d psm=%s engine=%s",
|
||||
row_idx, col_idx, text, avg_conf, len(words), cw, ch, psm, engine_name)
|
||||
else:
|
||||
logger.debug("_ocr_cell_crop R%02d_C%d: OCR returned NO words (crop=%dx%d psm=%s engine=%s)",
|
||||
row_idx, col_idx, cw, ch, psm, engine_name)
|
||||
|
||||
# --- PSM 7 fallback for still-empty Tesseract cells ---
|
||||
if not text.strip() and engine_name == "tesseract" and ocr_img is not None:
|
||||
@@ -4808,8 +4816,11 @@ def _ocr_cell_crop(
|
||||
|
||||
# --- Noise filter ---
|
||||
if text.strip():
|
||||
pre_filter = text
|
||||
text = _clean_cell_text_lite(text)
|
||||
if not text:
|
||||
logger.debug("_ocr_cell_crop R%02d_C%d: _clean_cell_text_lite REMOVED %r",
|
||||
row_idx, col_idx, pre_filter)
|
||||
avg_conf = 0.0
|
||||
|
||||
result = dict(empty_cell)
|
||||
|
||||
Reference in New Issue
Block a user