diff --git a/klausur-service/backend/cv_vocab_pipeline.py b/klausur-service/backend/cv_vocab_pipeline.py index 837fcc6..42e7314 100644 --- a/klausur-service/backend/cv_vocab_pipeline.py +++ b/klausur-service/backend/cv_vocab_pipeline.py @@ -4727,6 +4727,7 @@ def _ocr_cell_crop( } if cw <= 0 or ch <= 0: + logger.debug("_ocr_cell_crop R%02d_C%d: zero-size crop (%dx%d)", row_idx, col_idx, cw, ch) return empty_cell # --- Pixel-density check: skip truly empty cells --- @@ -4735,6 +4736,8 @@ def _ocr_cell_crop( if crop.size > 0: dark_ratio = float(np.count_nonzero(crop < 180)) / crop.size if dark_ratio < 0.005: + logger.debug("_ocr_cell_crop R%02d_C%d: skip empty (dark_ratio=%.4f, crop=%dx%d)", + row_idx, col_idx, dark_ratio, cw, ch) return empty_cell # --- Prepare crop for OCR --- @@ -4787,6 +4790,11 @@ def _ocr_cell_crop( y_tol = max(15, ch) text = _words_to_reading_order_text(words, y_tolerance_px=y_tol) avg_conf = round(sum(w['conf'] for w in words) / len(words), 1) + logger.debug("_ocr_cell_crop R%02d_C%d: OCR raw text=%r conf=%.1f nwords=%d crop=%dx%d psm=%s engine=%s", + row_idx, col_idx, text, avg_conf, len(words), cw, ch, psm, engine_name) + else: + logger.debug("_ocr_cell_crop R%02d_C%d: OCR returned NO words (crop=%dx%d psm=%s engine=%s)", + row_idx, col_idx, cw, ch, psm, engine_name) # --- PSM 7 fallback for still-empty Tesseract cells --- if not text.strip() and engine_name == "tesseract" and ocr_img is not None: @@ -4808,8 +4816,11 @@ def _ocr_cell_crop( # --- Noise filter --- if text.strip(): + pre_filter = text text = _clean_cell_text_lite(text) if not text: + logger.debug("_ocr_cell_crop R%02d_C%d: _clean_cell_text_lite REMOVED %r", + row_idx, col_idx, pre_filter) avg_conf = 0.0 result = dict(empty_cell)