diff --git a/klausur-service/backend/cv_vocab_pipeline.py b/klausur-service/backend/cv_vocab_pipeline.py index d56323d..3c4e304 100644 --- a/klausur-service/backend/cv_vocab_pipeline.py +++ b/klausur-service/backend/cv_vocab_pipeline.py @@ -3103,8 +3103,9 @@ def _ocr_single_cell( 'ocr_engine': 'word_lookup', } - # Use pre-assigned words (exclusive per column) if provided + # --- PRIMARY: Word-lookup from full-page Tesseract --- words = preassigned_words if preassigned_words is not None else [] + used_engine = 'word_lookup' if words: # Use row height as Y-tolerance so all words within a single row @@ -3112,12 +3113,36 @@ def _ocr_single_cell( # across two lines due to slight vertical offset). y_tol = max(15, row.height) text = _words_to_reading_order_text(words, y_tolerance_px=y_tol) + avg_conf = round(sum(w['conf'] for w in words) / len(words), 1) else: text = '' + avg_conf = 0.0 - avg_conf = 0.0 - if words: - avg_conf = round(sum(w['conf'] for w in words) / len(words), 1) + # --- FALLBACK: Cell-OCR for empty cells --- + # Full-page Tesseract can miss small or isolated words (e.g. "Ei"). + # Re-run OCR on the cell crop to catch what word-lookup missed. + if not text.strip() and cell_w > 0 and cell_h > 0: + cell_region = PageRegion( + type=col.type, + x=cell_x, y=cell_y, + width=cell_w, height=cell_h, + ) + if use_rapid and img_bgr is not None: + fallback_words = ocr_region_rapid(img_bgr, cell_region) + else: + cell_lang = lang_map.get(col.type, lang) + fallback_words = ocr_region(ocr_img, cell_region, lang=cell_lang, psm=6) + + if fallback_words: + fb_avg_h = sum(w['height'] for w in fallback_words) / len(fallback_words) + fb_y_tol = max(10, int(fb_avg_h * 0.5)) + fb_text = _words_to_reading_order_text(fallback_words, y_tolerance_px=fb_y_tol) + if fb_text.strip(): + text = fb_text + avg_conf = round( + sum(w['conf'] for w in fallback_words) / len(fallback_words), 1 + ) + used_engine = 'cell_ocr_fallback' return { 'cell_id': f"R{row_idx:02d}_C{col_idx}", @@ -3133,7 +3158,7 @@ def _ocr_single_cell( 'w': round(cell_w / img_w * 100, 2), 'h': round(cell_h / img_h * 100, 2), }, - 'ocr_engine': 'word_lookup', + 'ocr_engine': used_engine, }