From 77869e32f4076866211d681333e160b7b633d180 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Mon, 2 Mar 2026 07:24:46 +0100 Subject: [PATCH] feat(ocr-pipeline): use word-lookup instead of cell-OCR for cell grid Replace per-cell Tesseract re-runs with lookup of pre-existing full-page words from row.words. Words are filtered by X-overlap with column bounds. This fixes phantom rows with garbage text, missing last words, and incomplete example text by using the more reliable full-page OCR results. Co-Authored-By: Claude Opus 4.6 --- klausur-service/backend/cv_vocab_pipeline.py | 75 ++++++++++++++------ 1 file changed, 53 insertions(+), 22 deletions(-) diff --git a/klausur-service/backend/cv_vocab_pipeline.py b/klausur-service/backend/cv_vocab_pipeline.py index e3b88a7..23dceb7 100644 --- a/klausur-service/backend/cv_vocab_pipeline.py +++ b/klausur-service/backend/cv_vocab_pipeline.py @@ -3009,6 +3009,48 @@ def _replace_phonetics_in_text(text: str, pronunciation: str = 'british') -> str return _PHONETIC_BRACKET_RE.sub(replacer, text) +def _lookup_cell_words( + row: RowGeometry, + col: PageRegion, + pad: int = 8, +) -> Tuple[List[Dict], float]: + """Look up pre-existing Tesseract words that fall within a cell region. + + Instead of re-running OCR on a cell crop, this filters the full-page + Tesseract words (stored in row.words) by X-overlap with the column. + + Words use coordinates relative to the content ROI; columns use absolute + coordinates. row.x equals the content-ROI left_x, so we convert with: + col_left_rel = col.x - row.x + + Returns: + (words_in_cell, avg_confidence) where words_in_cell keep their + original relative coordinates (compatible with + _words_to_reading_order_text). + """ + if not row.words: + return [], 0.0 + + left_x = row.x # content ROI offset (absolute) + col_left_rel = col.x - left_x - pad + col_right_rel = col.x - left_x + col.width + pad + + words_in_cell = [] + for w in row.words: + w_left = w['left'] + w_right = w_left + w['width'] + # Word center must be within column bounds + w_center_x = (w_left + w_right) / 2 + if col_left_rel <= w_center_x <= col_right_rel: + words_in_cell.append(w) + + avg_conf = 0.0 + if words_in_cell: + avg_conf = round(sum(w['conf'] for w in words_in_cell) / len(words_in_cell), 1) + + return words_in_cell, avg_conf + + def _ocr_single_cell( row_idx: int, col_idx: int, @@ -3023,7 +3065,7 @@ def _ocr_single_cell( lang: str, lang_map: Dict[str, str], ) -> Dict[str, Any]: - """OCR a single cell (column × row intersection) and return its dict.""" + """Populate a single cell (column x row intersection) via word lookup.""" pad = 8 # pixels cell_x = max(0, col.x - pad) cell_y = max(0, row.y - pad) @@ -3051,33 +3093,22 @@ def _ocr_single_cell( 'w': round(col.width / img_w * 100, 2), 'h': round(row.height / img_h * 100, 2), }, - 'ocr_engine': engine_name, + 'ocr_engine': 'word_lookup', } - cell_region = PageRegion( - type=col.type, - x=cell_x, y=cell_y, - width=cell_w, height=cell_h, - ) + # --- PRIMARY: Word-lookup from full-page Tesseract --- + # Use pre-existing words from row.words (Step 4) instead of + # re-running OCR on a small crop. This is more reliable because + # full-page Tesseract has better context for recognition. + words, avg_conf = _lookup_cell_words(row, col, pad=pad) - # OCR the cell - if use_rapid: - words = ocr_region_rapid(img_bgr, cell_region) - else: - cell_lang = lang_map.get(col.type, lang) - words = ocr_region(ocr_img, cell_region, lang=cell_lang, psm=6) - - # Group into lines, then join in reading order if words: avg_h = sum(w['height'] for w in words) / len(words) y_tol = max(10, int(avg_h * 0.5)) + text = _words_to_reading_order_text(words, y_tolerance_px=y_tol) else: - y_tol = 15 - text = _words_to_reading_order_text(words, y_tolerance_px=y_tol) - - avg_conf = 0.0 - if words: - avg_conf = round(sum(w['conf'] for w in words) / len(words), 1) + text = '' + avg_conf = 0.0 return { 'cell_id': f"R{row_idx:02d}_C{col_idx}", @@ -3093,7 +3124,7 @@ def _ocr_single_cell( 'w': round(cell_w / img_w * 100, 2), 'h': round(cell_h / img_h * 100, 2), }, - 'ocr_engine': engine_name, + 'ocr_engine': 'word_lookup', }