From 1e0c6bb4b5635a01fcd07d1433fc72e9d7bd0a9c Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Wed, 4 Mar 2026 23:38:44 +0100 Subject: [PATCH] =?UTF-8?q?feat:=20hybrid=20OCR=20=E2=80=94=20full-page=20?= =?UTF-8?q?for=20broad=20columns,=20cell-crop=20for=20narrow?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fundamentally rearchitect build_cell_grid_v2 to combine the best of both approaches: - Broad columns (>15% image width): Use full-page Tesseract word assignment. Handles IPA brackets, punctuation, sentence flow, and ellipsis correctly. No garbled phonetics. - Narrow columns (<15% image width): Use isolated cell-crop OCR to prevent neighbour bleeding from adjacent broad columns. This eliminates the need for complex phonetic bracket replacement on broad columns since full-page Tesseract reads them correctly. Co-Authored-By: Claude Opus 4.6 --- klausur-service/backend/cv_vocab_pipeline.py | 143 +++++++++++++------ 1 file changed, 98 insertions(+), 45 deletions(-) diff --git a/klausur-service/backend/cv_vocab_pipeline.py b/klausur-service/backend/cv_vocab_pipeline.py index a2bab0e..1e5ea05 100644 --- a/klausur-service/backend/cv_vocab_pipeline.py +++ b/klausur-service/backend/cv_vocab_pipeline.py @@ -4965,6 +4965,14 @@ def _ocr_cell_crop( return result +# Threshold: columns narrower than this (% of image width) use single-cell +# crop OCR instead of full-page word assignment. Broad columns (EN, DE, +# Example) get full-page Tesseract which handles IPA brackets, punctuation, +# and sentence flow much better. Narrow columns (page_ref, marker) use +# isolated cell crops to prevent neighbour bleeding. +_NARROW_COL_THRESHOLD_PCT = 15.0 + + def build_cell_grid_v2( ocr_img: np.ndarray, column_regions: List[PageRegion], @@ -4975,30 +4983,24 @@ def build_cell_grid_v2( ocr_engine: str = "auto", img_bgr: Optional[np.ndarray] = None, ) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]: - """Cell-First Grid: crop each cell in isolation, then OCR. + """Hybrid Grid: full-page OCR for broad columns, cell-crop for narrow ones. Drop-in replacement for build_cell_grid() — same signature & return type. - No full-page word assignment; each cell is OCR'd from its own crop. + + Strategy: + - Broad columns (>15% image width): Use pre-assigned full-page Tesseract + words (from row.words). Handles IPA brackets, punctuation, sentence + continuity correctly. + - Narrow columns (<15% image width): Use isolated cell-crop OCR to prevent + neighbour bleeding from adjacent broad columns. """ - # Resolve engine — default to Tesseract for cell-first OCR. - # Tesseract excels at isolated text crops (binarized, upscaled). - # RapidOCR is optimized for full-page scene-text and produces artifacts - # on small cell crops (extra chars, missing punctuation, garbled IPA). - use_rapid = False + engine_name = "tesseract" if ocr_engine in ("trocr-printed", "trocr-handwritten", "lighton"): engine_name = ocr_engine - elif ocr_engine == "auto": - engine_name = "tesseract" - elif ocr_engine == "rapid": - if not RAPIDOCR_AVAILABLE: - logger.warning("RapidOCR requested but not available, falling back to Tesseract") - else: - use_rapid = True - engine_name = "rapid" if use_rapid else "tesseract" - else: - engine_name = "tesseract" + elif ocr_engine == "rapid" and RAPIDOCR_AVAILABLE: + engine_name = "rapid" - logger.info(f"build_cell_grid_v2: using OCR engine '{engine_name}'") + logger.info(f"build_cell_grid_v2: using OCR engine '{engine_name}' (hybrid mode)") # Filter to content rows only content_rows = [r for r in row_geometries if r.row_type == 'content'] @@ -5033,9 +5035,7 @@ def build_cell_grid_v2( logger.warning("build_cell_grid_v2: no usable columns found") return [], [] - # Heal row gaps — use header/footer boundaries (NOT column bounds!) - # In Cell-First OCR, the crop IS the OCR input, so extending into - # header/footer means OCR'ing header/footer text ("VOCABULARY", page nums). + # Heal row gaps — use header/footer boundaries content_rows.sort(key=lambda r: r.y) header_rows = [r for r in row_geometries if r.row_type == 'header'] footer_rows = [r for r in row_geometries if r.row_type == 'footer'] @@ -5063,38 +5063,91 @@ def build_cell_grid_v2( 'column_example': 'eng+deu', } - # --- Parallel OCR with ThreadPoolExecutor --- - # Tesseract is single-threaded per call, so we benefit from parallelism. - # ~40 rows × 4 cols = 160 cells, ~50% empty (density skip) → ~80 OCR calls. + # --- Classify columns as broad vs narrow --- + narrow_col_indices = set() + for ci, col in enumerate(relevant_cols): + col_pct = (col.width / img_w * 100) if img_w > 0 else 0 + if col_pct < _NARROW_COL_THRESHOLD_PCT: + narrow_col_indices.add(ci) + + broad_col_count = len(relevant_cols) - len(narrow_col_indices) + logger.info(f"build_cell_grid_v2: {broad_col_count} broad columns (full-page), " + f"{len(narrow_col_indices)} narrow columns (cell-crop)") + + # --- Phase 1: Broad columns via full-page word assignment --- cells: List[Dict[str, Any]] = [] - cell_tasks = [] for row_idx, row in enumerate(content_rows): + # Assign full-page words to columns for this row + col_words = _assign_row_words_to_columns(row, relevant_cols) + for col_idx, col in enumerate(relevant_cols): - cell_tasks.append((row_idx, col_idx, row, col)) + if col_idx not in narrow_col_indices: + # BROAD column: use pre-assigned full-page words + words = col_words.get(col_idx, []) + # Filter low-confidence words + words = [w for w in words if w.get('conf', 0) >= 30] - max_workers = 4 if engine_name == "tesseract" else 2 + if words: + y_tol = max(15, row.height) + text = _words_to_reading_order_text(words, y_tolerance_px=y_tol) + avg_conf = round(sum(w['conf'] for w in words) / len(words), 1) + else: + text = '' + avg_conf = 0.0 - with ThreadPoolExecutor(max_workers=max_workers) as pool: - futures = { - pool.submit( - _ocr_cell_crop, - ri, ci, row, col, - ocr_img, img_bgr, img_w, img_h, - engine_name, lang, lang_map, - ): (ri, ci) - for ri, ci, row, col in cell_tasks - } + # Apply noise filter + text = _clean_cell_text(text) - for future in as_completed(futures): - try: - cell = future.result() + cell = { + 'cell_id': f"R{row_idx:02d}_C{col_idx}", + 'row_index': row_idx, + 'col_index': col_idx, + 'col_type': col.type, + 'text': text, + 'confidence': avg_conf, + 'bbox_px': { + 'x': col.x, 'y': row.y, + 'w': col.width, 'h': row.height, + }, + 'bbox_pct': { + 'x': round(col.x / img_w * 100, 2) if img_w else 0, + 'y': round(row.y / img_h * 100, 2) if img_h else 0, + 'w': round(col.width / img_w * 100, 2) if img_w else 0, + 'h': round(row.height / img_h * 100, 2) if img_h else 0, + }, + 'ocr_engine': 'word_lookup', + } cells.append(cell) - except Exception as e: - ri, ci = futures[future] - logger.error(f"build_cell_grid_v2: cell R{ri:02d}_C{ci} failed: {e}") - # Sort cells by (row_index, col_index) since futures complete out of order + # --- Phase 2: Narrow columns via cell-crop OCR (parallel) --- + narrow_tasks = [] + for row_idx, row in enumerate(content_rows): + for col_idx, col in enumerate(relevant_cols): + if col_idx in narrow_col_indices: + narrow_tasks.append((row_idx, col_idx, row, col)) + + if narrow_tasks: + max_workers = 4 if engine_name == "tesseract" else 2 + with ThreadPoolExecutor(max_workers=max_workers) as pool: + futures = { + pool.submit( + _ocr_cell_crop, + ri, ci, row, col, + ocr_img, img_bgr, img_w, img_h, + engine_name, lang, lang_map, + ): (ri, ci) + for ri, ci, row, col in narrow_tasks + } + for future in as_completed(futures): + try: + cell = future.result() + cells.append(cell) + except Exception as e: + ri, ci = futures[future] + logger.error(f"build_cell_grid_v2: narrow cell R{ri:02d}_C{ci} failed: {e}") + + # Sort cells by (row_index, col_index) cells.sort(key=lambda c: (c['row_index'], c['col_index'])) # Remove all-empty rows @@ -5110,7 +5163,7 @@ def build_cell_grid_v2( logger.info(f"build_cell_grid_v2: {len(cells)} cells from " f"{len(content_rows)} rows × {len(relevant_cols)} columns, " - f"engine={engine_name}") + f"engine={engine_name} (hybrid)") return cells, columns_meta