""" Cell-grid construction v2 (hybrid: broad columns via word lookup, narrow via cell-crop). Extracted from cv_cell_grid.py. Lizenz: Apache 2.0 — DATENSCHUTZ: Alle Verarbeitung erfolgt lokal. """ import logging from concurrent.futures import ThreadPoolExecutor, as_completed from typing import Any, Dict, List, Optional, Tuple import numpy as np from cv_vocab_types import PageRegion, RowGeometry from cv_ocr_engines import ( RAPIDOCR_AVAILABLE, _assign_row_words_to_columns, _clean_cell_text, _clean_cell_text_lite, _words_to_reading_order_text, _words_to_spaced_text, ocr_region_lighton, ocr_region_rapid, ocr_region_trocr, ) from cv_cell_grid_helpers import ( _MIN_WORD_CONF, _ensure_minimum_crop_size, _heal_row_gaps, _is_artifact_row, _select_psm_for_column, ) logger = logging.getLogger(__name__) try: import cv2 except ImportError: cv2 = None # type: ignore[assignment] # --------------------------------------------------------------------------- # _ocr_cell_crop — isolated cell-crop OCR for v2 hybrid mode # --------------------------------------------------------------------------- def _ocr_cell_crop( row_idx: int, col_idx: int, row: RowGeometry, col: PageRegion, ocr_img: np.ndarray, img_bgr: Optional[np.ndarray], img_w: int, img_h: int, engine_name: str, lang: str, lang_map: Dict[str, str], ) -> Dict[str, Any]: """OCR a single cell by cropping the exact column x row intersection. No padding beyond cell boundaries -> no neighbour bleeding. """ # Display bbox: exact column x row intersection disp_x = col.x disp_y = row.y disp_w = col.width disp_h = row.height # Crop boundaries: add small internal padding (3px each side) to avoid # clipping characters near column/row edges (e.g. parentheses, descenders). # Stays within image bounds but may extend slightly beyond strict cell. # 3px is small enough to avoid neighbour content at typical scan DPI (200-300). _PAD = 3 cx = max(0, disp_x - _PAD) cy = max(0, disp_y - _PAD) cx2 = min(img_w, disp_x + disp_w + _PAD) cy2 = min(img_h, disp_y + disp_h + _PAD) cw = cx2 - cx ch = cy2 - cy empty_cell = { 'cell_id': f"R{row_idx:02d}_C{col_idx}", 'row_index': row_idx, 'col_index': col_idx, 'col_type': col.type, 'text': '', 'confidence': 0.0, 'bbox_px': {'x': disp_x, 'y': disp_y, 'w': disp_w, 'h': disp_h}, 'bbox_pct': { 'x': round(disp_x / img_w * 100, 2) if img_w else 0, 'y': round(disp_y / img_h * 100, 2) if img_h else 0, 'w': round(disp_w / img_w * 100, 2) if img_w else 0, 'h': round(disp_h / img_h * 100, 2) if img_h else 0, }, 'ocr_engine': 'cell_crop_v2', 'is_bold': False, } if cw <= 0 or ch <= 0: logger.debug("_ocr_cell_crop R%02d_C%d: zero-size crop (%dx%d)", row_idx, col_idx, cw, ch) return empty_cell # --- Pixel-density check: skip truly empty cells --- if ocr_img is not None: crop = ocr_img[cy:cy + ch, cx:cx + cw] if crop.size > 0: dark_ratio = float(np.count_nonzero(crop < 180)) / crop.size if dark_ratio < 0.005: logger.debug("_ocr_cell_crop R%02d_C%d: skip empty (dark_ratio=%.4f, crop=%dx%d)", row_idx, col_idx, dark_ratio, cw, ch) return empty_cell # --- Prepare crop for OCR --- cell_lang = lang_map.get(col.type, lang) psm = _select_psm_for_column(col.type, col.width, row.height) text = '' avg_conf = 0.0 used_engine = 'cell_crop_v2' if engine_name in ("trocr-printed", "trocr-handwritten") and img_bgr is not None: cell_region = PageRegion(type=col.type, x=cx, y=cy, width=cw, height=ch) words = ocr_region_trocr(img_bgr, cell_region, handwritten=(engine_name == "trocr-handwritten")) elif engine_name == "lighton" and img_bgr is not None: cell_region = PageRegion(type=col.type, x=cx, y=cy, width=cw, height=ch) words = ocr_region_lighton(img_bgr, cell_region) elif engine_name == "rapid" and img_bgr is not None: # Upscale small BGR crops for RapidOCR. bgr_crop = img_bgr[cy:cy + ch, cx:cx + cw] if bgr_crop.size == 0: words = [] else: crop_h, crop_w = bgr_crop.shape[:2] if crop_h < 80: # Force 3x upscale for short rows — small chars need more pixels scale = 3.0 bgr_up = cv2.resize(bgr_crop, None, fx=scale, fy=scale, interpolation=cv2.INTER_CUBIC) else: bgr_up = _ensure_minimum_crop_size(bgr_crop, min_dim=150, max_scale=3) up_h, up_w = bgr_up.shape[:2] scale_x = up_w / max(crop_w, 1) scale_y = up_h / max(crop_h, 1) was_scaled = (up_w != crop_w or up_h != crop_h) logger.debug("_ocr_cell_crop R%02d_C%d: rapid %dx%d -> %dx%d (scale=%.1fx)", row_idx, col_idx, crop_w, crop_h, up_w, up_h, scale_y) tmp_region = PageRegion(type=col.type, x=0, y=0, width=up_w, height=up_h) words = ocr_region_rapid(bgr_up, tmp_region) # Remap positions back to original image coords if words and was_scaled: for w in words: w['left'] = int(w['left'] / scale_x) + cx w['top'] = int(w['top'] / scale_y) + cy w['width'] = int(w['width'] / scale_x) w['height'] = int(w['height'] / scale_y) elif words: for w in words: w['left'] += cx w['top'] += cy else: # Tesseract: upscale tiny crops for better recognition if ocr_img is not None: crop_slice = ocr_img[cy:cy + ch, cx:cx + cw] upscaled = _ensure_minimum_crop_size(crop_slice) up_h, up_w = upscaled.shape[:2] tmp_region = PageRegion(type=col.type, x=0, y=0, width=up_w, height=up_h) words = ocr_region(upscaled, tmp_region, lang=cell_lang, psm=psm) # Remap word positions back to original image coordinates if words and (up_w != cw or up_h != ch): sx = cw / max(up_w, 1) sy = ch / max(up_h, 1) for w in words: w['left'] = int(w['left'] * sx) + cx w['top'] = int(w['top'] * sy) + cy w['width'] = int(w['width'] * sx) w['height'] = int(w['height'] * sy) elif words: for w in words: w['left'] += cx w['top'] += cy else: words = [] # Filter low-confidence words if words: words = [w for w in words if w.get('conf', 0) >= _MIN_WORD_CONF] if words: y_tol = max(15, ch) text = _words_to_reading_order_text(words, y_tolerance_px=y_tol) avg_conf = round(sum(w['conf'] for w in words) / len(words), 1) logger.debug("_ocr_cell_crop R%02d_C%d: OCR raw text=%r conf=%.1f nwords=%d crop=%dx%d psm=%s engine=%s", row_idx, col_idx, text, avg_conf, len(words), cw, ch, psm, engine_name) else: logger.debug("_ocr_cell_crop R%02d_C%d: OCR returned NO words (crop=%dx%d psm=%s engine=%s)", row_idx, col_idx, cw, ch, psm, engine_name) # --- PSM 7 fallback for still-empty Tesseract cells --- if not text.strip() and engine_name == "tesseract" and ocr_img is not None: crop_slice = ocr_img[cy:cy + ch, cx:cx + cw] upscaled = _ensure_minimum_crop_size(crop_slice) up_h, up_w = upscaled.shape[:2] tmp_region = PageRegion(type=col.type, x=0, y=0, width=up_w, height=up_h) psm7_words = ocr_region(upscaled, tmp_region, lang=cell_lang, psm=7) if psm7_words: psm7_words = [w for w in psm7_words if w.get('conf', 0) >= _MIN_WORD_CONF] if psm7_words: p7_text = _words_to_reading_order_text(psm7_words, y_tolerance_px=10) if p7_text.strip(): text = p7_text avg_conf = round( sum(w['conf'] for w in psm7_words) / len(psm7_words), 1 ) used_engine = 'cell_crop_v2_psm7' # Remap PSM7 word positions back to original image coords if up_w != cw or up_h != ch: sx = cw / max(up_w, 1) sy = ch / max(up_h, 1) for w in psm7_words: w['left'] = int(w['left'] * sx) + cx w['top'] = int(w['top'] * sy) + cy w['width'] = int(w['width'] * sx) w['height'] = int(w['height'] * sy) else: for w in psm7_words: w['left'] += cx w['top'] += cy words = psm7_words # --- Noise filter --- if text.strip(): pre_filter = text text = _clean_cell_text_lite(text) if not text: logger.debug("_ocr_cell_crop R%02d_C%d: _clean_cell_text_lite REMOVED %r", row_idx, col_idx, pre_filter) avg_conf = 0.0 result = dict(empty_cell) result['text'] = text result['confidence'] = avg_conf result['ocr_engine'] = used_engine # Store individual word bounding boxes (absolute image coordinates) # for pixel-accurate overlay positioning in the frontend. if words and text.strip(): result['word_boxes'] = [ { 'text': w.get('text', ''), 'left': w['left'], 'top': w['top'], 'width': w['width'], 'height': w['height'], 'conf': w.get('conf', 0), } for w in words if w.get('text', '').strip() ] return result # Threshold: columns narrower than this (% of image width) use single-cell # crop OCR instead of full-page word assignment. _NARROW_COL_THRESHOLD_PCT = 15.0 # --------------------------------------------------------------------------- # build_cell_grid_v2 — hybrid grid builder (current default) # --------------------------------------------------------------------------- def build_cell_grid_v2( ocr_img: np.ndarray, column_regions: List[PageRegion], row_geometries: List[RowGeometry], img_w: int, img_h: int, lang: str = "eng+deu", ocr_engine: str = "auto", img_bgr: Optional[np.ndarray] = None, skip_heal_gaps: bool = False, ) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]: """Hybrid Grid: full-page OCR for broad columns, cell-crop for narrow ones. Drop-in replacement for build_cell_grid() -- same signature & return type. Strategy: - Broad columns (>15% image width): Use pre-assigned full-page Tesseract words (from row.words). Handles IPA brackets, punctuation, sentence continuity correctly. - Narrow columns (<15% image width): Use isolated cell-crop OCR to prevent neighbour bleeding from adjacent broad columns. """ engine_name = "tesseract" if ocr_engine in ("trocr-printed", "trocr-handwritten", "lighton"): engine_name = ocr_engine elif ocr_engine == "rapid" and RAPIDOCR_AVAILABLE: engine_name = "rapid" logger.info(f"build_cell_grid_v2: using OCR engine '{engine_name}' (hybrid mode)") # Filter to content rows only content_rows = [r for r in row_geometries if r.row_type == 'content'] if not content_rows: logger.warning("build_cell_grid_v2: no content rows found") return [], [] # Filter phantom rows (word_count=0) and artifact rows before = len(content_rows) content_rows = [r for r in content_rows if r.word_count > 0] skipped = before - len(content_rows) if skipped > 0: logger.info(f"build_cell_grid_v2: skipped {skipped} phantom rows (word_count=0)") if not content_rows: logger.warning("build_cell_grid_v2: no content rows with words found") return [], [] before_art = len(content_rows) content_rows = [r for r in content_rows if not _is_artifact_row(r)] artifact_skipped = before_art - len(content_rows) if artifact_skipped > 0: logger.info(f"build_cell_grid_v2: skipped {artifact_skipped} artifact rows") if not content_rows: logger.warning("build_cell_grid_v2: no content rows after artifact filtering") return [], [] # Filter columns _skip_types = {'column_ignore', 'header', 'footer', 'margin_top', 'margin_bottom', 'margin_left', 'margin_right'} relevant_cols = [c for c in column_regions if c.type not in _skip_types] if not relevant_cols: logger.warning("build_cell_grid_v2: no usable columns found") return [], [] # Heal row gaps -- use header/footer boundaries content_rows.sort(key=lambda r: r.y) header_rows = [r for r in row_geometries if r.row_type == 'header'] footer_rows = [r for r in row_geometries if r.row_type == 'footer'] if header_rows: top_bound = max(r.y + r.height for r in header_rows) else: top_bound = content_rows[0].y if footer_rows: bottom_bound = min(r.y for r in footer_rows) else: bottom_bound = content_rows[-1].y + content_rows[-1].height # skip_heal_gaps: When True, keep cell positions at their exact row geometry # positions without expanding to fill gaps from removed rows. if not skip_heal_gaps: _heal_row_gaps(content_rows, top_bound=top_bound, bottom_bound=bottom_bound) relevant_cols.sort(key=lambda c: c.x) columns_meta = [ {'index': ci, 'type': c.type, 'x': c.x, 'width': c.width} for ci, c in enumerate(relevant_cols) ] lang_map = { 'column_en': 'eng', 'column_de': 'deu', 'column_example': 'eng+deu', } # --- Classify columns as broad vs narrow --- narrow_col_indices = set() for ci, col in enumerate(relevant_cols): col_pct = (col.width / img_w * 100) if img_w > 0 else 0 if col_pct < _NARROW_COL_THRESHOLD_PCT: narrow_col_indices.add(ci) broad_col_count = len(relevant_cols) - len(narrow_col_indices) logger.info(f"build_cell_grid_v2: {broad_col_count} broad columns (full-page), " f"{len(narrow_col_indices)} narrow columns (cell-crop)") # --- Phase 1: Broad columns via full-page word assignment --- cells: List[Dict[str, Any]] = [] for row_idx, row in enumerate(content_rows): # Assign full-page words to columns for this row col_words = _assign_row_words_to_columns(row, relevant_cols) for col_idx, col in enumerate(relevant_cols): if col_idx not in narrow_col_indices: # BROAD column: use pre-assigned full-page words words = col_words.get(col_idx, []) # Filter low-confidence words words = [w for w in words if w.get('conf', 0) >= _MIN_WORD_CONF] # Single full-width column (box sub-session): preserve spacing is_single_full_column = ( len(relevant_cols) == 1 and img_w > 0 and relevant_cols[0].width / img_w > 0.9 ) if words: y_tol = max(15, row.height) if is_single_full_column: text = _words_to_spaced_text(words, y_tolerance_px=y_tol) logger.info(f"R{row_idx:02d}: {len(words)} words, " f"text={text!r:.100}") else: text = _words_to_reading_order_text(words, y_tolerance_px=y_tol) avg_conf = round(sum(w['conf'] for w in words) / len(words), 1) else: text = '' avg_conf = 0.0 if is_single_full_column: logger.info(f"R{row_idx:02d}: 0 words (row has " f"{row.word_count} total, y={row.y}..{row.y+row.height})") # Apply noise filter -- but NOT for single-column sub-sessions if not is_single_full_column: text = _clean_cell_text(text) cell = { 'cell_id': f"R{row_idx:02d}_C{col_idx}", 'row_index': row_idx, 'col_index': col_idx, 'col_type': col.type, 'text': text, 'confidence': avg_conf, 'bbox_px': { 'x': col.x, 'y': row.y, 'w': col.width, 'h': row.height, }, 'bbox_pct': { 'x': round(col.x / img_w * 100, 2) if img_w else 0, 'y': round(row.y / img_h * 100, 2) if img_h else 0, 'w': round(col.width / img_w * 100, 2) if img_w else 0, 'h': round(row.height / img_h * 100, 2) if img_h else 0, }, 'ocr_engine': 'word_lookup', 'is_bold': False, } # Store word bounding boxes for pixel-accurate overlay if words and text.strip(): cell['word_boxes'] = [ { 'text': w.get('text', ''), 'left': w['left'], 'top': w['top'], 'width': w['width'], 'height': w['height'], 'conf': w.get('conf', 0), } for w in words if w.get('text', '').strip() ] cells.append(cell) # --- Phase 2: Narrow columns via cell-crop OCR (parallel) --- narrow_tasks = [] for row_idx, row in enumerate(content_rows): for col_idx, col in enumerate(relevant_cols): if col_idx in narrow_col_indices: narrow_tasks.append((row_idx, col_idx, row, col)) if narrow_tasks: max_workers = 4 if engine_name == "tesseract" else 2 with ThreadPoolExecutor(max_workers=max_workers) as pool: futures = { pool.submit( _ocr_cell_crop, ri, ci, row, col, ocr_img, img_bgr, img_w, img_h, engine_name, lang, lang_map, ): (ri, ci) for ri, ci, row, col in narrow_tasks } for future in as_completed(futures): try: cell = future.result() cells.append(cell) except Exception as e: ri, ci = futures[future] logger.error(f"build_cell_grid_v2: narrow cell R{ri:02d}_C{ci} failed: {e}") # Sort cells by (row_index, col_index) cells.sort(key=lambda c: (c['row_index'], c['col_index'])) # Remove all-empty rows rows_with_text: set = set() for cell in cells: if cell['text'].strip(): rows_with_text.add(cell['row_index']) before_filter = len(cells) cells = [c for c in cells if c['row_index'] in rows_with_text] empty_rows_removed = (before_filter - len(cells)) // max(len(relevant_cols), 1) if empty_rows_removed > 0: logger.info(f"build_cell_grid_v2: removed {empty_rows_removed} all-empty rows") logger.info(f"build_cell_grid_v2: {len(cells)} cells from " f"{len(content_rows)} rows x {len(relevant_cols)} columns, " f"engine={engine_name} (hybrid)") return cells, columns_meta