""" Shared helpers for cell-grid construction (v2 + legacy). Extracted from cv_cell_grid.py — used by both cv_cell_grid_build and cv_cell_grid_legacy. Lizenz: Apache 2.0 (kommerziell nutzbar) DATENSCHUTZ: Alle Verarbeitung erfolgt lokal. """ import logging from typing import List import numpy as np from cv_vocab_types import RowGeometry logger = logging.getLogger(__name__) try: import cv2 except ImportError: cv2 = None # type: ignore[assignment] # Minimum OCR word confidence to keep (used across multiple functions) _MIN_WORD_CONF = 30 def _compute_cell_padding(col_width: int, img_w: int) -> int: """Adaptive padding for OCR crops based on column width. Narrow columns (page_ref, marker) need more surrounding context so Tesseract can segment characters correctly. Wide columns keep the minimal 4 px padding to avoid pulling in neighbours. """ col_pct = col_width / img_w * 100 if img_w > 0 else 100 if col_pct < 5: return max(20, col_width // 2) if col_pct < 10: return max(12, col_width // 4) if col_pct < 15: return 8 return 4 def _ensure_minimum_crop_size(crop: np.ndarray, min_dim: int = 150, max_scale: int = 3) -> np.ndarray: """Upscale tiny crops so Tesseract gets enough pixel data. If either dimension is below *min_dim*, the crop is bicubic-upscaled so the smallest dimension reaches *min_dim* (capped at *max_scale* x). """ h, w = crop.shape[:2] if h >= min_dim and w >= min_dim: return crop scale = min(max_scale, max(min_dim / max(h, 1), min_dim / max(w, 1))) if scale <= 1.0: return crop new_w = int(w * scale) new_h = int(h * scale) return cv2.resize(crop, (new_w, new_h), interpolation=cv2.INTER_CUBIC) def _select_psm_for_column(col_type: str, col_width: int, row_height: int) -> int: """Choose the best Tesseract PSM for a given column geometry. - page_ref columns are almost always single short tokens -> PSM 8 - Very narrow or short cells -> PSM 7 (single text line) - Everything else -> PSM 6 (uniform block) """ if col_type in ('page_ref', 'marker'): return 8 # single word if col_width < 100 or row_height < 30: return 7 # single line return 6 # uniform block def _is_artifact_row(row: RowGeometry) -> bool: """Return True if this row contains only scan artifacts, not real text. Artifact rows (scanner shadows, noise) typically produce only single-character detections. A real content row always has at least one token with 2+ characters. """ if row.word_count == 0: return True texts = [w.get('text', '').strip() for w in row.words] return all(len(t) <= 1 for t in texts) def _heal_row_gaps( rows: List[RowGeometry], top_bound: int, bottom_bound: int, ) -> None: """Expand row y/height to fill vertical gaps caused by removed adjacent rows. After filtering out empty or artifact rows, remaining content rows may have gaps between them where the removed rows used to be. This function mutates each row to extend upward/downward to the midpoint of such gaps so that OCR crops cover the full available content area. The first row always extends to top_bound; the last row to bottom_bound. """ if not rows: return rows.sort(key=lambda r: r.y) n = len(rows) orig = [(r.y, r.y + r.height) for r in rows] # snapshot before mutation for i, row in enumerate(rows): # New top: midpoint between previous row's bottom and this row's top if i == 0: new_top = top_bound else: prev_bot = orig[i - 1][1] my_top = orig[i][0] gap = my_top - prev_bot new_top = prev_bot + gap // 2 if gap > 1 else my_top # New bottom: midpoint between this row's bottom and next row's top if i == n - 1: new_bottom = bottom_bound else: my_bot = orig[i][1] next_top = orig[i + 1][0] gap = next_top - my_bot new_bottom = my_bot + gap // 2 if gap > 1 else my_bot row.y = new_top row.height = max(5, new_bottom - new_top) logger.debug( f"_heal_row_gaps: {n} rows -> y range [{rows[0].y}..{rows[-1].y + rows[-1].height}] " f"(bounds: top={top_bound}, bottom={bottom_bound})" )