""" Words-First Grid Builder (Bottom-Up). Builds a cell grid from Tesseract word_boxes directly, without requiring pre-detected columns or rows. Algorithm: 1. Cluster words into columns by X-gap analysis 2. Cluster words into rows by Y-proximity 3. Build cells at (column, row) intersections Returns the same (cells, columns_meta) format as build_cell_grid_v2(). Lizenz: Apache 2.0 (kommerziell nutzbar) DATENSCHUTZ: Alle Verarbeitung erfolgt lokal. """ import logging import re import statistics from typing import Any, Dict, List, Tuple from cv_ocr_engines import ( _group_words_into_lines, _words_to_reading_order_text, ) logger = logging.getLogger(__name__) # --------------------------------------------------------------------------- # 1. Column clustering # --------------------------------------------------------------------------- def _cluster_columns( words: List[Dict], img_w: int, min_gap_pct: float = 3.0, ) -> List[Dict[str, Any]]: """Cluster words into columns by finding large horizontal gaps. Returns a list of column dicts: [{'index': 0, 'type': 'column_1', 'x_min': ..., 'x_max': ...}, ...] sorted left-to-right. """ if not words: return [] # Sort by X center sorted_w = sorted(words, key=lambda w: w['left'] + w['width'] / 2) # Collect word heights to compute adaptive threshold heights = [w['height'] for w in sorted_w if w.get('height', 0) > 0] median_h = statistics.median(heights) if heights else 30 # Adaptive gap threshold: 3× median word height, but at least min_gap_pct of image width min_gap_px = max(median_h * 3, img_w * min_gap_pct / 100) if img_w > 0 else median_h * 3 # Find X-gap boundaries between consecutive words (sorted by X-center) # For each word, compute right edge; for next word, compute left edge boundaries: List[float] = [] # X positions where columns split for i in range(len(sorted_w) - 1): right_edge = sorted_w[i]['left'] + sorted_w[i]['width'] left_edge = sorted_w[i + 1]['left'] gap = left_edge - right_edge if gap > min_gap_px: # Split point is midway through the gap boundaries.append((right_edge + left_edge) / 2) # Build column ranges from boundaries # Column ranges: (-inf, boundary[0]), (boundary[0], boundary[1]), ..., (boundary[-1], +inf) col_edges = [0.0] + boundaries + [float(img_w)] columns = [] for ci in range(len(col_edges) - 1): columns.append({ 'index': ci, 'type': f'column_{ci + 1}' if len(col_edges) > 2 else 'column_text', 'x_min': col_edges[ci], 'x_max': col_edges[ci + 1], }) return columns # --------------------------------------------------------------------------- # 2. Row clustering # --------------------------------------------------------------------------- def _cluster_rows( words: List[Dict], ) -> List[Dict[str, Any]]: """Cluster words into visual rows by Y-proximity. Uses half the median word height as Y-tolerance. Returns a list of row dicts: [{'index': 0, 'y_min': ..., 'y_max': ..., 'y_center': ...}, ...] sorted top-to-bottom. """ if not words: return [] heights = [w['height'] for w in words if w.get('height', 0) > 0] median_h = statistics.median(heights) if heights else 20 y_tol = max(median_h * 0.5, 5) lines = _group_words_into_lines(words, y_tolerance_px=int(y_tol)) rows = [] for ri, line_words in enumerate(lines): y_min = min(w['top'] for w in line_words) y_max = max(w['top'] + w['height'] for w in line_words) rows.append({ 'index': ri, 'y_min': y_min, 'y_max': y_max, 'y_center': (y_min + y_max) / 2, }) return rows # --------------------------------------------------------------------------- # 3. Build cells # --------------------------------------------------------------------------- def _assign_word_to_column(word: Dict, columns: List[Dict]) -> int: """Return column index for a word based on its X-center.""" x_center = word['left'] + word['width'] / 2 for col in columns: if col['x_min'] <= x_center < col['x_max']: return col['index'] # Fallback: nearest column return min(columns, key=lambda c: abs((c['x_min'] + c['x_max']) / 2 - x_center))['index'] def _assign_word_to_row(word: Dict, rows: List[Dict]) -> int: """Return row index for a word based on its Y-center.""" y_center = word['top'] + word['height'] / 2 # Find the row whose y_range contains this word's center for row in rows: if row['y_min'] <= y_center <= row['y_max']: return row['index'] # Fallback: nearest row by Y-center return min(rows, key=lambda r: abs(r['y_center'] - y_center))['index'] def _build_cells( words: List[Dict], columns: List[Dict], rows: List[Dict], img_w: int, img_h: int, ) -> List[Dict[str, Any]]: """Build cell dicts from word assignments to (column, row) pairs.""" if not columns or not rows: return [] # Bucket words into (col_idx, row_idx) buckets: Dict[Tuple[int, int], List[Dict]] = {} for w in words: ci = _assign_word_to_column(w, columns) ri = _assign_word_to_row(w, rows) buckets.setdefault((ci, ri), []).append(w) cells = [] for (ci, ri), cell_words in sorted(buckets.items(), key=lambda kv: (kv[0][1], kv[0][0])): col = columns[ci] row = rows[ri] # Compute tight bbox from actual word positions x_min = min(w['left'] for w in cell_words) y_min = min(w['top'] for w in cell_words) x_max = max(w['left'] + w['width'] for w in cell_words) y_max = max(w['top'] + w['height'] for w in cell_words) bw = x_max - x_min bh = y_max - y_min # Text from words in reading order text = _words_to_reading_order_text(cell_words, y_tolerance_px=max(10, int(bh * 0.4))) # Average confidence confs = [w.get('conf', 0) for w in cell_words if w.get('conf', 0) > 0] avg_conf = sum(confs) / len(confs) if confs else 0.0 # Word boxes with absolute pixel coordinates (consistent with cv_cell_grid.py). # PaddleOCR returns phrase-level boxes (e.g. "competition [kompa'tifn]"), # but the overlay slide mechanism expects one box per word. Split multi-word # boxes into individual word positions proportional to character length. # Also split at "[" boundaries (IPA patterns like "badge[bxd3]"). word_boxes = [] for w in sorted(cell_words, key=lambda ww: (ww['top'], ww['left'])): raw_text = w.get('text', '').strip() # Split by whitespace, at "[" boundaries (IPA), and after leading "!" # e.g. "badge[bxd3]" → ["badge", "[bxd3]"] # e.g. "profit['proft]" → ["profit", "['proft]"] # e.g. "!Betonung" → ["!", "Betonung"] tokens = re.split(r'\s+|(?=\[)|(?<=!)(?=[A-Za-z\u00c0-\u024f])', raw_text) tokens = [t for t in tokens if t] # remove empty strings if len(tokens) <= 1: # Single word — keep as-is word_boxes.append({ 'text': raw_text, 'left': w['left'], 'top': w['top'], 'width': w['width'], 'height': w['height'], 'conf': w.get('conf', 0), }) else: # Multi-word phrase — split proportionally by character count total_chars = sum(len(t) for t in tokens) if total_chars == 0: continue # Small gap between words (2% of box width per gap) n_gaps = len(tokens) - 1 gap_px = w['width'] * 0.02 usable_w = w['width'] - gap_px * n_gaps cursor = w['left'] for t in tokens: token_w = max(1, usable_w * len(t) / total_chars) word_boxes.append({ 'text': t, 'left': round(cursor), 'top': w['top'], 'width': round(token_w), 'height': w['height'], 'conf': w.get('conf', 0), }) cursor += token_w + gap_px cells.append({ 'cell_id': f"R{ri:02d}_C{ci}", 'row_index': ri, 'col_index': ci, 'col_type': col['type'], 'text': text, 'confidence': round(avg_conf, 1), 'bbox_px': {'x': x_min, 'y': y_min, 'w': bw, 'h': bh}, 'bbox_pct': { 'x': round(x_min / img_w * 100, 2) if img_w else 0, 'y': round(y_min / img_h * 100, 2) if img_h else 0, 'w': round(bw / img_w * 100, 2) if img_w else 0, 'h': round(bh / img_h * 100, 2) if img_h else 0, }, 'word_boxes': word_boxes, 'ocr_engine': 'words_first', 'is_bold': False, }) return cells # --------------------------------------------------------------------------- # 4. Public API # --------------------------------------------------------------------------- def build_grid_from_words( word_dicts: List[Dict], img_w: int, img_h: int, min_confidence: int = 30, ) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]: """Build a cell grid bottom-up from Tesseract word boxes. Args: word_dicts: Flat list of word dicts with keys: text, left, top, width, height, conf (absolute pixel coordinates). img_w: Image width in pixels. img_h: Image height in pixels. min_confidence: Minimum OCR confidence to keep a word. Returns: (cells, columns_meta) — same format as build_cell_grid_v2(). cells: list of cell dicts with cell_id, bbox_px, bbox_pct, etc. columns_meta: list of {'index', 'type', 'x', 'width'} dicts. """ if not word_dicts: logger.info("build_grid_from_words: no words — returning empty grid") return [], [] # Filter by confidence words = [ w for w in word_dicts if w.get('conf', 0) >= min_confidence and w.get('text', '').strip() ] if not words: logger.info("build_grid_from_words: all words filtered (conf < %d)", min_confidence) return [], [] logger.info("build_grid_from_words: %d words (after confidence filter from %d)", len(words), len(word_dicts)) # Step 1: cluster columns columns = _cluster_columns(words, img_w) logger.info("build_grid_from_words: %d column(s) detected", len(columns)) # Step 2: cluster rows rows = _cluster_rows(words) logger.info("build_grid_from_words: %d row(s) detected", len(rows)) # Step 3: build cells cells = _build_cells(words, columns, rows, img_w, img_h) logger.info("build_grid_from_words: %d cells built", len(cells)) # Build columns_meta in same format as build_cell_grid_v2 columns_meta = [] for col in columns: x = int(col['x_min']) w = int(col['x_max'] - col['x_min']) columns_meta.append({ 'index': col['index'], 'type': col['type'], 'x': x, 'width': w, }) return cells, columns_meta