""" Vocabulary extraction: cells -> vocab entries, and build_word_grid wrapper. Extracted from cv_cell_grid.py. Lizenz: Apache 2.0 (kommerziell nutzbar) DATENSCHUTZ: Alle Verarbeitung erfolgt lokal. """ import logging from typing import Any, Dict, List from cv_ocr_engines import ( _attach_example_sentences, _fix_phonetic_brackets, _split_comma_entries, ) from cv_cell_grid_legacy import build_cell_grid from cv_cell_grid_merge import ( _merge_continuation_rows, _merge_phonetic_continuation_rows, _merge_wrapped_rows, ) logger = logging.getLogger(__name__) def _cells_to_vocab_entries( cells: List[Dict[str, Any]], columns_meta: List[Dict[str, Any]], ) -> List[Dict[str, Any]]: """Map generic cells to vocab entries with english/german/example fields. Groups cells by row_index, maps col_type -> field name, and produces one entry per row (only rows with at least one non-empty field). """ col_type_to_field = { 'column_en': 'english', 'column_de': 'german', 'column_example': 'example', 'page_ref': 'source_page', 'column_marker': 'marker', 'column_text': 'text', # generic single-column (box sub-sessions) } bbox_key_map = { 'column_en': 'bbox_en', 'column_de': 'bbox_de', 'column_example': 'bbox_ex', 'page_ref': 'bbox_ref', 'column_marker': 'bbox_marker', 'column_text': 'bbox_text', } # Group cells by row_index rows: Dict[int, List[Dict]] = {} for cell in cells: ri = cell['row_index'] rows.setdefault(ri, []).append(cell) entries: List[Dict[str, Any]] = [] for row_idx in sorted(rows.keys()): row_cells = rows[row_idx] entry: Dict[str, Any] = { 'row_index': row_idx, 'english': '', 'german': '', 'example': '', 'text': '', # generic single-column (box sub-sessions) 'source_page': '', 'marker': '', 'confidence': 0.0, 'bbox': None, 'bbox_en': None, 'bbox_de': None, 'bbox_ex': None, 'bbox_ref': None, 'bbox_marker': None, 'bbox_text': None, 'ocr_engine': row_cells[0].get('ocr_engine', '') if row_cells else '', } confidences = [] for cell in row_cells: col_type = cell['col_type'] field = col_type_to_field.get(col_type) if field: entry[field] = cell['text'] bbox_field = bbox_key_map.get(col_type) if bbox_field: entry[bbox_field] = cell['bbox_pct'] if cell['confidence'] > 0: confidences.append(cell['confidence']) # Compute row-level bbox as union of all cell bboxes all_bboxes = [c['bbox_pct'] for c in row_cells if c.get('bbox_pct')] if all_bboxes: min_x = min(b['x'] for b in all_bboxes) min_y = min(b['y'] for b in all_bboxes) max_x2 = max(b['x'] + b['w'] for b in all_bboxes) max_y2 = max(b['y'] + b['h'] for b in all_bboxes) entry['bbox'] = { 'x': round(min_x, 2), 'y': round(min_y, 2), 'w': round(max_x2 - min_x, 2), 'h': round(max_y2 - min_y, 2), } entry['confidence'] = round( sum(confidences) / len(confidences), 1 ) if confidences else 0.0 # Only include if at least one mapped field has text has_content = any( entry.get(f) for f in col_type_to_field.values() ) if has_content: entries.append(entry) return entries def build_word_grid( ocr_img, column_regions, row_geometries, img_w: int, img_h: int, lang: str = "eng+deu", ocr_engine: str = "auto", img_bgr=None, pronunciation: str = "british", ) -> List[Dict[str, Any]]: """Vocab-specific: Cell-Grid + Vocab-Mapping + Post-Processing. Wrapper around build_cell_grid() that adds vocabulary-specific logic: - Maps cells to english/german/example entries - Applies character confusion fixes, IPA lookup, comma splitting, etc. - Falls back to returning raw cells if no vocab columns detected. Args: ocr_img: Binarized full-page image (for Tesseract). column_regions: Classified columns from Step 3. row_geometries: Rows from Step 4. img_w, img_h: Image dimensions. lang: Default Tesseract language. ocr_engine: 'tesseract', 'rapid', or 'auto'. img_bgr: BGR color image (required for RapidOCR). pronunciation: 'british' or 'american' for IPA lookup. Returns: List of entry dicts with english/german/example text and bbox info (percent). """ cells, columns_meta = build_cell_grid( ocr_img, column_regions, row_geometries, img_w, img_h, lang=lang, ocr_engine=ocr_engine, img_bgr=img_bgr, ) if not cells: return [] # Check if vocab layout is present col_types = {c['type'] for c in columns_meta} if not (col_types & {'column_en', 'column_de'}): logger.info("build_word_grid: no vocab columns -- returning raw cells") return cells # Vocab mapping: cells -> entries entries = _cells_to_vocab_entries(cells, columns_meta) # --- Post-processing pipeline (deterministic, no LLM) --- n_raw = len(entries) # 0. Merge cell-wrap continuation rows (empty primary column = text wrap) entries = _merge_wrapped_rows(entries) # 0a. Merge phonetic-only continuation rows into previous entry entries = _merge_phonetic_continuation_rows(entries) # 0b. Merge multi-line continuation rows (lowercase EN, empty DE) entries = _merge_continuation_rows(entries) # 1. Character confusion (| -> I, 1 -> I, 8 -> B) is now run in # llm_review_entries_streaming so changes are visible to the user in Step 6. # 2. Replace OCR'd phonetics with dictionary IPA entries = _fix_phonetic_brackets(entries, pronunciation=pronunciation) # 3. Split comma-separated word forms (break, broke, broken -> 3 entries) entries = _split_comma_entries(entries) # 4. Attach example sentences (rows without DE -> examples for preceding entry) entries = _attach_example_sentences(entries) engine_name = cells[0].get('ocr_engine', 'unknown') if cells else 'unknown' logger.info(f"build_word_grid: {len(entries)} entries from " f"{n_raw} raw -> {len(entries)} after post-processing " f"(engine={engine_name})") return entries