""" Vocab Worksheet OCR Pipeline — full Kombi OCR pipeline for a single page. Extracted from vocab_worksheet_api.py to keep file sizes manageable. Pipeline steps: orientation → deskew → dewarp → crop → scan-quality → enhance → dual-engine OCR (RapidOCR + Tesseract) → merge → grid-build → vocab extraction → row merging """ import logging import uuid from typing import Optional logger = logging.getLogger(__name__) # --------------------------------------------------------------------------- # Optional heavy dependencies (not available in every environment) # --------------------------------------------------------------------------- try: import cv2 import numpy as np except ImportError: cv2 = None # type: ignore[assignment] np = None # type: ignore[assignment] logger.warning("cv2 / numpy not available — OCR pipeline disabled") try: from PIL import Image except ImportError: Image = None # type: ignore[assignment] try: import pytesseract except ImportError: pytesseract = None # type: ignore[assignment] # CV pipeline helpers try: from cv_vocab_pipeline import ( deskew_two_pass, dewarp_image, detect_and_fix_orientation, _cells_to_vocab_entries, _fix_phonetic_brackets, ) except ImportError: deskew_two_pass = None # type: ignore[assignment] dewarp_image = None # type: ignore[assignment] detect_and_fix_orientation = None # type: ignore[assignment] _cells_to_vocab_entries = None # type: ignore[assignment] _fix_phonetic_brackets = None # type: ignore[assignment] try: from cv_cell_grid import ( _merge_wrapped_rows, _merge_phonetic_continuation_rows, _merge_continuation_rows, ) except ImportError: _merge_wrapped_rows = None # type: ignore[assignment] _merge_phonetic_continuation_rows = None # type: ignore[assignment] _merge_continuation_rows = None # type: ignore[assignment] try: from cv_ocr_engines import ocr_region_rapid except ImportError: ocr_region_rapid = None # type: ignore[assignment] try: from cv_vocab_types import PageRegion except ImportError: PageRegion = None # type: ignore[assignment] try: from ocr_pipeline_ocr_merge import ( _split_paddle_multi_words, _merge_paddle_tesseract, _deduplicate_words, ) except ImportError: _split_paddle_multi_words = None # type: ignore[assignment] _merge_paddle_tesseract = None # type: ignore[assignment] _deduplicate_words = None # type: ignore[assignment] try: from cv_words_first import build_grid_from_words except ImportError: build_grid_from_words = None # type: ignore[assignment] try: from ocr_pipeline_session_store import ( create_session_db as create_pipeline_session_db, update_session_db as update_pipeline_session_db, ) except ImportError: create_pipeline_session_db = None # type: ignore[assignment] update_pipeline_session_db = None # type: ignore[assignment] # --------------------------------------------------------------------------- # Main pipeline function # --------------------------------------------------------------------------- async def _run_ocr_pipeline_for_page( img_bgr: "np.ndarray", page_number: int, vocab_session_id: str, *, ipa_mode: str = "none", syllable_mode: str = "none", enable_enhance: bool = True, max_columns: Optional[int] = 3, override_min_conf: Optional[int] = None, ) -> tuple: """Run the full Kombi OCR pipeline on a single page and return vocab entries. Uses the same pipeline as the admin OCR Kombi pipeline: orientation → deskew → dewarp → crop → dual-engine OCR → grid-build (with pipe-autocorrect, word-gap merge, dictionary detection, etc.) Args: img_bgr: BGR numpy array. page_number: 0-indexed page number. vocab_session_id: Vocab session ID for logging. ipa_mode: "none" (default for worksheets), "auto", "all", "en", "de". syllable_mode: "none" (default for worksheets), "auto", "all", "en", "de". Returns (entries, rotation_deg) where entries is a list of dicts and rotation_deg is the orientation correction applied (0, 90, 180, 270). """ import time as _time t_total = _time.time() img_h, img_w = img_bgr.shape[:2] logger.info(f"Kombi Pipeline page {page_number + 1}: image {img_w}x{img_h}") # 1. Orientation detection (fix upside-down scans) t0 = _time.time() img_bgr, rotation = detect_and_fix_orientation(img_bgr) if rotation: img_h, img_w = img_bgr.shape[:2] logger.info(f" orientation: rotated {rotation}° ({_time.time() - t0:.1f}s)") else: logger.info(f" orientation: OK ({_time.time() - t0:.1f}s)") # 2. Create pipeline session in DB (visible in admin Kombi UI) pipeline_session_id = str(uuid.uuid4()) try: _, png_buf = cv2.imencode(".png", img_bgr) original_png = png_buf.tobytes() await create_pipeline_session_db( pipeline_session_id, name=f"vocab-ws-{vocab_session_id[:8]}-p{page_number + 1}", filename=f"page_{page_number + 1}.png", original_png=original_png, ) except Exception as e: logger.warning(f"Could not create pipeline session in DB: {e}") # 3. Three-pass deskew t0 = _time.time() deskewed_bgr, angle_applied, deskew_debug = deskew_two_pass(img_bgr.copy()) logger.info(f" deskew: angle={angle_applied:.2f} ({_time.time() - t0:.1f}s)") # 4. Dewarp t0 = _time.time() dewarped_bgr, dewarp_info = dewarp_image(deskewed_bgr) logger.info(f" dewarp: shear={dewarp_info['shear_degrees']:.3f} ({_time.time() - t0:.1f}s)") # 5. Content crop (removes scanner borders, gutter shadows) t0 = _time.time() try: from page_crop import detect_and_crop_page cropped_bgr, crop_result = detect_and_crop_page(dewarped_bgr) if crop_result.get("crop_applied"): dewarped_bgr = cropped_bgr logger.info(f" crop: applied ({_time.time() - t0:.1f}s)") else: logger.info(f" crop: skipped ({_time.time() - t0:.1f}s)") except Exception as e: logger.warning(f" crop: failed ({e}), continuing with uncropped image") # 5b. Scan quality assessment scan_quality_report = None try: from scan_quality import score_scan_quality scan_quality_report = score_scan_quality(dewarped_bgr) except Exception as e: logger.warning(f" scan quality: failed ({e})") if override_min_conf: min_ocr_conf = override_min_conf else: min_ocr_conf = scan_quality_report.recommended_min_conf if scan_quality_report else 40 # 5c. Image enhancement for degraded scans is_degraded = scan_quality_report.is_degraded if scan_quality_report else False if is_degraded and enable_enhance: try: from ocr_image_enhance import enhance_for_ocr dewarped_bgr = enhance_for_ocr(dewarped_bgr, is_degraded=True) logger.info(" enhancement: applied (degraded scan)") except Exception as e: logger.warning(f" enhancement: failed ({e})") # 6. Dual-engine OCR (RapidOCR + Tesseract → merge) t0 = _time.time() img_h, img_w = dewarped_bgr.shape[:2] # RapidOCR (local ONNX) try: from cv_ocr_engines import ocr_region_rapid from cv_vocab_types import PageRegion full_region = PageRegion(type="full_page", x=0, y=0, width=img_w, height=img_h) rapid_words = ocr_region_rapid(dewarped_bgr, full_region) or [] except Exception as e: logger.warning(f" RapidOCR failed: {e}") rapid_words = [] # Tesseract from PIL import Image import pytesseract pil_img = Image.fromarray(cv2.cvtColor(dewarped_bgr, cv2.COLOR_BGR2RGB)) data = pytesseract.image_to_data( pil_img, lang="eng+deu", config="--psm 6 --oem 3", output_type=pytesseract.Output.DICT, ) tess_words = [] for i in range(len(data["text"])): text = str(data["text"][i]).strip() conf_raw = str(data["conf"][i]) conf = int(conf_raw) if conf_raw.lstrip("-").isdigit() else -1 if not text or conf < min_ocr_conf: continue tess_words.append({ "text": text, "left": data["left"][i], "top": data["top"][i], "width": data["width"][i], "height": data["height"][i], "conf": conf, }) # Merge dual-engine results from ocr_pipeline_ocr_merge import _split_paddle_multi_words, _merge_paddle_tesseract, _deduplicate_words from cv_words_first import build_grid_from_words rapid_split = _split_paddle_multi_words(rapid_words) if rapid_words else [] if rapid_split or tess_words: merged_words = _merge_paddle_tesseract(rapid_split, tess_words) merged_words = _deduplicate_words(merged_words) else: merged_words = tess_words # fallback to Tesseract only # Build initial grid from merged words cells, columns_meta = build_grid_from_words(merged_words, img_w, img_h, max_columns=max_columns) for cell in cells: cell["ocr_engine"] = "rapid_kombi" n_rows = len(set(c["row_index"] for c in cells)) if cells else 0 n_cols = len(columns_meta) logger.info(f" ocr: rapid={len(rapid_words)}, tess={len(tess_words)}, " f"merged={len(merged_words)}, cells={len(cells)} ({_time.time() - t0:.1f}s)") # 7. Save word_result to pipeline session (needed by _build_grid_core) word_result = { "cells": cells, "grid_shape": {"rows": n_rows, "cols": n_cols, "total_cells": len(cells)}, "columns_used": columns_meta, "layout": "vocab" if {c.get("type") for c in columns_meta} & {"column_en", "column_de"} else "generic", "image_width": img_w, "image_height": img_h, "duration_seconds": 0, "ocr_engine": "rapid_kombi", "raw_tesseract_words": tess_words, "summary": { "total_cells": len(cells), "non_empty_cells": sum(1 for c in cells if c.get("text")), }, } # Save images + word_result to pipeline session for admin visibility try: _, dsk_buf = cv2.imencode(".png", deskewed_bgr) _, dwp_buf = cv2.imencode(".png", dewarped_bgr) await update_pipeline_session_db( pipeline_session_id, deskewed_png=dsk_buf.tobytes(), dewarped_png=dwp_buf.tobytes(), cropped_png=cv2.imencode(".png", dewarped_bgr)[1].tobytes(), word_result=word_result, deskew_result={"angle_applied": round(angle_applied, 3)}, dewarp_result={"shear_degrees": dewarp_info.get("shear_degrees", 0)}, current_step=8, ) except Exception as e: logger.warning(f"Could not update pipeline session: {e}") # 8. Run full grid-build (with pipe-autocorrect, word-gap merge, etc.) t0 = _time.time() try: from grid_editor_api import _build_grid_core session_data = { "word_result": word_result, } grid_result = await _build_grid_core( pipeline_session_id, session_data, ipa_mode=ipa_mode, syllable_mode=syllable_mode, ) logger.info(f" grid-build: {grid_result.get('summary', {}).get('total_cells', 0)} cells " f"({_time.time() - t0:.1f}s)") # Save grid result to pipeline session try: await update_pipeline_session_db( pipeline_session_id, grid_editor_result=grid_result, current_step=11, ) except Exception: pass except Exception as e: logger.warning(f" grid-build failed: {e}, falling back to basic grid") grid_result = None # 9. Extract vocab entries # Prefer grid-build result (better column detection, more cells) over # the initial build_grid_from_words() which often under-clusters. page_vocabulary = [] extraction_source = "none" # A) Try grid-build zones first (best quality: 4-column detection, autocorrect) if grid_result and grid_result.get("zones"): for zone in grid_result["zones"]: zone_cols = zone.get("columns", []) zone_cells = zone.get("cells", []) if not zone_cols or not zone_cells: continue # Sort columns by x position to determine roles sorted_cols = sorted(zone_cols, key=lambda c: c.get("x_min_px", 0)) col_idx_to_pos = {} for pos, col in enumerate(sorted_cols): ci = col.get("col_index", col.get("index", -1)) col_idx_to_pos[ci] = pos # Skip zones with only 1 column (likely headers/boxes) if len(sorted_cols) < 2: continue # Group cells by row rows_map: dict = {} for cell in zone_cells: ri = cell.get("row_index", 0) if ri not in rows_map: rows_map[ri] = {} ci = cell.get("col_index", 0) rows_map[ri][ci] = (cell.get("text") or "").strip() n_cols = len(sorted_cols) for ri in sorted(rows_map.keys()): row = rows_map[ri] # Collect texts in column-position order texts = [] for col in sorted_cols: ci = col.get("col_index", col.get("index", -1)) texts.append(row.get(ci, "")) if not any(texts): continue # Map by position, skipping narrow first column (page refs/markers) # Heuristic: if first column is very narrow (<15% of zone width), # it's likely a marker/ref column — skip it for vocab first_col_width = sorted_cols[0].get("x_max_px", 0) - sorted_cols[0].get("x_min_px", 0) zone_width = max(1, (sorted_cols[-1].get("x_max_px", 0) - sorted_cols[0].get("x_min_px", 0))) skip_first = first_col_width / zone_width < 0.15 and n_cols >= 3 data_texts = texts[1:] if skip_first else texts entry = { "id": str(uuid.uuid4()), "english": data_texts[0] if len(data_texts) > 0 else "", "german": data_texts[1] if len(data_texts) > 1 else "", "example_sentence": " ".join(t for t in data_texts[2:] if t) if len(data_texts) > 2 else "", "source_page": page_number + 1, } if entry["english"] or entry["german"]: page_vocabulary.append(entry) if page_vocabulary: extraction_source = f"grid-zones ({len(grid_result['zones'])} zones)" # B) Fallback: original cells with column classification if not page_vocabulary: col_types = {c.get("type") for c in columns_meta} is_vocab = bool(col_types & {"column_en", "column_de"}) if is_vocab: entries = _cells_to_vocab_entries(cells, columns_meta) entries = _fix_phonetic_brackets(entries, pronunciation="british") for entry in entries: if not entry.get("english") and not entry.get("german"): continue page_vocabulary.append({ "id": str(uuid.uuid4()), "english": entry.get("english", ""), "german": entry.get("german", ""), "example_sentence": entry.get("example", ""), "source_page": page_number + 1, }) extraction_source = f"classified ({len(columns_meta)} cols)" else: # Last resort: all cells by position rows_map2: dict = {} for cell in cells: ri = cell.get("row_index", 0) if ri not in rows_map2: rows_map2[ri] = {} ci = cell.get("col_index", 0) rows_map2[ri][ci] = (cell.get("text") or "").strip() all_ci = sorted({ci for r in rows_map2.values() for ci in r.keys()}) for ri in sorted(rows_map2.keys()): row = rows_map2[ri] texts = [row.get(ci, "") for ci in all_ci] if not any(texts): continue page_vocabulary.append({ "id": str(uuid.uuid4()), "english": texts[0] if len(texts) > 0 else "", "german": texts[1] if len(texts) > 1 else "", "example_sentence": " ".join(texts[2:]) if len(texts) > 2 else "", "source_page": page_number + 1, }) extraction_source = f"generic ({len(all_ci)} cols)" # --- Post-processing: merge cell-wrap continuation rows --- if len(page_vocabulary) >= 2: try: # Convert to internal format (example_sentence → example) internal = [] for v in page_vocabulary: internal.append({ 'row_index': len(internal), 'english': v.get('english', ''), 'german': v.get('german', ''), 'example': v.get('example_sentence', ''), }) n_before = len(internal) internal = _merge_wrapped_rows(internal) internal = _merge_phonetic_continuation_rows(internal) internal = _merge_continuation_rows(internal) if len(internal) < n_before: # Rebuild page_vocabulary from merged entries merged_vocab = [] for entry in internal: if not entry.get('english') and not entry.get('german'): continue merged_vocab.append({ 'id': str(uuid.uuid4()), 'english': entry.get('english', ''), 'german': entry.get('german', ''), 'example_sentence': entry.get('example', ''), 'source_page': page_number + 1, }) logger.info(f" row merging: {n_before} → {len(merged_vocab)} entries") page_vocabulary = merged_vocab except Exception as e: logger.warning(f" row merging failed (non-critical): {e}") logger.info(f" vocab extraction: {len(page_vocabulary)} entries via {extraction_source}") total_duration = _time.time() - t_total logger.info(f"Kombi Pipeline page {page_number + 1}: " f"{len(page_vocabulary)} vocab entries in {total_duration:.1f}s") return page_vocabulary, rotation, scan_quality_report