""" Grid Build Core — the main _build_grid_core() function. Extracted from grid_editor_api.py for maintainability. Takes merged OCR word positions and builds a structured, zone-aware grid. The function delegates to phase-specific modules: - grid_build_zones.py — image loading, graphic/box detection, zone grids - grid_build_cleanup.py — junk rows, artifacts, pipes, border strips - grid_build_text_ops.py — color, headings, IPA, page refs - grid_build_finalize.py — bullets, max_columns, dictionary, spelling, result """ import logging import time from typing import Any, Dict, List, Optional from grid_editor_helpers import ( _flatten_word_boxes, _get_content_bounds, _filter_decorative_margin, _filter_footer_words, _filter_header_junk, ) from grid_build_zones import _build_zones from grid_build_cleanup import _cleanup_zones from grid_build_text_ops import _process_text from grid_build_finalize import _finalize_grid logger = logging.getLogger(__name__) async def _build_grid_core( session_id: str, session: dict, *, ipa_mode: str = "auto", syllable_mode: str = "auto", enhance: bool = True, max_columns: Optional[int] = None, min_conf: Optional[int] = None, ) -> dict: """Core grid building logic — pure computation, no HTTP or DB side effects. Args: session_id: Session identifier (for logging and image loading). session: Full session dict from get_session_db(). ipa_mode: "auto" (only when English headwords detected), "all" (force IPA on all content columns), "en" (English column only), "de" (German/definition columns only), or "none" (skip entirely). syllable_mode: "auto" (only when original has pipe dividers), "all" (force syllabification on all words), "en" (English only), "de" (German only), or "none" (skip). Returns: StructuredGrid result dict. Raises: ValueError: If session data is incomplete. """ t0 = time.time() # ── Phase 1: Input Validation & Word Filtering ────────────────── # 1. Validate and load word results word_result = session.get("word_result") if not word_result or not word_result.get("cells"): raise ValueError("No word results found. Run paddle-kombi or rapid-kombi first.") img_w = word_result.get("image_width", 0) img_h = word_result.get("image_height", 0) if not img_w or not img_h: raise ValueError("Missing image dimensions in word_result") # 2. Flatten all word boxes from cells all_words = _flatten_word_boxes(word_result["cells"]) if not all_words: raise ValueError("No word boxes found in cells") # 2a-pre. Apply min_conf filter if specified if min_conf and min_conf > 0: before = len(all_words) all_words = [w for w in all_words if w.get('conf', 100) >= min_conf] removed = before - len(all_words) if removed: logger.info("build-grid session %s: min_conf=%d removed %d/%d words", session_id, min_conf, removed, before) logger.info("build-grid session %s: %d words from %d cells (enhance=%s, max_cols=%s, min_conf=%s)", session_id, len(all_words), len(word_result["cells"]), enhance, max_columns, min_conf) # 2b. Filter decorative margin columns (alphabet graphics) margin_strip_info = _filter_decorative_margin(all_words, img_w, logger, session_id) margin_strip_detected = margin_strip_info.get("found", False) # Read document_category from session document_category = session.get("document_category") # 2c. Filter footer rows (page numbers at the very bottom) page_number_info = _filter_footer_words(all_words, img_h, logger, session_id) # 2c2. Filter OCR junk from header illustrations _filter_header_junk(all_words, img_h, logger, session_id) # 2d. Filter words inside user-defined exclude regions structure_result = session.get("structure_result") exclude_rects = [] if structure_result: for er in structure_result.get("exclude_regions", []): exclude_rects.append({ "x": er["x"], "y": er["y"], "w": er["w"], "h": er["h"], }) if exclude_rects: before = len(all_words) filtered = [] for w in all_words: w_cx = w["left"] + w.get("width", 0) / 2 w_cy = w["top"] + w.get("height", 0) / 2 inside = any( er["x"] <= w_cx <= er["x"] + er["w"] and er["y"] <= w_cy <= er["y"] + er["h"] for er in exclude_rects ) if not inside: filtered.append(w) removed = before - len(filtered) if removed: all_words = filtered logger.info( "build-grid session %s: removed %d words inside %d user exclude region(s)", session_id, removed, len(exclude_rects), ) # 2e. Hard-filter words inside graphic/image regions from structure step graphic_rects: List[Dict[str, int]] = [] if structure_result: for g in structure_result.get("graphics", []): graphic_rects.append({ "x": g["x"], "y": g["y"], "w": g["w"], "h": g["h"], }) if graphic_rects: before = len(all_words) all_words = [ w for w in all_words if not any( gr["x"] <= w["left"] + w.get("width", 0) / 2 <= gr["x"] + gr["w"] and gr["y"] <= w["top"] + w.get("height", 0) / 2 <= gr["y"] + gr["h"] for gr in graphic_rects ) ] removed = before - len(all_words) if removed: logger.info( "build-grid session %s: hard-removed %d words inside %d structure graphic region(s)", session_id, removed, len(graphic_rects), ) content_x, content_y, content_w, content_h = _get_content_bounds(all_words) # ── Phase 2: Image Processing & Zone Detection ────────────────── zone_result = await _build_zones( session_id, session, all_words, graphic_rects, content_x, content_y, content_w, content_h, img_w, img_h, ) zones_data = zone_result["zones_data"] boxes_detected = zone_result["boxes_detected"] recovered_count = zone_result["recovered_count"] border_prefiltered = zone_result["border_prefiltered"] img_bgr = zone_result["img_bgr"] # ── Phase 3: Junk Removal & Cell Cleanup ──────────────────────── border_prefiltered = _cleanup_zones(zones_data, border_prefiltered, session_id) # ── Phase 4+5a: Color, Headings, IPA, Page Refs ───────────────── text_result = _process_text( zones_data, img_bgr, img_w, img_h, ipa_mode, page_number_info, ) # ── Phase 5b+6: Finalize & Result Assembly ────────────────────── duration = time.time() - t0 result = _finalize_grid( zones_data=zones_data, all_words=all_words, img_bgr=img_bgr, img_w=img_w, img_h=img_h, session_id=session_id, max_columns=max_columns, ipa_mode=ipa_mode, syllable_mode=syllable_mode, en_col_type=text_result["en_col_type"], ipa_target_cols=text_result["ipa_target_cols"], all_content_cols=text_result["all_content_cols"], skip_ipa=text_result["skip_ipa"], document_category=document_category, margin_strip_detected=margin_strip_detected, page_number_info=text_result["page_number_info"], boxes_detected=boxes_detected, recovered_count=recovered_count, duration=duration, ) return result