""" Layout analysis for OCR vocabulary pages — orchestration and re-exports. This module provides the high-level entry points for layout analysis and re-exports all functions from sub-modules for backward compatibility. Sub-modules: - cv_layout_detection: Document type detection, image creation, content bounds, header/footer - cv_layout_analyze: Legacy projection-based layout analysis - cv_layout_columns: Core column geometry detection - cv_layout_column_refine: Sub-column, broad-column, expand operations - cv_layout_rows: Row geometry detection - cv_layout_row_regularize: Row grid regularization - cv_layout_scoring: Language/role scoring, dictionary signals - cv_layout_classify: Column type classification (Phase B) - cv_layout_classify_position: Position-based classification fallbacks """ import logging from typing import Any, Dict, List, Optional, Tuple import numpy as np from cv_vocab_types import ColumnGeometry, DetectedBox, PageRegion logger = logging.getLogger(__name__) # ── Re-exports (backward compatibility) ─────────────────────────────────── from cv_layout_detection import ( # noqa: F401 detect_document_type, create_ocr_image, create_layout_image, _filter_narrow_runs, _find_content_bounds, _detect_header_footer_gaps, _region_has_content, _add_header_footer, ) from cv_layout_analyze import ( # noqa: F401 analyze_layout, ) from cv_layout_columns import ( # noqa: F401 detect_column_geometry, _detect_columns_by_clustering, _build_geometries_from_starts, ) from cv_layout_column_refine import ( # noqa: F401 _detect_sub_columns, _split_broad_columns, expand_narrow_columns, ) from cv_layout_rows import ( # noqa: F401 detect_row_geometry, _build_rows_from_word_grouping, ) from cv_layout_row_regularize import ( # noqa: F401 _regularize_row_grid, ) from cv_layout_scoring import ( # noqa: F401 _score_language, _score_role, _score_dictionary_signals, _classify_dictionary_columns, ) from cv_layout_classify import ( # noqa: F401 _build_margin_regions, positional_column_regions, classify_column_types, _classify_by_content, ) from cv_layout_classify_position import ( # noqa: F401 _classify_by_position_enhanced, _classify_by_position_fallback, ) # ── Orchestration Functions ─────────────────────────────────────────────── def analyze_layout_by_words(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> List[PageRegion]: """Detect columns using two-phase approach: geometry then content classification. Phase A: detect_column_geometry() — clustering word positions into columns. Phase B: classify_column_types() — content-based type assignment with fallback. Falls back to projection-based analyze_layout() if geometry detection fails. """ h, w = ocr_img.shape[:2] result = detect_column_geometry(ocr_img, dewarped_bgr) if result is None: logger.info("LayoutByWords: geometry detection failed, falling back to projection profiles") layout_img = create_layout_image(dewarped_bgr) return analyze_layout(layout_img, ocr_img) geometries, left_x, right_x, top_y, bottom_y, _word_dicts, _inv = result content_w = right_x - left_x header_y, footer_y = _detect_header_footer_gaps(_inv, w, h) if _inv is not None else (None, None) geometries = _detect_sub_columns(geometries, content_w, left_x=left_x, top_y=top_y, header_y=header_y, footer_y=footer_y) geometries = _split_broad_columns(geometries, content_w, left_x=left_x) content_h = bottom_y - top_y regions = positional_column_regions(geometries, content_w, content_h, left_x) col_count = len([r for r in regions if r.type.startswith('column') or r.type == 'page_ref']) methods = set(r.classification_method for r in regions if r.classification_method) logger.info(f"LayoutByWords: {col_count} columns detected (methods: {methods}): " f"{[(r.type, r.x, r.width, r.classification_confidence) for r in regions if r.type not in ('header','footer','margin_top','margin_bottom')]}") return regions def detect_column_geometry_zoned( ocr_img: np.ndarray, dewarped_bgr: np.ndarray, ) -> Optional[Tuple[ List[ColumnGeometry], int, int, int, int, List[Dict], np.ndarray, List[Dict], List[DetectedBox], ]]: """Zone-aware column geometry detection. 1. Finds content bounds. 2. Runs box detection. 3. If boxes found: splits page into zones, runs detect_column_geometry() per content zone on the corresponding sub-image. 4. If no boxes: delegates entirely to detect_column_geometry(). """ from cv_box_detect import detect_boxes, split_page_into_zones geo_result = detect_column_geometry(ocr_img, dewarped_bgr) if geo_result is None: return None geometries, left_x, right_x, top_y, bottom_y, word_dicts, inv = geo_result content_w = right_x - left_x content_h = bottom_y - top_y boxes = detect_boxes(dewarped_bgr, left_x, content_w, top_y, content_h) if not boxes: zone_data = [{ "index": 0, "zone_type": "content", "y": top_y, "height": content_h, "x": left_x, "width": content_w, "columns": [], }] return (geometries, left_x, right_x, top_y, bottom_y, word_dicts, inv, zone_data, boxes) zones = split_page_into_zones(left_x, top_y, content_w, content_h, boxes) content_strips: List[Tuple[int, int]] = [] for zone in zones: if zone.zone_type == 'content' and zone.height >= 40: content_strips.append((zone.y, zone.y + zone.height)) if not content_strips: logger.info("ZonedColumns: no content zones with height >= 40, using original result") zone_data = [{"index": 0, "zone_type": "content", "y": top_y, "height": content_h, "x": left_x, "width": content_w, "columns": []}] return (geometries, left_x, right_x, top_y, bottom_y, word_dicts, inv, zone_data, boxes) ocr_strips = [ocr_img[ys:ye, :] for ys, ye in content_strips] bgr_strips = [dewarped_bgr[ys:ye, :] for ys, ye in content_strips] combined_ocr = np.vstack(ocr_strips) combined_bgr = np.vstack(bgr_strips) logger.info(f"ZonedColumns: {len(boxes)} box(es), concatenating {len(content_strips)} " f"content strips into combined image {combined_ocr.shape[1]}x{combined_ocr.shape[0]}") combined_result = detect_column_geometry(combined_ocr, combined_bgr) if combined_result is not None: combined_geoms, c_lx, c_rx, c_ty, c_by, combined_words, combined_inv = combined_result else: logger.info("ZonedColumns: combined image column detection failed, using original") combined_geoms = geometries strip_offsets: List[Tuple[int, int, int]] = [] cum_y = 0 for ys, ye in content_strips: h = ye - ys strip_offsets.append((cum_y, h, ys)) cum_y += h def _combined_y_to_abs(cy: int) -> int: for c_start, s_h, abs_start in strip_offsets: if cy < c_start + s_h: return abs_start + (cy - c_start) last_c, last_h, last_abs = strip_offsets[-1] return last_abs + last_h if combined_result is not None: for g in combined_geoms: abs_y = _combined_y_to_abs(g.y) abs_y_end = _combined_y_to_abs(g.y + g.height) g.y = abs_y g.height = abs_y_end - abs_y if word_dicts: content_words = [] for w in word_dicts: w_abs_cx = w['left'] + left_x + w['width'] / 2 w_abs_cy = w['top'] + top_y + w['height'] / 2 inside_box = any( box.x <= w_abs_cx <= box.x + box.width and box.y <= w_abs_cy <= box.y + box.height for box in boxes ) if not inside_box: content_words.append(w) target_geoms = combined_geoms if combined_result is not None else geometries for g in target_geoms: g_left_rel = g.x - left_x g_right_rel = g_left_rel + g.width g.words = [ w for w in content_words if g_left_rel <= w['left'] + w['width'] / 2 < g_right_rel ] g.word_count = len(g.words) excluded_count = len(word_dicts) - len(content_words) if excluded_count: logger.info( "ZonedColumns: enriched geometries with %d content words " "(excluded %d box-interior words)", len(content_words), excluded_count, ) zones_data: List[Dict] = [] for zone in zones: zone_dict: Dict = { "index": zone.index, "zone_type": zone.zone_type, "y": zone.y, "height": zone.height, "x": zone.x, "width": zone.width, "columns": [], } if zone.box is not None: zone_dict["box"] = { "x": zone.box.x, "y": zone.box.y, "width": zone.box.width, "height": zone.box.height, "confidence": zone.box.confidence, "border_thickness": zone.box.border_thickness, } zones_data.append(zone_dict) all_geometries = combined_geoms if combined_geoms else geometries logger.info(f"ZonedColumns: {len(boxes)} box(es), {len(zones)} zone(s), " f"{len(all_geometries)} total columns (combined-image approach)") return (all_geometries, left_x, right_x, top_y, bottom_y, word_dicts, inv, zones_data, boxes)