breakpilot-lehrer/klausur-service/backend/ocr/layout/layout.py

"""
Layout analysis for OCR vocabulary pages — orchestration and re-exports.

This module provides the high-level entry points for layout analysis and
re-exports all functions from sub-modules for backward compatibility.

Sub-modules:
- cv_layout_detection: Document type detection, image creation, content bounds, header/footer
- cv_layout_analyze: Legacy projection-based layout analysis
- cv_layout_columns: Core column geometry detection
- cv_layout_column_refine: Sub-column, broad-column, expand operations
- cv_layout_rows: Row geometry detection
- cv_layout_row_regularize: Row grid regularization
- cv_layout_scoring: Language/role scoring, dictionary signals
- cv_layout_classify: Column type classification (Phase B)
- cv_layout_classify_position: Position-based classification fallbacks
"""

import logging
from typing import Any, Dict, List, Optional, Tuple

import numpy as np

from cv_vocab_types import ColumnGeometry, DetectedBox, PageRegion

logger = logging.getLogger(__name__)


# ── Re-exports (backward compatibility) ───────────────────────────────────

from cv_layout_detection import (  # noqa: F401
    detect_document_type,
    create_ocr_image,
    create_layout_image,
    _filter_narrow_runs,
    _find_content_bounds,
    _detect_header_footer_gaps,
    _region_has_content,
    _add_header_footer,
)

from cv_layout_analyze import (  # noqa: F401
    analyze_layout,
)

from cv_layout_columns import (  # noqa: F401
    detect_column_geometry,
    _detect_columns_by_clustering,
    _build_geometries_from_starts,
)

from cv_layout_column_refine import (  # noqa: F401
    _detect_sub_columns,
    _split_broad_columns,
    expand_narrow_columns,
)

from cv_layout_rows import (  # noqa: F401
    detect_row_geometry,
    _build_rows_from_word_grouping,
)

from cv_layout_row_regularize import (  # noqa: F401
    _regularize_row_grid,
)

from cv_layout_scoring import (  # noqa: F401
    _score_language,
    _score_role,
    _score_dictionary_signals,
    _classify_dictionary_columns,
)

from cv_layout_classify import (  # noqa: F401
    _build_margin_regions,
    positional_column_regions,
    classify_column_types,
    _classify_by_content,
)

from cv_layout_classify_position import (  # noqa: F401
    _classify_by_position_enhanced,
    _classify_by_position_fallback,
)


# ── Orchestration Functions ───────────────────────────────────────────────

def analyze_layout_by_words(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> List[PageRegion]:
    """Detect columns using two-phase approach: geometry then content classification.

    Phase A: detect_column_geometry() — clustering word positions into columns.
    Phase B: classify_column_types() — content-based type assignment with fallback.

    Falls back to projection-based analyze_layout() if geometry detection fails.
    """
    h, w = ocr_img.shape[:2]

    result = detect_column_geometry(ocr_img, dewarped_bgr)

    if result is None:
        logger.info("LayoutByWords: geometry detection failed, falling back to projection profiles")
        layout_img = create_layout_image(dewarped_bgr)
        return analyze_layout(layout_img, ocr_img)

    geometries, left_x, right_x, top_y, bottom_y, _word_dicts, _inv = result
    content_w = right_x - left_x

    header_y, footer_y = _detect_header_footer_gaps(_inv, w, h) if _inv is not None else (None, None)

    geometries = _detect_sub_columns(geometries, content_w, left_x=left_x,
                                      top_y=top_y, header_y=header_y, footer_y=footer_y)

    geometries = _split_broad_columns(geometries, content_w, left_x=left_x)

    content_h = bottom_y - top_y
    regions = positional_column_regions(geometries, content_w, content_h, left_x)

    col_count = len([r for r in regions if r.type.startswith('column') or r.type == 'page_ref'])
    methods = set(r.classification_method for r in regions if r.classification_method)
    logger.info(f"LayoutByWords: {col_count} columns detected (methods: {methods}): "
                f"{[(r.type, r.x, r.width, r.classification_confidence) for r in regions if r.type not in ('header','footer','margin_top','margin_bottom')]}")

    return regions


def detect_column_geometry_zoned(
    ocr_img: np.ndarray,
    dewarped_bgr: np.ndarray,
) -> Optional[Tuple[
    List[ColumnGeometry],
    int, int, int, int,
    List[Dict],
    np.ndarray,
    List[Dict],
    List[DetectedBox],
]]:
    """Zone-aware column geometry detection.

    1. Finds content bounds.
    2. Runs box detection.
    3. If boxes found: splits page into zones, runs detect_column_geometry()
       per content zone on the corresponding sub-image.
    4. If no boxes: delegates entirely to detect_column_geometry().
    """
    from cv_box_detect import detect_boxes, split_page_into_zones

    geo_result = detect_column_geometry(ocr_img, dewarped_bgr)
    if geo_result is None:
        return None

    geometries, left_x, right_x, top_y, bottom_y, word_dicts, inv = geo_result
    content_w = right_x - left_x
    content_h = bottom_y - top_y

    boxes = detect_boxes(dewarped_bgr, left_x, content_w, top_y, content_h)

    if not boxes:
        zone_data = [{
            "index": 0, "zone_type": "content",
            "y": top_y, "height": content_h,
            "x": left_x, "width": content_w, "columns": [],
        }]
        return (geometries, left_x, right_x, top_y, bottom_y,
                word_dicts, inv, zone_data, boxes)

    zones = split_page_into_zones(left_x, top_y, content_w, content_h, boxes)

    content_strips: List[Tuple[int, int]] = []
    for zone in zones:
        if zone.zone_type == 'content' and zone.height >= 40:
            content_strips.append((zone.y, zone.y + zone.height))

    if not content_strips:
        logger.info("ZonedColumns: no content zones with height >= 40, using original result")
        zone_data = [{"index": 0, "zone_type": "content", "y": top_y,
                       "height": content_h, "x": left_x, "width": content_w, "columns": []}]
        return (geometries, left_x, right_x, top_y, bottom_y,
                word_dicts, inv, zone_data, boxes)

    ocr_strips = [ocr_img[ys:ye, :] for ys, ye in content_strips]
    bgr_strips = [dewarped_bgr[ys:ye, :] for ys, ye in content_strips]
    combined_ocr = np.vstack(ocr_strips)
    combined_bgr = np.vstack(bgr_strips)

    logger.info(f"ZonedColumns: {len(boxes)} box(es), concatenating {len(content_strips)} "
                f"content strips into combined image {combined_ocr.shape[1]}x{combined_ocr.shape[0]}")

    combined_result = detect_column_geometry(combined_ocr, combined_bgr)
    if combined_result is not None:
        combined_geoms, c_lx, c_rx, c_ty, c_by, combined_words, combined_inv = combined_result
    else:
        logger.info("ZonedColumns: combined image column detection failed, using original")
        combined_geoms = geometries

    strip_offsets: List[Tuple[int, int, int]] = []
    cum_y = 0
    for ys, ye in content_strips:
        h = ye - ys
        strip_offsets.append((cum_y, h, ys))
        cum_y += h

    def _combined_y_to_abs(cy: int) -> int:
        for c_start, s_h, abs_start in strip_offsets:
            if cy < c_start + s_h:
                return abs_start + (cy - c_start)
        last_c, last_h, last_abs = strip_offsets[-1]
        return last_abs + last_h

    if combined_result is not None:
        for g in combined_geoms:
            abs_y = _combined_y_to_abs(g.y)
            abs_y_end = _combined_y_to_abs(g.y + g.height)
            g.y = abs_y
            g.height = abs_y_end - abs_y

    if word_dicts:
        content_words = []
        for w in word_dicts:
            w_abs_cx = w['left'] + left_x + w['width'] / 2
            w_abs_cy = w['top'] + top_y + w['height'] / 2
            inside_box = any(
                box.x <= w_abs_cx <= box.x + box.width
                and box.y <= w_abs_cy <= box.y + box.height
                for box in boxes
            )
            if not inside_box:
                content_words.append(w)

        target_geoms = combined_geoms if combined_result is not None else geometries
        for g in target_geoms:
            g_left_rel = g.x - left_x
            g_right_rel = g_left_rel + g.width
            g.words = [
                w for w in content_words
                if g_left_rel <= w['left'] + w['width'] / 2 < g_right_rel
            ]
            g.word_count = len(g.words)

        excluded_count = len(word_dicts) - len(content_words)
        if excluded_count:
            logger.info(
                "ZonedColumns: enriched geometries with %d content words "
                "(excluded %d box-interior words)",
                len(content_words), excluded_count,
            )

    zones_data: List[Dict] = []
    for zone in zones:
        zone_dict: Dict = {
            "index": zone.index,
            "zone_type": zone.zone_type,
            "y": zone.y,
            "height": zone.height,
            "x": zone.x,
            "width": zone.width,
            "columns": [],
        }
        if zone.box is not None:
            zone_dict["box"] = {
                "x": zone.box.x, "y": zone.box.y,
                "width": zone.box.width, "height": zone.box.height,
                "confidence": zone.box.confidence,
                "border_thickness": zone.box.border_thickness,
            }
        zones_data.append(zone_dict)

    all_geometries = combined_geoms if combined_geoms else geometries

    logger.info(f"ZonedColumns: {len(boxes)} box(es), {len(zones)} zone(s), "
                f"{len(all_geometries)} total columns (combined-image approach)")

    return (all_geometries, left_x, right_x, top_y, bottom_y,
            word_dicts, inv, zones_data, boxes)