breakpilot-lehrer/klausur-service/backend/grid_build_finalize.py

"""
Grid Build Finalize — Phase 5b+6: Orchestrates cell-level operations,
dictionary detection, syllable dividers, spell checking, empty column
removal, and result assembly.

Extracted from grid_build_core.py for maintainability.
"""

import logging
import re
from typing import Any, Dict, List, Optional

from grid_build_cell_ops import (
    _remove_bullets_and_artifacts,
    _remove_garbled_cells,
    _normalize_word_order,
    _enforce_max_columns,
)

logger = logging.getLogger(__name__)


def _finalize_grid(
    zones_data: List[Dict[str, Any]],
    all_words: List[Dict[str, Any]],
    img_bgr: Any,
    img_w: int,
    img_h: int,
    session_id: str,
    max_columns: Optional[int],
    ipa_mode: str,
    syllable_mode: str,
    en_col_type: Optional[str],
    ipa_target_cols: set,
    all_content_cols: set,
    skip_ipa: bool,
    document_category: Optional[str],
    margin_strip_detected: bool,
    page_number_info: Optional[Dict],
    boxes_detected: int,
    recovered_count: int,
    duration: float,
) -> dict:
    """Run final processing steps and assemble result dict.

    Handles: bullet removal, artifact cells, word ordering, max_columns,
    dictionary detection, syllable dividers, spell check, empty columns,
    internal flag cleanup, and result assembly.
    """
    total_cols = sum(len(z.get("columns", [])) for z in zones_data)

    # 5i. Remove blue bullet/artifact word_boxes
    _remove_bullets_and_artifacts(zones_data)

    # 5j-pre. Remove cells whose text is entirely garbled / artifact noise
    _remove_garbled_cells(zones_data)

    # 5j. Normalise word_box order to reading order
    _normalize_word_order(zones_data)

    # 5k. Enforce max_columns by merging narrowest columns
    if max_columns and max_columns > 0:
        _enforce_max_columns(zones_data, max_columns)

    # --- Dictionary detection on assembled grid ---
    dict_detection = _detect_dictionary(
        zones_data, img_w, img_h, document_category, margin_strip_detected
    )

    # --- Word-gap merge ---
    try:
        from cv_syllable_detect import merge_word_gaps_in_zones
        merge_word_gaps_in_zones(zones_data, session_id)
    except Exception as e:
        logger.warning("Word-gap merge failed: %s", e)

    # --- Pipe auto-correction ---
    try:
        from cv_syllable_detect import autocorrect_pipe_artifacts
        autocorrect_pipe_artifacts(zones_data, session_id)
    except Exception as e:
        logger.warning("Pipe autocorrect failed: %s", e)

    # --- Syllable divider insertion ---
    syllable_insertions = _insert_syllable_dividers(
        zones_data, img_bgr, session_id, syllable_mode, dict_detection,
        en_col_type, all_content_cols, total_cols,
    )

    # --- Split merged words ---
    _split_merged_words(zones_data, session_id)

    # --- Ensure space before IPA/phonetic brackets ---
    _fix_ipa_spacing(zones_data)

    # --- SmartSpellChecker ---
    _run_spell_checker(zones_data, session_id, en_col_type, total_cols)

    # --- Debug log cell counts per column ---
    for z in zones_data:
        if z.get("zone_type") == "content":
            from collections import Counter as _Counter
            _cc = _Counter(c.get("col_index") for c in z.get("cells", []))
            _cols = z.get("columns", [])
            logger.info(
                "pre-empty-col-removal zone %d: %d cols, cells_per_col=%s",
                z.get("zone_index", 0), len(_cols), dict(sorted(_cc.items())),
            )

    # --- Remove empty columns ---
    _remove_empty_columns(zones_data)

    # Clean up internal flags before returning
    for z in zones_data:
        for cell in z.get("cells", []):
            cell.pop("_ipa_corrected", None)

    # 6. Build result
    return _assemble_result(
        zones_data, all_words, img_w, img_h, session_id,
        ipa_mode, syllable_mode, ipa_target_cols, skip_ipa,
        dict_detection, page_number_info, boxes_detected,
        recovered_count, duration, syllable_insertions,
    )


def _detect_dictionary(
    zones_data: List[Dict[str, Any]],
    img_w: int,
    img_h: int,
    document_category: Optional[str],
    margin_strip_detected: bool,
) -> Dict[str, Any]:
    """Run dictionary detection on the assembled grid."""
    from cv_layout import _score_dictionary_signals
    dict_detection: Dict[str, Any] = {"is_dictionary": False, "confidence": 0.0}
    try:
        from cv_vocab_types import ColumnGeometry
        for z in zones_data:
            zone_cells = z.get("cells", [])
            zone_cols = z.get("columns", [])
            if len(zone_cols) < 2 or len(zone_cells) < 10:
                continue
            pseudo_geoms = []
            for col in zone_cols:
                ci = col["index"]
                col_cells = [c for c in zone_cells if c.get("col_index") == ci]
                col_words = []
                for cell in col_cells:
                    for wb in cell.get("word_boxes") or []:
                        col_words.append({
                            "text": wb.get("text", ""),
                            "conf": wb.get("conf", 0),
                            "top": wb.get("top", 0),
                            "left": wb.get("left", 0),
                            "height": wb.get("height", 0),
                            "width": wb.get("width", 0),
                        })
                    if not cell.get("word_boxes") and cell.get("text"):
                        col_words.append({
                            "text": cell["text"],
                            "conf": cell.get("confidence", 50),
                            "top": cell.get("bbox_px", {}).get("y", 0),
                            "left": cell.get("bbox_px", {}).get("x", 0),
                            "height": cell.get("bbox_px", {}).get("h", 20),
                            "width": cell.get("bbox_px", {}).get("w", 50),
                        })
                col_w = col.get("x_max_px", 0) - col.get("x_min_px", 0)
                pseudo_geoms.append(ColumnGeometry(
                    index=ci, x=col.get("x_min_px", 0), y=0,
                    width=max(col_w, 1), height=img_h,
                    word_count=len(col_words), words=col_words,
                    width_ratio=col_w / max(img_w, 1),
                ))
            if len(pseudo_geoms) >= 2:
                dd = _score_dictionary_signals(
                    pseudo_geoms,
                    document_category=document_category,
                    margin_strip_detected=margin_strip_detected,
                )
                if dd["confidence"] > dict_detection["confidence"]:
                    dict_detection = dd
    except Exception as e:
        logger.warning("Dictionary detection failed: %s", e)
    return dict_detection


def _insert_syllable_dividers(
    zones_data: List[Dict[str, Any]],
    img_bgr: Any,
    session_id: str,
    syllable_mode: str,
    dict_detection: Dict[str, Any],
    en_col_type: Optional[str],
    all_content_cols: set,
    total_cols: int,
) -> int:
    """Insert syllable dividers for dictionary pages. Returns insertion count."""
    syllable_insertions = 0
    if syllable_mode == "none" or img_bgr is None:
        if syllable_mode == "none":
            for z in zones_data:
                for cell in z.get("cells", []):
                    t = cell.get("text", "")
                    if "|" in t:
                        cell["text"] = t.replace("|", "")
        return syllable_insertions

    _syllable_eligible = False
    if syllable_mode in ("all", "de", "en"):
        _syllable_eligible = True
    elif (dict_detection.get("is_dictionary")
            and dict_detection.get("article_col_index") is not None):
        _syllable_eligible = True

    _syllable_col_filter: Optional[set] = None
    if syllable_mode == "en":
        _syllable_col_filter = {en_col_type} if en_col_type else set()
    elif syllable_mode == "de":
        if en_col_type and total_cols >= 3:
            _syllable_col_filter = all_content_cols - {en_col_type}

    if _syllable_eligible:
        try:
            from cv_syllable_detect import insert_syllable_dividers
            force_syllables = (syllable_mode in ("all", "de", "en"))
            syllable_insertions = insert_syllable_dividers(
                zones_data, img_bgr, session_id,
                force=force_syllables,
                col_filter=_syllable_col_filter,
            )
        except Exception as e:
            logger.warning("Syllable insertion failed: %s", e)

    return syllable_insertions


def _split_merged_words(
    zones_data: List[Dict[str, Any]],
    session_id: str,
) -> None:
    """Split merged words using dictionary lookup."""
    try:
        from cv_review import _try_split_merged_word, _SPELL_AVAILABLE
        if not _SPELL_AVAILABLE:
            return
        split_count = 0
        for z in zones_data:
            for cell in z.get("cells", []):
                text = cell.get("text", "")
                if not text:
                    continue
                parts = []
                changed = False
                for token in text.split():
                    clean = token
                    bracket_pos = clean.find('[')
                    suffix_ipa = ""
                    if bracket_pos > 0:
                        suffix_ipa = clean[bracket_pos:]
                        clean = clean[:bracket_pos]
                    suffix_punct = ""
                    stripped = clean.rstrip(".,!?;:'\")")
                    if stripped != clean:
                        suffix_punct = clean[len(stripped):]
                        clean = stripped
                    suffix = suffix_punct + suffix_ipa
                    contraction = ""
                    if "'" in clean and clean.index("'") >= 2:
                        apos_pos = clean.index("'")
                        contraction = clean[apos_pos:]
                        clean = clean[:apos_pos]
                        suffix = contraction + suffix
                    if len(clean) >= 4 and clean.isalpha():
                        split = _try_split_merged_word(clean)
                        if split:
                            parts.append(split + suffix)
                            changed = True
                            continue
                    parts.append(token)
                if changed:
                    cell["text"] = " ".join(parts)
                    split_count += 1
        if split_count:
            logger.info("build-grid session %s: split %d merged words", session_id, split_count)
    except ImportError:
        pass


def _fix_ipa_spacing(zones_data: List[Dict[str, Any]]) -> None:
    """Ensure space before IPA/phonetic brackets: 'word[ipa]' -> 'word [ipa]'."""
    _IPA_NOSPACE_RE = re.compile(r'([a-zA-ZäöüÄÖÜß])(\[[^\]]{2,}\])')
    for z in zones_data:
        for cell in z.get("cells", []):
            text = cell.get("text", "")
            if text and "[" in text:
                fixed = _IPA_NOSPACE_RE.sub(r'\1 \2', text)
                if fixed != text:
                    cell["text"] = fixed


def _run_spell_checker(
    zones_data: List[Dict[str, Any]],
    session_id: str,
    en_col_type: Optional[str],
    total_cols: int,
) -> None:
    """Run SmartSpellChecker on all cells."""
    try:
        from smart_spell import SmartSpellChecker
        _ssc = SmartSpellChecker()
        spell_fix_count = 0

        for z in zones_data:
            for cell in z.get("cells", []):
                text = cell.get("text", "")
                if not text or not text.strip():
                    continue
                ct = cell.get("col_type", "")
                if not ct.startswith("column_"):
                    continue

                if total_cols >= 3 and en_col_type:
                    lang = "en" if ct == en_col_type else "de"
                elif total_cols <= 2:
                    lang = "auto"
                else:
                    lang = "auto"

                result = _ssc.correct_text(text, lang=lang)
                if result.changed:
                    cell["text"] = result.corrected
                    spell_fix_count += 1

        if spell_fix_count:
            logger.info(
                "build-grid session %s: SmartSpellChecker fixed %d cells",
                session_id, spell_fix_count,
            )
    except ImportError:
        logger.debug("SmartSpellChecker not available in build-grid")
    except Exception as e:
        logger.warning("SmartSpellChecker error in build-grid: %s", e)


def _remove_empty_columns(zones_data: List[Dict[str, Any]]) -> None:
    """Remove columns that have no cells assigned."""
    for z in zones_data:
        cells = z.get("cells", [])
        used_col_indices = {c.get("col_index") for c in cells}
        old_cols = z.get("columns", [])
        new_cols = [c for c in old_cols if c.get("col_index", c.get("index", -1)) in used_col_indices]
        if len(new_cols) < len(old_cols):
            old_to_new = {}
            for new_i, col in enumerate(new_cols):
                old_i = col.get("col_index", col.get("index", new_i))
                old_to_new[old_i] = new_i
                col["col_index"] = new_i
                col["index"] = new_i
                col["label"] = f"column_{new_i + 1}" if len(new_cols) > 1 else "column_text"
            for cell in cells:
                old_ci = cell.get("col_index", 0)
                cell["col_index"] = old_to_new.get(old_ci, old_ci)
                cell["col_type"] = f"column_{cell['col_index'] + 1}" if len(new_cols) > 1 else "column_text"
            z["columns"] = new_cols


def _assemble_result(
    zones_data: List[Dict[str, Any]],
    all_words: List[Dict[str, Any]],
    img_w: int,
    img_h: int,
    session_id: str,
    ipa_mode: str,
    syllable_mode: str,
    ipa_target_cols: set,
    skip_ipa: bool,
    dict_detection: Dict[str, Any],
    page_number_info: Optional[Dict],
    boxes_detected: int,
    recovered_count: int,
    duration: float,
    syllable_insertions: int,
) -> dict:
    """Build the final result dict (Phase 6)."""
    total_cells = sum(len(z.get("cells", [])) for z in zones_data)
    total_columns = sum(len(z.get("columns", [])) for z in zones_data)
    total_rows = sum(len(z.get("rows", [])) for z in zones_data)

    # Collect color statistics
    color_stats: Dict[str, int] = {}
    for z in zones_data:
        for cell in z.get("cells", []):
            for wb in cell.get("word_boxes", []):
                cn = wb.get("color_name", "black")
                color_stats[cn] = color_stats.get(cn, 0) + 1

    # Compute layout metrics
    all_content_row_heights: List[float] = []
    for z in zones_data:
        for row in z.get("rows", []):
            if not row.get("is_header", False):
                h = row.get("y_max_px", 0) - row.get("y_min_px", 0)
                if h > 0:
                    all_content_row_heights.append(h)
    avg_row_height = (
        sum(all_content_row_heights) / len(all_content_row_heights)
        if all_content_row_heights else 30.0
    )
    font_size_suggestion = max(10, int(avg_row_height * 0.6))

    return {
        "session_id": session_id,
        "image_width": img_w,
        "image_height": img_h,
        "zones": zones_data,
        "boxes_detected": boxes_detected,
        "summary": {
            "total_zones": len(zones_data),
            "total_columns": total_columns,
            "total_rows": total_rows,
            "total_cells": total_cells,
            "total_words": len(all_words),
            "recovered_colored": recovered_count,
            "color_stats": color_stats,
        },
        "formatting": {
            "bold_columns": [],
            "header_rows": [],
        },
        "layout_metrics": {
            "page_width_px": img_w,
            "page_height_px": img_h,
            "avg_row_height_px": round(avg_row_height, 1),
            "font_size_suggestion_px": font_size_suggestion,
        },
        "dictionary_detection": {
            "is_dictionary": dict_detection.get("is_dictionary", False),
            "confidence": dict_detection.get("confidence", 0.0),
            "signals": dict_detection.get("signals", {}),
            "article_col_index": dict_detection.get("article_col_index"),
            "headword_col_index": dict_detection.get("headword_col_index"),
        },
        "processing_modes": {
            "ipa_mode": ipa_mode,
            "syllable_mode": syllable_mode,
            "ipa_applied": bool(ipa_target_cols) if not skip_ipa else False,
            "syllables_applied": syllable_insertions > 0,
        },
        "page_number": page_number_info,
        "duration_seconds": round(duration, 2),
    }