breakpilot-lehrer/klausur-service/backend/grid_build_text_ops.py

"""
Grid Build Text Ops — Phase 4+5a: Color annotation, heading detection,
parenthesis fix, IPA phonetic correction, page ref extraction, and
slash-IPA conversion.

Extracted from grid_build_core.py for maintainability.
"""

import logging
import re
from typing import Any, Dict, List, Optional, Set, Tuple

from cv_color_detect import detect_word_colors
from cv_ocr_engines import (
    fix_cell_phonetics, fix_ipa_continuation_cell, _text_has_garbled_ipa,
    _lookup_ipa,
)
from grid_editor_helpers import (
    _detect_heading_rows_by_color,
    _detect_heading_rows_by_single_cell,
)

logger = logging.getLogger(__name__)


def _process_text(
    zones_data: List[Dict[str, Any]],
    img_bgr: Any,
    img_w: int,
    img_h: int,
    ipa_mode: str,
    page_number_info: Optional[Dict],
) -> Dict[str, Any]:
    """Run color annotation, heading detection, IPA correction, and page refs.

    Args:
        zones_data: List of zone dicts (modified in place).
        img_bgr: BGR image array (or None).
        img_w: Image width.
        img_h: Image height.
        ipa_mode: IPA processing mode.
        page_number_info: Existing page number metadata (may be None).

    Returns:
        Dict with keys: en_col_type, ipa_target_cols, all_content_cols,
        skip_ipa, page_number_info.
    """
    # 5. Color annotation on final word_boxes in cells
    if img_bgr is not None:
        all_wb: List[Dict] = []
        for z in zones_data:
            for cell in z.get("cells", []):
                all_wb.extend(cell.get("word_boxes", []))
        detect_word_colors(img_bgr, all_wb)

    # 5a. Heading detection by color + height
    heading_count = _detect_heading_rows_by_color(zones_data, img_w, img_h)
    if heading_count:
        logger.info("Detected %d heading rows by color+height", heading_count)

    # 5b. Fix unmatched parentheses in cell text
    for z in zones_data:
        for cell in z.get("cells", []):
            text = cell.get("text", "")
            if ")" in text and "(" not in text:
                cell["text"] = "(" + text

    # 5c. IPA phonetic correction
    all_cells = [cell for z in zones_data for cell in z.get("cells", [])]
    total_cols = sum(len(z.get("columns", [])) for z in zones_data)
    en_col_type = None
    ipa_target_cols: set = set()
    all_content_cols: set = set()
    skip_ipa = (ipa_mode == "none")

    # When ipa_mode=none, strip ALL square brackets from ALL content columns
    if skip_ipa:
        _SQUARE_BRACKET_RE_NONE = re.compile(r'\s*\[[^\]]+\]')
        for cell in all_cells:
            ct = cell.get("col_type", "")
            if not ct.startswith("column_"):
                continue
            text = cell.get("text", "")
            if "[" in text:
                stripped = _SQUARE_BRACKET_RE_NONE.sub("", text)
                if stripped != text:
                    cell["text"] = stripped.strip()
                    cell["_ipa_corrected"] = True

    if not skip_ipa and total_cols >= 3:
        en_col_type, ipa_target_cols, all_content_cols = _run_ipa_correction(
            all_cells, total_cols, ipa_mode, zones_data
        )
    elif not skip_ipa:
        # Collect all_content_cols even when <3 cols (needed by finalize)
        for cell in all_cells:
            ct = cell.get("col_type", "")
            if ct.startswith("column_") and (cell.get("text") or "").strip():
                all_content_cols.add(ct)

    # 5e. Heading detection by single-cell rows
    single_heading_count = _detect_heading_rows_by_single_cell(zones_data, img_w, img_h)
    if single_heading_count:
        logger.info("Detected %d heading rows by single-cell heuristic", single_heading_count)

    # 5f. Strip IPA from headings
    for z in zones_data:
        for cell in z.get("cells", []):
            if cell.get("col_type") != "heading":
                continue
            text = cell.get("text", "")
            stripped = re.sub(r'\s*\[[^\]]*\]\s*$', '', text).strip()
            if stripped and stripped != text:
                cell["text"] = stripped

    # 5g. Extract page_ref cells and footer rows
    _extract_page_refs_and_footers(zones_data, page_number_info)

    # 5h. Convert slash-delimited IPA to bracket notation
    _convert_slash_ipa(zones_data, skip_ipa, en_col_type)

    return {
        "en_col_type": en_col_type,
        "ipa_target_cols": ipa_target_cols,
        "all_content_cols": all_content_cols,
        "skip_ipa": skip_ipa,
        "page_number_info": page_number_info,
    }


def _run_ipa_correction(
    all_cells: List[Dict],
    total_cols: int,
    ipa_mode: str,
    zones_data: List[Dict[str, Any]],
) -> Tuple[Optional[str], set, set]:
    """Run IPA correction on cells. Returns (en_col_type, ipa_target_cols, all_content_cols)."""
    en_col_type = None
    all_content_cols: set = set()

    # Detect English headword column via IPA signals
    col_ipa_count: Dict[str, int] = {}
    for cell in all_cells:
        ct = cell.get("col_type", "")
        if not ct.startswith("column_"):
            continue
        txt = cell.get("text", "") or ""
        if txt.strip():
            all_content_cols.add(ct)
        if '[' in txt or _text_has_garbled_ipa(txt):
            col_ipa_count[ct] = col_ipa_count.get(ct, 0) + 1
    if col_ipa_count:
        en_col_type = max(col_ipa_count, key=col_ipa_count.get)
    elif ipa_mode == "all":
        col_cell_count: Dict[str, int] = {}
        for cell in all_cells:
            ct = cell.get("col_type", "")
            if ct.startswith("column_") and (cell.get("text") or "").strip():
                col_cell_count[ct] = col_cell_count.get(ct, 0) + 1
        if col_cell_count:
            en_col_type = max(col_cell_count, key=col_cell_count.get)

    # Decide which columns to process based on ipa_mode
    en_ipa_target_cols: set = set()
    de_ipa_target_cols: set = set()
    if ipa_mode in ("auto", "en"):
        if en_col_type:
            en_ipa_target_cols.add(en_col_type)
    elif ipa_mode == "de":
        de_ipa_target_cols = all_content_cols - {en_col_type} if en_col_type else all_content_cols
    elif ipa_mode == "all":
        if en_col_type:
            en_ipa_target_cols.add(en_col_type)
        de_ipa_target_cols = all_content_cols - en_ipa_target_cols

    # --- Strip IPA from columns NOT in the target set ---
    _SQUARE_BRACKET_RE = re.compile(r'\s*\[[^\]]+\]')
    strip_en_ipa = en_col_type and en_col_type not in en_ipa_target_cols
    if strip_en_ipa or ipa_mode == "none":
        strip_cols = {en_col_type} if strip_en_ipa and ipa_mode != "none" else all_content_cols
        for cell in all_cells:
            ct = cell.get("col_type", "")
            if ct not in strip_cols:
                continue
            text = cell.get("text", "")
            if "[" in text:
                stripped = _SQUARE_BRACKET_RE.sub("", text)
                if stripped != text:
                    cell["text"] = stripped.strip()
                    cell["_ipa_corrected"] = True

    # --- English IPA (Britfone + eng_to_ipa) ---
    if en_ipa_target_cols:
        for cell in all_cells:
            ct = cell.get("col_type")
            if ct in en_ipa_target_cols:
                cell["_orig_col_type"] = ct
                cell["col_type"] = "column_en"
    _pre_ipa = {id(c): c.get("text", "") for c in all_cells}
    fix_cell_phonetics(all_cells, pronunciation="british")
    for cell in all_cells:
        orig = cell.pop("_orig_col_type", None)
        if orig:
            cell["col_type"] = orig
        if cell.get("text", "") != _pre_ipa.get(id(cell), ""):
            cell["_ipa_corrected"] = True

    # --- German IPA (wiki-pronunciation-dict + epitran) ---
    if de_ipa_target_cols:
        from cv_ipa_german import insert_german_ipa
        insert_german_ipa(all_cells, de_ipa_target_cols)

    ipa_target_cols = en_ipa_target_cols | de_ipa_target_cols

    # Mark cells whose text was changed by IPA correction
    for cell in all_cells:
        if cell.get("text", "") != _pre_ipa.get(id(cell), ""):
            cell["_ipa_corrected"] = True

    # 5d. Fix IPA continuation cells
    skip_ipa = (ipa_mode == "none")
    _REAL_IPA_CHARS = set("ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ")
    ipa_cont_fixed = 0
    for z in ([] if skip_ipa else zones_data):
        rows_sorted = sorted(z.get("rows", []), key=lambda r: r["index"])
        z_cells = z.get("cells", [])
        for idx, row in enumerate(rows_sorted):
            if idx == 0:
                continue
            ri = row["index"]
            row_cells = [c for c in z_cells if c.get("row_index") == ri]
            for cell in row_cells:
                ct = cell.get("col_type", "")
                if not ct.startswith("column_"):
                    continue
                cell_text = (cell.get("text") or "").strip()
                if not cell_text:
                    wb_texts = [w.get("text", "")
                                for w in cell.get("word_boxes", [])]
                    cell_text = " ".join(wb_texts).strip()
                    if not cell_text:
                        continue

                is_bracketed = (
                    cell_text.startswith('[') and cell_text.endswith(']')
                )

                if is_bracketed:
                    if not _text_has_garbled_ipa(cell_text):
                        continue
                    if re.search(r'\[[^\]]*[ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ][^\]]*\]', cell_text):
                        continue
                else:
                    content_cells_in_row = [
                        c for c in row_cells
                        if c.get("col_type", "").startswith("column_")
                        and c.get("col_type") != "column_1"
                    ]
                    if len(content_cells_in_row) != 1:
                        continue
                    if not _text_has_garbled_ipa(cell_text):
                        continue
                    if any(c in _REAL_IPA_CHARS for c in cell_text):
                        continue
                    _words_in_text = re.findall(r'[A-Za-zÄÖÜäöüß]{3,}', cell_text)
                    if len(_words_in_text) >= 3:
                        continue

                # Find headword in previous row, same column
                prev_ri = rows_sorted[idx - 1]["index"]
                prev_same_col = [
                    c for c in z_cells
                    if c.get("row_index") == prev_ri
                    and c.get("col_type") == ct
                ]
                if not prev_same_col:
                    continue
                prev_text = prev_same_col[0].get("text", "")
                fixed = fix_ipa_continuation_cell(
                    cell_text, prev_text, pronunciation="british",
                )
                if fixed != cell_text:
                    cell["text"] = fixed
                    ipa_cont_fixed += 1
                    logger.info(
                        "IPA continuation R%d %s: '%s' -> '%s'",
                        ri, ct, cell_text, fixed,
                    )
    if ipa_cont_fixed:
        logger.info("Fixed %d IPA continuation cells", ipa_cont_fixed)

    return en_col_type, ipa_target_cols, all_content_cols


def _extract_page_refs_and_footers(
    zones_data: List[Dict[str, Any]],
    page_number_info: Optional[Dict],
) -> None:
    """Extract page_ref cells and footer rows from content zones.

    Modifies zones_data in place. Updates page_number_info if a page number
    footer is found.
    """
    _REAL_IPA_CHARS_SET = set("ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ")
    _PAGE_REF_RE = re.compile(r'^[pP,]?\s*\.?\s*\d+$')
    _NUMBER_WORDS = {
        "one", "two", "three", "four", "five", "six", "seven",
        "eight", "nine", "ten", "eleven", "twelve", "thirteen",
        "fourteen", "fifteen", "sixteen", "seventeen", "eighteen",
        "nineteen", "twenty", "thirty", "forty", "fifty", "sixty",
        "seventy", "eighty", "ninety", "hundred", "thousand", "and",
        "einhundert", "zweihundert", "dreihundert", "vierhundert",
        "und", "zehn", "zwanzig", "dreißig", "vierzig", "fünfzig",
    }

    for z in zones_data:
        if z.get("zone_type") != "content":
            continue
        cells = z.get("cells", [])
        rows = z.get("rows", [])
        if not rows:
            continue

        # Extract column_1 cells that look like page references
        page_refs = []
        page_ref_cell_ids = set()
        for cell in cells:
            if cell.get("col_type") != "column_1":
                continue
            text = (cell.get("text") or "").strip()
            if not text:
                continue
            if not _PAGE_REF_RE.match(text):
                continue
            page_refs.append({
                "row_index": cell.get("row_index"),
                "text": text,
                "bbox_pct": cell.get("bbox_pct", {}),
            })
            page_ref_cell_ids.add(cell.get("cell_id"))

        # Detect footer: last non-header row if it has only 1 cell
        footer_rows = []
        non_header_rows = [r for r in rows if not r.get("is_header")]
        if non_header_rows:
            last_row = non_header_rows[-1]
            last_ri = last_row["index"]
            last_cells = [c for c in z["cells"]
                          if c.get("row_index") == last_ri]
            if len(last_cells) == 1:
                text = (last_cells[0].get("text") or "").strip()
                has_real_ipa = any(c in _REAL_IPA_CHARS_SET for c in text)
                has_commas = ',' in text
                text_words = set(text.lower().split())
                is_written_number = len(text_words) >= 2 and text_words.issubset(_NUMBER_WORDS)
                is_page_number = len(text) <= 20 or is_written_number
                if (text and not has_real_ipa and not has_commas
                        and is_page_number
                        and last_cells[0].get("col_type") != "heading"):
                    footer_rows.append({
                        "row_index": last_ri,
                        "text": text,
                        "bbox_pct": last_cells[0].get("bbox_pct", {}),
                    })

        # Classify footer rows
        page_number_footers = []
        other_footers = []
        for fr in footer_rows:
            ft = fr["text"].strip()
            digits = "".join(c for c in ft if c.isdigit())
            if digits and re.match(r'^[\d\s.]+$', ft):
                page_number_footers.append(fr)
            elif ft.lower().split() and set(ft.lower().split()).issubset(_NUMBER_WORDS):
                page_number_footers.append(fr)
            else:
                other_footers.append(fr)

        # Remove page-number footer rows from grid entirely
        if page_number_footers:
            pn_ris = {fr["row_index"] for fr in page_number_footers}
            z["cells"] = [c for c in z["cells"] if c.get("row_index") not in pn_ris]
            z["rows"] = [r for r in z["rows"] if r["index"] not in pn_ris]
            pn_text = page_number_footers[0]["text"].strip()
            pn_digits = "".join(c for c in pn_text if c.isdigit())
            if not page_number_info:
                page_number_info = {
                    "text": pn_text,
                    "y_pct": page_number_footers[0].get("bbox_pct", {}).get("y", 95),
                }
                if pn_digits:
                    page_number_info["number"] = int(pn_digits)

        # Mark remaining footer rows
        if other_footers:
            footer_ris = {fr["row_index"] for fr in other_footers}
            for r in z["rows"]:
                if r["index"] in footer_ris:
                    r["is_footer"] = True
            for c in z["cells"]:
                if c.get("row_index") in footer_ris:
                    c["col_type"] = "footer"

        if page_refs or footer_rows:
            logger.info(
                "Extracted %d page_refs + %d footer rows (%d page numbers removed) from zone %d",
                len(page_refs), len(footer_rows), len(page_number_footers),
                z.get("zone_index", 0),
            )

        if page_refs:
            z["page_refs"] = page_refs
        if other_footers:
            z["footer"] = other_footers


def _convert_slash_ipa(
    zones_data: List[Dict[str, Any]],
    skip_ipa: bool,
    en_col_type: Optional[str],
) -> None:
    """Convert slash-delimited IPA to bracket notation.

    Dictionary-style pages print IPA between slashes: "tiger /'taiga/"
    """
    _SLASH_IPA_RE = re.compile(
        r'(\b[a-zA-Z]+[²³¹]?)\s*'   # headword (capture group 1)
        r"(/[^/]{2,}/)"              # /ipa/ (capture group 2), min 2 chars
    )
    _STANDALONE_SLASH_IPA_RE = re.compile(r'^/([^/]{2,})/')
    _SLASH_IPA_REJECT_RE = re.compile(r'[\s(),]')
    slash_ipa_fixed = 0

    for z in ([] if skip_ipa else zones_data):
        for cell in z.get("cells", []):
            if en_col_type and cell.get("col_type") != en_col_type:
                continue
            text = cell.get("text", "")
            if "/" not in text:
                continue

            def _replace_slash_ipa(m: re.Match) -> str:
                nonlocal slash_ipa_fixed
                headword = m.group(1)
                ocr_ipa = m.group(2)
                inner_raw = ocr_ipa.strip("/").strip()
                if _SLASH_IPA_REJECT_RE.search(inner_raw):
                    return m.group(0)
                clean_hw = re.sub(r'[²³¹\d]', '', headword).strip()
                ipa = _lookup_ipa(clean_hw, "british") if clean_hw else None
                if ipa:
                    slash_ipa_fixed += 1
                    return f"{headword} [{ipa}]"
                inner = inner_raw.lstrip("'").strip()
                if inner:
                    slash_ipa_fixed += 1
                    return f"{headword} [{inner}]"
                return m.group(0)

            new_text = _SLASH_IPA_RE.sub(_replace_slash_ipa, text)

            _AFTER_BRACKET_SLASH = re.compile(r'(?<=\])\s*(/[^/]{2,}/)')

            def _replace_trailing_slash(m: re.Match) -> str:
                nonlocal slash_ipa_fixed
                inner = m.group(1).strip("/").strip().lstrip("'").strip()
                if _SLASH_IPA_REJECT_RE.search(inner):
                    return m.group(0)
                if inner:
                    slash_ipa_fixed += 1
                    return f" [{inner}]"
                return m.group(0)
            new_text = _AFTER_BRACKET_SLASH.sub(_replace_trailing_slash, new_text)

            if new_text == text:
                m = _STANDALONE_SLASH_IPA_RE.match(text)
                if m:
                    inner = m.group(1).strip()
                    if not _SLASH_IPA_REJECT_RE.search(inner):
                        inner = inner.lstrip("'").strip()
                        if inner:
                            new_text = "[" + inner + "]" + text[m.end():]
                            slash_ipa_fixed += 1

            if new_text != text:
                cell["text"] = new_text

    if slash_ipa_fixed:
        logger.info("Step 5h: converted %d slash-IPA to bracket notation", slash_ipa_fixed)