breakpilot-lehrer/klausur-service/backend/grid_build_cleanup.py

"""
Grid Build Cleanup — Phase 3: Junk row removal, artifact cleanup, pipe
divider removal, connector normalization, border strip detection, and
alphabet sidebar removal.

Extracted from grid_build_core.py for maintainability.
"""

import logging
import re
from typing import Any, Dict, List

from cv_ocr_engines import _words_to_reading_order_text

logger = logging.getLogger(__name__)

_PIPE_RE = re.compile(r"^\|+$")


def _cleanup_zones(
    zones_data: List[Dict[str, Any]],
    border_prefiltered: bool,
    session_id: str,
) -> bool:
    """Clean up zone data: remove junk rows, artifacts, pipes, border strips.

    Args:
        zones_data: List of zone dicts (modified in place).
        border_prefiltered: Whether border words were already pre-filtered.
        session_id: For logging.

    Returns:
        Updated border_prefiltered flag.
    """
    _remove_junk_rows(zones_data)
    _remove_artifact_cells(zones_data)
    _remove_oversized_word_boxes(zones_data)
    _remove_pipe_dividers(zones_data)
    _normalize_connector_columns(zones_data)
    border_prefiltered = _remove_border_strips(zones_data, border_prefiltered)
    _remove_alphabet_sidebars(zones_data)
    return border_prefiltered


def _remove_junk_rows(zones_data: List[Dict[str, Any]]) -> None:
    """Remove rows where ALL cells contain only short, low-confidence text.

    Also removes 'oversized stub' rows and 'scattered debris' rows.
    """
    _JUNK_CONF_THRESHOLD = 50
    _JUNK_MAX_TEXT_LEN = 3

    for z in zones_data:
        cells = z.get("cells", [])
        rows = z.get("rows", [])
        if not cells or not rows:
            continue

        # Compute median word height across the zone for oversized detection
        all_wb_heights = [
            wb["height"]
            for cell in cells
            for wb in cell.get("word_boxes") or []
            if wb.get("height", 0) > 0
        ]
        median_wb_h = sorted(all_wb_heights)[len(all_wb_heights) // 2] if all_wb_heights else 28

        junk_row_indices = set()
        for row in rows:
            ri = row["index"]
            row_cells = [c for c in cells if c.get("row_index") == ri]
            if not row_cells:
                continue

            row_wbs = [
                wb for cell in row_cells
                for wb in cell.get("word_boxes") or []
            ]

            # Rule 1: ALL word_boxes are low-conf AND short text
            all_junk = True
            for wb in row_wbs:
                text = (wb.get("text") or "").strip()
                conf = wb.get("conf", 0)
                if conf >= _JUNK_CONF_THRESHOLD or len(text) > _JUNK_MAX_TEXT_LEN:
                    all_junk = False
                    break
            if all_junk and row_wbs:
                junk_row_indices.add(ri)
                continue

            # Rule 2: oversized stub -- <=3 words, short total text,
            # and word height > 1.8x median
            if len(row_wbs) <= 3:
                total_text = "".join((wb.get("text") or "").strip() for wb in row_wbs)
                max_h = max((wb.get("height", 0) for wb in row_wbs), default=0)
                has_page_ref = any(
                    re.match(r'^[pPsS]\.?\s*\d+$', (wb.get("text") or "").strip())
                    for wb in row_wbs
                )
                if len(total_text) <= 5 and max_h > median_wb_h * 1.8 and not has_page_ref:
                    junk_row_indices.add(ri)
                    continue

            # Rule 3: scattered debris -- rows with only tiny fragments
            longest = max(len((wb.get("text") or "").strip()) for wb in row_wbs)
            if longest <= 2:
                junk_row_indices.add(ri)
                continue

        if junk_row_indices:
            z["cells"] = [c for c in cells if c.get("row_index") not in junk_row_indices]
            z["rows"] = [r for r in rows if r["index"] not in junk_row_indices]
            logger.info(
                "build-grid: removed %d junk rows from zone %d: %s",
                len(junk_row_indices), z["zone_index"],
                sorted(junk_row_indices),
            )


def _remove_artifact_cells(zones_data: List[Dict[str, Any]]) -> None:
    """Remove individual cells with a single very-short, low-conf word."""
    _ARTIFACT_MAX_LEN = 2
    _ARTIFACT_CONF_THRESHOLD = 65

    for z in zones_data:
        cells = z.get("cells", [])
        if not cells:
            continue
        artifact_ids = set()
        for cell in cells:
            wbs = cell.get("word_boxes") or []
            if len(wbs) != 1:
                continue
            wb = wbs[0]
            text = (wb.get("text") or "").strip()
            conf = wb.get("conf", 100)
            if len(text) <= _ARTIFACT_MAX_LEN and conf < _ARTIFACT_CONF_THRESHOLD:
                artifact_ids.add(cell.get("cell_id"))
        if artifact_ids:
            z["cells"] = [c for c in cells if c.get("cell_id") not in artifact_ids]
            logger.info(
                "build-grid: removed %d artifact cells from zone %d: %s",
                len(artifact_ids), z.get("zone_index", 0),
                [c.get("text") for c in cells if c.get("cell_id") in artifact_ids],
            )


def _remove_oversized_word_boxes(zones_data: List[Dict[str, Any]]) -> None:
    """Remove word_boxes whose height is 3x+ the median (graphic artifacts)."""
    for z in zones_data:
        cells = z.get("cells", [])
        if not cells:
            continue
        all_wh = [
            wb["height"]
            for cell in cells
            for wb in cell.get("word_boxes") or []
            if wb.get("height", 0) > 0
        ]
        if not all_wh:
            continue
        med_h = sorted(all_wh)[len(all_wh) // 2]
        oversized_threshold = med_h * 3
        removed_oversized = 0
        for cell in cells:
            wbs = cell.get("word_boxes") or []
            filtered = [wb for wb in wbs if wb.get("height", 0) <= oversized_threshold]
            if len(filtered) < len(wbs):
                removed_oversized += len(wbs) - len(filtered)
                cell["word_boxes"] = filtered
                cell["text"] = _words_to_reading_order_text(filtered)
        if removed_oversized:
            z["cells"] = [c for c in cells if c.get("word_boxes")]
            logger.info(
                "build-grid: removed %d oversized word_boxes (>%dpx) from zone %d",
                removed_oversized, oversized_threshold, z.get("zone_index", 0),
            )


def _remove_pipe_dividers(zones_data: List[Dict[str, Any]]) -> None:
    """Remove pipe-character word_boxes (column divider artifacts)."""
    for z in zones_data:
        if z.get("vsplit_group") is not None:
            continue  # pipes already removed before split
        removed_pipes = 0
        for cell in z.get("cells", []):
            wbs = cell.get("word_boxes") or []
            filtered = [wb for wb in wbs if not _PIPE_RE.match((wb.get("text") or "").strip())]
            if len(filtered) < len(wbs):
                removed_pipes += len(wbs) - len(filtered)
                cell["word_boxes"] = filtered
                cell["text"] = _words_to_reading_order_text(filtered)
        if removed_pipes:
            z["cells"] = [c for c in z.get("cells", []) if (c.get("word_boxes") or c.get("text", "").strip())]
            logger.info(
                "build-grid: removed %d pipe-divider word_boxes from zone %d",
                removed_pipes, z.get("zone_index", 0),
            )

    # Strip pipe chars ONLY from cell edges (OCR artifacts).
    # Preserve pipes embedded in words as syllable separators.
    for z in zones_data:
        for cell in z.get("cells", []):
            text = cell.get("text", "")
            if "|" in text:
                cleaned = text.strip("|").strip()
                if cleaned != text.strip():
                    cell["text"] = cleaned


def _normalize_connector_columns(zones_data: List[Dict[str, Any]]) -> None:
    """Normalize narrow connector columns where OCR appends noise chars.

    In synonym dictionaries a narrow column repeats the same word
    (e.g. "oder") in every row. OCR sometimes appends noise chars.
    """
    for z in zones_data:
        cols = z.get("columns", [])
        cells = z.get("cells", [])
        if not cols or not cells:
            continue
        for col in cols:
            ci = col.get("index")
            col_cells = [c for c in cells if c.get("col_index") == ci]
            if len(col_cells) < 3:
                continue
            text_counts: Dict[str, int] = {}
            for c in col_cells:
                t = (c.get("text") or "").strip()
                if t:
                    text_counts[t] = text_counts.get(t, 0) + 1
            if not text_counts:
                continue
            dominant_text = max(text_counts, key=text_counts.get)  # type: ignore[arg-type]
            dominant_count = text_counts[dominant_text]
            if len(dominant_text) > 10 or dominant_count < len(col_cells) * 0.6:
                continue
            fixed = 0
            for c in col_cells:
                t = (c.get("text") or "").strip()
                if t != dominant_text and t.startswith(dominant_text) and len(t) <= len(dominant_text) + 2:
                    c["text"] = dominant_text
                    wbs = c.get("word_boxes") or []
                    if len(wbs) == 1:
                        wbs[0]["text"] = dominant_text
                    fixed += 1
            if fixed:
                logger.info(
                    "build-grid: normalized %d outlier cells in connector column %d "
                    "(dominant='%s') zone %d",
                    fixed, ci, dominant_text, z.get("zone_index", 0),
                )


def _remove_border_strips(
    zones_data: List[Dict[str, Any]],
    border_prefiltered: bool,
) -> bool:
    """Detect and remove page-border decoration strips.

    Returns updated border_prefiltered flag.
    """
    border_strip_removed = 0
    if border_prefiltered:
        logger.info("Step 4e: skipped (border pre-filter already applied)")
        return border_prefiltered

    for z in zones_data:
        cells = z.get("cells", [])
        if not cells:
            continue
        all_wbs_with_cell: list = []
        for cell in cells:
            for wb in cell.get("word_boxes") or []:
                all_wbs_with_cell.append((wb.get("left", 0), wb, cell))
        if len(all_wbs_with_cell) < 10:
            continue
        all_wbs_with_cell.sort(key=lambda t: t[0])
        total = len(all_wbs_with_cell)

        # -- Left-edge scan --
        left_strip_count = 0
        left_gap = 0
        running_right = 0
        for gi in range(total - 1):
            running_right = max(
                running_right,
                all_wbs_with_cell[gi][0] + all_wbs_with_cell[gi][1].get("width", 0),
            )
            gap = all_wbs_with_cell[gi + 1][0] - running_right
            if gap > 30:
                left_strip_count = gi + 1
                left_gap = gap
                break

        # -- Right-edge scan --
        right_strip_count = 0
        right_gap = 0
        running_left = all_wbs_with_cell[-1][0]
        for gi in range(total - 1, 0, -1):
            running_left = min(running_left, all_wbs_with_cell[gi][0])
            prev_right = (
                all_wbs_with_cell[gi - 1][0]
                + all_wbs_with_cell[gi - 1][1].get("width", 0)
            )
            gap = running_left - prev_right
            if gap > 30:
                right_strip_count = total - gi
                right_gap = gap
                break

        strip_wbs: set = set()
        strip_side = ""
        strip_gap = 0
        strip_count = 0
        if left_strip_count > 0 and left_strip_count / total < 0.20:
            strip_side = "left"
            strip_count = left_strip_count
            strip_gap = left_gap
            strip_wbs = {id(t[1]) for t in all_wbs_with_cell[:left_strip_count]}
        elif right_strip_count > 0 and right_strip_count / total < 0.20:
            strip_side = "right"
            strip_count = right_strip_count
            strip_gap = right_gap
            strip_wbs = {id(t[1]) for t in all_wbs_with_cell[total - right_strip_count:]}

        if not strip_wbs:
            continue
        for cell in cells:
            wbs = cell.get("word_boxes") or []
            filtered = [wb for wb in wbs if id(wb) not in strip_wbs]
            if len(filtered) < len(wbs):
                border_strip_removed += len(wbs) - len(filtered)
                cell["word_boxes"] = filtered
                cell["text"] = _words_to_reading_order_text(filtered)
        z["cells"] = [c for c in cells
                      if (c.get("word_boxes") or c.get("text", "").strip())]
        logger.info(
            "Step 4e: removed %d border-strip word_boxes (%s) from zone %d "
            "(gap=%dpx, strip=%d/%d wbs)",
            border_strip_removed, strip_side, z.get("zone_index", 0),
            strip_gap, strip_count, total,
        )

    return border_prefiltered


def _remove_alphabet_sidebars(zones_data: List[Dict[str, Any]]) -> None:
    """Remove decorative edge columns (alphabet sidebar safety net).

    Dictionary pages have A-Z letter sidebars that OCR reads as single-
    character word_boxes.
    """
    for z in zones_data:
        columns = z.get("columns", [])
        cells = z.get("cells", [])
        if len(columns) < 3 or not cells:
            continue
        col_cells: Dict[str, List[Dict]] = {}
        for cell in cells:
            ct = cell.get("col_type", "")
            if ct.startswith("column_"):
                col_cells.setdefault(ct, []).append(cell)
        col_types_ordered = sorted(col_cells.keys())
        if len(col_types_ordered) < 3:
            continue
        for edge_ct in [col_types_ordered[0], col_types_ordered[-1]]:
            edge_cells_list = col_cells.get(edge_ct, [])
            if len(edge_cells_list) < 3:
                continue
            texts = [(c.get("text") or "").strip() for c in edge_cells_list]
            avg_len = sum(len(t) for t in texts) / len(texts)
            single_char = sum(1 for t in texts if len(t) <= 1)
            single_ratio = single_char / len(texts)
            if avg_len > 1.5:
                continue
            if single_ratio < 0.7:
                continue
            removed_count = len(edge_cells_list)
            edge_ids = {id(c) for c in edge_cells_list}
            z["cells"] = [c for c in cells if id(c) not in edge_ids]
            z["columns"] = [col for col in columns if col.get("col_type") != edge_ct]
            logger.info(
                "Step 4f: removed decorative edge column '%s' from zone %d "
                "(%d cells, avg_len=%.1f, single_char=%.0f%%)",
                edge_ct, z.get("zone_index", 0), removed_count,
                avg_len, single_ratio * 100,
            )
            break  # only remove one edge per zone