breakpilot-lehrer/klausur-service/backend/cv_gutter_repair_grid.py

"""
Gutter Repair Grid — grid analysis and suggestion application.

Extracted from cv_gutter_repair.py for modularity.

Lizenz: Apache 2.0 (kommerziell nutzbar)
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
"""

import logging
import time
from typing import Any, Dict, List, Tuple

from cv_gutter_repair_core import (
    _init_spellcheckers,
    _is_ipa_text,
    _is_known,
    _MIN_WORD_LEN_HYPHEN,
    _SPELL_AVAILABLE,
    _STOPWORDS,
    _TRAILING_PUNCT_RE,
    _try_hyphen_join,
    _try_spell_fix,
    _word_is_at_gutter_edge,
    GutterSuggestion,
)

logger = logging.getLogger(__name__)


# ---------------------------------------------------------------------------
# Grid analysis
# ---------------------------------------------------------------------------

def analyse_grid_for_gutter_repair(
    grid_data: Dict[str, Any],
    image_width: int = 0,
) -> Dict[str, Any]:
    """Analyse a structured grid and return gutter repair suggestions.

    Args:
        grid_data: The grid_editor_result from the session (zones→cells structure).
        image_width: Image width in pixels (for determining gutter side).

    Returns:
        Dict with "suggestions" list and "stats".
    """
    t0 = time.time()
    _init_spellcheckers()

    if not _SPELL_AVAILABLE:
        return {
            "suggestions": [],
            "stats": {"error": "pyspellchecker not installed"},
            "duration_seconds": 0,
        }

    zones = grid_data.get("zones", [])
    suggestions: List[GutterSuggestion] = []
    words_checked = 0
    gutter_candidates = 0

    for zi, zone in enumerate(zones):
        columns = zone.get("columns", [])
        cells = zone.get("cells", [])
        if not columns or not cells:
            continue

        # Build column lookup: col_index → {x, width, type}
        col_info: Dict[int, Dict] = {}
        for col in columns:
            ci = col.get("index", col.get("col_index", -1))
            col_info[ci] = {
                "x": col.get("x_min_px", col.get("x", 0)),
                "width": col.get("x_max_px", col.get("width", 0)) - col.get("x_min_px", col.get("x", 0)),
                "type": col.get("type", col.get("col_type", "")),
            }

        # Build row→col→cell lookup
        cell_map: Dict[Tuple[int, int], Dict] = {}
        max_row = 0
        for cell in cells:
            ri = cell.get("row_index", 0)
            ci = cell.get("col_index", 0)
            cell_map[(ri, ci)] = cell
            if ri > max_row:
                max_row = ri

        # Determine which columns are at the gutter edge.
        # For a left page: rightmost content columns.
        # For now, check ALL columns — a word is a candidate if it's at the
        # right edge of its column AND not a known word.
        for (ri, ci), cell in cell_map.items():
            text = (cell.get("text") or "").strip()
            if not text:
                continue
            if _is_ipa_text(text):
                continue

            words_checked += 1
            col = col_info.get(ci, {})
            col_type = col.get("type", "")

            # Get word boxes to check position
            word_boxes = cell.get("word_boxes", [])

            # Check the LAST word in the cell (rightmost, closest to gutter)
            cell_words = text.split()
            if not cell_words:
                continue

            last_word = cell_words[-1]

            # Skip stopwords
            if last_word.lower().rstrip(".,;:!?-") in _STOPWORDS:
                continue

            last_word_clean = last_word.rstrip(".,;:!?)(")
            if len(last_word_clean) < _MIN_WORD_LEN_HYPHEN:
                continue

            # Check if the last word is at the gutter edge
            is_at_edge = False
            if word_boxes:
                last_wb = word_boxes[-1]
                is_at_edge = _word_is_at_gutter_edge(
                    last_wb, col.get("x", 0), col.get("width", 1)
                )
            else:
                # No word boxes — use cell bbox
                bbox = cell.get("bbox_px", {})
                is_at_edge = _word_is_at_gutter_edge(
                    {"left": bbox.get("x", 0), "width": bbox.get("w", 0)},
                    col.get("x", 0), col.get("width", 1)
                )

            if not is_at_edge:
                continue

            # Word is at gutter edge — check if it's a known word
            if _is_known(last_word_clean):
                continue

            # Check if the word ends with "-" (explicit hyphen break)
            ends_with_hyphen = last_word.endswith("-")

            # If the word already ends with "-" and the stem (without
            # the hyphen) is a known word, this is a VALID line-break
            # hyphenation — not a gutter error.  Gutter problems cause
            # the hyphen to be LOST ("ve" instead of "ver-"), so a
            # visible hyphen + known stem = intentional word-wrap.
            # Example: "wunder-" → "wunder" is known → skip.
            if ends_with_hyphen:
                stem = last_word_clean.rstrip("-")
                if stem and _is_known(stem):
                    continue

            gutter_candidates += 1

            # --- Strategy 1: Hyphen join with next row ---
            next_cell = cell_map.get((ri + 1, ci))
            if next_cell:
                next_text = (next_cell.get("text") or "").strip()
                next_words = next_text.split()
                if next_words:
                    first_next = next_words[0]
                    first_next_clean = _TRAILING_PUNCT_RE.sub('', first_next)
                    first_alpha = next((c for c in first_next if c.isalpha()), "")

                    # Also skip if the joined word is known (covers compound
                    # words where the stem alone might not be in the dictionary)
                    if ends_with_hyphen and first_next_clean:
                        direct = last_word_clean.rstrip("-") + first_next_clean
                        if _is_known(direct):
                            continue

                    # Continuation likely if:
                    # - explicit hyphen, OR
                    # - next row starts lowercase (= not a new entry)
                    if ends_with_hyphen or (first_alpha and first_alpha.islower()):
                        result = _try_hyphen_join(last_word_clean, first_next)
                        if result:
                            joined, missing, conf = result
                            # Build display parts: show hyphenation for original layout
                            if ends_with_hyphen:
                                display_p1 = last_word_clean.rstrip("-")
                                if missing:
                                    display_p1 += missing
                                display_p1 += "-"
                            else:
                                display_p1 = last_word_clean
                                if missing:
                                    display_p1 += missing + "-"
                                else:
                                    display_p1 += "-"

                            suggestion = GutterSuggestion(
                                type="hyphen_join",
                                zone_index=zi,
                                row_index=ri,
                                col_index=ci,
                                col_type=col_type,
                                cell_id=cell.get("cell_id", f"R{ri:02d}_C{ci}"),
                                original_text=last_word,
                                suggested_text=joined,
                                next_row_index=ri + 1,
                                next_row_cell_id=next_cell.get("cell_id", f"R{ri+1:02d}_C{ci}"),
                                next_row_text=next_text,
                                missing_chars=missing,
                                display_parts=[display_p1, first_next],
                                confidence=conf,
                                reason="gutter_truncation" if missing else "hyphen_continuation",
                            )
                            suggestions.append(suggestion)
                            continue  # skip spell_fix if hyphen_join found

            # --- Strategy 2: Single-word spell fix (only for longer words) ---
            fix_result = _try_spell_fix(last_word_clean, col_type)
            if fix_result:
                corrected, conf, alts = fix_result
                suggestion = GutterSuggestion(
                    type="spell_fix",
                    zone_index=zi,
                    row_index=ri,
                    col_index=ci,
                    col_type=col_type,
                    cell_id=cell.get("cell_id", f"R{ri:02d}_C{ci}"),
                    original_text=last_word,
                    suggested_text=corrected,
                    alternatives=alts,
                    confidence=conf,
                    reason="gutter_blur",
                )
                suggestions.append(suggestion)

    duration = round(time.time() - t0, 3)

    logger.info(
        "Gutter repair: checked %d words, %d gutter candidates, %d suggestions (%.2fs)",
        words_checked, gutter_candidates, len(suggestions), duration,
    )

    return {
        "suggestions": [s.to_dict() for s in suggestions],
        "stats": {
            "words_checked": words_checked,
            "gutter_candidates": gutter_candidates,
            "suggestions_found": len(suggestions),
        },
        "duration_seconds": duration,
    }


def apply_gutter_suggestions(
    grid_data: Dict[str, Any],
    accepted_ids: List[str],
    suggestions: List[Dict[str, Any]],
) -> Dict[str, Any]:
    """Apply accepted gutter repair suggestions to the grid data.

    Modifies cells in-place and returns summary of changes.

    Args:
        grid_data: The grid_editor_result (zones→cells).
        accepted_ids: List of suggestion IDs the user accepted.
        suggestions: The full suggestions list (from analyse_grid_for_gutter_repair).

    Returns:
        Dict with "applied_count" and "changes" list.
    """
    accepted_set = set(accepted_ids)
    accepted_suggestions = [s for s in suggestions if s.get("id") in accepted_set]

    zones = grid_data.get("zones", [])
    changes: List[Dict[str, Any]] = []

    for s in accepted_suggestions:
        zi = s.get("zone_index", 0)
        ri = s.get("row_index", 0)
        ci = s.get("col_index", 0)
        stype = s.get("type", "")

        if zi >= len(zones):
            continue
        zone_cells = zones[zi].get("cells", [])

        # Find the target cell
        target_cell = None
        for cell in zone_cells:
            if cell.get("row_index") == ri and cell.get("col_index") == ci:
                target_cell = cell
                break

        if not target_cell:
            continue

        old_text = target_cell.get("text", "")

        if stype == "spell_fix":
            # Replace the last word in the cell text
            original_word = s.get("original_text", "")
            corrected = s.get("suggested_text", "")
            if original_word and corrected:
                # Replace from the right (last occurrence)
                idx = old_text.rfind(original_word)
                if idx >= 0:
                    new_text = old_text[:idx] + corrected + old_text[idx + len(original_word):]
                    target_cell["text"] = new_text
                    changes.append({
                        "type": "spell_fix",
                        "zone_index": zi,
                        "row_index": ri,
                        "col_index": ci,
                        "cell_id": target_cell.get("cell_id", ""),
                        "old_text": old_text,
                        "new_text": new_text,
                    })

        elif stype == "hyphen_join":
            # Current cell: replace last word with the hyphenated first part
            original_word = s.get("original_text", "")
            joined = s.get("suggested_text", "")
            display_parts = s.get("display_parts", [])
            next_ri = s.get("next_row_index", -1)

            if not original_word or not joined or not display_parts:
                continue

            # The first display part is what goes in the current row
            first_part = display_parts[0] if display_parts else ""

            # Replace the last word in current cell with the restored form.
            # The next row is NOT modified — "künden" stays in its row
            # because the original book layout has it there. We only fix
            # the truncated word in the current row (e.g. "ve" → "ver-").
            idx = old_text.rfind(original_word)
            if idx >= 0:
                new_text = old_text[:idx] + first_part + old_text[idx + len(original_word):]
                target_cell["text"] = new_text
                changes.append({
                    "type": "hyphen_join",
                    "zone_index": zi,
                    "row_index": ri,
                    "col_index": ci,
                    "cell_id": target_cell.get("cell_id", ""),
                    "old_text": old_text,
                    "new_text": new_text,
                    "joined_word": joined,
                })

    logger.info("Gutter repair applied: %d/%d suggestions", len(changes), len(accepted_suggestions))

    return {
        "applied_count": len(accepted_suggestions),
        "changes": changes,
    }