breakpilot-lehrer/klausur-service/backend/vocab_worksheet_ocr.py

"""
Vocab Worksheet OCR Pipeline — full Kombi OCR pipeline for a single page.

Extracted from vocab_worksheet_api.py to keep file sizes manageable.

Pipeline steps:
  orientation → deskew → dewarp → crop → scan-quality → enhance →
  dual-engine OCR (RapidOCR + Tesseract) → merge → grid-build →
  vocab extraction → row merging
"""

import logging
import uuid
from typing import Optional

logger = logging.getLogger(__name__)

# ---------------------------------------------------------------------------
# Optional heavy dependencies (not available in every environment)
# ---------------------------------------------------------------------------

try:
    import cv2
    import numpy as np
except ImportError:
    cv2 = None  # type: ignore[assignment]
    np = None  # type: ignore[assignment]
    logger.warning("cv2 / numpy not available — OCR pipeline disabled")

try:
    from PIL import Image
except ImportError:
    Image = None  # type: ignore[assignment]

try:
    import pytesseract
except ImportError:
    pytesseract = None  # type: ignore[assignment]

# CV pipeline helpers
try:
    from cv_vocab_pipeline import (
        deskew_two_pass,
        dewarp_image,
        detect_and_fix_orientation,
        _cells_to_vocab_entries,
        _fix_phonetic_brackets,
    )
except ImportError:
    deskew_two_pass = None  # type: ignore[assignment]
    dewarp_image = None  # type: ignore[assignment]
    detect_and_fix_orientation = None  # type: ignore[assignment]
    _cells_to_vocab_entries = None  # type: ignore[assignment]
    _fix_phonetic_brackets = None  # type: ignore[assignment]

try:
    from cv_cell_grid import (
        _merge_wrapped_rows,
        _merge_phonetic_continuation_rows,
        _merge_continuation_rows,
    )
except ImportError:
    _merge_wrapped_rows = None  # type: ignore[assignment]
    _merge_phonetic_continuation_rows = None  # type: ignore[assignment]
    _merge_continuation_rows = None  # type: ignore[assignment]

try:
    from cv_ocr_engines import ocr_region_rapid
except ImportError:
    ocr_region_rapid = None  # type: ignore[assignment]

try:
    from cv_vocab_types import PageRegion
except ImportError:
    PageRegion = None  # type: ignore[assignment]

try:
    from ocr_pipeline_ocr_merge import (
        _split_paddle_multi_words,
        _merge_paddle_tesseract,
        _deduplicate_words,
    )
except ImportError:
    _split_paddle_multi_words = None  # type: ignore[assignment]
    _merge_paddle_tesseract = None  # type: ignore[assignment]
    _deduplicate_words = None  # type: ignore[assignment]

try:
    from cv_words_first import build_grid_from_words
except ImportError:
    build_grid_from_words = None  # type: ignore[assignment]

try:
    from ocr_pipeline_session_store import (
        create_session_db as create_pipeline_session_db,
        update_session_db as update_pipeline_session_db,
    )
except ImportError:
    create_pipeline_session_db = None  # type: ignore[assignment]
    update_pipeline_session_db = None  # type: ignore[assignment]


# ---------------------------------------------------------------------------
# Main pipeline function
# ---------------------------------------------------------------------------

async def _run_ocr_pipeline_for_page(
    img_bgr: "np.ndarray",
    page_number: int,
    vocab_session_id: str,
    *,
    ipa_mode: str = "none",
    syllable_mode: str = "none",
    enable_enhance: bool = True,
    max_columns: Optional[int] = 3,
    override_min_conf: Optional[int] = None,
) -> tuple:
    """Run the full Kombi OCR pipeline on a single page and return vocab entries.

    Uses the same pipeline as the admin OCR Kombi pipeline:
    orientation → deskew → dewarp → crop → dual-engine OCR → grid-build
    (with pipe-autocorrect, word-gap merge, dictionary detection, etc.)

    Args:
        img_bgr: BGR numpy array.
        page_number: 0-indexed page number.
        vocab_session_id: Vocab session ID for logging.
        ipa_mode: "none" (default for worksheets), "auto", "all", "en", "de".
        syllable_mode: "none" (default for worksheets), "auto", "all", "en", "de".

    Returns (entries, rotation_deg) where entries is a list of dicts and
    rotation_deg is the orientation correction applied (0, 90, 180, 270).
    """
    import time as _time

    t_total = _time.time()
    img_h, img_w = img_bgr.shape[:2]
    logger.info(f"Kombi Pipeline page {page_number + 1}: image {img_w}x{img_h}")

    # 1. Orientation detection (fix upside-down scans)
    t0 = _time.time()
    img_bgr, rotation = detect_and_fix_orientation(img_bgr)
    if rotation:
        img_h, img_w = img_bgr.shape[:2]
        logger.info(f"  orientation: rotated {rotation}° ({_time.time() - t0:.1f}s)")
    else:
        logger.info(f"  orientation: OK ({_time.time() - t0:.1f}s)")

    # 2. Create pipeline session in DB (visible in admin Kombi UI)
    pipeline_session_id = str(uuid.uuid4())
    try:
        _, png_buf = cv2.imencode(".png", img_bgr)
        original_png = png_buf.tobytes()
        await create_pipeline_session_db(
            pipeline_session_id,
            name=f"vocab-ws-{vocab_session_id[:8]}-p{page_number + 1}",
            filename=f"page_{page_number + 1}.png",
            original_png=original_png,
        )
    except Exception as e:
        logger.warning(f"Could not create pipeline session in DB: {e}")

    # 3. Three-pass deskew
    t0 = _time.time()
    deskewed_bgr, angle_applied, deskew_debug = deskew_two_pass(img_bgr.copy())
    logger.info(f"  deskew: angle={angle_applied:.2f} ({_time.time() - t0:.1f}s)")

    # 4. Dewarp
    t0 = _time.time()
    dewarped_bgr, dewarp_info = dewarp_image(deskewed_bgr)
    logger.info(f"  dewarp: shear={dewarp_info['shear_degrees']:.3f} ({_time.time() - t0:.1f}s)")

    # 5. Content crop (removes scanner borders, gutter shadows)
    t0 = _time.time()
    try:
        from page_crop import detect_and_crop_page
        cropped_bgr, crop_result = detect_and_crop_page(dewarped_bgr)
        if crop_result.get("crop_applied"):
            dewarped_bgr = cropped_bgr
            logger.info(f"  crop: applied ({_time.time() - t0:.1f}s)")
        else:
            logger.info(f"  crop: skipped ({_time.time() - t0:.1f}s)")
    except Exception as e:
        logger.warning(f"  crop: failed ({e}), continuing with uncropped image")

    # 5b. Scan quality assessment
    scan_quality_report = None
    try:
        from scan_quality import score_scan_quality
        scan_quality_report = score_scan_quality(dewarped_bgr)
    except Exception as e:
        logger.warning(f"  scan quality: failed ({e})")

    if override_min_conf:
        min_ocr_conf = override_min_conf
    else:
        min_ocr_conf = scan_quality_report.recommended_min_conf if scan_quality_report else 40

    # 5c. Image enhancement for degraded scans
    is_degraded = scan_quality_report.is_degraded if scan_quality_report else False
    if is_degraded and enable_enhance:
        try:
            from ocr_image_enhance import enhance_for_ocr
            dewarped_bgr = enhance_for_ocr(dewarped_bgr, is_degraded=True)
            logger.info("  enhancement: applied (degraded scan)")
        except Exception as e:
            logger.warning(f"  enhancement: failed ({e})")

    # 6. Dual-engine OCR (RapidOCR + Tesseract → merge)
    t0 = _time.time()
    img_h, img_w = dewarped_bgr.shape[:2]

    # RapidOCR (local ONNX)
    try:
        from cv_ocr_engines import ocr_region_rapid
        from cv_vocab_types import PageRegion
        full_region = PageRegion(type="full_page", x=0, y=0, width=img_w, height=img_h)
        rapid_words = ocr_region_rapid(dewarped_bgr, full_region) or []
    except Exception as e:
        logger.warning(f"  RapidOCR failed: {e}")
        rapid_words = []

    # Tesseract
    from PIL import Image
    import pytesseract
    pil_img = Image.fromarray(cv2.cvtColor(dewarped_bgr, cv2.COLOR_BGR2RGB))
    data = pytesseract.image_to_data(
        pil_img, lang="eng+deu", config="--psm 6 --oem 3",
        output_type=pytesseract.Output.DICT,
    )
    tess_words = []
    for i in range(len(data["text"])):
        text = str(data["text"][i]).strip()
        conf_raw = str(data["conf"][i])
        conf = int(conf_raw) if conf_raw.lstrip("-").isdigit() else -1
        if not text or conf < min_ocr_conf:
            continue
        tess_words.append({
            "text": text,
            "left": data["left"][i], "top": data["top"][i],
            "width": data["width"][i], "height": data["height"][i],
            "conf": conf,
        })

    # Merge dual-engine results
    from ocr_pipeline_ocr_merge import _split_paddle_multi_words, _merge_paddle_tesseract, _deduplicate_words
    from cv_words_first import build_grid_from_words

    rapid_split = _split_paddle_multi_words(rapid_words) if rapid_words else []
    if rapid_split or tess_words:
        merged_words = _merge_paddle_tesseract(rapid_split, tess_words)
        merged_words = _deduplicate_words(merged_words)
    else:
        merged_words = tess_words  # fallback to Tesseract only

    # Build initial grid from merged words
    cells, columns_meta = build_grid_from_words(merged_words, img_w, img_h, max_columns=max_columns)
    for cell in cells:
        cell["ocr_engine"] = "rapid_kombi"

    n_rows = len(set(c["row_index"] for c in cells)) if cells else 0
    n_cols = len(columns_meta)
    logger.info(f"  ocr: rapid={len(rapid_words)}, tess={len(tess_words)}, "
                f"merged={len(merged_words)}, cells={len(cells)} ({_time.time() - t0:.1f}s)")

    # 7. Save word_result to pipeline session (needed by _build_grid_core)
    word_result = {
        "cells": cells,
        "grid_shape": {"rows": n_rows, "cols": n_cols, "total_cells": len(cells)},
        "columns_used": columns_meta,
        "layout": "vocab" if {c.get("type") for c in columns_meta} & {"column_en", "column_de"} else "generic",
        "image_width": img_w,
        "image_height": img_h,
        "duration_seconds": 0,
        "ocr_engine": "rapid_kombi",
        "raw_tesseract_words": tess_words,
        "summary": {
            "total_cells": len(cells),
            "non_empty_cells": sum(1 for c in cells if c.get("text")),
        },
    }

    # Save images + word_result to pipeline session for admin visibility
    try:
        _, dsk_buf = cv2.imencode(".png", deskewed_bgr)
        _, dwp_buf = cv2.imencode(".png", dewarped_bgr)
        await update_pipeline_session_db(
            pipeline_session_id,
            deskewed_png=dsk_buf.tobytes(),
            dewarped_png=dwp_buf.tobytes(),
            cropped_png=cv2.imencode(".png", dewarped_bgr)[1].tobytes(),
            word_result=word_result,
            deskew_result={"angle_applied": round(angle_applied, 3)},
            dewarp_result={"shear_degrees": dewarp_info.get("shear_degrees", 0)},
            current_step=8,
        )
    except Exception as e:
        logger.warning(f"Could not update pipeline session: {e}")

    # 8. Run full grid-build (with pipe-autocorrect, word-gap merge, etc.)
    t0 = _time.time()
    try:
        from grid_editor_api import _build_grid_core
        session_data = {
            "word_result": word_result,
        }
        grid_result = await _build_grid_core(
            pipeline_session_id, session_data,
            ipa_mode=ipa_mode, syllable_mode=syllable_mode,
        )
        logger.info(f"  grid-build: {grid_result.get('summary', {}).get('total_cells', 0)} cells "
                    f"({_time.time() - t0:.1f}s)")

        # Save grid result to pipeline session
        try:
            await update_pipeline_session_db(
                pipeline_session_id,
                grid_editor_result=grid_result,
                current_step=11,
            )
        except Exception:
            pass

    except Exception as e:
        logger.warning(f"  grid-build failed: {e}, falling back to basic grid")
        grid_result = None

    # 9. Extract vocab entries
    # Prefer grid-build result (better column detection, more cells) over
    # the initial build_grid_from_words() which often under-clusters.
    page_vocabulary = []
    extraction_source = "none"

    # A) Try grid-build zones first (best quality: 4-column detection, autocorrect)
    if grid_result and grid_result.get("zones"):
        for zone in grid_result["zones"]:
            zone_cols = zone.get("columns", [])
            zone_cells = zone.get("cells", [])
            if not zone_cols or not zone_cells:
                continue

            # Sort columns by x position to determine roles
            sorted_cols = sorted(zone_cols, key=lambda c: c.get("x_min_px", 0))
            col_idx_to_pos = {}
            for pos, col in enumerate(sorted_cols):
                ci = col.get("col_index", col.get("index", -1))
                col_idx_to_pos[ci] = pos

            # Skip zones with only 1 column (likely headers/boxes)
            if len(sorted_cols) < 2:
                continue

            # Group cells by row
            rows_map: dict = {}
            for cell in zone_cells:
                ri = cell.get("row_index", 0)
                if ri not in rows_map:
                    rows_map[ri] = {}
                ci = cell.get("col_index", 0)
                rows_map[ri][ci] = (cell.get("text") or "").strip()

            n_cols = len(sorted_cols)
            for ri in sorted(rows_map.keys()):
                row = rows_map[ri]
                # Collect texts in column-position order
                texts = []
                for col in sorted_cols:
                    ci = col.get("col_index", col.get("index", -1))
                    texts.append(row.get(ci, ""))

                if not any(texts):
                    continue

                # Map by position, skipping narrow first column (page refs/markers)
                # Heuristic: if first column is very narrow (<15% of zone width),
                # it's likely a marker/ref column — skip it for vocab
                first_col_width = sorted_cols[0].get("x_max_px", 0) - sorted_cols[0].get("x_min_px", 0)
                zone_width = max(1, (sorted_cols[-1].get("x_max_px", 0) - sorted_cols[0].get("x_min_px", 0)))
                skip_first = first_col_width / zone_width < 0.15 and n_cols >= 3

                data_texts = texts[1:] if skip_first else texts

                entry = {
                    "id": str(uuid.uuid4()),
                    "english": data_texts[0] if len(data_texts) > 0 else "",
                    "german": data_texts[1] if len(data_texts) > 1 else "",
                    "example_sentence": " ".join(t for t in data_texts[2:] if t) if len(data_texts) > 2 else "",
                    "source_page": page_number + 1,
                }
                if entry["english"] or entry["german"]:
                    page_vocabulary.append(entry)

        if page_vocabulary:
            extraction_source = f"grid-zones ({len(grid_result['zones'])} zones)"

    # B) Fallback: original cells with column classification
    if not page_vocabulary:
        col_types = {c.get("type") for c in columns_meta}
        is_vocab = bool(col_types & {"column_en", "column_de"})

        if is_vocab:
            entries = _cells_to_vocab_entries(cells, columns_meta)
            entries = _fix_phonetic_brackets(entries, pronunciation="british")
            for entry in entries:
                if not entry.get("english") and not entry.get("german"):
                    continue
                page_vocabulary.append({
                    "id": str(uuid.uuid4()),
                    "english": entry.get("english", ""),
                    "german": entry.get("german", ""),
                    "example_sentence": entry.get("example", ""),
                    "source_page": page_number + 1,
                })
            extraction_source = f"classified ({len(columns_meta)} cols)"
        else:
            # Last resort: all cells by position
            rows_map2: dict = {}
            for cell in cells:
                ri = cell.get("row_index", 0)
                if ri not in rows_map2:
                    rows_map2[ri] = {}
                ci = cell.get("col_index", 0)
                rows_map2[ri][ci] = (cell.get("text") or "").strip()
            all_ci = sorted({ci for r in rows_map2.values() for ci in r.keys()})
            for ri in sorted(rows_map2.keys()):
                row = rows_map2[ri]
                texts = [row.get(ci, "") for ci in all_ci]
                if not any(texts):
                    continue
                page_vocabulary.append({
                    "id": str(uuid.uuid4()),
                    "english": texts[0] if len(texts) > 0 else "",
                    "german": texts[1] if len(texts) > 1 else "",
                    "example_sentence": " ".join(texts[2:]) if len(texts) > 2 else "",
                    "source_page": page_number + 1,
                })
            extraction_source = f"generic ({len(all_ci)} cols)"

    # --- Post-processing: merge cell-wrap continuation rows ---
    if len(page_vocabulary) >= 2:
        try:
            # Convert to internal format (example_sentence → example)
            internal = []
            for v in page_vocabulary:
                internal.append({
                    'row_index': len(internal),
                    'english': v.get('english', ''),
                    'german': v.get('german', ''),
                    'example': v.get('example_sentence', ''),
                })

            n_before = len(internal)
            internal = _merge_wrapped_rows(internal)
            internal = _merge_phonetic_continuation_rows(internal)
            internal = _merge_continuation_rows(internal)

            if len(internal) < n_before:
                # Rebuild page_vocabulary from merged entries
                merged_vocab = []
                for entry in internal:
                    if not entry.get('english') and not entry.get('german'):
                        continue
                    merged_vocab.append({
                        'id': str(uuid.uuid4()),
                        'english': entry.get('english', ''),
                        'german': entry.get('german', ''),
                        'example_sentence': entry.get('example', ''),
                        'source_page': page_number + 1,
                    })
                logger.info(f"  row merging: {n_before} → {len(merged_vocab)} entries")
                page_vocabulary = merged_vocab
        except Exception as e:
            logger.warning(f"  row merging failed (non-critical): {e}")

    logger.info(f"  vocab extraction: {len(page_vocabulary)} entries via {extraction_source}")

    total_duration = _time.time() - t_total
    logger.info(f"Kombi Pipeline page {page_number + 1}: "
                f"{len(page_vocabulary)} vocab entries in {total_duration:.1f}s")

    return page_vocabulary, rotation, scan_quality_report