breakpilot-lehrer/klausur-service/backend/cv_cell_grid_vocab.py

"""
Vocabulary extraction: cells -> vocab entries, and build_word_grid wrapper.

Extracted from cv_cell_grid.py.

Lizenz: Apache 2.0 (kommerziell nutzbar)
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
"""

import logging
from typing import Any, Dict, List

from cv_ocr_engines import (
    _attach_example_sentences,
    _fix_phonetic_brackets,
    _split_comma_entries,
)
from cv_cell_grid_legacy import build_cell_grid
from cv_cell_grid_merge import (
    _merge_continuation_rows,
    _merge_phonetic_continuation_rows,
    _merge_wrapped_rows,
)

logger = logging.getLogger(__name__)


def _cells_to_vocab_entries(
    cells: List[Dict[str, Any]],
    columns_meta: List[Dict[str, Any]],
) -> List[Dict[str, Any]]:
    """Map generic cells to vocab entries with english/german/example fields.

    Groups cells by row_index, maps col_type -> field name, and produces
    one entry per row (only rows with at least one non-empty field).
    """
    col_type_to_field = {
        'column_en': 'english',
        'column_de': 'german',
        'column_example': 'example',
        'page_ref': 'source_page',
        'column_marker': 'marker',
        'column_text': 'text',  # generic single-column (box sub-sessions)
    }
    bbox_key_map = {
        'column_en': 'bbox_en',
        'column_de': 'bbox_de',
        'column_example': 'bbox_ex',
        'page_ref': 'bbox_ref',
        'column_marker': 'bbox_marker',
        'column_text': 'bbox_text',
    }

    # Group cells by row_index
    rows: Dict[int, List[Dict]] = {}
    for cell in cells:
        ri = cell['row_index']
        rows.setdefault(ri, []).append(cell)

    entries: List[Dict[str, Any]] = []
    for row_idx in sorted(rows.keys()):
        row_cells = rows[row_idx]
        entry: Dict[str, Any] = {
            'row_index': row_idx,
            'english': '',
            'german': '',
            'example': '',
            'text': '',  # generic single-column (box sub-sessions)
            'source_page': '',
            'marker': '',
            'confidence': 0.0,
            'bbox': None,
            'bbox_en': None,
            'bbox_de': None,
            'bbox_ex': None,
            'bbox_ref': None,
            'bbox_marker': None,
            'bbox_text': None,
            'ocr_engine': row_cells[0].get('ocr_engine', '') if row_cells else '',
        }

        confidences = []
        for cell in row_cells:
            col_type = cell['col_type']
            field = col_type_to_field.get(col_type)
            if field:
                entry[field] = cell['text']
            bbox_field = bbox_key_map.get(col_type)
            if bbox_field:
                entry[bbox_field] = cell['bbox_pct']
            if cell['confidence'] > 0:
                confidences.append(cell['confidence'])

        # Compute row-level bbox as union of all cell bboxes
        all_bboxes = [c['bbox_pct'] for c in row_cells if c.get('bbox_pct')]
        if all_bboxes:
            min_x = min(b['x'] for b in all_bboxes)
            min_y = min(b['y'] for b in all_bboxes)
            max_x2 = max(b['x'] + b['w'] for b in all_bboxes)
            max_y2 = max(b['y'] + b['h'] for b in all_bboxes)
            entry['bbox'] = {
                'x': round(min_x, 2),
                'y': round(min_y, 2),
                'w': round(max_x2 - min_x, 2),
                'h': round(max_y2 - min_y, 2),
            }

        entry['confidence'] = round(
            sum(confidences) / len(confidences), 1
        ) if confidences else 0.0

        # Only include if at least one mapped field has text
        has_content = any(
            entry.get(f)
            for f in col_type_to_field.values()
        )
        if has_content:
            entries.append(entry)

    return entries


def build_word_grid(
    ocr_img,
    column_regions,
    row_geometries,
    img_w: int,
    img_h: int,
    lang: str = "eng+deu",
    ocr_engine: str = "auto",
    img_bgr=None,
    pronunciation: str = "british",
) -> List[Dict[str, Any]]:
    """Vocab-specific: Cell-Grid + Vocab-Mapping + Post-Processing.

    Wrapper around build_cell_grid() that adds vocabulary-specific logic:
    - Maps cells to english/german/example entries
    - Applies character confusion fixes, IPA lookup, comma splitting, etc.
    - Falls back to returning raw cells if no vocab columns detected.

    Args:
        ocr_img: Binarized full-page image (for Tesseract).
        column_regions: Classified columns from Step 3.
        row_geometries: Rows from Step 4.
        img_w, img_h: Image dimensions.
        lang: Default Tesseract language.
        ocr_engine: 'tesseract', 'rapid', or 'auto'.
        img_bgr: BGR color image (required for RapidOCR).
        pronunciation: 'british' or 'american' for IPA lookup.

    Returns:
        List of entry dicts with english/german/example text and bbox info (percent).
    """
    cells, columns_meta = build_cell_grid(
        ocr_img, column_regions, row_geometries, img_w, img_h,
        lang=lang, ocr_engine=ocr_engine, img_bgr=img_bgr,
    )

    if not cells:
        return []

    # Check if vocab layout is present
    col_types = {c['type'] for c in columns_meta}
    if not (col_types & {'column_en', 'column_de'}):
        logger.info("build_word_grid: no vocab columns -- returning raw cells")
        return cells

    # Vocab mapping: cells -> entries
    entries = _cells_to_vocab_entries(cells, columns_meta)

    # --- Post-processing pipeline (deterministic, no LLM) ---
    n_raw = len(entries)

    # 0. Merge cell-wrap continuation rows (empty primary column = text wrap)
    entries = _merge_wrapped_rows(entries)

    # 0a. Merge phonetic-only continuation rows into previous entry
    entries = _merge_phonetic_continuation_rows(entries)

    # 0b. Merge multi-line continuation rows (lowercase EN, empty DE)
    entries = _merge_continuation_rows(entries)

    # 1. Character confusion (| -> I, 1 -> I, 8 -> B) is now run in
    #    llm_review_entries_streaming so changes are visible to the user in Step 6.

    # 2. Replace OCR'd phonetics with dictionary IPA
    entries = _fix_phonetic_brackets(entries, pronunciation=pronunciation)

    # 3. Split comma-separated word forms (break, broke, broken -> 3 entries)
    entries = _split_comma_entries(entries)

    # 4. Attach example sentences (rows without DE -> examples for preceding entry)
    entries = _attach_example_sentences(entries)

    engine_name = cells[0].get('ocr_engine', 'unknown') if cells else 'unknown'
    logger.info(f"build_word_grid: {len(entries)} entries from "
                f"{n_raw} raw -> {len(entries)} after post-processing "
                f"(engine={engine_name})")

    return entries