breakpilot-lehrer/klausur-service/backend/cv_review_pipeline.py

"""
CV Review Pipeline — Multi-pass OCR, line alignment, LLM post-correction, and orchestration.

Stages 6-8 of the CV vocabulary pipeline plus the main orchestrator.

Lizenz: Apache 2.0 (kommerziell nutzbar)
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
"""

import logging
import time
from typing import Any, Dict, List, Optional

import numpy as np

from cv_vocab_types import (
    CV_PIPELINE_AVAILABLE,
    PageRegion,
    PipelineResult,
    VocabRow,
)
from cv_preprocessing import (
    deskew_image,
    dewarp_image,
    render_image_high_res,
    render_pdf_high_res,
)
from cv_layout import (
    analyze_layout,
    create_layout_image,
    create_ocr_image,
)
from cv_ocr_engines import (
    _group_words_into_lines,
)

logger = logging.getLogger(__name__)

try:
    import cv2
except ImportError:
    cv2 = None  # type: ignore[assignment]

try:
    import pytesseract
    from PIL import Image
except ImportError:
    pytesseract = None  # type: ignore[assignment]
    Image = None  # type: ignore[assignment,misc]


# =============================================================================
# Stage 6: Multi-Pass OCR
# =============================================================================

def ocr_region(ocr_img: np.ndarray, region: PageRegion, lang: str,
               psm: int, fallback_psm: Optional[int] = None,
               min_confidence: float = 40.0) -> List[Dict[str, Any]]:
    """Run Tesseract OCR on a specific region with given PSM.

    Args:
        ocr_img: Binarized full-page image.
        region: Region to crop and OCR.
        lang: Tesseract language string.
        psm: Page Segmentation Mode.
        fallback_psm: If confidence too low, retry with this PSM per line.
        min_confidence: Minimum average confidence before fallback.

    Returns:
        List of word dicts with text, position, confidence.
    """
    crop = ocr_img[region.y:region.y + region.height,
                   region.x:region.x + region.width]

    if crop.size == 0:
        return []

    pil_img = Image.fromarray(crop)

    config = f'--psm {psm} --oem 3'
    try:
        data = pytesseract.image_to_data(pil_img, lang=lang, config=config,
                                         output_type=pytesseract.Output.DICT)
    except Exception as e:
        logger.warning(f"Tesseract failed for region {region.type}: {e}")
        return []

    words = []
    for i in range(len(data['text'])):
        text = data['text'][i].strip()
        conf = int(data['conf'][i])
        if not text or conf < 10:
            continue
        words.append({
            'text': text,
            'left': data['left'][i] + region.x,
            'top': data['top'][i] + region.y,
            'width': data['width'][i],
            'height': data['height'][i],
            'conf': conf,
            'region_type': region.type,
        })

    if words and fallback_psm is not None:
        avg_conf = sum(w['conf'] for w in words) / len(words)
        if avg_conf < min_confidence:
            logger.info(f"Region {region.type}: avg confidence {avg_conf:.0f}% < {min_confidence}%, "
                        f"trying fallback PSM {fallback_psm}")
            words = _ocr_region_line_by_line(ocr_img, region, lang, fallback_psm)

    return words


def _ocr_region_line_by_line(ocr_img: np.ndarray, region: PageRegion,
                              lang: str, psm: int) -> List[Dict[str, Any]]:
    """OCR a region line by line (fallback for low-confidence regions)."""
    crop = ocr_img[region.y:region.y + region.height,
                   region.x:region.x + region.width]

    if crop.size == 0:
        return []

    inv = cv2.bitwise_not(crop)
    h_proj = np.sum(inv, axis=1)
    threshold = np.max(h_proj) * 0.05 if np.max(h_proj) > 0 else 0

    lines = []
    in_text = False
    line_start = 0
    for y in range(len(h_proj)):
        if h_proj[y] > threshold and not in_text:
            line_start = y
            in_text = True
        elif h_proj[y] <= threshold and in_text:
            if y - line_start > 5:
                lines.append((line_start, y))
            in_text = False
    if in_text and len(h_proj) - line_start > 5:
        lines.append((line_start, len(h_proj)))

    all_words = []
    config = f'--psm {psm} --oem 3'

    for line_y_start, line_y_end in lines:
        pad = 3
        y1 = max(0, line_y_start - pad)
        y2 = min(crop.shape[0], line_y_end + pad)
        line_crop = crop[y1:y2, :]

        if line_crop.size == 0:
            continue

        pil_img = Image.fromarray(line_crop)
        try:
            data = pytesseract.image_to_data(pil_img, lang=lang, config=config,
                                             output_type=pytesseract.Output.DICT)
        except Exception:
            continue

        for i in range(len(data['text'])):
            text = data['text'][i].strip()
            conf = int(data['conf'][i])
            if not text or conf < 10:
                continue
            all_words.append({
                'text': text,
                'left': data['left'][i] + region.x,
                'top': data['top'][i] + region.y + y1,
                'width': data['width'][i],
                'height': data['height'][i],
                'conf': conf,
                'region_type': region.type,
            })

    return all_words


def run_multi_pass_ocr(ocr_img: np.ndarray,
                       regions: List[PageRegion],
                       lang: str = "eng+deu") -> Dict[str, List[Dict]]:
    """Run OCR on each detected region with optimized settings."""
    results: Dict[str, List[Dict]] = {}

    _ocr_skip = {'header', 'footer', 'margin_top', 'margin_bottom', 'margin_left', 'margin_right'}
    for region in regions:
        if region.type in _ocr_skip:
            continue

        if region.type == 'column_en':
            words = ocr_region(ocr_img, region, lang='eng', psm=4)
        elif region.type == 'column_de':
            words = ocr_region(ocr_img, region, lang='deu', psm=4)
        elif region.type == 'column_example':
            words = ocr_region(ocr_img, region, lang=lang, psm=6,
                              fallback_psm=7, min_confidence=40.0)
        else:
            words = ocr_region(ocr_img, region, lang=lang, psm=6)

        results[region.type] = words
        logger.info(f"OCR {region.type}: {len(words)} words")

    return results


# =============================================================================
# Stage 7: Line Alignment -> Vocabulary Entries
# =============================================================================

def match_lines_to_vocab(ocr_results: Dict[str, List[Dict]],
                          regions: List[PageRegion],
                          y_tolerance_px: int = 25) -> List[VocabRow]:
    """Align OCR results from different columns into vocabulary rows."""
    if 'column_en' not in ocr_results and 'column_de' not in ocr_results:
        logger.info("match_lines_to_vocab: no column_en/column_de in OCR results, returning empty")
        return []

    en_lines = _group_words_into_lines(ocr_results.get('column_en', []), y_tolerance_px)
    de_lines = _group_words_into_lines(ocr_results.get('column_de', []), y_tolerance_px)
    ex_lines = _group_words_into_lines(ocr_results.get('column_example', []), y_tolerance_px)

    def line_y_center(line: List[Dict]) -> float:
        return sum(w['top'] + w['height'] / 2 for w in line) / len(line)

    def line_text(line: List[Dict]) -> str:
        return ' '.join(w['text'] for w in line)

    def line_confidence(line: List[Dict]) -> float:
        return sum(w['conf'] for w in line) / len(line) if line else 0

    vocab_rows: List[VocabRow] = []

    for en_line in en_lines:
        en_y = line_y_center(en_line)
        en_text = line_text(en_line)
        en_conf = line_confidence(en_line)

        if len(en_text.strip()) < 2:
            continue

        de_text = ""
        de_conf = 0.0
        best_de_dist = float('inf')
        best_de_idx = -1
        for idx, de_line in enumerate(de_lines):
            dist = abs(line_y_center(de_line) - en_y)
            if dist < y_tolerance_px and dist < best_de_dist:
                best_de_dist = dist
                best_de_idx = idx

        if best_de_idx >= 0:
            de_text = line_text(de_lines[best_de_idx])
            de_conf = line_confidence(de_lines[best_de_idx])

        ex_text = ""
        ex_conf = 0.0
        best_ex_dist = float('inf')
        best_ex_idx = -1
        for idx, ex_line in enumerate(ex_lines):
            dist = abs(line_y_center(ex_line) - en_y)
            if dist < y_tolerance_px and dist < best_ex_dist:
                best_ex_dist = dist
                best_ex_idx = idx

        if best_ex_idx >= 0:
            ex_text = line_text(ex_lines[best_ex_idx])
            ex_conf = line_confidence(ex_lines[best_ex_idx])

        avg_conf = en_conf
        conf_count = 1
        if de_conf > 0:
            avg_conf += de_conf
            conf_count += 1
        if ex_conf > 0:
            avg_conf += ex_conf
            conf_count += 1

        vocab_rows.append(VocabRow(
            english=en_text.strip(),
            german=de_text.strip(),
            example=ex_text.strip(),
            confidence=avg_conf / conf_count,
            y_position=int(en_y),
        ))

    # Handle multi-line wrapping in example column
    matched_ex_ys = set()
    for row in vocab_rows:
        if row.example:
            matched_ex_ys.add(row.y_position)

    for ex_line in ex_lines:
        ex_y = line_y_center(ex_line)
        already_matched = any(abs(ex_y - y) < y_tolerance_px for y in matched_ex_ys)
        if already_matched:
            continue

        best_row = None
        best_dist = float('inf')
        for row in vocab_rows:
            dist = ex_y - row.y_position
            if 0 < dist < y_tolerance_px * 3 and dist < best_dist:
                best_dist = dist
                best_row = row

        if best_row:
            continuation = line_text(ex_line).strip()
            if continuation:
                best_row.example = (best_row.example + " " + continuation).strip()

    vocab_rows.sort(key=lambda r: r.y_position)

    return vocab_rows


# =============================================================================
# Stage 8: Optional LLM Post-Correction
# =============================================================================

async def llm_post_correct(img: np.ndarray, vocab_rows: List[VocabRow],
                           confidence_threshold: float = 50.0,
                           enabled: bool = False) -> List[VocabRow]:
    """Optionally send low-confidence regions to Qwen-VL for correction."""
    if not enabled:
        return vocab_rows

    logger.info(f"LLM post-correction skipped (not yet implemented)")
    return vocab_rows


# =============================================================================
# Orchestrator
# =============================================================================

async def run_cv_pipeline(
    pdf_data: Optional[bytes] = None,
    image_data: Optional[bytes] = None,
    page_number: int = 0,
    zoom: float = 3.0,
    enable_dewarp: bool = True,
    enable_llm_correction: bool = False,
    lang: str = "eng+deu",
) -> PipelineResult:
    """Run the complete CV document reconstruction pipeline."""
    if not CV_PIPELINE_AVAILABLE:
        return PipelineResult(error="CV pipeline not available (OpenCV or Tesseract missing)")

    result = PipelineResult()
    total_start = time.time()

    try:
        # Stage 1: Render
        t = time.time()
        if pdf_data:
            img = render_pdf_high_res(pdf_data, page_number, zoom)
        elif image_data:
            img = render_image_high_res(image_data)
        else:
            return PipelineResult(error="No input data (pdf_data or image_data required)")
        result.stages['render'] = round(time.time() - t, 2)
        result.image_width = img.shape[1]
        result.image_height = img.shape[0]
        logger.info(f"Stage 1 (render): {img.shape[1]}x{img.shape[0]} in {result.stages['render']}s")

        # Stage 2: Deskew
        t = time.time()
        img, angle = deskew_image(img)
        result.stages['deskew'] = round(time.time() - t, 2)
        logger.info(f"Stage 2 (deskew): {angle:.2f}\u00b0 in {result.stages['deskew']}s")

        # Stage 3: Dewarp
        if enable_dewarp:
            t = time.time()
            img, _dewarp_info = dewarp_image(img)
            result.stages['dewarp'] = round(time.time() - t, 2)

        # Stage 4: Dual image preparation
        t = time.time()
        ocr_img = create_ocr_image(img)
        layout_img = create_layout_image(img)
        result.stages['image_prep'] = round(time.time() - t, 2)

        # Stage 5: Layout analysis
        t = time.time()
        regions = analyze_layout(layout_img, ocr_img)
        result.stages['layout'] = round(time.time() - t, 2)
        result.columns_detected = len([r for r in regions if r.type.startswith('column')])
        logger.info(f"Stage 5 (layout): {result.columns_detected} columns in {result.stages['layout']}s")

        # Stage 6: Multi-pass OCR
        t = time.time()
        ocr_results = run_multi_pass_ocr(ocr_img, regions, lang)
        result.stages['ocr'] = round(time.time() - t, 2)
        total_words = sum(len(w) for w in ocr_results.values())
        result.word_count = total_words
        logger.info(f"Stage 6 (OCR): {total_words} words in {result.stages['ocr']}s")

        # Stage 7: Line alignment
        t = time.time()
        vocab_rows = match_lines_to_vocab(ocr_results, regions)
        result.stages['alignment'] = round(time.time() - t, 2)

        # Stage 8: Optional LLM correction
        if enable_llm_correction:
            t = time.time()
            vocab_rows = await llm_post_correct(img, vocab_rows)
            result.stages['llm_correction'] = round(time.time() - t, 2)

        # Convert to output format
        result.vocabulary = [
            {
                "english": row.english,
                "german": row.german,
                "example": row.example,
                "confidence": round(row.confidence, 1),
            }
            for row in vocab_rows
            if row.english or row.german
        ]

        result.duration_seconds = round(time.time() - total_start, 2)
        logger.info(f"CV Pipeline complete: {len(result.vocabulary)} entries in {result.duration_seconds}s")

    except Exception as e:
        logger.error(f"CV Pipeline error: {e}")
        import traceback
        logger.debug(traceback.format_exc())
        result.error = str(e)
        result.duration_seconds = round(time.time() - total_start, 2)

    return result