refactor: split ocr_pipeline_api.py (5426 lines) into 8 modules

Each module is under 1050 lines: - ocr_pipeline_common.py (354) - shared state, cache, models, helpers - ocr_pipeline_sessions.py (483) - session CRUD, image serving, doc-type - ocr_pipeline_geometry.py (1025) - deskew, dewarp, structure, columns - ocr_pipeline_rows.py (348) - row detection, box-overlay helper - ocr_pipeline_words.py (876) - word detection (SSE), paddle-direct - ocr_pipeline_ocr_merge.py (615) - merge helpers, kombi endpoints - ocr_pipeline_postprocess.py (929) - LLM review, reconstruction, export - ocr_pipeline_auto.py (705) - auto-mode orchestrator, reprocess ocr_pipeline_api.py is now a 61-line thin wrapper that re-exports router, _cache, and test-imported symbols for backward compatibility. No changes needed in main.py or tests. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-18 08:42:00 +01:00
parent 98f7f7d7d5
commit ec287fd12e
9 changed files with 5382 additions and 5412 deletions
@@ -0,0 +1,705 @@
 """
 OCR Pipeline Auto-Mode Orchestrator and Reprocess Endpoints.
 Extracted from ocr_pipeline_api.py — contains:
 - POST /sessions/{session_id}/reprocess  (clear downstream + restart from step)
 - POST /sessions/{session_id}/run-auto   (full auto-mode with SSE streaming)
 Lizenz: Apache 2.0
 DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
 """
 import json
 import logging
 import os
 import re
 import time
 from dataclasses import asdict
 from typing import Any, Dict, List, Optional
 import cv2
 import numpy as np
 from fastapi import APIRouter, HTTPException, Request
 from fastapi.responses import StreamingResponse
 from pydantic import BaseModel
 from cv_vocab_pipeline import (
    OLLAMA_REVIEW_MODEL,
    PageRegion,
    RowGeometry,
    _cells_to_vocab_entries,
    _detect_header_footer_gaps,
    _detect_sub_columns,
    _fix_character_confusion,
    _fix_phonetic_brackets,
    fix_cell_phonetics,
    analyze_layout,
    build_cell_grid,
    classify_column_types,
    create_layout_image,
    create_ocr_image,
    deskew_image,
    deskew_image_by_word_alignment,
    detect_column_geometry,
    detect_row_geometry,
    _apply_shear,
    dewarp_image,
    llm_review_entries,
 )
 from ocr_pipeline_common import (
    _cache,
    _load_session_to_cache,
    _get_cached,
    _get_base_image_png,
    _append_pipeline_log,
 )
 from ocr_pipeline_session_store import (
    get_session_db,
    update_session_db,
 )
 logger = logging.getLogger(__name__)
 router = APIRouter(prefix="/api/v1/ocr-pipeline", tags=["ocr-pipeline"])
 # ---------------------------------------------------------------------------
 # Reprocess endpoint
 # ---------------------------------------------------------------------------
@router.post("/sessions/{session_id}/reprocess")
 async def reprocess_session(session_id: str, request: Request):
    """Re-run pipeline from a specific step, clearing downstream data.
    Body: {"from_step": 5}  (1-indexed step number)
    Pipeline order: Orientation(1) → Deskew(2) → Dewarp(3) → Crop(4) → Columns(5) →
                    Rows(6) → Words(7) → LLM-Review(8) → Reconstruction(9) → Validation(10)
    Clears downstream results:
    - from_step <= 1: orientation_result + all downstream
    - from_step <= 2: deskew_result + all downstream
    - from_step <= 3: dewarp_result + all downstream
    - from_step <= 4: crop_result + all downstream
    - from_step <= 5: column_result, row_result, word_result
    - from_step <= 6: row_result, word_result
    - from_step <= 7: word_result (cells, vocab_entries)
    - from_step <= 8: word_result.llm_review only
    """
    session = await get_session_db(session_id)
    if not session:
        raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
    body = await request.json()
    from_step = body.get("from_step", 1)
    if not isinstance(from_step, int) or from_step < 1 or from_step > 10:
        raise HTTPException(status_code=400, detail="from_step must be between 1 and 10")
    update_kwargs: Dict[str, Any] = {"current_step": from_step}
    # Clear downstream data based on from_step
    # New pipeline order: Orient(2) → Deskew(3) → Dewarp(4) → Crop(5) →
    #   Columns(6) → Rows(7) → Words(8) → LLM(9) → Recon(10) → GT(11)
    if from_step <= 8:
        update_kwargs["word_result"] = None
    elif from_step == 9:
        # Only clear LLM review from word_result
        word_result = session.get("word_result")
        if word_result:
            word_result.pop("llm_review", None)
            word_result.pop("llm_corrections", None)
            update_kwargs["word_result"] = word_result
    if from_step <= 7:
        update_kwargs["row_result"] = None
    if from_step <= 6:
        update_kwargs["column_result"] = None
    if from_step <= 4:
        update_kwargs["crop_result"] = None
    if from_step <= 3:
        update_kwargs["dewarp_result"] = None
    if from_step <= 2:
        update_kwargs["deskew_result"] = None
    if from_step <= 1:
        update_kwargs["orientation_result"] = None
    await update_session_db(session_id, **update_kwargs)
    # Also clear cache
    if session_id in _cache:
        for key in list(update_kwargs.keys()):
            if key != "current_step":
                _cache[session_id][key] = update_kwargs[key]
        _cache[session_id]["current_step"] = from_step
    logger.info(f"Session {session_id} reprocessing from step {from_step}")
    return {
        "session_id": session_id,
        "from_step": from_step,
        "cleared": [k for k in update_kwargs if k != "current_step"],
    }
 # ---------------------------------------------------------------------------
 # VLM shear detection helper (used by dewarp step in auto-mode)
 # ---------------------------------------------------------------------------
 async def _detect_shear_with_vlm(image_bytes: bytes) -> Dict[str, Any]:
    """Ask qwen2.5vl:32b to estimate the vertical shear angle of a scanned page.
    The VLM is shown the image and asked: are the column/table borders tilted?
    If yes, by how many degrees? Returns a dict with shear_degrees and confidence.
    Confidence is 0.0 if Ollama is unavailable or parsing fails.
    """
    import httpx
    import base64
    import re
    ollama_base = os.getenv("OLLAMA_BASE_URL", "http://host.docker.internal:11434")
    model = os.getenv("OLLAMA_HTR_MODEL", "qwen2.5vl:32b")
    prompt = (
        "This is a scanned vocabulary worksheet. Look at the vertical borders of the table columns. "
        "Are they perfectly vertical, or do they tilt slightly? "
        "If they tilt, estimate the tilt angle in degrees (positive = top tilts right, negative = top tilts left). "
        "Reply with ONLY a JSON object like: {\"shear_degrees\": 1.2, \"confidence\": 0.8} "
        "Use confidence 0.0-1.0 based on how clearly you can see the tilt. "
        "If the columns look straight, return {\"shear_degrees\": 0.0, \"confidence\": 0.9}"
    )
    img_b64 = base64.b64encode(image_bytes).decode("utf-8")
    payload = {
        "model": model,
        "prompt": prompt,
        "images": [img_b64],
        "stream": False,
    }
    try:
        async with httpx.AsyncClient(timeout=60.0) as client:
            resp = await client.post(f"{ollama_base}/api/generate", json=payload)
            resp.raise_for_status()
            text = resp.json().get("response", "")
        # Parse JSON from response (may have surrounding text)
        match = re.search(r'\{[^}]+\}', text)
        if match:
            import json
            data = json.loads(match.group(0))
            shear = float(data.get("shear_degrees", 0.0))
            conf = float(data.get("confidence", 0.0))
            # Clamp to reasonable range
            shear = max(-3.0, min(3.0, shear))
            conf = max(0.0, min(1.0, conf))
            return {"method": "vlm_qwen2.5vl", "shear_degrees": round(shear, 3), "confidence": round(conf, 2)}
    except Exception as e:
        logger.warning(f"VLM dewarp failed: {e}")
    return {"method": "vlm_qwen2.5vl", "shear_degrees": 0.0, "confidence": 0.0}
 # ---------------------------------------------------------------------------
 # Auto-mode orchestrator
 # ---------------------------------------------------------------------------
 class RunAutoRequest(BaseModel):
    from_step: int = 1          # 1=deskew, 2=dewarp, 3=columns, 4=rows, 5=words, 6=llm-review
    ocr_engine: str = "auto"    # "auto" | "rapid" | "tesseract"
    pronunciation: str = "british"
    skip_llm_review: bool = False
    dewarp_method: str = "ensemble"  # "ensemble" | "vlm" | "cv"
 async def _auto_sse_event(step: str, status: str, data: Dict[str, Any]) -> str:
    """Format a single SSE event line."""
    import json as _json
    payload = {"step": step, "status": status, **data}
    return f"data: {_json.dumps(payload)}\n\n"
@router.post("/sessions/{session_id}/run-auto")
 async def run_auto(session_id: str, req: RunAutoRequest, request: Request):
    """Run the full OCR pipeline automatically from a given step, streaming SSE progress.
    Steps:
        1. Deskew  — straighten the scan
        2. Dewarp  — correct vertical shear (ensemble CV or VLM)
        3. Columns — detect column layout
        4. Rows    — detect row layout
        5. Words   — OCR each cell
        6. LLM review — correct OCR errors (optional)
    Already-completed steps are skipped unless `from_step` forces a rerun.
    Yields SSE events of the form:
        data: {"step": "deskew", "status": "start"|"done"|"skipped"|"error", ...}
    Final event:
        data: {"step": "complete", "status": "done", "steps_run": [...], "steps_skipped": [...]}
    """
    if req.from_step < 1 or req.from_step > 6:
        raise HTTPException(status_code=400, detail="from_step must be 1-6")
    if req.dewarp_method not in ("ensemble", "vlm", "cv"):
        raise HTTPException(status_code=400, detail="dewarp_method must be: ensemble, vlm, cv")
    if session_id not in _cache:
        await _load_session_to_cache(session_id)
    async def _generate():
        steps_run: List[str] = []
        steps_skipped: List[str] = []
        error_step: Optional[str] = None
        session = await get_session_db(session_id)
        if not session:
            yield await _auto_sse_event("error", "error", {"message": f"Session {session_id} not found"})
            return
        cached = _get_cached(session_id)
        # -----------------------------------------------------------------
        # Step 1: Deskew
        # -----------------------------------------------------------------
        if req.from_step <= 1:
            yield await _auto_sse_event("deskew", "start", {})
            try:
                t0 = time.time()
                orig_bgr = cached.get("original_bgr")
                if orig_bgr is None:
                    raise ValueError("Original image not loaded")
                # Method 1: Hough lines
                try:
                    deskewed_hough, angle_hough = deskew_image(orig_bgr.copy())
                except Exception:
                    deskewed_hough, angle_hough = orig_bgr, 0.0
                # Method 2: Word alignment
                success_enc, png_orig = cv2.imencode(".png", orig_bgr)
                orig_bytes = png_orig.tobytes() if success_enc else b""
                try:
                    deskewed_wa_bytes, angle_wa = deskew_image_by_word_alignment(orig_bytes)
                except Exception:
                    deskewed_wa_bytes, angle_wa = orig_bytes, 0.0
                # Pick best method
                if abs(angle_wa) >= abs(angle_hough) or abs(angle_hough) < 0.1:
                    method_used = "word_alignment"
                    angle_applied = angle_wa
                    wa_arr = np.frombuffer(deskewed_wa_bytes, dtype=np.uint8)
                    deskewed_bgr = cv2.imdecode(wa_arr, cv2.IMREAD_COLOR)
                    if deskewed_bgr is None:
                        deskewed_bgr = deskewed_hough
                        method_used = "hough"
                        angle_applied = angle_hough
                else:
                    method_used = "hough"
                    angle_applied = angle_hough
                    deskewed_bgr = deskewed_hough
                success, png_buf = cv2.imencode(".png", deskewed_bgr)
                deskewed_png = png_buf.tobytes() if success else b""
                deskew_result = {
                    "method_used": method_used,
                    "rotation_degrees": round(float(angle_applied), 3),
                    "duration_seconds": round(time.time() - t0, 2),
                }
                cached["deskewed_bgr"] = deskewed_bgr
                cached["deskew_result"] = deskew_result
                await update_session_db(
                    session_id,
                    deskewed_png=deskewed_png,
                    deskew_result=deskew_result,
                    auto_rotation_degrees=float(angle_applied),
                    current_step=3,
                )
                session = await get_session_db(session_id)
                steps_run.append("deskew")
                yield await _auto_sse_event("deskew", "done", deskew_result)
            except Exception as e:
                logger.error(f"Auto-mode deskew failed for {session_id}: {e}")
                error_step = "deskew"
                yield await _auto_sse_event("deskew", "error", {"message": str(e)})
                yield await _auto_sse_event("complete", "error", {"error_step": error_step})
                return
        else:
            steps_skipped.append("deskew")
            yield await _auto_sse_event("deskew", "skipped", {"reason": "from_step > 1"})
        # -----------------------------------------------------------------
        # Step 2: Dewarp
        # -----------------------------------------------------------------
        if req.from_step <= 2:
            yield await _auto_sse_event("dewarp", "start", {"method": req.dewarp_method})
            try:
                t0 = time.time()
                deskewed_bgr = cached.get("deskewed_bgr")
                if deskewed_bgr is None:
                    raise ValueError("Deskewed image not available")
                if req.dewarp_method == "vlm":
                    success_enc, png_buf = cv2.imencode(".png", deskewed_bgr)
                    img_bytes = png_buf.tobytes() if success_enc else b""
                    vlm_det = await _detect_shear_with_vlm(img_bytes)
                    shear_deg = vlm_det["shear_degrees"]
                    if abs(shear_deg) >= 0.05 and vlm_det["confidence"] >= 0.3:
                        dewarped_bgr = _apply_shear(deskewed_bgr, -shear_deg)
                    else:
                        dewarped_bgr = deskewed_bgr
                    dewarp_info = {
                        "method": vlm_det["method"],
                        "shear_degrees": shear_deg,
                        "confidence": vlm_det["confidence"],
                        "detections": [vlm_det],
                    }
                else:
                    dewarped_bgr, dewarp_info = dewarp_image(deskewed_bgr)
                success_enc, png_buf = cv2.imencode(".png", dewarped_bgr)
                dewarped_png = png_buf.tobytes() if success_enc else b""
                dewarp_result = {
                    "method_used": dewarp_info["method"],
                    "shear_degrees": dewarp_info["shear_degrees"],
                    "confidence": dewarp_info["confidence"],
                    "duration_seconds": round(time.time() - t0, 2),
                    "detections": dewarp_info.get("detections", []),
                }
                cached["dewarped_bgr"] = dewarped_bgr
                cached["dewarp_result"] = dewarp_result
                await update_session_db(
                    session_id,
                    dewarped_png=dewarped_png,
                    dewarp_result=dewarp_result,
                    auto_shear_degrees=dewarp_info.get("shear_degrees", 0.0),
                    current_step=4,
                )
                session = await get_session_db(session_id)
                steps_run.append("dewarp")
                yield await _auto_sse_event("dewarp", "done", dewarp_result)
            except Exception as e:
                logger.error(f"Auto-mode dewarp failed for {session_id}: {e}")
                error_step = "dewarp"
                yield await _auto_sse_event("dewarp", "error", {"message": str(e)})
                yield await _auto_sse_event("complete", "error", {"error_step": error_step})
                return
        else:
            steps_skipped.append("dewarp")
            yield await _auto_sse_event("dewarp", "skipped", {"reason": "from_step > 2"})
        # -----------------------------------------------------------------
        # Step 3: Columns
        # -----------------------------------------------------------------
        if req.from_step <= 3:
            yield await _auto_sse_event("columns", "start", {})
            try:
                t0 = time.time()
                col_img = cached.get("cropped_bgr") if cached.get("cropped_bgr") is not None else cached.get("dewarped_bgr")
                if col_img is None:
                    raise ValueError("Cropped/dewarped image not available")
                ocr_img = create_ocr_image(col_img)
                h, w = ocr_img.shape[:2]
                geo_result = detect_column_geometry(ocr_img, col_img)
                if geo_result is None:
                    layout_img = create_layout_image(col_img)
                    regions = analyze_layout(layout_img, ocr_img)
                    cached["_word_dicts"] = None
                    cached["_inv"] = None
                    cached["_content_bounds"] = None
                else:
                    geometries, left_x, right_x, top_y, bottom_y, word_dicts, inv = geo_result
                    content_w = right_x - left_x
                    cached["_word_dicts"] = word_dicts
                    cached["_inv"] = inv
                    cached["_content_bounds"] = (left_x, right_x, top_y, bottom_y)
                    header_y, footer_y = _detect_header_footer_gaps(inv, w, h) if inv is not None else (None, None)
                    geometries = _detect_sub_columns(geometries, content_w, left_x=left_x,
                                                     top_y=top_y, header_y=header_y, footer_y=footer_y)
                    regions = classify_column_types(geometries, content_w, top_y, w, h, bottom_y,
                                                   left_x=left_x, right_x=right_x, inv=inv)
                columns = [asdict(r) for r in regions]
                column_result = {
                    "columns": columns,
                    "classification_methods": list({c.get("classification_method", "") for c in columns if c.get("classification_method")}),
                    "duration_seconds": round(time.time() - t0, 2),
                }
                cached["column_result"] = column_result
                await update_session_db(session_id, column_result=column_result,
                                        row_result=None, word_result=None, current_step=6)
                session = await get_session_db(session_id)
                steps_run.append("columns")
                yield await _auto_sse_event("columns", "done", {
                    "column_count": len(columns),
                    "duration_seconds": column_result["duration_seconds"],
                })
            except Exception as e:
                logger.error(f"Auto-mode columns failed for {session_id}: {e}")
                error_step = "columns"
                yield await _auto_sse_event("columns", "error", {"message": str(e)})
                yield await _auto_sse_event("complete", "error", {"error_step": error_step})
                return
        else:
            steps_skipped.append("columns")
            yield await _auto_sse_event("columns", "skipped", {"reason": "from_step > 3"})
        # -----------------------------------------------------------------
        # Step 4: Rows
        # -----------------------------------------------------------------
        if req.from_step <= 4:
            yield await _auto_sse_event("rows", "start", {})
            try:
                t0 = time.time()
                row_img = cached.get("cropped_bgr") if cached.get("cropped_bgr") is not None else cached.get("dewarped_bgr")
                session = await get_session_db(session_id)
                column_result = session.get("column_result") or cached.get("column_result")
                if not column_result or not column_result.get("columns"):
                    raise ValueError("Column detection must complete first")
                col_regions = [
                    PageRegion(
                        type=c["type"], x=c["x"], y=c["y"],
                        width=c["width"], height=c["height"],
                        classification_confidence=c.get("classification_confidence", 1.0),
                        classification_method=c.get("classification_method", ""),
                    )
                    for c in column_result["columns"]
                ]
                word_dicts = cached.get("_word_dicts")
                inv = cached.get("_inv")
                content_bounds = cached.get("_content_bounds")
                if word_dicts is None or inv is None or content_bounds is None:
                    ocr_img_tmp = create_ocr_image(row_img)
                    geo_result = detect_column_geometry(ocr_img_tmp, row_img)
                    if geo_result is None:
                        raise ValueError("Column geometry detection failed — cannot detect rows")
                    _g, lx, rx, ty, by, word_dicts, inv = geo_result
                    cached["_word_dicts"] = word_dicts
                    cached["_inv"] = inv
                    cached["_content_bounds"] = (lx, rx, ty, by)
                    content_bounds = (lx, rx, ty, by)
                left_x, right_x, top_y, bottom_y = content_bounds
                row_geoms = detect_row_geometry(inv, word_dicts, left_x, right_x, top_y, bottom_y)
                row_list = [
                    {
                        "index": r.index, "x": r.x, "y": r.y,
                        "width": r.width, "height": r.height,
                        "word_count": r.word_count,
                        "row_type": r.row_type,
                        "gap_before": r.gap_before,
                    }
                    for r in row_geoms
                ]
                row_result = {
                    "rows": row_list,
                    "row_count": len(row_list),
                    "content_rows": len([r for r in row_geoms if r.row_type == "content"]),
                    "duration_seconds": round(time.time() - t0, 2),
                }
                cached["row_result"] = row_result
                await update_session_db(session_id, row_result=row_result, current_step=7)
                session = await get_session_db(session_id)
                steps_run.append("rows")
                yield await _auto_sse_event("rows", "done", {
                    "row_count": len(row_list),
                    "content_rows": row_result["content_rows"],
                    "duration_seconds": row_result["duration_seconds"],
                })
            except Exception as e:
                logger.error(f"Auto-mode rows failed for {session_id}: {e}")
                error_step = "rows"
                yield await _auto_sse_event("rows", "error", {"message": str(e)})
                yield await _auto_sse_event("complete", "error", {"error_step": error_step})
                return
        else:
            steps_skipped.append("rows")
            yield await _auto_sse_event("rows", "skipped", {"reason": "from_step > 4"})
        # -----------------------------------------------------------------
        # Step 5: Words (OCR)
        # -----------------------------------------------------------------
        if req.from_step <= 5:
            yield await _auto_sse_event("words", "start", {"engine": req.ocr_engine})
            try:
                t0 = time.time()
                word_img = cached.get("cropped_bgr") if cached.get("cropped_bgr") is not None else cached.get("dewarped_bgr")
                session = await get_session_db(session_id)
                column_result = session.get("column_result") or cached.get("column_result")
                row_result = session.get("row_result") or cached.get("row_result")
                col_regions = [
                    PageRegion(
                        type=c["type"], x=c["x"], y=c["y"],
                        width=c["width"], height=c["height"],
                        classification_confidence=c.get("classification_confidence", 1.0),
                        classification_method=c.get("classification_method", ""),
                    )
                    for c in column_result["columns"]
                ]
                row_geoms = [
                    RowGeometry(
                        index=r["index"], x=r["x"], y=r["y"],
                        width=r["width"], height=r["height"],
                        word_count=r.get("word_count", 0), words=[],
                        row_type=r.get("row_type", "content"),
                        gap_before=r.get("gap_before", 0),
                    )
                    for r in row_result["rows"]
                ]
                word_dicts = cached.get("_word_dicts")
                if word_dicts is not None:
                    content_bounds = cached.get("_content_bounds")
                    top_y = content_bounds[2] if content_bounds else min(r.y for r in row_geoms)
                    for row in row_geoms:
                        row_y_rel = row.y - top_y
                        row_bottom_rel = row_y_rel + row.height
                        row.words = [
                            w for w in word_dicts
                            if row_y_rel <= w['top'] + w['height'] / 2 < row_bottom_rel
                        ]
                        row.word_count = len(row.words)
                ocr_img = create_ocr_image(word_img)
                img_h, img_w = word_img.shape[:2]
                cells, columns_meta = build_cell_grid(
                    ocr_img, col_regions, row_geoms, img_w, img_h,
                    ocr_engine=req.ocr_engine, img_bgr=word_img,
                )
                duration = time.time() - t0
                col_types = {c['type'] for c in columns_meta}
                is_vocab = bool(col_types & {'column_en', 'column_de'})
                n_content_rows = len([r for r in row_geoms if r.row_type == 'content'])
                used_engine = cells[0].get("ocr_engine", "tesseract") if cells else req.ocr_engine
                # Apply IPA phonetic fixes directly to cell texts
                fix_cell_phonetics(cells, pronunciation=req.pronunciation)
                word_result_data = {
                    "cells": cells,
                    "grid_shape": {
                        "rows": n_content_rows,
                        "cols": len(columns_meta),
                        "total_cells": len(cells),
                    },
                    "columns_used": columns_meta,
                    "layout": "vocab" if is_vocab else "generic",
                    "image_width": img_w,
                    "image_height": img_h,
                    "duration_seconds": round(duration, 2),
                    "ocr_engine": used_engine,
                    "summary": {
                        "total_cells": len(cells),
                        "non_empty_cells": sum(1 for c in cells if c.get("text")),
                        "low_confidence": sum(1 for c in cells if 0 < c.get("confidence", 0) < 50),
                    },
                }
                has_text_col = 'column_text' in col_types
                if is_vocab or has_text_col:
                    entries = _cells_to_vocab_entries(cells, columns_meta)
                    entries = _fix_character_confusion(entries)
                    entries = _fix_phonetic_brackets(entries, pronunciation=req.pronunciation)
                    word_result_data["vocab_entries"] = entries
                    word_result_data["entries"] = entries
                    word_result_data["entry_count"] = len(entries)
                    word_result_data["summary"]["total_entries"] = len(entries)
                await update_session_db(session_id, word_result=word_result_data, current_step=8)
                cached["word_result"] = word_result_data
                session = await get_session_db(session_id)
                steps_run.append("words")
                yield await _auto_sse_event("words", "done", {
                    "total_cells": len(cells),
                    "layout": word_result_data["layout"],
                    "duration_seconds": round(duration, 2),
                    "ocr_engine": used_engine,
                    "summary": word_result_data["summary"],
                })
            except Exception as e:
                logger.error(f"Auto-mode words failed for {session_id}: {e}")
                error_step = "words"
                yield await _auto_sse_event("words", "error", {"message": str(e)})
                yield await _auto_sse_event("complete", "error", {"error_step": error_step})
                return
        else:
            steps_skipped.append("words")
            yield await _auto_sse_event("words", "skipped", {"reason": "from_step > 5"})
        # -----------------------------------------------------------------
        # Step 6: LLM Review (optional)
        # -----------------------------------------------------------------
        if req.from_step <= 6 and not req.skip_llm_review:
            yield await _auto_sse_event("llm_review", "start", {"model": OLLAMA_REVIEW_MODEL})
            try:
                session = await get_session_db(session_id)
                word_result = session.get("word_result") or cached.get("word_result")
                entries = word_result.get("entries") or word_result.get("vocab_entries") or []
                if not entries:
                    yield await _auto_sse_event("llm_review", "skipped", {"reason": "no entries"})
                    steps_skipped.append("llm_review")
                else:
                    reviewed = await llm_review_entries(entries)
                    session = await get_session_db(session_id)
                    word_result_updated = dict(session.get("word_result") or {})
                    word_result_updated["entries"] = reviewed
                    word_result_updated["vocab_entries"] = reviewed
                    word_result_updated["llm_reviewed"] = True
                    word_result_updated["llm_model"] = OLLAMA_REVIEW_MODEL
                    await update_session_db(session_id, word_result=word_result_updated, current_step=9)
                    cached["word_result"] = word_result_updated
                    steps_run.append("llm_review")
                    yield await _auto_sse_event("llm_review", "done", {
                        "entries_reviewed": len(reviewed),
                        "model": OLLAMA_REVIEW_MODEL,
                    })
            except Exception as e:
                logger.warning(f"Auto-mode llm_review failed for {session_id} (non-fatal): {e}")
                yield await _auto_sse_event("llm_review", "error", {"message": str(e), "fatal": False})
                steps_skipped.append("llm_review")
        else:
            steps_skipped.append("llm_review")
            reason = "skipped by request" if req.skip_llm_review else "from_step > 6"
            yield await _auto_sse_event("llm_review", "skipped", {"reason": reason})
        # -----------------------------------------------------------------
        # Final event
        # -----------------------------------------------------------------
        yield await _auto_sse_event("complete", "done", {
            "steps_run": steps_run,
            "steps_skipped": steps_skipped,
        })
    return StreamingResponse(
        _generate(),
        media_type="text/event-stream",
        headers={
            "Cache-Control": "no-cache",
            "Connection": "keep-alive",
            "X-Accel-Buffering": "no",
        },
    )
@@ -0,0 +1,354 @@
 """
 Shared common module for the OCR pipeline.
 Contains in-memory cache, helper functions, Pydantic request models,
 pipeline logging, and border-ghost word filtering used by the pipeline
 API endpoints and related modules.
 """
 import logging
 import re
 import time
 from datetime import datetime
 from typing import Any, Dict, List, Optional
 import cv2
 import numpy as np
 from fastapi import HTTPException
 from pydantic import BaseModel
 from ocr_pipeline_session_store import get_session_db, get_session_image, update_session_db
 __all__ = [
    # Cache
    "_cache",
    # Helper functions
    "_get_base_image_png",
    "_load_session_to_cache",
    "_get_cached",
    # Pydantic models
    "ManualDeskewRequest",
    "DeskewGroundTruthRequest",
    "ManualDewarpRequest",
    "CombinedAdjustRequest",
    "DewarpGroundTruthRequest",
    "VALID_DOCUMENT_CATEGORIES",
    "UpdateSessionRequest",
    "ManualColumnsRequest",
    "ColumnGroundTruthRequest",
    "ManualRowsRequest",
    "RowGroundTruthRequest",
    "RemoveHandwritingRequest",
    # Pipeline log
    "_append_pipeline_log",
    # Border-ghost filter
    "_BORDER_GHOST_CHARS",
    "_filter_border_ghost_words",
 ]
 logger = logging.getLogger(__name__)
 # ---------------------------------------------------------------------------
 # In-memory cache for active sessions (BGR numpy arrays for processing)
 # DB is source of truth, cache holds BGR arrays during active processing.
 # ---------------------------------------------------------------------------
 _cache: Dict[str, Dict[str, Any]] = {}
 async def _get_base_image_png(session_id: str) -> Optional[bytes]:
    """Get the best available base image for a session (cropped > dewarped > original)."""
    for img_type in ("cropped", "dewarped", "original"):
        png_data = await get_session_image(session_id, img_type)
        if png_data:
            return png_data
    return None
 async def _load_session_to_cache(session_id: str) -> Dict[str, Any]:
    """Load session from DB into cache, decoding PNGs to BGR arrays."""
    session = await get_session_db(session_id)
    if not session:
        raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
    if session_id in _cache:
        return _cache[session_id]
    cache_entry: Dict[str, Any] = {
        "id": session_id,
        **session,
        "original_bgr": None,
        "oriented_bgr": None,
        "cropped_bgr": None,
        "deskewed_bgr": None,
        "dewarped_bgr": None,
    }
    # Decode images from DB into BGR numpy arrays
    for img_type, bgr_key in [
        ("original", "original_bgr"),
        ("oriented", "oriented_bgr"),
        ("cropped", "cropped_bgr"),
        ("deskewed", "deskewed_bgr"),
        ("dewarped", "dewarped_bgr"),
    ]:
        png_data = await get_session_image(session_id, img_type)
        if png_data:
            arr = np.frombuffer(png_data, dtype=np.uint8)
            bgr = cv2.imdecode(arr, cv2.IMREAD_COLOR)
            cache_entry[bgr_key] = bgr
    # Sub-sessions: original image IS the cropped box region.
    # Promote original_bgr to cropped_bgr so downstream steps find it.
    if session.get("parent_session_id") and cache_entry["original_bgr"] is not None:
        if cache_entry["cropped_bgr"] is None and cache_entry["dewarped_bgr"] is None:
            cache_entry["cropped_bgr"] = cache_entry["original_bgr"]
    _cache[session_id] = cache_entry
    return cache_entry
 def _get_cached(session_id: str) -> Dict[str, Any]:
    """Get from cache or raise 404."""
    entry = _cache.get(session_id)
    if not entry:
        raise HTTPException(status_code=404, detail=f"Session {session_id} not in cache — reload first")
    return entry
 # ---------------------------------------------------------------------------
 # Pydantic Models
 # ---------------------------------------------------------------------------
 class ManualDeskewRequest(BaseModel):
    angle: float
 class DeskewGroundTruthRequest(BaseModel):
    is_correct: bool
    corrected_angle: Optional[float] = None
    notes: Optional[str] = None
 class ManualDewarpRequest(BaseModel):
    shear_degrees: float
 class CombinedAdjustRequest(BaseModel):
    rotation_degrees: float = 0.0
    shear_degrees: float = 0.0
 class DewarpGroundTruthRequest(BaseModel):
    is_correct: bool
    corrected_shear: Optional[float] = None
    notes: Optional[str] = None
 VALID_DOCUMENT_CATEGORIES = {
    'vokabelseite', 'buchseite', 'arbeitsblatt', 'klausurseite',
    'mathearbeit', 'statistik', 'zeitung', 'formular', 'handschrift', 'sonstiges',
 }
 class UpdateSessionRequest(BaseModel):
    name: Optional[str] = None
    document_category: Optional[str] = None
 class ManualColumnsRequest(BaseModel):
    columns: List[Dict[str, Any]]
 class ColumnGroundTruthRequest(BaseModel):
    is_correct: bool
    corrected_columns: Optional[List[Dict[str, Any]]] = None
    notes: Optional[str] = None
 class ManualRowsRequest(BaseModel):
    rows: List[Dict[str, Any]]
 class RowGroundTruthRequest(BaseModel):
    is_correct: bool
    corrected_rows: Optional[List[Dict[str, Any]]] = None
    notes: Optional[str] = None
 class RemoveHandwritingRequest(BaseModel):
    method: str = "auto"       # "auto" | "telea" | "ns"
    target_ink: str = "all"    # "all" | "colored" | "pencil"
    dilation: int = 2          # mask dilation iterations (0-5)
    use_source: str = "auto"   # "original" | "deskewed" | "auto"
 # ---------------------------------------------------------------------------
 # Pipeline Log Helper
 # ---------------------------------------------------------------------------
 async def _append_pipeline_log(
    session_id: str,
    step_name: str,
    metrics: Dict[str, Any],
    success: bool = True,
    duration_ms: Optional[int] = None,
 ):
    """Append a step entry to the session's pipeline_log JSONB."""
    session = await get_session_db(session_id)
    if not session:
        return
    log = session.get("pipeline_log") or {"steps": []}
    if not isinstance(log, dict):
        log = {"steps": []}
    entry = {
        "step": step_name,
        "completed_at": datetime.utcnow().isoformat(),
        "success": success,
        "metrics": metrics,
    }
    if duration_ms is not None:
        entry["duration_ms"] = duration_ms
    log.setdefault("steps", []).append(entry)
    await update_session_db(session_id, pipeline_log=log)
 # ---------------------------------------------------------------------------
 # Border-ghost word filter
 # ---------------------------------------------------------------------------
 # Characters that OCR produces when reading box-border lines.
 _BORDER_GHOST_CHARS = set("|1lI![](){}iíì/\\-—–_~.,;:'\"")
 def _filter_border_ghost_words(
    word_result: Dict,
    boxes: List,
 ) -> int:
    """Remove OCR words that are actually box border lines.
    A word is considered a border ghost when it sits on a known box edge
    (left, right, top, or bottom) and looks like a line artefact (narrow
    aspect ratio or text consists only of line-like characters).
    After removing ghost cells, columns that have become empty are also
    removed from ``columns_used`` so the grid no longer shows phantom
    columns.
    Modifies *word_result* in-place and returns the number of removed cells.
    """
    if not boxes or not word_result:
        return 0
    cells = word_result.get("cells")
    if not cells:
        return 0
    # Build border bands — vertical (X) and horizontal (Y)
    x_bands = []  # list of (x_lo, x_hi)
    y_bands = []  # list of (y_lo, y_hi)
    for b in boxes:
        bx = b.x if hasattr(b, "x") else b.get("x", 0)
        by = b.y if hasattr(b, "y") else b.get("y", 0)
        bw = b.width if hasattr(b, "width") else b.get("w", b.get("width", 0))
        bh = b.height if hasattr(b, "height") else b.get("h", b.get("height", 0))
        bt = b.border_thickness if hasattr(b, "border_thickness") else b.get("border_thickness", 3)
        margin = max(bt * 2, 10) + 6  # generous margin
        # Vertical edges (left / right)
        x_bands.append((bx - margin, bx + margin))
        x_bands.append((bx + bw - margin, bx + bw + margin))
        # Horizontal edges (top / bottom)
        y_bands.append((by - margin, by + margin))
        y_bands.append((by + bh - margin, by + bh + margin))
    img_w = word_result.get("image_width", 1)
    img_h = word_result.get("image_height", 1)
    def _is_ghost(cell: Dict) -> bool:
        text = (cell.get("text") or "").strip()
        if not text:
            return False
        # Compute absolute pixel position
        if cell.get("bbox_px"):
            px = cell["bbox_px"]
            cx = px["x"] + px["w"] / 2
            cy = px["y"] + px["h"] / 2
            cw = px["w"]
            ch = px["h"]
        elif cell.get("bbox_pct"):
            pct = cell["bbox_pct"]
            cx = (pct["x"] / 100) * img_w + (pct["w"] / 100) * img_w / 2
            cy = (pct["y"] / 100) * img_h + (pct["h"] / 100) * img_h / 2
            cw = (pct["w"] / 100) * img_w
            ch = (pct["h"] / 100) * img_h
        else:
            return False
        # Check if center sits on a vertical or horizontal border
        on_vertical = any(lo <= cx <= hi for lo, hi in x_bands)
        on_horizontal = any(lo <= cy <= hi for lo, hi in y_bands)
        if not on_vertical and not on_horizontal:
            return False
        # Very short text (1-2 chars) on a border → very likely ghost
        if len(text) <= 2:
            # Narrow vertically (line-like) or narrow horizontally (dash-like)?
            if ch > 0 and cw / ch < 0.5:
                return True
            if cw > 0 and ch / cw < 0.5:
                return True
            # Text is only border-ghost characters?
            if all(c in _BORDER_GHOST_CHARS for c in text):
                return True
        # Longer text but still only ghost chars and very narrow
        if all(c in _BORDER_GHOST_CHARS for c in text):
            if ch > 0 and cw / ch < 0.35:
                return True
            if cw > 0 and ch / cw < 0.35:
                return True
            return True  # all ghost chars on a border → remove
        return False
    before = len(cells)
    word_result["cells"] = [c for c in cells if not _is_ghost(c)]
    removed = before - len(word_result["cells"])
    # --- Remove empty columns from columns_used ---
    columns_used = word_result.get("columns_used")
    if removed and columns_used and len(columns_used) > 1:
        remaining_cells = word_result["cells"]
        occupied_cols = {c.get("col_index") for c in remaining_cells}
        before_cols = len(columns_used)
        columns_used = [col for col in columns_used if col.get("index") in occupied_cols]
        # Re-index columns and remap cell col_index values
        if len(columns_used) < before_cols:
            old_to_new = {}
            for new_i, col in enumerate(columns_used):
                old_to_new[col["index"]] = new_i
                col["index"] = new_i
            for cell in remaining_cells:
                old_ci = cell.get("col_index")
                if old_ci in old_to_new:
                    cell["col_index"] = old_to_new[old_ci]
            word_result["columns_used"] = columns_used
            logger.info("border-ghost: removed %d empty column(s), %d remaining",
                        before_cols - len(columns_used), len(columns_used))
    if removed:
        # Update summary counts
        summary = word_result.get("summary", {})
        summary["total_cells"] = len(word_result["cells"])
        summary["non_empty_cells"] = sum(1 for c in word_result["cells"] if c.get("text"))
        word_result["summary"] = summary
        gs = word_result.get("grid_shape", {})
        gs["total_cells"] = len(word_result["cells"])
        if columns_used is not None:
            gs["cols"] = len(columns_used)
        word_result["grid_shape"] = gs
    return removed
@@ -0,0 +1,615 @@
 """
 OCR Merge Helpers and Kombi Endpoints.
 Contains merge helper functions for combining PaddleOCR/RapidOCR with Tesseract
 results, plus the paddle-kombi and rapid-kombi endpoints.
 Extracted from ocr_pipeline_api.py for modularity.
 Lizenz: Apache 2.0
 DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
 """
 import logging
 import time
 from typing import Any, Dict, List
 import cv2
 import httpx
 import numpy as np
 from fastapi import APIRouter, HTTPException
 from cv_words_first import build_grid_from_words
 from ocr_pipeline_common import _cache, _append_pipeline_log
 from ocr_pipeline_session_store import get_session_image, update_session_db
 logger = logging.getLogger(__name__)
 router = APIRouter(prefix="/api/v1/ocr-pipeline", tags=["ocr-pipeline"])
 # ---------------------------------------------------------------------------
 # Merge helper functions
 # ---------------------------------------------------------------------------
 def _split_paddle_multi_words(words: list) -> list:
    """Split PaddleOCR multi-word boxes into individual word boxes.
    PaddleOCR often returns entire phrases as a single box, e.g.
    "More than 200 singers took part in the" with one bounding box.
    This splits them into individual words with proportional widths.
    Also handles leading "!" (e.g. "!Betonung" → ["!", "Betonung"])
    and IPA brackets (e.g. "badge[bxd3]" → ["badge", "[bxd3]"]).
    """
    import re
    result = []
    for w in words:
        raw_text = w.get("text", "").strip()
        if not raw_text:
            continue
        # Split on whitespace, before "[" (IPA), and after "!" before letter
        tokens = re.split(
            r'\s+|(?=\[)|(?<=!)(?=[A-Za-z\u00c0-\u024f])', raw_text
        )
        tokens = [t for t in tokens if t]
        if len(tokens) <= 1:
            result.append(w)
        else:
            # Split proportionally by character count
            total_chars = sum(len(t) for t in tokens)
            if total_chars == 0:
                continue
            n_gaps = len(tokens) - 1
            gap_px = w["width"] * 0.02
            usable_w = w["width"] - gap_px * n_gaps
            cursor = w["left"]
            for t in tokens:
                token_w = max(1, usable_w * len(t) / total_chars)
                result.append({
                    "text": t,
                    "left": round(cursor),
                    "top": w["top"],
                    "width": round(token_w),
                    "height": w["height"],
                    "conf": w.get("conf", 0),
                })
                cursor += token_w + gap_px
    return result
 def _group_words_into_rows(words: list, row_gap: int = 12) -> list:
    """Group words into rows by Y-position clustering.
    Words whose vertical centers are within `row_gap` pixels are on the same row.
    Returns list of rows, each row is a list of words sorted left-to-right.
    """
    if not words:
        return []
    # Sort by vertical center
    sorted_words = sorted(words, key=lambda w: w["top"] + w.get("height", 0) / 2)
    rows: list = []
    current_row: list = [sorted_words[0]]
    current_cy = sorted_words[0]["top"] + sorted_words[0].get("height", 0) / 2
    for w in sorted_words[1:]:
        cy = w["top"] + w.get("height", 0) / 2
        if abs(cy - current_cy) <= row_gap:
            current_row.append(w)
        else:
            # Sort current row left-to-right before saving
            rows.append(sorted(current_row, key=lambda w: w["left"]))
            current_row = [w]
            current_cy = cy
    if current_row:
        rows.append(sorted(current_row, key=lambda w: w["left"]))
    return rows
 def _row_center_y(row: list) -> float:
    """Average vertical center of a row of words."""
    if not row:
        return 0.0
    return sum(w["top"] + w.get("height", 0) / 2 for w in row) / len(row)
 def _merge_row_sequences(paddle_row: list, tess_row: list) -> list:
    """Merge two word sequences from the same row using sequence alignment.
    Both sequences are sorted left-to-right. Walk through both simultaneously:
    - If words match (same/similar text): take Paddle text with averaged coords
    - If they don't match: the extra word is unique to one engine, include it
    This prevents duplicates because both engines produce words in the same order.
    """
    merged = []
    pi, ti = 0, 0
    while pi < len(paddle_row) and ti < len(tess_row):
        pw = paddle_row[pi]
        tw = tess_row[ti]
        # Check if these are the same word
        pt = pw.get("text", "").lower().strip()
        tt = tw.get("text", "").lower().strip()
        # Same text or one contains the other
        is_same = (pt == tt) or (len(pt) > 1 and len(tt) > 1 and (pt in tt or tt in pt))
        # Spatial overlap check: if words overlap >= 40% horizontally,
        # they're the same physical word regardless of OCR text differences.
        # (40% catches borderline cases like "Stick"/"Stück" at 48% overlap)
        spatial_match = False
        if not is_same:
            overlap_left = max(pw["left"], tw["left"])
            overlap_right = min(
                pw["left"] + pw.get("width", 0),
                tw["left"] + tw.get("width", 0),
            )
            overlap_w = max(0, overlap_right - overlap_left)
            min_w = min(pw.get("width", 1), tw.get("width", 1))
            if min_w > 0 and overlap_w / min_w >= 0.4:
                is_same = True
                spatial_match = True
        if is_same:
            # Matched — average coordinates weighted by confidence
            pc = pw.get("conf", 80)
            tc = tw.get("conf", 50)
            total = pc + tc
            if total == 0:
                total = 1
            # Text: prefer higher-confidence engine when texts differ
            # (e.g. Tesseract "Stück" conf=98 vs PaddleOCR "Stick" conf=80)
            if spatial_match and pc < tc:
                best_text = tw["text"]
            else:
                best_text = pw["text"]
            merged.append({
                "text": best_text,
                "left": round((pw["left"] * pc + tw["left"] * tc) / total),
                "top": round((pw["top"] * pc + tw["top"] * tc) / total),
                "width": round((pw["width"] * pc + tw["width"] * tc) / total),
                "height": round((pw["height"] * pc + tw["height"] * tc) / total),
                "conf": max(pc, tc),
            })
            pi += 1
            ti += 1
        else:
            # Different text — one engine found something extra
            # Look ahead: is the current Paddle word somewhere in Tesseract ahead?
            paddle_ahead = any(
                tess_row[t].get("text", "").lower().strip() == pt
                for t in range(ti + 1, min(ti + 4, len(tess_row)))
            )
            # Is the current Tesseract word somewhere in Paddle ahead?
            tess_ahead = any(
                paddle_row[p].get("text", "").lower().strip() == tt
                for p in range(pi + 1, min(pi + 4, len(paddle_row)))
            )
            if paddle_ahead and not tess_ahead:
                # Tesseract has an extra word (e.g. "!" or bullet) → include it
                if tw.get("conf", 0) >= 30:
                    merged.append(tw)
                ti += 1
            elif tess_ahead and not paddle_ahead:
                # Paddle has an extra word → include it
                merged.append(pw)
                pi += 1
            else:
                # Both have unique words or neither found ahead → take leftmost first
                if pw["left"] <= tw["left"]:
                    merged.append(pw)
                    pi += 1
                else:
                    if tw.get("conf", 0) >= 30:
                        merged.append(tw)
                    ti += 1
    # Remaining words from either engine
    while pi < len(paddle_row):
        merged.append(paddle_row[pi])
        pi += 1
    while ti < len(tess_row):
        tw = tess_row[ti]
        if tw.get("conf", 0) >= 30:
            merged.append(tw)
        ti += 1
    return merged
 def _merge_paddle_tesseract(paddle_words: list, tess_words: list) -> list:
    """Merge word boxes from PaddleOCR and Tesseract using row-based sequence alignment.
    Strategy:
    1. Group each engine's words into rows (by Y-position clustering)
    2. Match rows between engines (by vertical center proximity)
    3. Within each matched row: merge sequences left-to-right, deduplicating
       words that appear in both engines at the same sequence position
    4. Unmatched rows from either engine: keep as-is
    This prevents:
    - Cross-line averaging (words from different lines being merged)
    - Duplicate words (same word from both engines shown twice)
    """
    if not paddle_words and not tess_words:
        return []
    if not paddle_words:
        return [w for w in tess_words if w.get("conf", 0) >= 40]
    if not tess_words:
        return list(paddle_words)
    # Step 1: Group into rows
    paddle_rows = _group_words_into_rows(paddle_words)
    tess_rows = _group_words_into_rows(tess_words)
    # Step 2: Match rows between engines by vertical center proximity
    used_tess_rows: set = set()
    merged_all: list = []
    for pr in paddle_rows:
        pr_cy = _row_center_y(pr)
        best_dist, best_tri = float("inf"), -1
        for tri, tr in enumerate(tess_rows):
            if tri in used_tess_rows:
                continue
            tr_cy = _row_center_y(tr)
            dist = abs(pr_cy - tr_cy)
            if dist < best_dist:
                best_dist, best_tri = dist, tri
        # Row height threshold — rows must be within ~1.5x typical line height
        max_row_dist = max(
            max((w.get("height", 20) for w in pr), default=20),
            15,
        )
        if best_tri >= 0 and best_dist <= max_row_dist:
            # Matched row — merge sequences
            tr = tess_rows[best_tri]
            used_tess_rows.add(best_tri)
            merged_all.extend(_merge_row_sequences(pr, tr))
        else:
            # No matching Tesseract row — keep Paddle row as-is
            merged_all.extend(pr)
    # Add unmatched Tesseract rows
    for tri, tr in enumerate(tess_rows):
        if tri not in used_tess_rows:
            for tw in tr:
                if tw.get("conf", 0) >= 40:
                    merged_all.append(tw)
    return merged_all
 def _deduplicate_words(words: list) -> list:
    """Remove duplicate words with same text at overlapping positions.
    PaddleOCR can return overlapping phrases (e.g. "von jm." and "jm. =")
    that produce duplicate words after splitting.  This pass removes them.
    A word is a duplicate only when BOTH horizontal AND vertical overlap
    exceed 50% — same text on the same visual line at the same position.
    """
    if not words:
        return words
    result: list = []
    for w in words:
        wt = w.get("text", "").lower().strip()
        if not wt:
            continue
        is_dup = False
        w_right = w["left"] + w.get("width", 0)
        w_bottom = w["top"] + w.get("height", 0)
        for existing in result:
            et = existing.get("text", "").lower().strip()
            if wt != et:
                continue
            # Horizontal overlap
            ox_l = max(w["left"], existing["left"])
            ox_r = min(w_right, existing["left"] + existing.get("width", 0))
            ox = max(0, ox_r - ox_l)
            min_w = min(w.get("width", 1), existing.get("width", 1))
            if min_w <= 0 or ox / min_w < 0.5:
                continue
            # Vertical overlap — must also be on the same line
            oy_t = max(w["top"], existing["top"])
            oy_b = min(w_bottom, existing["top"] + existing.get("height", 0))
            oy = max(0, oy_b - oy_t)
            min_h = min(w.get("height", 1), existing.get("height", 1))
            if min_h > 0 and oy / min_h >= 0.5:
                is_dup = True
                break
        if not is_dup:
            result.append(w)
    removed = len(words) - len(result)
    if removed:
        logger.info("dedup: removed %d duplicate words", removed)
    return result
 # ---------------------------------------------------------------------------
 # Kombi endpoints
 # ---------------------------------------------------------------------------
@router.post("/sessions/{session_id}/paddle-kombi")
 async def paddle_kombi(session_id: str):
    """Run PaddleOCR + Tesseract on the preprocessed image and merge results.
    Both engines run on the same preprocessed (cropped/dewarped) image.
    Word boxes are matched by IoU and coordinates are averaged weighted by
    confidence. Unmatched Tesseract words (bullets, symbols) are added.
    """
    img_png = await get_session_image(session_id, "cropped")
    if not img_png:
        img_png = await get_session_image(session_id, "dewarped")
    if not img_png:
        img_png = await get_session_image(session_id, "original")
    if not img_png:
        raise HTTPException(status_code=404, detail="No image found for this session")
    img_arr = np.frombuffer(img_png, dtype=np.uint8)
    img_bgr = cv2.imdecode(img_arr, cv2.IMREAD_COLOR)
    if img_bgr is None:
        raise HTTPException(status_code=400, detail="Failed to decode image")
    img_h, img_w = img_bgr.shape[:2]
    from cv_ocr_engines import ocr_region_paddle
    t0 = time.time()
    # --- PaddleOCR ---
    paddle_words = await ocr_region_paddle(img_bgr, region=None)
    if not paddle_words:
        paddle_words = []
    # --- Tesseract ---
    from PIL import Image
    import pytesseract
    pil_img = Image.fromarray(cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB))
    data = pytesseract.image_to_data(
        pil_img, lang="eng+deu",
        config="--psm 6 --oem 3",
        output_type=pytesseract.Output.DICT,
    )
    tess_words = []
    for i in range(len(data["text"])):
        text = str(data["text"][i]).strip()
        conf_raw = str(data["conf"][i])
        conf = int(conf_raw) if conf_raw.lstrip("-").isdigit() else -1
        if not text or conf < 20:
            continue
        tess_words.append({
            "text": text,
            "left": data["left"][i],
            "top": data["top"][i],
            "width": data["width"][i],
            "height": data["height"][i],
            "conf": conf,
        })
    # --- Split multi-word Paddle boxes into individual words ---
    paddle_words_split = _split_paddle_multi_words(paddle_words)
    logger.info(
        "paddle_kombi: split %d paddle boxes → %d individual words",
        len(paddle_words), len(paddle_words_split),
    )
    # --- Merge ---
    if not paddle_words_split and not tess_words:
        raise HTTPException(status_code=400, detail="Both OCR engines returned no words")
    merged_words = _merge_paddle_tesseract(paddle_words_split, tess_words)
    merged_words = _deduplicate_words(merged_words)
    cells, columns_meta = build_grid_from_words(merged_words, img_w, img_h)
    duration = time.time() - t0
    for cell in cells:
        cell["ocr_engine"] = "kombi"
    n_rows = len(set(c["row_index"] for c in cells)) if cells else 0
    n_cols = len(columns_meta)
    col_types = {c.get("type") for c in columns_meta}
    is_vocab = bool(col_types & {"column_en", "column_de"})
    word_result = {
        "cells": cells,
        "grid_shape": {"rows": n_rows, "cols": n_cols, "total_cells": len(cells)},
        "columns_used": columns_meta,
        "layout": "vocab" if is_vocab else "generic",
        "image_width": img_w,
        "image_height": img_h,
        "duration_seconds": round(duration, 2),
        "ocr_engine": "kombi",
        "grid_method": "kombi",
        "raw_paddle_words": paddle_words,
        "raw_paddle_words_split": paddle_words_split,
        "raw_tesseract_words": tess_words,
        "summary": {
            "total_cells": len(cells),
            "non_empty_cells": sum(1 for c in cells if c.get("text")),
            "low_confidence": sum(1 for c in cells if 0 < c.get("confidence", 0) < 50),
            "paddle_words": len(paddle_words),
            "paddle_words_split": len(paddle_words_split),
            "tesseract_words": len(tess_words),
            "merged_words": len(merged_words),
        },
    }
    await update_session_db(
        session_id,
        word_result=word_result,
        cropped_png=img_png,
        current_step=8,
    )
    # Update in-memory cache so detect-structure can access word_result
    if session_id in _cache:
        _cache[session_id]["word_result"] = word_result
    logger.info(
        "paddle_kombi session %s: %d cells (%d rows, %d cols) in %.2fs "
        "[paddle=%d, tess=%d, merged=%d]",
        session_id, len(cells), n_rows, n_cols, duration,
        len(paddle_words), len(tess_words), len(merged_words),
    )
    await _append_pipeline_log(session_id, "paddle_kombi", {
        "total_cells": len(cells),
        "non_empty_cells": word_result["summary"]["non_empty_cells"],
        "paddle_words": len(paddle_words),
        "tesseract_words": len(tess_words),
        "merged_words": len(merged_words),
        "ocr_engine": "kombi",
    }, duration_ms=int(duration * 1000))
    return {"session_id": session_id, **word_result}
@router.post("/sessions/{session_id}/rapid-kombi")
 async def rapid_kombi(session_id: str):
    """Run RapidOCR + Tesseract on the preprocessed image and merge results.
    Same merge logic as paddle-kombi, but uses local RapidOCR (ONNX Runtime)
    instead of remote PaddleOCR service.
    """
    img_png = await get_session_image(session_id, "cropped")
    if not img_png:
        img_png = await get_session_image(session_id, "dewarped")
    if not img_png:
        img_png = await get_session_image(session_id, "original")
    if not img_png:
        raise HTTPException(status_code=404, detail="No image found for this session")
    img_arr = np.frombuffer(img_png, dtype=np.uint8)
    img_bgr = cv2.imdecode(img_arr, cv2.IMREAD_COLOR)
    if img_bgr is None:
        raise HTTPException(status_code=400, detail="Failed to decode image")
    img_h, img_w = img_bgr.shape[:2]
    from cv_ocr_engines import ocr_region_rapid
    from cv_vocab_types import PageRegion
    t0 = time.time()
    # --- RapidOCR (local, synchronous) ---
    full_region = PageRegion(
        type="full_page", x=0, y=0, width=img_w, height=img_h,
    )
    rapid_words = ocr_region_rapid(img_bgr, full_region)
    if not rapid_words:
        rapid_words = []
    # --- Tesseract ---
    from PIL import Image
    import pytesseract
    pil_img = Image.fromarray(cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB))
    data = pytesseract.image_to_data(
        pil_img, lang="eng+deu",
        config="--psm 6 --oem 3",
        output_type=pytesseract.Output.DICT,
    )
    tess_words = []
    for i in range(len(data["text"])):
        text = str(data["text"][i]).strip()
        conf_raw = str(data["conf"][i])
        conf = int(conf_raw) if conf_raw.lstrip("-").isdigit() else -1
        if not text or conf < 20:
            continue
        tess_words.append({
            "text": text,
            "left": data["left"][i],
            "top": data["top"][i],
            "width": data["width"][i],
            "height": data["height"][i],
            "conf": conf,
        })
    # --- Split multi-word RapidOCR boxes into individual words ---
    rapid_words_split = _split_paddle_multi_words(rapid_words)
    logger.info(
        "rapid_kombi: split %d rapid boxes → %d individual words",
        len(rapid_words), len(rapid_words_split),
    )
    # --- Merge ---
    if not rapid_words_split and not tess_words:
        raise HTTPException(status_code=400, detail="Both OCR engines returned no words")
    merged_words = _merge_paddle_tesseract(rapid_words_split, tess_words)
    merged_words = _deduplicate_words(merged_words)
    cells, columns_meta = build_grid_from_words(merged_words, img_w, img_h)
    duration = time.time() - t0
    for cell in cells:
        cell["ocr_engine"] = "rapid_kombi"
    n_rows = len(set(c["row_index"] for c in cells)) if cells else 0
    n_cols = len(columns_meta)
    col_types = {c.get("type") for c in columns_meta}
    is_vocab = bool(col_types & {"column_en", "column_de"})
    word_result = {
        "cells": cells,
        "grid_shape": {"rows": n_rows, "cols": n_cols, "total_cells": len(cells)},
        "columns_used": columns_meta,
        "layout": "vocab" if is_vocab else "generic",
        "image_width": img_w,
        "image_height": img_h,
        "duration_seconds": round(duration, 2),
        "ocr_engine": "rapid_kombi",
        "grid_method": "rapid_kombi",
        "raw_rapid_words": rapid_words,
        "raw_rapid_words_split": rapid_words_split,
        "raw_tesseract_words": tess_words,
        "summary": {
            "total_cells": len(cells),
            "non_empty_cells": sum(1 for c in cells if c.get("text")),
            "low_confidence": sum(1 for c in cells if 0 < c.get("confidence", 0) < 50),
            "rapid_words": len(rapid_words),
            "rapid_words_split": len(rapid_words_split),
            "tesseract_words": len(tess_words),
            "merged_words": len(merged_words),
        },
    }
    await update_session_db(
        session_id,
        word_result=word_result,
        cropped_png=img_png,
        current_step=8,
    )
    # Update in-memory cache so detect-structure can access word_result
    if session_id in _cache:
        _cache[session_id]["word_result"] = word_result
    logger.info(
        "rapid_kombi session %s: %d cells (%d rows, %d cols) in %.2fs "
        "[rapid=%d, tess=%d, merged=%d]",
        session_id, len(cells), n_rows, n_cols, duration,
        len(rapid_words), len(tess_words), len(merged_words),
    )
    await _append_pipeline_log(session_id, "rapid_kombi", {
        "total_cells": len(cells),
        "non_empty_cells": word_result["summary"]["non_empty_cells"],
        "rapid_words": len(rapid_words),
        "tesseract_words": len(tess_words),
        "merged_words": len(merged_words),
        "ocr_engine": "rapid_kombi",
    }, duration_ms=int(duration * 1000))
    return {"session_id": session_id, **word_result}
@@ -0,0 +1,929 @@
 """
 OCR Pipeline Postprocessing API — LLM review, reconstruction, export, validation,
 image detection/generation, and handwriting removal endpoints.
 Extracted from ocr_pipeline_api.py to keep the main module manageable.
 Lizenz: Apache 2.0
 DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
 """
 import json
 import logging
 import os
 import re
 from datetime import datetime
 from typing import Any, Dict, List, Optional
 from fastapi import APIRouter, HTTPException, Request
 from fastapi.responses import StreamingResponse
 from pydantic import BaseModel
 from cv_vocab_pipeline import (
    OLLAMA_REVIEW_MODEL,
    llm_review_entries,
    llm_review_entries_streaming,
 )
 from ocr_pipeline_session_store import (
    get_session_db,
    get_session_image,
    get_sub_sessions,
    update_session_db,
 )
 from ocr_pipeline_common import (
    _cache,
    _load_session_to_cache,
    _get_cached,
    _get_base_image_png,
    _append_pipeline_log,
    RemoveHandwritingRequest,
 )
 logger = logging.getLogger(__name__)
 router = APIRouter(prefix="/api/v1/ocr-pipeline", tags=["ocr-pipeline"])
 # ---------------------------------------------------------------------------
 # Pydantic Models
 # ---------------------------------------------------------------------------
 STYLE_SUFFIXES = {
    "educational": "educational illustration, textbook style, clear, colorful",
    "cartoon": "cartoon, child-friendly, simple shapes",
    "sketch": "pencil sketch, hand-drawn, black and white",
    "clipart": "clipart, flat vector style, simple",
    "realistic": "photorealistic, high detail",
 }
 class ValidationRequest(BaseModel):
    notes: Optional[str] = None
    score: Optional[int] = None
 class GenerateImageRequest(BaseModel):
    region_index: int
    prompt: str
    style: str = "educational"
 # ---------------------------------------------------------------------------
 # Step 8: LLM Review
 # ---------------------------------------------------------------------------
@router.post("/sessions/{session_id}/llm-review")
 async def run_llm_review(session_id: str, request: Request, stream: bool = False):
    """Run LLM-based correction on vocab entries from Step 5.
    Query params:
        stream: false (default) for JSON response, true for SSE streaming
    """
    session = await get_session_db(session_id)
    if not session:
        raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
    word_result = session.get("word_result")
    if not word_result:
        raise HTTPException(status_code=400, detail="No word result found — run Step 5 first")
    entries = word_result.get("vocab_entries") or word_result.get("entries") or []
    if not entries:
        raise HTTPException(status_code=400, detail="No vocab entries found — run Step 5 first")
    # Optional model override from request body
    body = {}
    try:
        body = await request.json()
    except Exception:
        pass
    model = body.get("model") or OLLAMA_REVIEW_MODEL
    if stream:
        return StreamingResponse(
            _llm_review_stream_generator(session_id, entries, word_result, model, request),
            media_type="text/event-stream",
            headers={"Cache-Control": "no-cache", "Connection": "keep-alive", "X-Accel-Buffering": "no"},
        )
    # Non-streaming path
    try:
        result = await llm_review_entries(entries, model=model)
    except Exception as e:
        import traceback
        logger.error(f"LLM review failed for session {session_id}: {type(e).__name__}: {e}\n{traceback.format_exc()}")
        raise HTTPException(status_code=502, detail=f"LLM review failed ({type(e).__name__}): {e}")
    # Store result inside word_result as a sub-key
    word_result["llm_review"] = {
        "changes": result["changes"],
        "model_used": result["model_used"],
        "duration_ms": result["duration_ms"],
        "entries_corrected": result["entries_corrected"],
    }
    await update_session_db(session_id, word_result=word_result, current_step=9)
    if session_id in _cache:
        _cache[session_id]["word_result"] = word_result
    logger.info(f"LLM review session {session_id}: {len(result['changes'])} changes, "
                f"{result['duration_ms']}ms, model={result['model_used']}")
    await _append_pipeline_log(session_id, "correction", {
        "engine": "llm",
        "model": result["model_used"],
        "total_entries": len(entries),
        "corrections_proposed": len(result["changes"]),
    }, duration_ms=result["duration_ms"])
    return {
        "session_id": session_id,
        "changes": result["changes"],
        "model_used": result["model_used"],
        "duration_ms": result["duration_ms"],
        "total_entries": len(entries),
        "corrections_found": len(result["changes"]),
    }
 async def _llm_review_stream_generator(
    session_id: str,
    entries: List[Dict],
    word_result: Dict,
    model: str,
    request: Request,
 ):
    """SSE generator that yields batch-by-batch LLM review progress."""
    try:
        async for event in llm_review_entries_streaming(entries, model=model):
            if await request.is_disconnected():
                logger.info(f"SSE: client disconnected during LLM review for {session_id}")
                return
            yield f"data: {json.dumps(event, ensure_ascii=False)}\n\n"
            # On complete: persist to DB
            if event.get("type") == "complete":
                word_result["llm_review"] = {
                    "changes": event["changes"],
                    "model_used": event["model_used"],
                    "duration_ms": event["duration_ms"],
                    "entries_corrected": event["entries_corrected"],
                }
                await update_session_db(session_id, word_result=word_result, current_step=9)
                if session_id in _cache:
                    _cache[session_id]["word_result"] = word_result
                logger.info(f"LLM review SSE session {session_id}: {event['corrections_found']} changes, "
                            f"{event['duration_ms']}ms, skipped={event['skipped']}, model={event['model_used']}")
    except Exception as e:
        import traceback
        logger.error(f"LLM review SSE failed for {session_id}: {type(e).__name__}: {e}\n{traceback.format_exc()}")
        error_event = {"type": "error", "detail": f"{type(e).__name__}: {e}"}
        yield f"data: {json.dumps(error_event)}\n\n"
@router.post("/sessions/{session_id}/llm-review/apply")
 async def apply_llm_corrections(session_id: str, request: Request):
    """Apply selected LLM corrections to vocab entries."""
    session = await get_session_db(session_id)
    if not session:
        raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
    word_result = session.get("word_result")
    if not word_result:
        raise HTTPException(status_code=400, detail="No word result found")
    llm_review = word_result.get("llm_review")
    if not llm_review:
        raise HTTPException(status_code=400, detail="No LLM review found — run /llm-review first")
    body = await request.json()
    accepted_indices = set(body.get("accepted_indices", []))  # indices into changes[]
    changes = llm_review.get("changes", [])
    entries = word_result.get("vocab_entries") or word_result.get("entries") or []
    # Build a lookup: (row_index, field) -> new_value for accepted changes
    corrections = {}
    applied_count = 0
    for idx, change in enumerate(changes):
        if idx in accepted_indices:
            key = (change["row_index"], change["field"])
            corrections[key] = change["new"]
            applied_count += 1
    # Apply corrections to entries
    for entry in entries:
        row_idx = entry.get("row_index", -1)
        for field_name in ("english", "german", "example"):
            key = (row_idx, field_name)
            if key in corrections:
                entry[field_name] = corrections[key]
                entry["llm_corrected"] = True
    # Update word_result
    word_result["vocab_entries"] = entries
    word_result["entries"] = entries
    word_result["llm_review"]["applied_count"] = applied_count
    word_result["llm_review"]["applied_at"] = datetime.utcnow().isoformat()
    await update_session_db(session_id, word_result=word_result)
    if session_id in _cache:
        _cache[session_id]["word_result"] = word_result
    logger.info(f"Applied {applied_count}/{len(changes)} LLM corrections for session {session_id}")
    return {
        "session_id": session_id,
        "applied_count": applied_count,
        "total_changes": len(changes),
    }
 # ---------------------------------------------------------------------------
 # Step 9: Reconstruction + Fabric JSON export
 # ---------------------------------------------------------------------------
@router.post("/sessions/{session_id}/reconstruction")
 async def save_reconstruction(session_id: str, request: Request):
    """Save edited cell texts from reconstruction step."""
    session = await get_session_db(session_id)
    if not session:
        raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
    word_result = session.get("word_result")
    if not word_result:
        raise HTTPException(status_code=400, detail="No word result found")
    body = await request.json()
    cell_updates = body.get("cells", [])
    if not cell_updates:
        await update_session_db(session_id, current_step=10)
        return {"session_id": session_id, "updated": 0}
    # Build update map: cell_id -> new text
    update_map = {c["cell_id"]: c["text"] for c in cell_updates}
    # Separate sub-session updates (cell_ids prefixed with "box{N}_")
    sub_updates: Dict[int, Dict[str, str]] = {}  # box_index -> {original_cell_id: text}
    main_updates: Dict[str, str] = {}
    for cell_id, text in update_map.items():
        m = re.match(r'^box(\d+)_(.+)$', cell_id)
        if m:
            bi = int(m.group(1))
            original_id = m.group(2)
            sub_updates.setdefault(bi, {})[original_id] = text
        else:
            main_updates[cell_id] = text
    # Update main session cells
    cells = word_result.get("cells", [])
    updated_count = 0
    for cell in cells:
        if cell["cell_id"] in main_updates:
            cell["text"] = main_updates[cell["cell_id"]]
            cell["status"] = "edited"
            updated_count += 1
    word_result["cells"] = cells
    # Also update vocab_entries if present
    entries = word_result.get("vocab_entries") or word_result.get("entries") or []
    if entries:
        # Map cell_id pattern "R{row}_C{col}" to entry fields
        for entry in entries:
            row_idx = entry.get("row_index", -1)
            # Check each field's cell
            for col_idx, field_name in enumerate(["english", "german", "example"]):
                cell_id = f"R{row_idx:02d}_C{col_idx}"
                # Also try without zero-padding
                cell_id_alt = f"R{row_idx}_C{col_idx}"
                new_text = main_updates.get(cell_id) or main_updates.get(cell_id_alt)
                if new_text is not None:
                    entry[field_name] = new_text
        word_result["vocab_entries"] = entries
        if "entries" in word_result:
            word_result["entries"] = entries
    await update_session_db(session_id, word_result=word_result, current_step=10)
    if session_id in _cache:
        _cache[session_id]["word_result"] = word_result
    # Route sub-session updates
    sub_updated = 0
    if sub_updates:
        subs = await get_sub_sessions(session_id)
        sub_by_index = {s.get("box_index"): s["id"] for s in subs}
        for bi, updates in sub_updates.items():
            sub_id = sub_by_index.get(bi)
            if not sub_id:
                continue
            sub_session = await get_session_db(sub_id)
            if not sub_session:
                continue
            sub_word = sub_session.get("word_result")
            if not sub_word:
                continue
            sub_cells = sub_word.get("cells", [])
            for cell in sub_cells:
                if cell["cell_id"] in updates:
                    cell["text"] = updates[cell["cell_id"]]
                    cell["status"] = "edited"
                    sub_updated += 1
            sub_word["cells"] = sub_cells
            await update_session_db(sub_id, word_result=sub_word)
            if sub_id in _cache:
                _cache[sub_id]["word_result"] = sub_word
    total_updated = updated_count + sub_updated
    logger.info(f"Reconstruction saved for session {session_id}: "
                f"{updated_count} main + {sub_updated} sub-session cells updated")
    return {
        "session_id": session_id,
        "updated": total_updated,
        "main_updated": updated_count,
        "sub_updated": sub_updated,
    }
@router.get("/sessions/{session_id}/reconstruction/fabric-json")
 async def get_fabric_json(session_id: str):
    """Return cell grid as Fabric.js-compatible JSON for the canvas editor.
    If the session has sub-sessions (box regions), their cells are merged
    into the result at the correct Y positions.
    """
    session = await get_session_db(session_id)
    if not session:
        raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
    word_result = session.get("word_result")
    if not word_result:
        raise HTTPException(status_code=400, detail="No word result found")
    cells = list(word_result.get("cells", []))
    img_w = word_result.get("image_width", 800)
    img_h = word_result.get("image_height", 600)
    # Merge sub-session cells at box positions
    subs = await get_sub_sessions(session_id)
    if subs:
        column_result = session.get("column_result") or {}
        zones = column_result.get("zones") or []
        box_zones = [z for z in zones if z.get("zone_type") == "box" and z.get("box")]
        for sub in subs:
            sub_session = await get_session_db(sub["id"])
            if not sub_session:
                continue
            sub_word = sub_session.get("word_result")
            if not sub_word or not sub_word.get("cells"):
                continue
            bi = sub.get("box_index", 0)
            if bi < len(box_zones):
                box = box_zones[bi]["box"]
                box_y, box_x = box["y"], box["x"]
            else:
                box_y, box_x = 0, 0
            # Offset sub-session cells to absolute page coordinates
            for cell in sub_word["cells"]:
                cell_copy = dict(cell)
                # Prefix cell_id with box index
                cell_copy["cell_id"] = f"box{bi}_{cell_copy.get('cell_id', '')}"
                cell_copy["source"] = f"box_{bi}"
                # Offset bbox_px
                bbox = cell_copy.get("bbox_px", {})
                if bbox:
                    bbox = dict(bbox)
                    bbox["x"] = bbox.get("x", 0) + box_x
                    bbox["y"] = bbox.get("y", 0) + box_y
                    cell_copy["bbox_px"] = bbox
                cells.append(cell_copy)
    from services.layout_reconstruction_service import cells_to_fabric_json
    fabric_json = cells_to_fabric_json(cells, img_w, img_h)
    return fabric_json
 # ---------------------------------------------------------------------------
 # Vocab entries merged + PDF/DOCX export
 # ---------------------------------------------------------------------------
@router.get("/sessions/{session_id}/vocab-entries/merged")
 async def get_merged_vocab_entries(session_id: str):
    """Return vocab entries from main session + all sub-sessions, sorted by Y position."""
    session = await get_session_db(session_id)
    if not session:
        raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
    word_result = session.get("word_result") or {}
    entries = list(word_result.get("vocab_entries") or word_result.get("entries") or [])
    # Tag main entries
    for e in entries:
        e.setdefault("source", "main")
    # Merge sub-session entries
    subs = await get_sub_sessions(session_id)
    if subs:
        column_result = session.get("column_result") or {}
        zones = column_result.get("zones") or []
        box_zones = [z for z in zones if z.get("zone_type") == "box" and z.get("box")]
        for sub in subs:
            sub_session = await get_session_db(sub["id"])
            if not sub_session:
                continue
            sub_word = sub_session.get("word_result") or {}
            sub_entries = sub_word.get("vocab_entries") or sub_word.get("entries") or []
            bi = sub.get("box_index", 0)
            box_y = 0
            if bi < len(box_zones):
                box_y = box_zones[bi]["box"]["y"]
            for e in sub_entries:
                e_copy = dict(e)
                e_copy["source"] = f"box_{bi}"
                e_copy["source_y"] = box_y  # for sorting
                entries.append(e_copy)
    # Sort by approximate Y position
    def _sort_key(e):
        if e.get("source", "main") == "main":
            return e.get("row_index", 0) * 100  # main entries by row index
        return e.get("source_y", 0) * 100 + e.get("row_index", 0)
    entries.sort(key=_sort_key)
    return {
        "session_id": session_id,
        "entries": entries,
        "total": len(entries),
        "sources": list(set(e.get("source", "main") for e in entries)),
    }
@router.get("/sessions/{session_id}/reconstruction/export/pdf")
 async def export_reconstruction_pdf(session_id: str):
    """Export the reconstructed cell grid as a PDF table."""
    session = await get_session_db(session_id)
    if not session:
        raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
    word_result = session.get("word_result")
    if not word_result:
        raise HTTPException(status_code=400, detail="No word result found")
    cells = word_result.get("cells", [])
    columns_used = word_result.get("columns_used", [])
    grid_shape = word_result.get("grid_shape", {})
    n_rows = grid_shape.get("rows", 0)
    n_cols = grid_shape.get("cols", 0)
    # Build table data: rows x columns
    table_data: list[list[str]] = []
    header = [c.get("label", c.get("type", f"Col {i}")) for i, c in enumerate(columns_used)]
    if not header:
        header = [f"Col {i}" for i in range(n_cols)]
    table_data.append(header)
    for r in range(n_rows):
        row_texts = []
        for ci in range(n_cols):
            cell_id = f"R{r:02d}_C{ci}"
            cell = next((c for c in cells if c.get("cell_id") == cell_id), None)
            row_texts.append(cell.get("text", "") if cell else "")
        table_data.append(row_texts)
    # Generate PDF with reportlab
    try:
        from reportlab.lib.pagesizes import A4
        from reportlab.lib import colors
        from reportlab.platypus import SimpleDocTemplate, Table, TableStyle
        import io as _io
        buf = _io.BytesIO()
        doc = SimpleDocTemplate(buf, pagesize=A4)
        if not table_data or not table_data[0]:
            raise HTTPException(status_code=400, detail="No data to export")
        t = Table(table_data)
        t.setStyle(TableStyle([
            ('BACKGROUND', (0, 0), (-1, 0), colors.HexColor('#0d9488')),
            ('TEXTCOLOR', (0, 0), (-1, 0), colors.white),
            ('FONTSIZE', (0, 0), (-1, -1), 9),
            ('GRID', (0, 0), (-1, -1), 0.5, colors.grey),
            ('VALIGN', (0, 0), (-1, -1), 'TOP'),
            ('WORDWRAP', (0, 0), (-1, -1), True),
        ]))
        doc.build([t])
        buf.seek(0)
        from fastapi.responses import StreamingResponse
        return StreamingResponse(
            buf,
            media_type="application/pdf",
            headers={"Content-Disposition": f'attachment; filename="reconstruction_{session_id}.pdf"'},
        )
    except ImportError:
        raise HTTPException(status_code=501, detail="reportlab not installed")
@router.get("/sessions/{session_id}/reconstruction/export/docx")
 async def export_reconstruction_docx(session_id: str):
    """Export the reconstructed cell grid as a DOCX table."""
    session = await get_session_db(session_id)
    if not session:
        raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
    word_result = session.get("word_result")
    if not word_result:
        raise HTTPException(status_code=400, detail="No word result found")
    cells = word_result.get("cells", [])
    columns_used = word_result.get("columns_used", [])
    grid_shape = word_result.get("grid_shape", {})
    n_rows = grid_shape.get("rows", 0)
    n_cols = grid_shape.get("cols", 0)
    try:
        from docx import Document
        from docx.shared import Pt
        import io as _io
        doc = Document()
        doc.add_heading(f'Rekonstruktion – Session {session_id[:8]}', level=1)
        # Build header
        header = [c.get("label", c.get("type", f"Col {i}")) for i, c in enumerate(columns_used)]
        if not header:
            header = [f"Col {i}" for i in range(n_cols)]
        table = doc.add_table(rows=1 + n_rows, cols=max(n_cols, 1))
        table.style = 'Table Grid'
        # Header row
        for ci, h in enumerate(header):
            table.rows[0].cells[ci].text = h
        # Data rows
        for r in range(n_rows):
            for ci in range(n_cols):
                cell_id = f"R{r:02d}_C{ci}"
                cell = next((c for c in cells if c.get("cell_id") == cell_id), None)
                table.rows[r + 1].cells[ci].text = cell.get("text", "") if cell else ""
        buf = _io.BytesIO()
        doc.save(buf)
        buf.seek(0)
        from fastapi.responses import StreamingResponse
        return StreamingResponse(
            buf,
            media_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
            headers={"Content-Disposition": f'attachment; filename="reconstruction_{session_id}.docx"'},
        )
    except ImportError:
        raise HTTPException(status_code=501, detail="python-docx not installed")
 # ---------------------------------------------------------------------------
 # Step 8: Validation — Original vs. Reconstruction
 # ---------------------------------------------------------------------------
@router.post("/sessions/{session_id}/reconstruction/detect-images")
 async def detect_image_regions(session_id: str):
    """Detect illustration/image regions in the original scan using VLM.
    Sends the original image to qwen2.5vl to find non-text, non-table
    image areas, returning bounding boxes (in %) and descriptions.
    """
    import base64
    import httpx
    import re
    session = await get_session_db(session_id)
    if not session:
        raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
    # Get original image bytes
    original_png = await get_session_image(session_id, "original")
    if not original_png:
        raise HTTPException(status_code=400, detail="No original image found")
    # Build context from vocab entries for richer descriptions
    word_result = session.get("word_result") or {}
    entries = word_result.get("vocab_entries") or word_result.get("entries") or []
    vocab_context = ""
    if entries:
        sample = entries[:10]
        words = [f"{e.get('english', '')} / {e.get('german', '')}" for e in sample if e.get('english')]
        if words:
            vocab_context = f"\nContext: This is a vocabulary page with words like: {', '.join(words)}"
    ollama_base = os.getenv("OLLAMA_BASE_URL", "http://host.docker.internal:11434")
    model = os.getenv("OLLAMA_HTR_MODEL", "qwen2.5vl:32b")
    prompt = (
        "Analyze this scanned page. Find ALL illustration/image/picture regions "
        "(NOT text, NOT table cells, NOT blank areas). "
        "For each image region found, return its bounding box as percentage of page dimensions "
        "and a short English description of what the image shows. "
        "Reply with ONLY a JSON array like: "
        '[{"x": 10, "y": 20, "w": 30, "h": 25, "description": "drawing of a cat"}] '
        "where x, y, w, h are percentages (0-100) of the page width/height. "
        "If there are NO images on the page, return an empty array: []"
        f"{vocab_context}"
    )
    img_b64 = base64.b64encode(original_png).decode("utf-8")
    payload = {
        "model": model,
        "prompt": prompt,
        "images": [img_b64],
        "stream": False,
    }
    try:
        async with httpx.AsyncClient(timeout=120.0) as client:
            resp = await client.post(f"{ollama_base}/api/generate", json=payload)
            resp.raise_for_status()
            text = resp.json().get("response", "")
        # Parse JSON array from response
        match = re.search(r'\[.*?\]', text, re.DOTALL)
        if match:
            raw_regions = json.loads(match.group(0))
        else:
            raw_regions = []
        # Normalize to ImageRegion format
        regions = []
        for r in raw_regions:
            regions.append({
                "bbox_pct": {
                    "x": max(0, min(100, float(r.get("x", 0)))),
                    "y": max(0, min(100, float(r.get("y", 0)))),
                    "w": max(1, min(100, float(r.get("w", 10)))),
                    "h": max(1, min(100, float(r.get("h", 10)))),
                },
                "description": r.get("description", ""),
                "prompt": r.get("description", ""),
                "image_b64": None,
                "style": "educational",
            })
        # Enrich prompts with nearby vocab context
        if entries:
            for region in regions:
                ry = region["bbox_pct"]["y"]
                rh = region["bbox_pct"]["h"]
                nearby = [
                    e for e in entries
                    if e.get("bbox") and abs(e["bbox"].get("y", 0) - ry) < rh + 10
                ]
                if nearby:
                    en_words = [e.get("english", "") for e in nearby if e.get("english")]
                    de_words = [e.get("german", "") for e in nearby if e.get("german")]
                    if en_words or de_words:
                        context = f" (vocabulary context: {', '.join(en_words[:5])}"
                        if de_words:
                            context += f" / {', '.join(de_words[:5])}"
                        context += ")"
                        region["prompt"] = region["description"] + context
        # Save to ground_truth JSONB
        ground_truth = session.get("ground_truth") or {}
        validation = ground_truth.get("validation") or {}
        validation["image_regions"] = regions
        validation["detected_at"] = datetime.utcnow().isoformat()
        ground_truth["validation"] = validation
        await update_session_db(session_id, ground_truth=ground_truth)
        if session_id in _cache:
            _cache[session_id]["ground_truth"] = ground_truth
        logger.info(f"Detected {len(regions)} image regions for session {session_id}")
        return {"regions": regions, "count": len(regions)}
    except httpx.ConnectError:
        logger.warning(f"VLM not available at {ollama_base} for image detection")
        return {"regions": [], "count": 0, "error": "VLM not available"}
    except Exception as e:
        logger.error(f"Image detection failed for {session_id}: {e}")
        return {"regions": [], "count": 0, "error": str(e)}
@router.post("/sessions/{session_id}/reconstruction/generate-image")
 async def generate_image_for_region(session_id: str, req: GenerateImageRequest):
    """Generate a replacement image for a detected region using mflux.
    Sends the prompt (with style suffix) to the mflux-service running
    natively on the Mac Mini (Metal GPU required).
    """
    import httpx
    session = await get_session_db(session_id)
    if not session:
        raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
    ground_truth = session.get("ground_truth") or {}
    validation = ground_truth.get("validation") or {}
    regions = validation.get("image_regions") or []
    if req.region_index < 0 or req.region_index >= len(regions):
        raise HTTPException(status_code=400, detail=f"Invalid region_index {req.region_index}, have {len(regions)} regions")
    mflux_url = os.getenv("MFLUX_URL", "http://host.docker.internal:8095")
    style_suffix = STYLE_SUFFIXES.get(req.style, STYLE_SUFFIXES["educational"])
    full_prompt = f"{req.prompt}, {style_suffix}"
    # Determine image size from region aspect ratio (snap to multiples of 64)
    region = regions[req.region_index]
    bbox = region["bbox_pct"]
    aspect = bbox["w"] / max(bbox["h"], 1)
    if aspect > 1.3:
        width, height = 768, 512
    elif aspect < 0.7:
        width, height = 512, 768
    else:
        width, height = 512, 512
    try:
        async with httpx.AsyncClient(timeout=300.0) as client:
            resp = await client.post(f"{mflux_url}/generate", json={
                "prompt": full_prompt,
                "width": width,
                "height": height,
                "steps": 4,
            })
            resp.raise_for_status()
            data = resp.json()
            image_b64 = data.get("image_b64")
        if not image_b64:
            return {"image_b64": None, "success": False, "error": "No image returned"}
        # Save to ground_truth
        regions[req.region_index]["image_b64"] = image_b64
        regions[req.region_index]["prompt"] = req.prompt
        regions[req.region_index]["style"] = req.style
        validation["image_regions"] = regions
        ground_truth["validation"] = validation
        await update_session_db(session_id, ground_truth=ground_truth)
        if session_id in _cache:
            _cache[session_id]["ground_truth"] = ground_truth
        logger.info(f"Generated image for session {session_id} region {req.region_index}")
        return {"image_b64": image_b64, "success": True}
    except httpx.ConnectError:
        logger.warning(f"mflux-service not available at {mflux_url}")
        return {"image_b64": None, "success": False, "error": f"mflux-service not available at {mflux_url}"}
    except Exception as e:
        logger.error(f"Image generation failed for {session_id}: {e}")
        return {"image_b64": None, "success": False, "error": str(e)}
@router.post("/sessions/{session_id}/reconstruction/validate")
 async def save_validation(session_id: str, req: ValidationRequest):
    """Save final validation results for step 8.
    Stores notes, score, and preserves any detected/generated image regions.
    Sets current_step = 10 to mark pipeline as complete.
    """
    session = await get_session_db(session_id)
    if not session:
        raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
    ground_truth = session.get("ground_truth") or {}
    validation = ground_truth.get("validation") or {}
    validation["validated_at"] = datetime.utcnow().isoformat()
    validation["notes"] = req.notes
    validation["score"] = req.score
    ground_truth["validation"] = validation
    await update_session_db(session_id, ground_truth=ground_truth, current_step=11)
    if session_id in _cache:
        _cache[session_id]["ground_truth"] = ground_truth
    logger.info(f"Validation saved for session {session_id}: score={req.score}")
    return {"session_id": session_id, "validation": validation}
@router.get("/sessions/{session_id}/reconstruction/validation")
 async def get_validation(session_id: str):
    """Retrieve saved validation data for step 8."""
    session = await get_session_db(session_id)
    if not session:
        raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
    ground_truth = session.get("ground_truth") or {}
    validation = ground_truth.get("validation")
    return {
        "session_id": session_id,
        "validation": validation,
        "word_result": session.get("word_result"),
    }
 # ---------------------------------------------------------------------------
 # Remove handwriting
 # ---------------------------------------------------------------------------
@router.post("/sessions/{session_id}/remove-handwriting")
 async def remove_handwriting_endpoint(session_id: str, req: RemoveHandwritingRequest):
    """
    Remove handwriting from a session image using inpainting.
    Steps:
    1. Load source image (auto -> deskewed if available, else original)
    2. Detect handwriting mask (filtered by target_ink)
    3. Dilate mask to cover stroke edges
    4. Inpaint the image
    5. Store result as clean_png in the session
    Returns metadata including the URL to fetch the clean image.
    """
    import time as _time
    t0 = _time.monotonic()
    from services.handwriting_detection import detect_handwriting
    from services.inpainting_service import inpaint_image, dilate_mask as _dilate_mask, InpaintingMethod, image_to_png
    session = await get_session_db(session_id)
    if not session:
        raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
    # 1. Determine source image
    source = req.use_source
    if source == "auto":
        deskewed = await get_session_image(session_id, "deskewed")
        source = "deskewed" if deskewed else "original"
    image_bytes = await get_session_image(session_id, source)
    if not image_bytes:
        raise HTTPException(status_code=404, detail=f"Source image '{source}' not available")
    # 2. Detect handwriting mask
    detection = detect_handwriting(image_bytes, target_ink=req.target_ink)
    # 3. Convert mask to PNG bytes and dilate
    import io
    from PIL import Image as _PILImage
    mask_img = _PILImage.fromarray(detection.mask)
    mask_buf = io.BytesIO()
    mask_img.save(mask_buf, format="PNG")
    mask_bytes = mask_buf.getvalue()
    if req.dilation > 0:
        mask_bytes = _dilate_mask(mask_bytes, iterations=req.dilation)
    # 4. Inpaint
    method_map = {
        "telea": InpaintingMethod.OPENCV_TELEA,
        "ns": InpaintingMethod.OPENCV_NS,
        "auto": InpaintingMethod.AUTO,
    }
    inpaint_method = method_map.get(req.method, InpaintingMethod.AUTO)
    result = inpaint_image(image_bytes, mask_bytes, method=inpaint_method)
    if not result.success:
        raise HTTPException(status_code=500, detail="Inpainting failed")
    elapsed_ms = int((_time.monotonic() - t0) * 1000)
    meta = {
        "method_used": result.method_used.value if hasattr(result.method_used, "value") else str(result.method_used),
        "handwriting_ratio": round(detection.handwriting_ratio, 4),
        "detection_confidence": round(detection.confidence, 4),
        "target_ink": req.target_ink,
        "dilation": req.dilation,
        "source_image": source,
        "processing_time_ms": elapsed_ms,
    }
    # 5. Persist clean image (convert BGR ndarray -> PNG bytes)
    clean_png_bytes = image_to_png(result.image)
    await update_session_db(session_id, clean_png=clean_png_bytes, handwriting_removal_meta=meta)
    return {
        **meta,
        "image_url": f"/api/v1/ocr-pipeline/sessions/{session_id}/image/clean",
        "session_id": session_id,
    }
@@ -0,0 +1,348 @@
 """
 OCR Pipeline - Row Detection Endpoints.
 Extracted from ocr_pipeline_api.py.
 Handles row detection (auto + manual) and row ground truth.
 Lizenz: Apache 2.0
 DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
 """
 import logging
 import time
 from datetime import datetime
 from typing import Any, Dict, List, Optional
 import cv2
 import numpy as np
 from fastapi import APIRouter, HTTPException
 from cv_vocab_pipeline import (
    create_ocr_image,
    detect_column_geometry,
    detect_row_geometry,
 )
 from ocr_pipeline_common import (
    _cache,
    _load_session_to_cache,
    _get_cached,
    _append_pipeline_log,
    ManualRowsRequest,
    RowGroundTruthRequest,
 )
 from ocr_pipeline_session_store import (
    get_session_db,
    update_session_db,
 )
 logger = logging.getLogger(__name__)
 router = APIRouter(prefix="/api/v1/ocr-pipeline", tags=["ocr-pipeline"])
 # ---------------------------------------------------------------------------
 # Helper: Box-exclusion overlay (used by rows overlay and columns overlay)
 # ---------------------------------------------------------------------------
 def _draw_box_exclusion_overlay(
    img: np.ndarray,
    zones: List[Dict],
    *,
    label: str = "BOX — separat verarbeitet",
 ) -> None:
    """Draw red semi-transparent rectangles over box zones (in-place).
    Reusable for columns, rows, and words overlays.
    """
    for zone in zones:
        if zone.get("zone_type") != "box" or not zone.get("box"):
            continue
        box = zone["box"]
        bx, by = box["x"], box["y"]
        bw, bh = box["width"], box["height"]
        # Red semi-transparent fill (~25 %)
        box_overlay = img.copy()
        cv2.rectangle(box_overlay, (bx, by), (bx + bw, by + bh), (0, 0, 200), -1)
        cv2.addWeighted(box_overlay, 0.25, img, 0.75, 0, img)
        # Border
        cv2.rectangle(img, (bx, by), (bx + bw, by + bh), (0, 0, 200), 2)
        # Label
        cv2.putText(img, label, (bx + 10, by + bh - 10),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 255, 255), 2)
 # ---------------------------------------------------------------------------
 # Row Detection Endpoints
 # ---------------------------------------------------------------------------
@router.post("/sessions/{session_id}/rows")
 async def detect_rows(session_id: str):
    """Run row detection on the cropped (or dewarped) image using horizontal gap analysis."""
    if session_id not in _cache:
        await _load_session_to_cache(session_id)
    cached = _get_cached(session_id)
    dewarped_bgr = cached.get("cropped_bgr") if cached.get("cropped_bgr") is not None else cached.get("dewarped_bgr")
    if dewarped_bgr is None:
        raise HTTPException(status_code=400, detail="Crop or dewarp must be completed before row detection")
    t0 = time.time()
    # Try to reuse cached word_dicts and inv from column detection
    word_dicts = cached.get("_word_dicts")
    inv = cached.get("_inv")
    content_bounds = cached.get("_content_bounds")
    if word_dicts is None or inv is None or content_bounds is None:
        # Not cached — run column geometry to get intermediates
        ocr_img = create_ocr_image(dewarped_bgr)
        geo_result = detect_column_geometry(ocr_img, dewarped_bgr)
        if geo_result is None:
            raise HTTPException(status_code=400, detail="Column geometry detection failed — cannot detect rows")
        _geoms, left_x, right_x, top_y, bottom_y, word_dicts, inv = geo_result
        cached["_word_dicts"] = word_dicts
        cached["_inv"] = inv
        cached["_content_bounds"] = (left_x, right_x, top_y, bottom_y)
    else:
        left_x, right_x, top_y, bottom_y = content_bounds
    # Read zones from column_result to exclude box regions
    session = await get_session_db(session_id)
    column_result = (session or {}).get("column_result") or {}
    is_sub_session = bool((session or {}).get("parent_session_id"))
    # Sub-sessions (box crops): use word-grouping instead of gap-based
    # row detection.  Box images are small with complex internal layouts
    # (headings, sub-columns) where the horizontal projection approach
    # merges rows.  Word-grouping directly clusters words by Y proximity,
    # which is more robust for these cases.
    if is_sub_session and word_dicts:
        from cv_layout import _build_rows_from_word_grouping
        rows = _build_rows_from_word_grouping(
            word_dicts, left_x, right_x, top_y, bottom_y,
            right_x - left_x, bottom_y - top_y,
        )
        logger.info(f"OCR Pipeline: sub-session {session_id}: word-grouping found {len(rows)} rows")
    else:
        zones = column_result.get("zones") or []  # zones can be None for sub-sessions
        # Collect box y-ranges for filtering.
        # Use border_thickness to shrink the exclusion zone: the border pixels
        # belong visually to the box frame, but text rows above/below the box
        # may overlap with the border area and must not be clipped.
        box_ranges = []  # [(y_start, y_end)]
        box_ranges_inner = []  # [(y_start + border, y_end - border)] for row filtering
        for zone in zones:
            if zone.get("zone_type") == "box" and zone.get("box"):
                box = zone["box"]
                bt = max(box.get("border_thickness", 0), 5)  # minimum 5px margin
                box_ranges.append((box["y"], box["y"] + box["height"]))
                # Inner range: shrink by border thickness so boundary rows aren't excluded
                box_ranges_inner.append((box["y"] + bt, box["y"] + box["height"] - bt))
        if box_ranges and inv is not None:
            # Combined-image approach: strip box regions from inv image,
            # run row detection on the combined image, then remap y-coords back.
            content_strips = []  # [(y_start, y_end)] in absolute coords
            # Build content strips by subtracting box inner ranges from [top_y, bottom_y].
            # Using inner ranges means the border area is included in the content
            # strips, so the last row above a box isn't clipped by the border.
            sorted_boxes = sorted(box_ranges_inner, key=lambda r: r[0])
            strip_start = top_y
            for by_start, by_end in sorted_boxes:
                if by_start > strip_start:
                    content_strips.append((strip_start, by_start))
                strip_start = max(strip_start, by_end)
            if strip_start < bottom_y:
                content_strips.append((strip_start, bottom_y))
            # Filter to strips with meaningful height
            content_strips = [(ys, ye) for ys, ye in content_strips if ye - ys >= 20]
            if content_strips:
                # Stack content strips vertically
                inv_strips = [inv[ys:ye, :] for ys, ye in content_strips]
                combined_inv = np.vstack(inv_strips)
                # Filter word_dicts to only include words from content strips
                combined_words = []
                cum_y = 0
                strip_offsets = []  # (combined_y_start, strip_height, abs_y_start)
                for ys, ye in content_strips:
                    h = ye - ys
                    strip_offsets.append((cum_y, h, ys))
                    for w in word_dicts:
                        w_abs_y = w['top'] + top_y  # word y is relative to content top
                        w_center = w_abs_y + w['height'] / 2
                        if ys <= w_center < ye:
                            # Remap to combined coordinates
                            w_copy = dict(w)
                            w_copy['top'] = cum_y + (w_abs_y - ys)
                            combined_words.append(w_copy)
                    cum_y += h
                # Run row detection on combined image
                combined_h = combined_inv.shape[0]
                rows = detect_row_geometry(
                    combined_inv, combined_words, left_x, right_x, 0, combined_h,
                )
                # Remap y-coordinates back to absolute page coords
                def _combined_y_to_abs(cy: int) -> int:
                    for c_start, s_h, abs_start in strip_offsets:
                        if cy < c_start + s_h:
                            return abs_start + (cy - c_start)
                    last_c, last_h, last_abs = strip_offsets[-1]
                    return last_abs + last_h
                for r in rows:
                    abs_y = _combined_y_to_abs(r.y)
                    abs_y_end = _combined_y_to_abs(r.y + r.height)
                    r.y = abs_y
                    r.height = abs_y_end - abs_y
            else:
                rows = detect_row_geometry(inv, word_dicts, left_x, right_x, top_y, bottom_y)
        else:
            # No boxes — standard row detection
            rows = detect_row_geometry(inv, word_dicts, left_x, right_x, top_y, bottom_y)
    duration = time.time() - t0
    # Assign zone_index based on which content zone each row falls in
    # Build content zone list with indices
    zones = column_result.get("zones") or []
    content_zones = [(i, z) for i, z in enumerate(zones) if z.get("zone_type") == "content"] if zones else []
    # Build serializable result (exclude words to keep payload small)
    rows_data = []
    for r in rows:
        # Determine zone_index
        zone_idx = 0
        row_center_y = r.y + r.height / 2
        for zi, zone in content_zones:
            zy = zone["y"]
            zh = zone["height"]
            if zy <= row_center_y < zy + zh:
                zone_idx = zi
                break
        rd = {
            "index": r.index,
            "x": r.x,
            "y": r.y,
            "width": r.width,
            "height": r.height,
            "word_count": r.word_count,
            "row_type": r.row_type,
            "gap_before": r.gap_before,
            "zone_index": zone_idx,
        }
        rows_data.append(rd)
    type_counts = {}
    for r in rows:
        type_counts[r.row_type] = type_counts.get(r.row_type, 0) + 1
    row_result = {
        "rows": rows_data,
        "summary": type_counts,
        "total_rows": len(rows),
        "duration_seconds": round(duration, 2),
    }
    # Persist to DB — also invalidate word_result since rows changed
    await update_session_db(
        session_id,
        row_result=row_result,
        word_result=None,
        current_step=7,
    )
    cached["row_result"] = row_result
    cached.pop("word_result", None)
    logger.info(f"OCR Pipeline: rows session {session_id}: "
                f"{len(rows)} rows detected ({duration:.2f}s): {type_counts}")
    content_rows = sum(1 for r in rows if r.row_type == "content")
    avg_height = round(sum(r.height for r in rows) / len(rows)) if rows else 0
    await _append_pipeline_log(session_id, "rows", {
        "total_rows": len(rows),
        "content_rows": content_rows,
        "artifact_rows_removed": type_counts.get("header", 0) + type_counts.get("footer", 0),
        "avg_row_height_px": avg_height,
    }, duration_ms=int(duration * 1000))
    return {
        "session_id": session_id,
        **row_result,
    }
@router.post("/sessions/{session_id}/rows/manual")
 async def set_manual_rows(session_id: str, req: ManualRowsRequest):
    """Override detected rows with manual definitions."""
    row_result = {
        "rows": req.rows,
        "total_rows": len(req.rows),
        "duration_seconds": 0,
        "method": "manual",
    }
    await update_session_db(session_id, row_result=row_result, word_result=None)
    if session_id in _cache:
        _cache[session_id]["row_result"] = row_result
        _cache[session_id].pop("word_result", None)
    logger.info(f"OCR Pipeline: manual rows session {session_id}: "
                f"{len(req.rows)} rows set")
    return {"session_id": session_id, **row_result}
@router.post("/sessions/{session_id}/ground-truth/rows")
 async def save_row_ground_truth(session_id: str, req: RowGroundTruthRequest):
    """Save ground truth feedback for the row detection step."""
    session = await get_session_db(session_id)
    if not session:
        raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
    ground_truth = session.get("ground_truth") or {}
    gt = {
        "is_correct": req.is_correct,
        "corrected_rows": req.corrected_rows,
        "notes": req.notes,
        "saved_at": datetime.utcnow().isoformat(),
        "row_result": session.get("row_result"),
    }
    ground_truth["rows"] = gt
    await update_session_db(session_id, ground_truth=ground_truth)
    if session_id in _cache:
        _cache[session_id]["ground_truth"] = ground_truth
    return {"session_id": session_id, "ground_truth": gt}
@router.get("/sessions/{session_id}/ground-truth/rows")
 async def get_row_ground_truth(session_id: str):
    """Retrieve saved ground truth for row detection."""
    session = await get_session_db(session_id)
    if not session:
        raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
    ground_truth = session.get("ground_truth") or {}
    rows_gt = ground_truth.get("rows")
    if not rows_gt:
        raise HTTPException(status_code=404, detail="No row ground truth saved")
    return {
        "session_id": session_id,
        "rows_gt": rows_gt,
        "rows_auto": session.get("row_result"),
    }
@@ -0,0 +1,483 @@
 """
 OCR Pipeline Sessions API - Session management and image serving endpoints.
 Extracted from ocr_pipeline_api.py for modularity.
 Handles: CRUD for sessions, thumbnails, pipeline logs, categories,
 image serving (with overlay dispatch), and document type detection.
 Lizenz: Apache 2.0
 DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
 """
 import logging
 import time
 import uuid
 from typing import Any, Dict, Optional
 import cv2
 import numpy as np
 from fastapi import APIRouter, File, Form, HTTPException, Query, UploadFile
 from fastapi.responses import Response
 from cv_vocab_pipeline import (
    create_ocr_image,
    detect_document_type,
    render_image_high_res,
    render_pdf_high_res,
 )
 from ocr_pipeline_common import (
    VALID_DOCUMENT_CATEGORIES,
    UpdateSessionRequest,
    _append_pipeline_log,
    _cache,
    _get_base_image_png,
    _get_cached,
    _load_session_to_cache,
 )
 from ocr_pipeline_overlays import render_overlay
 from ocr_pipeline_session_store import (
    create_session_db,
    delete_all_sessions_db,
    delete_session_db,
    get_session_db,
    get_session_image,
    get_sub_sessions,
    list_sessions_db,
    update_session_db,
 )
 logger = logging.getLogger(__name__)
 router = APIRouter(prefix="/api/v1/ocr-pipeline", tags=["ocr-pipeline"])
 # ---------------------------------------------------------------------------
 # Session Management Endpoints
 # ---------------------------------------------------------------------------
@router.get("/sessions")
 async def list_sessions(include_sub_sessions: bool = False):
    """List OCR pipeline sessions.
    By default, sub-sessions (box regions) are hidden.
    Pass ?include_sub_sessions=true to show them.
    """
    sessions = await list_sessions_db(include_sub_sessions=include_sub_sessions)
    return {"sessions": sessions}
@router.post("/sessions")
 async def create_session(
    file: UploadFile = File(...),
    name: Optional[str] = Form(None),
 ):
    """Upload a PDF or image file and create a pipeline session."""
    file_data = await file.read()
    filename = file.filename or "upload"
    content_type = file.content_type or ""
    session_id = str(uuid.uuid4())
    is_pdf = content_type == "application/pdf" or filename.lower().endswith(".pdf")
    try:
        if is_pdf:
            img_bgr = render_pdf_high_res(file_data, page_number=0, zoom=3.0)
        else:
            img_bgr = render_image_high_res(file_data)
    except Exception as e:
        raise HTTPException(status_code=400, detail=f"Could not process file: {e}")
    # Encode original as PNG bytes
    success, png_buf = cv2.imencode(".png", img_bgr)
    if not success:
        raise HTTPException(status_code=500, detail="Failed to encode image")
    original_png = png_buf.tobytes()
    session_name = name or filename
    # Persist to DB
    await create_session_db(
        session_id=session_id,
        name=session_name,
        filename=filename,
        original_png=original_png,
    )
    # Cache BGR array for immediate processing
    _cache[session_id] = {
        "id": session_id,
        "filename": filename,
        "name": session_name,
        "original_bgr": img_bgr,
        "oriented_bgr": None,
        "cropped_bgr": None,
        "deskewed_bgr": None,
        "dewarped_bgr": None,
        "orientation_result": None,
        "crop_result": None,
        "deskew_result": None,
        "dewarp_result": None,
        "ground_truth": {},
        "current_step": 1,
    }
    logger.info(f"OCR Pipeline: created session {session_id} from {filename} "
                f"({img_bgr.shape[1]}x{img_bgr.shape[0]})")
    return {
        "session_id": session_id,
        "filename": filename,
        "name": session_name,
        "image_width": img_bgr.shape[1],
        "image_height": img_bgr.shape[0],
        "original_image_url": f"/api/v1/ocr-pipeline/sessions/{session_id}/image/original",
    }
@router.get("/sessions/{session_id}")
 async def get_session_info(session_id: str):
    """Get session info including deskew/dewarp/column results for step navigation."""
    session = await get_session_db(session_id)
    if not session:
        raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
    # Get image dimensions from original PNG
    original_png = await get_session_image(session_id, "original")
    if original_png:
        arr = np.frombuffer(original_png, dtype=np.uint8)
        img = cv2.imdecode(arr, cv2.IMREAD_COLOR)
        img_w, img_h = img.shape[1], img.shape[0] if img is not None else (0, 0)
    else:
        img_w, img_h = 0, 0
    result = {
        "session_id": session["id"],
        "filename": session.get("filename", ""),
        "name": session.get("name", ""),
        "image_width": img_w,
        "image_height": img_h,
        "original_image_url": f"/api/v1/ocr-pipeline/sessions/{session_id}/image/original",
        "current_step": session.get("current_step", 1),
        "document_category": session.get("document_category"),
        "doc_type": session.get("doc_type"),
    }
    if session.get("orientation_result"):
        result["orientation_result"] = session["orientation_result"]
    if session.get("crop_result"):
        result["crop_result"] = session["crop_result"]
    if session.get("deskew_result"):
        result["deskew_result"] = session["deskew_result"]
    if session.get("dewarp_result"):
        result["dewarp_result"] = session["dewarp_result"]
    if session.get("column_result"):
        result["column_result"] = session["column_result"]
    if session.get("row_result"):
        result["row_result"] = session["row_result"]
    if session.get("word_result"):
        result["word_result"] = session["word_result"]
    if session.get("doc_type_result"):
        result["doc_type_result"] = session["doc_type_result"]
    # Sub-session info
    if session.get("parent_session_id"):
        result["parent_session_id"] = session["parent_session_id"]
        result["box_index"] = session.get("box_index")
    else:
        # Check for sub-sessions
        subs = await get_sub_sessions(session_id)
        if subs:
            result["sub_sessions"] = [
                {"id": s["id"], "name": s.get("name"), "box_index": s.get("box_index")}
                for s in subs
            ]
    return result
@router.put("/sessions/{session_id}")
 async def update_session(session_id: str, req: UpdateSessionRequest):
    """Update session name and/or document category."""
    kwargs: Dict[str, Any] = {}
    if req.name is not None:
        kwargs["name"] = req.name
    if req.document_category is not None:
        if req.document_category not in VALID_DOCUMENT_CATEGORIES:
            raise HTTPException(
                status_code=400,
                detail=f"Invalid category '{req.document_category}'. Valid: {sorted(VALID_DOCUMENT_CATEGORIES)}",
            )
        kwargs["document_category"] = req.document_category
    if not kwargs:
        raise HTTPException(status_code=400, detail="Nothing to update")
    updated = await update_session_db(session_id, **kwargs)
    if not updated:
        raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
    return {"session_id": session_id, **kwargs}
@router.delete("/sessions/{session_id}")
 async def delete_session(session_id: str):
    """Delete a session."""
    _cache.pop(session_id, None)
    deleted = await delete_session_db(session_id)
    if not deleted:
        raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
    return {"session_id": session_id, "deleted": True}
@router.delete("/sessions")
 async def delete_all_sessions():
    """Delete ALL sessions (cleanup)."""
    _cache.clear()
    count = await delete_all_sessions_db()
    return {"deleted_count": count}
@router.post("/sessions/{session_id}/create-box-sessions")
 async def create_box_sessions(session_id: str):
    """Create sub-sessions for each detected box region.
    Crops box regions from the cropped/dewarped image and creates
    independent sub-sessions that can be processed through the pipeline.
    """
    session = await get_session_db(session_id)
    if not session:
        raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
    column_result = session.get("column_result")
    if not column_result:
        raise HTTPException(status_code=400, detail="Column detection must be completed first")
    zones = column_result.get("zones") or []
    box_zones = [z for z in zones if z.get("zone_type") == "box" and z.get("box")]
    if not box_zones:
        return {"session_id": session_id, "sub_sessions": [], "message": "No boxes detected"}
    # Check for existing sub-sessions
    existing = await get_sub_sessions(session_id)
    if existing:
        return {
            "session_id": session_id,
            "sub_sessions": [{"id": s["id"], "box_index": s.get("box_index")} for s in existing],
            "message": f"{len(existing)} sub-session(s) already exist",
        }
    # Load base image
    base_png = await get_session_image(session_id, "cropped")
    if not base_png:
        base_png = await get_session_image(session_id, "dewarped")
    if not base_png:
        raise HTTPException(status_code=400, detail="No base image available")
    arr = np.frombuffer(base_png, dtype=np.uint8)
    img = cv2.imdecode(arr, cv2.IMREAD_COLOR)
    if img is None:
        raise HTTPException(status_code=500, detail="Failed to decode image")
    parent_name = session.get("name", "Session")
    created = []
    for i, zone in enumerate(box_zones):
        box = zone["box"]
        bx, by = box["x"], box["y"]
        bw, bh = box["width"], box["height"]
        # Crop box region with small padding
        pad = 5
        y1 = max(0, by - pad)
        y2 = min(img.shape[0], by + bh + pad)
        x1 = max(0, bx - pad)
        x2 = min(img.shape[1], bx + bw + pad)
        crop = img[y1:y2, x1:x2]
        # Encode as PNG
        success, png_buf = cv2.imencode(".png", crop)
        if not success:
            logger.warning(f"Failed to encode box {i} crop for session {session_id}")
            continue
        sub_id = str(uuid.uuid4())
        sub_name = f"{parent_name} — Box {i + 1}"
        await create_session_db(
            session_id=sub_id,
            name=sub_name,
            filename=session.get("filename", "box-crop.png"),
            original_png=png_buf.tobytes(),
            parent_session_id=session_id,
            box_index=i,
        )
        # Cache the BGR for immediate processing
        # Promote original to cropped so column/row/word detection finds it
        box_bgr = crop.copy()
        _cache[sub_id] = {
            "id": sub_id,
            "filename": session.get("filename", "box-crop.png"),
            "name": sub_name,
            "parent_session_id": session_id,
            "original_bgr": box_bgr,
            "oriented_bgr": None,
            "cropped_bgr": box_bgr,
            "deskewed_bgr": None,
            "dewarped_bgr": None,
            "orientation_result": None,
            "crop_result": None,
            "deskew_result": None,
            "dewarp_result": None,
            "ground_truth": {},
            "current_step": 1,
        }
        created.append({
            "id": sub_id,
            "name": sub_name,
            "box_index": i,
            "box": box,
            "image_width": crop.shape[1],
            "image_height": crop.shape[0],
        })
        logger.info(f"Created box sub-session {sub_id} for session {session_id} "
                     f"(box {i}, {crop.shape[1]}x{crop.shape[0]})")
    return {
        "session_id": session_id,
        "sub_sessions": created,
        "total": len(created),
    }
@router.get("/sessions/{session_id}/thumbnail")
 async def get_session_thumbnail(session_id: str, size: int = Query(default=80, ge=16, le=400)):
    """Return a small thumbnail of the original image."""
    original_png = await get_session_image(session_id, "original")
    if not original_png:
        raise HTTPException(status_code=404, detail=f"Session {session_id} not found or no image")
    arr = np.frombuffer(original_png, dtype=np.uint8)
    img = cv2.imdecode(arr, cv2.IMREAD_COLOR)
    if img is None:
        raise HTTPException(status_code=500, detail="Failed to decode image")
    h, w = img.shape[:2]
    scale = size / max(h, w)
    new_w, new_h = int(w * scale), int(h * scale)
    thumb = cv2.resize(img, (new_w, new_h), interpolation=cv2.INTER_AREA)
    _, png_bytes = cv2.imencode(".png", thumb)
    return Response(content=png_bytes.tobytes(), media_type="image/png",
                    headers={"Cache-Control": "public, max-age=3600"})
@router.get("/sessions/{session_id}/pipeline-log")
 async def get_pipeline_log(session_id: str):
    """Get the pipeline execution log for a session."""
    session = await get_session_db(session_id)
    if not session:
        raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
    return {"session_id": session_id, "pipeline_log": session.get("pipeline_log") or {"steps": []}}
@router.get("/categories")
 async def list_categories():
    """List valid document categories."""
    return {"categories": sorted(VALID_DOCUMENT_CATEGORIES)}
 # ---------------------------------------------------------------------------
 # Image Endpoints
 # ---------------------------------------------------------------------------
@router.get("/sessions/{session_id}/image/{image_type}")
 async def get_image(session_id: str, image_type: str):
    """Serve session images: original, deskewed, dewarped, binarized, structure-overlay, columns-overlay, or rows-overlay."""
    valid_types = {"original", "oriented", "cropped", "deskewed", "dewarped", "binarized", "structure-overlay", "columns-overlay", "rows-overlay", "words-overlay", "clean"}
    if image_type not in valid_types:
        raise HTTPException(status_code=400, detail=f"Unknown image type: {image_type}")
    if image_type == "structure-overlay":
        return await render_overlay("structure", session_id)
    if image_type == "columns-overlay":
        return await render_overlay("columns", session_id)
    if image_type == "rows-overlay":
        return await render_overlay("rows", session_id)
    if image_type == "words-overlay":
        return await render_overlay("words", session_id)
    # Try cache first for fast serving
    cached = _cache.get(session_id)
    if cached:
        png_key = f"{image_type}_png" if image_type != "original" else None
        bgr_key = f"{image_type}_bgr" if image_type != "binarized" else None
        # For binarized, check if we have it cached as PNG
        if image_type == "binarized" and cached.get("binarized_png"):
            return Response(content=cached["binarized_png"], media_type="image/png")
    # Load from DB — for cropped/dewarped, fall back through the chain
    if image_type in ("cropped", "dewarped"):
        data = await _get_base_image_png(session_id)
    else:
        data = await get_session_image(session_id, image_type)
    if not data:
        raise HTTPException(status_code=404, detail=f"Image '{image_type}' not available yet")
    return Response(content=data, media_type="image/png")
 # ---------------------------------------------------------------------------
 # Document Type Detection (between Dewarp and Columns)
 # ---------------------------------------------------------------------------
@router.post("/sessions/{session_id}/detect-type")
 async def detect_type(session_id: str):
    """Detect document type (vocab_table, full_text, generic_table).
    Should be called after crop (clean image available).
    Falls back to dewarped if crop was skipped.
    Stores result in session for frontend to decide pipeline flow.
    """
    if session_id not in _cache:
        await _load_session_to_cache(session_id)
    cached = _get_cached(session_id)
    img_bgr = cached.get("cropped_bgr") if cached.get("cropped_bgr") is not None else cached.get("dewarped_bgr")
    if img_bgr is None:
        raise HTTPException(status_code=400, detail="Crop or dewarp must be completed first")
    t0 = time.time()
    ocr_img = create_ocr_image(img_bgr)
    result = detect_document_type(ocr_img, img_bgr)
    duration = time.time() - t0
    result_dict = {
        "doc_type": result.doc_type,
        "confidence": result.confidence,
        "pipeline": result.pipeline,
        "skip_steps": result.skip_steps,
        "features": result.features,
        "duration_seconds": round(duration, 2),
    }
    # Persist to DB
    await update_session_db(
        session_id,
        doc_type=result.doc_type,
        doc_type_result=result_dict,
    )
    cached["doc_type_result"] = result_dict
    logger.info(f"OCR Pipeline: detect-type session {session_id}: "
                f"{result.doc_type} (confidence={result.confidence}, {duration:.2f}s)")
    await _append_pipeline_log(session_id, "detect_type", {
        "doc_type": result.doc_type,
        "pipeline": result.pipeline,
        "confidence": result.confidence,
        **{k: v for k, v in (result.features or {}).items() if isinstance(v, (int, float, str, bool))},
    }, duration_ms=int(duration * 1000))
    return {"session_id": session_id, **result_dict}
@@ -0,0 +1,876 @@
 """
 OCR Pipeline Words - Word detection and ground truth endpoints.
 Extracted from ocr_pipeline_api.py.
 Handles:
 - POST /sessions/{session_id}/words — main SSE streaming word detection
 - POST /sessions/{session_id}/paddle-direct — PaddleOCR direct endpoint
 - POST /sessions/{session_id}/ground-truth/words — save ground truth
 - GET  /sessions/{session_id}/ground-truth/words — get ground truth
 Lizenz: Apache 2.0
 DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
 """
 import json
 import logging
 import time
 from datetime import datetime
 from typing import Any, Dict, List, Optional
 import cv2
 import numpy as np
 from fastapi import APIRouter, HTTPException, Request
 from fastapi.responses import StreamingResponse
 from pydantic import BaseModel
 from cv_vocab_pipeline import (
    PageRegion,
    RowGeometry,
    _cells_to_vocab_entries,
    _fix_character_confusion,
    _fix_phonetic_brackets,
    fix_cell_phonetics,
    build_cell_grid_v2,
    build_cell_grid_v2_streaming,
    create_ocr_image,
    detect_column_geometry,
 )
 from cv_words_first import build_grid_from_words
 from ocr_pipeline_session_store import (
    get_session_db,
    get_session_image,
    update_session_db,
 )
 from ocr_pipeline_common import (
    _cache,
    _load_session_to_cache,
    _get_cached,
    _get_base_image_png,
    _append_pipeline_log,
 )
 logger = logging.getLogger(__name__)
 router = APIRouter(prefix="/api/v1/ocr-pipeline", tags=["ocr-pipeline"])
 # ---------------------------------------------------------------------------
 # Pydantic models
 # ---------------------------------------------------------------------------
 class WordGroundTruthRequest(BaseModel):
    is_correct: bool
    corrected_entries: Optional[List[Dict[str, Any]]] = None
    notes: Optional[str] = None
 # ---------------------------------------------------------------------------
 # Word Detection Endpoint (Step 7)
 # ---------------------------------------------------------------------------
@router.post("/sessions/{session_id}/words")
 async def detect_words(
    session_id: str,
    request: Request,
    engine: str = "auto",
    pronunciation: str = "british",
    stream: bool = False,
    skip_heal_gaps: bool = False,
    grid_method: str = "v2",
 ):
    """Build word grid from columns × rows, OCR each cell.
    Query params:
        engine: 'auto' (default), 'tesseract', 'rapid', or 'paddle'
        pronunciation: 'british' (default) or 'american' — for IPA dictionary lookup
        stream: false (default) for JSON response, true for SSE streaming
        skip_heal_gaps: false (default). When true, cells keep exact row geometry
            positions without gap-healing expansion. Better for overlay rendering.
        grid_method: 'v2' (default) or 'words_first' — grid construction strategy.
            'v2' uses pre-detected columns/rows (top-down).
            'words_first' clusters words bottom-up (no column/row detection needed).
    """
    # PaddleOCR is full-page remote OCR → force words_first grid method
    if engine == "paddle" and grid_method != "words_first":
        logger.info("detect_words: engine=paddle requires words_first, overriding grid_method=%s", grid_method)
        grid_method = "words_first"
    if session_id not in _cache:
        logger.info("detect_words: session %s not in cache, loading from DB", session_id)
        await _load_session_to_cache(session_id)
    cached = _get_cached(session_id)
    dewarped_bgr = cached.get("cropped_bgr") if cached.get("cropped_bgr") is not None else cached.get("dewarped_bgr")
    if dewarped_bgr is None:
        logger.warning("detect_words: no cropped/dewarped image for session %s (cache keys: %s)",
                       session_id, [k for k in cached.keys() if k.endswith('_bgr')])
        raise HTTPException(status_code=400, detail="Crop or dewarp must be completed before word detection")
    session = await get_session_db(session_id)
    if not session:
        raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
    column_result = session.get("column_result")
    row_result = session.get("row_result")
    if not column_result or not column_result.get("columns"):
        # No column detection — synthesize a single full-page pseudo-column.
        # This enables the overlay pipeline which skips column detection.
        img_h_tmp, img_w_tmp = dewarped_bgr.shape[:2]
        column_result = {
            "columns": [{
                "type": "column_text",
                "x": 0, "y": 0,
                "width": img_w_tmp, "height": img_h_tmp,
                "classification_confidence": 1.0,
                "classification_method": "full_page_fallback",
            }],
            "zones": [],
            "duration_seconds": 0,
        }
        logger.info("detect_words: no column_result — using full-page pseudo-column %dx%d", img_w_tmp, img_h_tmp)
    if grid_method != "words_first" and (not row_result or not row_result.get("rows")):
        raise HTTPException(status_code=400, detail="Row detection must be completed first")
    # Convert column dicts back to PageRegion objects
    col_regions = [
        PageRegion(
            type=c["type"],
            x=c["x"], y=c["y"],
            width=c["width"], height=c["height"],
            classification_confidence=c.get("classification_confidence", 1.0),
            classification_method=c.get("classification_method", ""),
        )
        for c in column_result["columns"]
    ]
    # Convert row dicts back to RowGeometry objects
    row_geoms = [
        RowGeometry(
            index=r["index"],
            x=r["x"], y=r["y"],
            width=r["width"], height=r["height"],
            word_count=r.get("word_count", 0),
            words=[],
            row_type=r.get("row_type", "content"),
            gap_before=r.get("gap_before", 0),
        )
        for r in row_result["rows"]
    ]
    # Cell-First OCR (v2): no full-page word re-population needed.
    # Each cell is cropped and OCR'd in isolation → no neighbour bleeding.
    # We still need word_count > 0 for row filtering in build_cell_grid_v2,
    # so populate from cached words if available (just for counting).
    word_dicts = cached.get("_word_dicts")
    if word_dicts is None:
        ocr_img_tmp = create_ocr_image(dewarped_bgr)
        geo_result = detect_column_geometry(ocr_img_tmp, dewarped_bgr)
        if geo_result is not None:
            _geoms, left_x, right_x, top_y, bottom_y, word_dicts, inv = geo_result
            cached["_word_dicts"] = word_dicts
            cached["_inv"] = inv
            cached["_content_bounds"] = (left_x, right_x, top_y, bottom_y)
    if word_dicts:
        content_bounds = cached.get("_content_bounds")
        if content_bounds:
            _lx, _rx, top_y, _by = content_bounds
        else:
            top_y = min(r.y for r in row_geoms) if row_geoms else 0
        for row in row_geoms:
            row_y_rel = row.y - top_y
            row_bottom_rel = row_y_rel + row.height
            row.words = [
                w for w in word_dicts
                if row_y_rel <= w['top'] + w['height'] / 2 < row_bottom_rel
            ]
            row.word_count = len(row.words)
    # Exclude rows that fall within box zones.
    # Use inner box range (shrunk by border_thickness) so that rows at
    # the boundary (overlapping with the box border) are NOT excluded.
    zones = column_result.get("zones") or []
    box_ranges_inner = []
    for zone in zones:
        if zone.get("zone_type") == "box" and zone.get("box"):
            box = zone["box"]
            bt = max(box.get("border_thickness", 0), 5)  # minimum 5px margin
            box_ranges_inner.append((box["y"] + bt, box["y"] + box["height"] - bt))
    if box_ranges_inner:
        def _row_in_box(r):
            center_y = r.y + r.height / 2
            return any(by_s <= center_y < by_e for by_s, by_e in box_ranges_inner)
        before_count = len(row_geoms)
        row_geoms = [r for r in row_geoms if not _row_in_box(r)]
        excluded = before_count - len(row_geoms)
        if excluded:
            logger.info(f"detect_words: excluded {excluded} rows inside box zones")
    # --- Words-First path: bottom-up grid from word boxes ---
    if grid_method == "words_first":
        t0 = time.time()
        img_h, img_w = dewarped_bgr.shape[:2]
        # For paddle engine: run remote PaddleOCR full-page instead of Tesseract
        if engine == "paddle":
            from cv_ocr_engines import ocr_region_paddle
            wf_word_dicts = await ocr_region_paddle(dewarped_bgr, region=None)
            # PaddleOCR returns absolute coordinates, no content_bounds offset needed
            cached["_paddle_word_dicts"] = wf_word_dicts
        else:
            # Get word_dicts from cache or run Tesseract full-page
            wf_word_dicts = cached.get("_word_dicts")
            if wf_word_dicts is None:
                ocr_img_tmp = create_ocr_image(dewarped_bgr)
                geo_result = detect_column_geometry(ocr_img_tmp, dewarped_bgr)
                if geo_result is not None:
                    _geoms, left_x, right_x, top_y, bottom_y, wf_word_dicts, inv = geo_result
                    cached["_word_dicts"] = wf_word_dicts
                    cached["_inv"] = inv
                    cached["_content_bounds"] = (left_x, right_x, top_y, bottom_y)
        if not wf_word_dicts:
            raise HTTPException(status_code=400, detail="No words detected — cannot build words-first grid")
        # Convert word coordinates to absolute image coordinates if needed
        # (detect_column_geometry returns words relative to content ROI)
        # PaddleOCR already returns absolute coordinates — skip offset.
        if engine != "paddle":
            content_bounds = cached.get("_content_bounds")
            if content_bounds:
                lx, _rx, ty, _by = content_bounds
                abs_words = []
                for w in wf_word_dicts:
                    abs_words.append({
                        **w,
                        'left': w['left'] + lx,
                        'top': w['top'] + ty,
                    })
                wf_word_dicts = abs_words
        # Extract box rects for box-aware column clustering
        box_rects = []
        for zone in zones:
            if zone.get("zone_type") == "box" and zone.get("box"):
                box_rects.append(zone["box"])
        cells, columns_meta = build_grid_from_words(
            wf_word_dicts, img_w, img_h, box_rects=box_rects or None,
        )
        duration = time.time() - t0
        # Apply IPA phonetic fixes
        fix_cell_phonetics(cells, pronunciation=pronunciation)
        # Add zone_index for backward compat
        for cell in cells:
            cell.setdefault("zone_index", 0)
        col_types = {c['type'] for c in columns_meta}
        is_vocab = bool(col_types & {'column_en', 'column_de'})
        n_rows = len(set(c['row_index'] for c in cells)) if cells else 0
        n_cols = len(columns_meta)
        used_engine = "paddle" if engine == "paddle" else "words_first"
        word_result = {
            "cells": cells,
            "grid_shape": {
                "rows": n_rows,
                "cols": n_cols,
                "total_cells": len(cells),
            },
            "columns_used": columns_meta,
            "layout": "vocab" if is_vocab else "generic",
            "image_width": img_w,
            "image_height": img_h,
            "duration_seconds": round(duration, 2),
            "ocr_engine": used_engine,
            "grid_method": "words_first",
            "summary": {
                "total_cells": len(cells),
                "non_empty_cells": sum(1 for c in cells if c.get("text")),
                "low_confidence": sum(1 for c in cells if 0 < c.get("confidence", 0) < 50),
            },
        }
        if is_vocab or 'column_text' in col_types:
            entries = _cells_to_vocab_entries(cells, columns_meta)
            entries = _fix_phonetic_brackets(entries, pronunciation=pronunciation)
            word_result["vocab_entries"] = entries
            word_result["entries"] = entries
            word_result["entry_count"] = len(entries)
            word_result["summary"]["total_entries"] = len(entries)
            word_result["summary"]["with_english"] = sum(1 for e in entries if e.get("english"))
            word_result["summary"]["with_german"] = sum(1 for e in entries if e.get("german"))
        await update_session_db(session_id, word_result=word_result, current_step=8)
        cached["word_result"] = word_result
        logger.info(f"OCR Pipeline: words-first session {session_id}: "
                    f"{len(cells)} cells ({duration:.2f}s), {n_rows} rows, {n_cols} cols")
        await _append_pipeline_log(session_id, "words", {
            "grid_method": "words_first",
            "total_cells": len(cells),
            "non_empty_cells": word_result["summary"]["non_empty_cells"],
            "ocr_engine": used_engine,
            "layout": word_result["layout"],
        }, duration_ms=int(duration * 1000))
        return {"session_id": session_id, **word_result}
    if stream:
        # Cell-First OCR v2: use batch-then-stream approach instead of
        # per-cell streaming. The parallel ThreadPoolExecutor in
        # build_cell_grid_v2 is much faster than sequential streaming.
        return StreamingResponse(
            _word_batch_stream_generator(
                session_id, cached, col_regions, row_geoms,
                dewarped_bgr, engine, pronunciation, request,
                skip_heal_gaps=skip_heal_gaps,
            ),
            media_type="text/event-stream",
            headers={
                "Cache-Control": "no-cache",
                "Connection": "keep-alive",
                "X-Accel-Buffering": "no",
            },
        )
    # --- Non-streaming path (grid_method=v2) ---
    t0 = time.time()
    # Create binarized OCR image (for Tesseract)
    ocr_img = create_ocr_image(dewarped_bgr)
    img_h, img_w = dewarped_bgr.shape[:2]
    # Build cell grid using Cell-First OCR (v2) — each cell cropped in isolation
    cells, columns_meta = build_cell_grid_v2(
        ocr_img, col_regions, row_geoms, img_w, img_h,
        ocr_engine=engine, img_bgr=dewarped_bgr,
        skip_heal_gaps=skip_heal_gaps,
    )
    duration = time.time() - t0
    # Add zone_index to each cell (default 0 for backward compatibility)
    for cell in cells:
        cell.setdefault("zone_index", 0)
    # Layout detection
    col_types = {c['type'] for c in columns_meta}
    is_vocab = bool(col_types & {'column_en', 'column_de'})
    # Count content rows and columns for grid_shape
    n_content_rows = len([r for r in row_geoms if r.row_type == 'content'])
    n_cols = len(columns_meta)
    # Determine which engine was actually used
    used_engine = cells[0].get("ocr_engine", "tesseract") if cells else engine
    # Apply IPA phonetic fixes directly to cell texts (for overlay mode)
    fix_cell_phonetics(cells, pronunciation=pronunciation)
    # Grid result (always generic)
    word_result = {
        "cells": cells,
        "grid_shape": {
            "rows": n_content_rows,
            "cols": n_cols,
            "total_cells": len(cells),
        },
        "columns_used": columns_meta,
        "layout": "vocab" if is_vocab else "generic",
        "image_width": img_w,
        "image_height": img_h,
        "duration_seconds": round(duration, 2),
        "ocr_engine": used_engine,
        "summary": {
            "total_cells": len(cells),
            "non_empty_cells": sum(1 for c in cells if c.get("text")),
            "low_confidence": sum(1 for c in cells if 0 < c.get("confidence", 0) < 50),
        },
    }
    # For vocab layout or single-column (box sub-sessions): map cells 1:1
    # to vocab entries (row→entry).
    has_text_col = 'column_text' in col_types
    if is_vocab or has_text_col:
        entries = _cells_to_vocab_entries(cells, columns_meta)
        entries = _fix_phonetic_brackets(entries, pronunciation=pronunciation)
        word_result["vocab_entries"] = entries
        word_result["entries"] = entries
        word_result["entry_count"] = len(entries)
        word_result["summary"]["total_entries"] = len(entries)
        word_result["summary"]["with_english"] = sum(1 for e in entries if e.get("english"))
        word_result["summary"]["with_german"] = sum(1 for e in entries if e.get("german"))
    # Persist to DB
    await update_session_db(
        session_id,
        word_result=word_result,
        current_step=8,
    )
    cached["word_result"] = word_result
    logger.info(f"OCR Pipeline: words session {session_id}: "
                f"layout={word_result['layout']}, "
                f"{len(cells)} cells ({duration:.2f}s), summary: {word_result['summary']}")
    await _append_pipeline_log(session_id, "words", {
        "total_cells": len(cells),
        "non_empty_cells": word_result["summary"]["non_empty_cells"],
        "low_confidence_count": word_result["summary"]["low_confidence"],
        "ocr_engine": used_engine,
        "layout": word_result["layout"],
        "entry_count": word_result.get("entry_count", 0),
    }, duration_ms=int(duration * 1000))
    return {
        "session_id": session_id,
        **word_result,
    }
 async def _word_batch_stream_generator(
    session_id: str,
    cached: Dict[str, Any],
    col_regions: List[PageRegion],
    row_geoms: List[RowGeometry],
    dewarped_bgr: np.ndarray,
    engine: str,
    pronunciation: str,
    request: Request,
    skip_heal_gaps: bool = False,
 ):
    """SSE generator that runs batch OCR (parallel) then streams results.
    Unlike the old per-cell streaming, this uses build_cell_grid_v2 with
    ThreadPoolExecutor for parallel OCR, then emits all cells as SSE events.
    The 'preparing' event keeps the connection alive during OCR processing.
    """
    import asyncio
    t0 = time.time()
    ocr_img = create_ocr_image(dewarped_bgr)
    img_h, img_w = dewarped_bgr.shape[:2]
    _skip_types = {'column_ignore', 'header', 'footer', 'margin_top', 'margin_bottom', 'margin_left', 'margin_right'}
    n_content_rows = len([r for r in row_geoms if r.row_type == 'content'])
    n_cols = len([c for c in col_regions if c.type not in _skip_types])
    col_types = {c.type for c in col_regions if c.type not in _skip_types}
    is_vocab = bool(col_types & {'column_en', 'column_de'})
    total_cells = n_content_rows * n_cols
    # 1. Send meta event immediately
    meta_event = {
        "type": "meta",
        "grid_shape": {"rows": n_content_rows, "cols": n_cols, "total_cells": total_cells},
        "layout": "vocab" if is_vocab else "generic",
    }
    yield f"data: {json.dumps(meta_event)}\n\n"
    # 2. Send preparing event (keepalive for proxy)
    yield f"data: {json.dumps({'type': 'preparing', 'message': 'Cell-First OCR laeuft parallel...'})}\n\n"
    # 3. Run batch OCR in thread pool with periodic keepalive events.
    # The OCR takes 30-60s and proxy servers (Nginx) may drop idle SSE
    # connections after 30-60s. Send keepalive every 5s to prevent this.
    loop = asyncio.get_event_loop()
    ocr_future = loop.run_in_executor(
        None,
        lambda: build_cell_grid_v2(
            ocr_img, col_regions, row_geoms, img_w, img_h,
            ocr_engine=engine, img_bgr=dewarped_bgr,
            skip_heal_gaps=skip_heal_gaps,
        ),
    )
    # Send keepalive events every 5 seconds while OCR runs
    keepalive_count = 0
    while not ocr_future.done():
        try:
            cells, columns_meta = await asyncio.wait_for(
                asyncio.shield(ocr_future), timeout=5.0,
            )
            break  # OCR finished
        except asyncio.TimeoutError:
            keepalive_count += 1
            elapsed = int(time.time() - t0)
            yield f"data: {json.dumps({'type': 'keepalive', 'elapsed': elapsed, 'message': f'OCR laeuft... ({elapsed}s)'})}\n\n"
            if await request.is_disconnected():
                logger.info(f"SSE batch: client disconnected during OCR for {session_id}")
                ocr_future.cancel()
                return
    else:
        cells, columns_meta = ocr_future.result()
    if await request.is_disconnected():
        logger.info(f"SSE batch: client disconnected after OCR for {session_id}")
        return
    # 4. Apply IPA phonetic fixes directly to cell texts (for overlay mode)
    fix_cell_phonetics(cells, pronunciation=pronunciation)
    # 5. Send columns meta
    if columns_meta:
        yield f"data: {json.dumps({'type': 'columns', 'columns_used': columns_meta})}\n\n"
    # 6. Stream all cells
    for idx, cell in enumerate(cells):
        cell_event = {
            "type": "cell",
            "cell": cell,
            "progress": {"current": idx + 1, "total": len(cells)},
        }
        yield f"data: {json.dumps(cell_event)}\n\n"
    # 6. Build final result and persist
    duration = time.time() - t0
    used_engine = cells[0].get("ocr_engine", "tesseract") if cells else engine
    word_result = {
        "cells": cells,
        "grid_shape": {"rows": n_content_rows, "cols": n_cols, "total_cells": len(cells)},
        "columns_used": columns_meta,
        "layout": "vocab" if is_vocab else "generic",
        "image_width": img_w,
        "image_height": img_h,
        "duration_seconds": round(duration, 2),
        "ocr_engine": used_engine,
        "summary": {
            "total_cells": len(cells),
            "non_empty_cells": sum(1 for c in cells if c.get("text")),
            "low_confidence": sum(1 for c in cells if 0 < c.get("confidence", 0) < 50),
        },
    }
    vocab_entries = None
    has_text_col = 'column_text' in col_types
    if is_vocab or has_text_col:
        entries = _cells_to_vocab_entries(cells, columns_meta)
        entries = _fix_phonetic_brackets(entries, pronunciation=pronunciation)
        word_result["vocab_entries"] = entries
        word_result["entries"] = entries
        word_result["entry_count"] = len(entries)
        word_result["summary"]["total_entries"] = len(entries)
        word_result["summary"]["with_english"] = sum(1 for e in entries if e.get("english"))
        word_result["summary"]["with_german"] = sum(1 for e in entries if e.get("german"))
        vocab_entries = entries
    await update_session_db(session_id, word_result=word_result, current_step=8)
    cached["word_result"] = word_result
    logger.info(f"OCR Pipeline SSE batch: words session {session_id}: "
                f"layout={word_result['layout']}, {len(cells)} cells ({duration:.2f}s)")
    # 7. Send complete event
    complete_event = {
        "type": "complete",
        "summary": word_result["summary"],
        "duration_seconds": round(duration, 2),
        "ocr_engine": used_engine,
    }
    if vocab_entries is not None:
        complete_event["vocab_entries"] = vocab_entries
    yield f"data: {json.dumps(complete_event)}\n\n"
 async def _word_stream_generator(
    session_id: str,
    cached: Dict[str, Any],
    col_regions: List[PageRegion],
    row_geoms: List[RowGeometry],
    dewarped_bgr: np.ndarray,
    engine: str,
    pronunciation: str,
    request: Request,
 ):
    """SSE generator that yields cell-by-cell OCR progress."""
    t0 = time.time()
    ocr_img = create_ocr_image(dewarped_bgr)
    img_h, img_w = dewarped_bgr.shape[:2]
    # Compute grid shape upfront for the meta event
    n_content_rows = len([r for r in row_geoms if r.row_type == 'content'])
    _skip_types = {'column_ignore', 'header', 'footer', 'margin_top', 'margin_bottom', 'margin_left', 'margin_right'}
    n_cols = len([c for c in col_regions if c.type not in _skip_types])
    # Determine layout
    col_types = {c.type for c in col_regions if c.type not in _skip_types}
    is_vocab = bool(col_types & {'column_en', 'column_de'})
    # Start streaming — first event: meta
    columns_meta = None  # will be set from first yield
    total_cells = n_content_rows * n_cols
    meta_event = {
        "type": "meta",
        "grid_shape": {"rows": n_content_rows, "cols": n_cols, "total_cells": total_cells},
        "layout": "vocab" if is_vocab else "generic",
    }
    yield f"data: {json.dumps(meta_event)}\n\n"
    # Keepalive: send preparing event so proxy doesn't timeout during OCR init
    yield f"data: {json.dumps({'type': 'preparing', 'message': 'Cell-First OCR wird initialisiert...'})}\n\n"
    # Stream cells one by one
    all_cells: List[Dict[str, Any]] = []
    cell_idx = 0
    last_keepalive = time.time()
    for cell, cols_meta, total in build_cell_grid_v2_streaming(
        ocr_img, col_regions, row_geoms, img_w, img_h,
        ocr_engine=engine, img_bgr=dewarped_bgr,
    ):
        if await request.is_disconnected():
            logger.info(f"SSE: client disconnected during streaming for {session_id}")
            return
        if columns_meta is None:
            columns_meta = cols_meta
            # Send columns_used as part of first cell or update meta
            meta_update = {
                "type": "columns",
                "columns_used": cols_meta,
            }
            yield f"data: {json.dumps(meta_update)}\n\n"
        all_cells.append(cell)
        cell_idx += 1
        cell_event = {
            "type": "cell",
            "cell": cell,
            "progress": {"current": cell_idx, "total": total},
        }
        yield f"data: {json.dumps(cell_event)}\n\n"
    # All cells done — build final result
    duration = time.time() - t0
    if columns_meta is None:
        columns_meta = []
    # Post-OCR: remove rows where ALL cells are empty (inter-row gaps
    # that had stray Tesseract artifacts giving word_count > 0).
    rows_with_text: set = set()
    for c in all_cells:
        if c.get("text", "").strip():
            rows_with_text.add(c["row_index"])
    before_filter = len(all_cells)
    all_cells = [c for c in all_cells if c["row_index"] in rows_with_text]
    empty_rows_removed = (before_filter - len(all_cells)) // max(n_cols, 1)
    if empty_rows_removed > 0:
        logger.info(f"SSE: removed {empty_rows_removed} all-empty rows after OCR")
    used_engine = all_cells[0].get("ocr_engine", "tesseract") if all_cells else engine
    # Apply IPA phonetic fixes directly to cell texts (for overlay mode)
    fix_cell_phonetics(all_cells, pronunciation=pronunciation)
    word_result = {
        "cells": all_cells,
        "grid_shape": {
            "rows": n_content_rows,
            "cols": n_cols,
            "total_cells": len(all_cells),
        },
        "columns_used": columns_meta,
        "layout": "vocab" if is_vocab else "generic",
        "image_width": img_w,
        "image_height": img_h,
        "duration_seconds": round(duration, 2),
        "ocr_engine": used_engine,
        "summary": {
            "total_cells": len(all_cells),
            "non_empty_cells": sum(1 for c in all_cells if c.get("text")),
            "low_confidence": sum(1 for c in all_cells if 0 < c.get("confidence", 0) < 50),
        },
    }
    # For vocab layout or single-column (box sub-sessions): map cells 1:1
    # to vocab entries (row→entry).
    vocab_entries = None
    has_text_col = 'column_text' in col_types
    if is_vocab or has_text_col:
        entries = _cells_to_vocab_entries(all_cells, columns_meta)
        entries = _fix_character_confusion(entries)
        entries = _fix_phonetic_brackets(entries, pronunciation=pronunciation)
        word_result["vocab_entries"] = entries
        word_result["entries"] = entries
        word_result["entry_count"] = len(entries)
        word_result["summary"]["total_entries"] = len(entries)
        word_result["summary"]["with_english"] = sum(1 for e in entries if e.get("english"))
        word_result["summary"]["with_german"] = sum(1 for e in entries if e.get("german"))
        vocab_entries = entries
    # Persist to DB
    await update_session_db(
        session_id,
        word_result=word_result,
        current_step=8,
    )
    cached["word_result"] = word_result
    logger.info(f"OCR Pipeline SSE: words session {session_id}: "
                f"layout={word_result['layout']}, "
                f"{len(all_cells)} cells ({duration:.2f}s)")
    # Final complete event
    complete_event = {
        "type": "complete",
        "summary": word_result["summary"],
        "duration_seconds": round(duration, 2),
        "ocr_engine": used_engine,
    }
    if vocab_entries is not None:
        complete_event["vocab_entries"] = vocab_entries
    yield f"data: {json.dumps(complete_event)}\n\n"
 # ---------------------------------------------------------------------------
 # PaddleOCR Direct Endpoint
 # ---------------------------------------------------------------------------
@router.post("/sessions/{session_id}/paddle-direct")
 async def paddle_direct(session_id: str):
    """Run PaddleOCR on the preprocessed image and build a word grid directly.
    Expects orientation/deskew/dewarp/crop to be done already.
    Uses the cropped image (falls back to dewarped, then original).
    The used image is stored as cropped_png so OverlayReconstruction
    can display it as the background.
    """
    # Try preprocessed images first (crop > dewarp > original)
    img_png = await get_session_image(session_id, "cropped")
    if not img_png:
        img_png = await get_session_image(session_id, "dewarped")
    if not img_png:
        img_png = await get_session_image(session_id, "original")
    if not img_png:
        raise HTTPException(status_code=404, detail="No image found for this session")
    img_arr = np.frombuffer(img_png, dtype=np.uint8)
    img_bgr = cv2.imdecode(img_arr, cv2.IMREAD_COLOR)
    if img_bgr is None:
        raise HTTPException(status_code=400, detail="Failed to decode original image")
    img_h, img_w = img_bgr.shape[:2]
    from cv_ocr_engines import ocr_region_paddle
    t0 = time.time()
    word_dicts = await ocr_region_paddle(img_bgr, region=None)
    if not word_dicts:
        raise HTTPException(status_code=400, detail="PaddleOCR returned no words")
    # Reuse build_grid_from_words — same function that works in the regular
    # pipeline with PaddleOCR (engine=paddle, grid_method=words_first).
    # Handles phrase splitting, column clustering, and reading order.
    cells, columns_meta = build_grid_from_words(word_dicts, img_w, img_h)
    duration = time.time() - t0
    # Tag cells as paddle_direct
    for cell in cells:
        cell["ocr_engine"] = "paddle_direct"
    n_rows = len(set(c["row_index"] for c in cells)) if cells else 0
    n_cols = len(columns_meta)
    col_types = {c.get("type") for c in columns_meta}
    is_vocab = bool(col_types & {"column_en", "column_de"})
    word_result = {
        "cells": cells,
        "grid_shape": {
            "rows": n_rows,
            "cols": n_cols,
            "total_cells": len(cells),
        },
        "columns_used": columns_meta,
        "layout": "vocab" if is_vocab else "generic",
        "image_width": img_w,
        "image_height": img_h,
        "duration_seconds": round(duration, 2),
        "ocr_engine": "paddle_direct",
        "grid_method": "paddle_direct",
        "summary": {
            "total_cells": len(cells),
            "non_empty_cells": sum(1 for c in cells if c.get("text")),
            "low_confidence": sum(1 for c in cells if 0 < c.get("confidence", 0) < 50),
        },
    }
    # Store preprocessed image as cropped_png so OverlayReconstruction shows it
    await update_session_db(
        session_id,
        word_result=word_result,
        cropped_png=img_png,
        current_step=8,
    )
    logger.info(
        "paddle_direct session %s: %d cells (%d rows, %d cols) in %.2fs",
        session_id, len(cells), n_rows, n_cols, duration,
    )
    await _append_pipeline_log(session_id, "paddle_direct", {
        "total_cells": len(cells),
        "non_empty_cells": word_result["summary"]["non_empty_cells"],
        "ocr_engine": "paddle_direct",
    }, duration_ms=int(duration * 1000))
    return {"session_id": session_id, **word_result}
 # ---------------------------------------------------------------------------
 # Ground Truth Words Endpoints
 # ---------------------------------------------------------------------------
@router.post("/sessions/{session_id}/ground-truth/words")
 async def save_word_ground_truth(session_id: str, req: WordGroundTruthRequest):
    """Save ground truth feedback for the word recognition step."""
    session = await get_session_db(session_id)
    if not session:
        raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
    ground_truth = session.get("ground_truth") or {}
    gt = {
        "is_correct": req.is_correct,
        "corrected_entries": req.corrected_entries,
        "notes": req.notes,
        "saved_at": datetime.utcnow().isoformat(),
        "word_result": session.get("word_result"),
    }
    ground_truth["words"] = gt
    await update_session_db(session_id, ground_truth=ground_truth)
    if session_id in _cache:
        _cache[session_id]["ground_truth"] = ground_truth
    return {"session_id": session_id, "ground_truth": gt}
@router.get("/sessions/{session_id}/ground-truth/words")
 async def get_word_ground_truth(session_id: str):
    """Retrieve saved ground truth for word recognition."""
    session = await get_session_db(session_id)
    if not session:
        raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
    ground_truth = session.get("ground_truth") or {}
    words_gt = ground_truth.get("words")
    if not words_gt:
        raise HTTPException(status_code=404, detail="No word ground truth saved")
    return {
        "session_id": session_id,
        "words_gt": words_gt,
        "words_auto": session.get("word_result"),
    }