""" OCR Pipeline Auto-Mode Orchestrator. POST /sessions/{session_id}/run-auto -- full auto-mode with SSE streaming. Lizenz: Apache 2.0 DATENSCHUTZ: Alle Verarbeitung erfolgt lokal. """ import logging import time from dataclasses import asdict from typing import Any, Dict, List, Optional import cv2 import numpy as np from fastapi import APIRouter, HTTPException, Request from fastapi.responses import StreamingResponse from cv_vocab_pipeline import ( OLLAMA_REVIEW_MODEL, PageRegion, RowGeometry, _cells_to_vocab_entries, _detect_header_footer_gaps, _detect_sub_columns, _fix_character_confusion, _fix_phonetic_brackets, fix_cell_phonetics, analyze_layout, build_cell_grid, classify_column_types, create_layout_image, create_ocr_image, deskew_image, deskew_image_by_word_alignment, detect_column_geometry, detect_row_geometry, _apply_shear, dewarp_image, llm_review_entries, ) from ocr_pipeline_common import ( _cache, _load_session_to_cache, _get_cached, ) from ocr_pipeline_session_store import ( get_session_db, update_session_db, ) from ocr_pipeline_auto_helpers import ( RunAutoRequest, auto_sse_event as _auto_sse_event, detect_shear_with_vlm as _detect_shear_with_vlm, ) logger = logging.getLogger(__name__) router = APIRouter(tags=["ocr-pipeline"]) @router.post("/sessions/{session_id}/run-auto") async def run_auto(session_id: str, req: RunAutoRequest, request: Request): """Run the full OCR pipeline automatically from a given step, streaming SSE progress. Steps: 1. Deskew -- straighten the scan 2. Dewarp -- correct vertical shear (ensemble CV or VLM) 3. Columns -- detect column layout 4. Rows -- detect row layout 5. Words -- OCR each cell 6. LLM review -- correct OCR errors (optional) Already-completed steps are skipped unless `from_step` forces a rerun. Yields SSE events of the form: data: {"step": "deskew", "status": "start"|"done"|"skipped"|"error", ...} Final event: data: {"step": "complete", "status": "done", "steps_run": [...], "steps_skipped": [...]} """ if req.from_step < 1 or req.from_step > 6: raise HTTPException(status_code=400, detail="from_step must be 1-6") if req.dewarp_method not in ("ensemble", "vlm", "cv"): raise HTTPException(status_code=400, detail="dewarp_method must be: ensemble, vlm, cv") if session_id not in _cache: await _load_session_to_cache(session_id) async def _generate(): steps_run: List[str] = [] steps_skipped: List[str] = [] error_step: Optional[str] = None session = await get_session_db(session_id) if not session: yield await _auto_sse_event("error", "error", {"message": f"Session {session_id} not found"}) return cached = _get_cached(session_id) # Step 1: Deskew if req.from_step <= 1: yield await _auto_sse_event("deskew", "start", {}) try: t0 = time.time() orig_bgr = cached.get("original_bgr") if orig_bgr is None: raise ValueError("Original image not loaded") try: deskewed_hough, angle_hough = deskew_image(orig_bgr.copy()) except Exception: deskewed_hough, angle_hough = orig_bgr, 0.0 success_enc, png_orig = cv2.imencode(".png", orig_bgr) orig_bytes = png_orig.tobytes() if success_enc else b"" try: deskewed_wa_bytes, angle_wa = deskew_image_by_word_alignment(orig_bytes) except Exception: deskewed_wa_bytes, angle_wa = orig_bytes, 0.0 if abs(angle_wa) >= abs(angle_hough) or abs(angle_hough) < 0.1: method_used = "word_alignment" angle_applied = angle_wa wa_arr = np.frombuffer(deskewed_wa_bytes, dtype=np.uint8) deskewed_bgr = cv2.imdecode(wa_arr, cv2.IMREAD_COLOR) if deskewed_bgr is None: deskewed_bgr = deskewed_hough method_used = "hough" angle_applied = angle_hough else: method_used = "hough" angle_applied = angle_hough deskewed_bgr = deskewed_hough success, png_buf = cv2.imencode(".png", deskewed_bgr) deskewed_png = png_buf.tobytes() if success else b"" deskew_result = { "method_used": method_used, "rotation_degrees": round(float(angle_applied), 3), "duration_seconds": round(time.time() - t0, 2), } cached["deskewed_bgr"] = deskewed_bgr cached["deskew_result"] = deskew_result await update_session_db( session_id, deskewed_png=deskewed_png, deskew_result=deskew_result, auto_rotation_degrees=float(angle_applied), current_step=3, ) session = await get_session_db(session_id) steps_run.append("deskew") yield await _auto_sse_event("deskew", "done", deskew_result) except Exception as e: logger.error(f"Auto-mode deskew failed for {session_id}: {e}") error_step = "deskew" yield await _auto_sse_event("deskew", "error", {"message": str(e)}) yield await _auto_sse_event("complete", "error", {"error_step": error_step}) return else: steps_skipped.append("deskew") yield await _auto_sse_event("deskew", "skipped", {"reason": "from_step > 1"}) # Step 2: Dewarp if req.from_step <= 2: yield await _auto_sse_event("dewarp", "start", {"method": req.dewarp_method}) try: t0 = time.time() deskewed_bgr = cached.get("deskewed_bgr") if deskewed_bgr is None: raise ValueError("Deskewed image not available") if req.dewarp_method == "vlm": success_enc, png_buf = cv2.imencode(".png", deskewed_bgr) img_bytes = png_buf.tobytes() if success_enc else b"" vlm_det = await _detect_shear_with_vlm(img_bytes) shear_deg = vlm_det["shear_degrees"] if abs(shear_deg) >= 0.05 and vlm_det["confidence"] >= 0.3: dewarped_bgr = _apply_shear(deskewed_bgr, -shear_deg) else: dewarped_bgr = deskewed_bgr dewarp_info = { "method": vlm_det["method"], "shear_degrees": shear_deg, "confidence": vlm_det["confidence"], "detections": [vlm_det], } else: dewarped_bgr, dewarp_info = dewarp_image(deskewed_bgr) success_enc, png_buf = cv2.imencode(".png", dewarped_bgr) dewarped_png = png_buf.tobytes() if success_enc else b"" dewarp_result = { "method_used": dewarp_info["method"], "shear_degrees": dewarp_info["shear_degrees"], "confidence": dewarp_info["confidence"], "duration_seconds": round(time.time() - t0, 2), "detections": dewarp_info.get("detections", []), } cached["dewarped_bgr"] = dewarped_bgr cached["dewarp_result"] = dewarp_result await update_session_db( session_id, dewarped_png=dewarped_png, dewarp_result=dewarp_result, auto_shear_degrees=dewarp_info.get("shear_degrees", 0.0), current_step=4, ) session = await get_session_db(session_id) steps_run.append("dewarp") yield await _auto_sse_event("dewarp", "done", dewarp_result) except Exception as e: logger.error(f"Auto-mode dewarp failed for {session_id}: {e}") error_step = "dewarp" yield await _auto_sse_event("dewarp", "error", {"message": str(e)}) yield await _auto_sse_event("complete", "error", {"error_step": error_step}) return else: steps_skipped.append("dewarp") yield await _auto_sse_event("dewarp", "skipped", {"reason": "from_step > 2"}) # Step 3: Columns if req.from_step <= 3: yield await _auto_sse_event("columns", "start", {}) try: t0 = time.time() col_img = cached.get("cropped_bgr") if cached.get("cropped_bgr") is not None else cached.get("dewarped_bgr") if col_img is None: raise ValueError("Cropped/dewarped image not available") ocr_img = create_ocr_image(col_img) h, w = ocr_img.shape[:2] geo_result = detect_column_geometry(ocr_img, col_img) if geo_result is None: layout_img = create_layout_image(col_img) regions = analyze_layout(layout_img, ocr_img) cached["_word_dicts"] = None cached["_inv"] = None cached["_content_bounds"] = None else: geometries, left_x, right_x, top_y, bottom_y, word_dicts, inv = geo_result content_w = right_x - left_x cached["_word_dicts"] = word_dicts cached["_inv"] = inv cached["_content_bounds"] = (left_x, right_x, top_y, bottom_y) header_y, footer_y = _detect_header_footer_gaps(inv, w, h) if inv is not None else (None, None) geometries = _detect_sub_columns(geometries, content_w, left_x=left_x, top_y=top_y, header_y=header_y, footer_y=footer_y) regions = classify_column_types(geometries, content_w, top_y, w, h, bottom_y, left_x=left_x, right_x=right_x, inv=inv) columns = [asdict(r) for r in regions] column_result = { "columns": columns, "classification_methods": list({c.get("classification_method", "") for c in columns if c.get("classification_method")}), "duration_seconds": round(time.time() - t0, 2), } cached["column_result"] = column_result await update_session_db(session_id, column_result=column_result, row_result=None, word_result=None, current_step=6) session = await get_session_db(session_id) steps_run.append("columns") yield await _auto_sse_event("columns", "done", { "column_count": len(columns), "duration_seconds": column_result["duration_seconds"], }) except Exception as e: logger.error(f"Auto-mode columns failed for {session_id}: {e}") error_step = "columns" yield await _auto_sse_event("columns", "error", {"message": str(e)}) yield await _auto_sse_event("complete", "error", {"error_step": error_step}) return else: steps_skipped.append("columns") yield await _auto_sse_event("columns", "skipped", {"reason": "from_step > 3"}) # Step 4: Rows if req.from_step <= 4: yield await _auto_sse_event("rows", "start", {}) try: t0 = time.time() row_img = cached.get("cropped_bgr") if cached.get("cropped_bgr") is not None else cached.get("dewarped_bgr") session = await get_session_db(session_id) column_result = session.get("column_result") or cached.get("column_result") if not column_result or not column_result.get("columns"): raise ValueError("Column detection must complete first") col_regions = [ PageRegion( type=c["type"], x=c["x"], y=c["y"], width=c["width"], height=c["height"], classification_confidence=c.get("classification_confidence", 1.0), classification_method=c.get("classification_method", ""), ) for c in column_result["columns"] ] word_dicts = cached.get("_word_dicts") inv = cached.get("_inv") content_bounds = cached.get("_content_bounds") if word_dicts is None or inv is None or content_bounds is None: ocr_img_tmp = create_ocr_image(row_img) geo_result = detect_column_geometry(ocr_img_tmp, row_img) if geo_result is None: raise ValueError("Column geometry detection failed -- cannot detect rows") _g, lx, rx, ty, by, word_dicts, inv = geo_result cached["_word_dicts"] = word_dicts cached["_inv"] = inv cached["_content_bounds"] = (lx, rx, ty, by) content_bounds = (lx, rx, ty, by) left_x, right_x, top_y, bottom_y = content_bounds row_geoms = detect_row_geometry(inv, word_dicts, left_x, right_x, top_y, bottom_y) row_list = [ { "index": r.index, "x": r.x, "y": r.y, "width": r.width, "height": r.height, "word_count": r.word_count, "row_type": r.row_type, "gap_before": r.gap_before, } for r in row_geoms ] row_result = { "rows": row_list, "row_count": len(row_list), "content_rows": len([r for r in row_geoms if r.row_type == "content"]), "duration_seconds": round(time.time() - t0, 2), } cached["row_result"] = row_result await update_session_db(session_id, row_result=row_result, current_step=7) session = await get_session_db(session_id) steps_run.append("rows") yield await _auto_sse_event("rows", "done", { "row_count": len(row_list), "content_rows": row_result["content_rows"], "duration_seconds": row_result["duration_seconds"], }) except Exception as e: logger.error(f"Auto-mode rows failed for {session_id}: {e}") error_step = "rows" yield await _auto_sse_event("rows", "error", {"message": str(e)}) yield await _auto_sse_event("complete", "error", {"error_step": error_step}) return else: steps_skipped.append("rows") yield await _auto_sse_event("rows", "skipped", {"reason": "from_step > 4"}) # Step 5: Words (OCR) if req.from_step <= 5: yield await _auto_sse_event("words", "start", {"engine": req.ocr_engine}) try: t0 = time.time() word_img = cached.get("cropped_bgr") if cached.get("cropped_bgr") is not None else cached.get("dewarped_bgr") session = await get_session_db(session_id) column_result = session.get("column_result") or cached.get("column_result") row_result = session.get("row_result") or cached.get("row_result") col_regions = [ PageRegion( type=c["type"], x=c["x"], y=c["y"], width=c["width"], height=c["height"], classification_confidence=c.get("classification_confidence", 1.0), classification_method=c.get("classification_method", ""), ) for c in column_result["columns"] ] row_geoms = [ RowGeometry( index=r["index"], x=r["x"], y=r["y"], width=r["width"], height=r["height"], word_count=r.get("word_count", 0), words=[], row_type=r.get("row_type", "content"), gap_before=r.get("gap_before", 0), ) for r in row_result["rows"] ] word_dicts = cached.get("_word_dicts") if word_dicts is not None: content_bounds = cached.get("_content_bounds") top_y = content_bounds[2] if content_bounds else min(r.y for r in row_geoms) for row in row_geoms: row_y_rel = row.y - top_y row_bottom_rel = row_y_rel + row.height row.words = [ w for w in word_dicts if row_y_rel <= w['top'] + w['height'] / 2 < row_bottom_rel ] row.word_count = len(row.words) ocr_img = create_ocr_image(word_img) img_h, img_w = word_img.shape[:2] cells, columns_meta = build_cell_grid( ocr_img, col_regions, row_geoms, img_w, img_h, ocr_engine=req.ocr_engine, img_bgr=word_img, ) duration = time.time() - t0 col_types = {c['type'] for c in columns_meta} is_vocab = bool(col_types & {'column_en', 'column_de'}) n_content_rows = len([r for r in row_geoms if r.row_type == 'content']) used_engine = cells[0].get("ocr_engine", "tesseract") if cells else req.ocr_engine fix_cell_phonetics(cells, pronunciation=req.pronunciation) word_result_data = { "cells": cells, "grid_shape": { "rows": n_content_rows, "cols": len(columns_meta), "total_cells": len(cells), }, "columns_used": columns_meta, "layout": "vocab" if is_vocab else "generic", "image_width": img_w, "image_height": img_h, "duration_seconds": round(duration, 2), "ocr_engine": used_engine, "summary": { "total_cells": len(cells), "non_empty_cells": sum(1 for c in cells if c.get("text")), "low_confidence": sum(1 for c in cells if 0 < c.get("confidence", 0) < 50), }, } has_text_col = 'column_text' in col_types if is_vocab or has_text_col: entries = _cells_to_vocab_entries(cells, columns_meta) entries = _fix_character_confusion(entries) entries = _fix_phonetic_brackets(entries, pronunciation=req.pronunciation) word_result_data["vocab_entries"] = entries word_result_data["entries"] = entries word_result_data["entry_count"] = len(entries) word_result_data["summary"]["total_entries"] = len(entries) await update_session_db(session_id, word_result=word_result_data, current_step=8) cached["word_result"] = word_result_data session = await get_session_db(session_id) steps_run.append("words") yield await _auto_sse_event("words", "done", { "total_cells": len(cells), "layout": word_result_data["layout"], "duration_seconds": round(duration, 2), "ocr_engine": used_engine, "summary": word_result_data["summary"], }) except Exception as e: logger.error(f"Auto-mode words failed for {session_id}: {e}") error_step = "words" yield await _auto_sse_event("words", "error", {"message": str(e)}) yield await _auto_sse_event("complete", "error", {"error_step": error_step}) return else: steps_skipped.append("words") yield await _auto_sse_event("words", "skipped", {"reason": "from_step > 5"}) # Step 6: LLM Review (optional) if req.from_step <= 6 and not req.skip_llm_review: yield await _auto_sse_event("llm_review", "start", {"model": OLLAMA_REVIEW_MODEL}) try: session = await get_session_db(session_id) word_result = session.get("word_result") or cached.get("word_result") entries = word_result.get("entries") or word_result.get("vocab_entries") or [] if not entries: yield await _auto_sse_event("llm_review", "skipped", {"reason": "no entries"}) steps_skipped.append("llm_review") else: reviewed = await llm_review_entries(entries) session = await get_session_db(session_id) word_result_updated = dict(session.get("word_result") or {}) word_result_updated["entries"] = reviewed word_result_updated["vocab_entries"] = reviewed word_result_updated["llm_reviewed"] = True word_result_updated["llm_model"] = OLLAMA_REVIEW_MODEL await update_session_db(session_id, word_result=word_result_updated, current_step=9) cached["word_result"] = word_result_updated steps_run.append("llm_review") yield await _auto_sse_event("llm_review", "done", { "entries_reviewed": len(reviewed), "model": OLLAMA_REVIEW_MODEL, }) except Exception as e: logger.warning(f"Auto-mode llm_review failed for {session_id} (non-fatal): {e}") yield await _auto_sse_event("llm_review", "error", {"message": str(e), "fatal": False}) steps_skipped.append("llm_review") else: steps_skipped.append("llm_review") reason = "skipped by request" if req.skip_llm_review else "from_step > 6" yield await _auto_sse_event("llm_review", "skipped", {"reason": reason}) # Final event yield await _auto_sse_event("complete", "done", { "steps_run": steps_run, "steps_skipped": steps_skipped, }) return StreamingResponse( _generate(), media_type="text/event-stream", headers={ "Cache-Control": "no-cache", "Connection": "keep-alive", "X-Accel-Buffering": "no", }, )