""" OCR Pipeline API - Schrittweise Seitenrekonstruktion. Zerlegt den OCR-Prozess in 7 einzelne Schritte: 1. Deskewing - Scan begradigen 2. Dewarping - Buchwoelbung entzerren 3. Spaltenerkennung - Unsichtbare Spalten finden 4. Worterkennung - OCR mit Bounding Boxes 5. Koordinatenzuweisung - Exakte Positionen 6. Seitenrekonstruktion - Seite nachbauen 7. Ground Truth Validierung - Gesamtpruefung Lizenz: Apache 2.0 DATENSCHUTZ: Alle Verarbeitung erfolgt lokal. """ import io import logging import time import uuid from datetime import datetime, timedelta from typing import Any, Dict, Optional import cv2 import numpy as np from fastapi import APIRouter, File, HTTPException, UploadFile from fastapi.responses import Response from pydantic import BaseModel from cv_vocab_pipeline import ( create_ocr_image, deskew_image, deskew_image_by_word_alignment, dewarp_image, dewarp_image_manual, render_image_high_res, render_pdf_high_res, ) logger = logging.getLogger(__name__) router = APIRouter(prefix="/api/v1/ocr-pipeline", tags=["ocr-pipeline"]) # --------------------------------------------------------------------------- # In-memory session store (24h TTL) # --------------------------------------------------------------------------- _sessions: Dict[str, Dict[str, Any]] = {} SESSION_TTL_HOURS = 24 def _cleanup_expired(): """Remove sessions older than TTL.""" cutoff = datetime.utcnow() - timedelta(hours=SESSION_TTL_HOURS) expired = [sid for sid, s in _sessions.items() if s.get("created_at", datetime.utcnow()) < cutoff] for sid in expired: del _sessions[sid] logger.info(f"OCR Pipeline: expired session {sid}") def _get_session(session_id: str) -> Dict[str, Any]: """Get session or raise 404.""" session = _sessions.get(session_id) if not session: raise HTTPException(status_code=404, detail=f"Session {session_id} not found") return session # --------------------------------------------------------------------------- # Pydantic Models # --------------------------------------------------------------------------- class ManualDeskewRequest(BaseModel): angle: float class DeskewGroundTruthRequest(BaseModel): is_correct: bool corrected_angle: Optional[float] = None notes: Optional[str] = None class ManualDewarpRequest(BaseModel): scale: float class DewarpGroundTruthRequest(BaseModel): is_correct: bool corrected_scale: Optional[float] = None notes: Optional[str] = None # --------------------------------------------------------------------------- # Endpoints # --------------------------------------------------------------------------- @router.post("/sessions") async def create_session(file: UploadFile = File(...)): """Upload a PDF or image file and create a pipeline session.""" _cleanup_expired() file_data = await file.read() filename = file.filename or "upload" content_type = file.content_type or "" session_id = str(uuid.uuid4()) is_pdf = content_type == "application/pdf" or filename.lower().endswith(".pdf") try: if is_pdf: img_bgr = render_pdf_high_res(file_data, page_number=0, zoom=3.0) else: img_bgr = render_image_high_res(file_data) except Exception as e: raise HTTPException(status_code=400, detail=f"Could not process file: {e}") # Encode original as PNG bytes for serving success, png_buf = cv2.imencode(".png", img_bgr) if not success: raise HTTPException(status_code=500, detail="Failed to encode image") _sessions[session_id] = { "id": session_id, "filename": filename, "created_at": datetime.utcnow(), "original_bgr": img_bgr, "original_png": png_buf.tobytes(), "deskewed_bgr": None, "deskewed_png": None, "binarized_png": None, "deskew_result": None, "dewarped_bgr": None, "dewarped_png": None, "dewarp_result": None, "displacement_map": None, "ground_truth": {}, "current_step": 1, } logger.info(f"OCR Pipeline: created session {session_id} from {filename} " f"({img_bgr.shape[1]}x{img_bgr.shape[0]})") return { "session_id": session_id, "filename": filename, "image_width": img_bgr.shape[1], "image_height": img_bgr.shape[0], "original_image_url": f"/api/v1/ocr-pipeline/sessions/{session_id}/image/original", } @router.post("/sessions/{session_id}/deskew") async def auto_deskew(session_id: str): """Run both deskew methods and pick the best one.""" session = _get_session(session_id) img_bgr = session["original_bgr"] t0 = time.time() # Method 1: Hough Lines try: deskewed_hough, angle_hough = deskew_image(img_bgr.copy()) except Exception as e: logger.warning(f"Hough deskew failed: {e}") deskewed_hough, angle_hough = img_bgr, 0.0 # Method 2: Word Alignment (needs image bytes) success_enc, png_orig = cv2.imencode(".png", img_bgr) orig_bytes = png_orig.tobytes() if success_enc else b"" try: deskewed_wa_bytes, angle_wa = deskew_image_by_word_alignment(orig_bytes) except Exception as e: logger.warning(f"Word alignment deskew failed: {e}") deskewed_wa_bytes, angle_wa = orig_bytes, 0.0 duration = time.time() - t0 # Pick method with larger detected angle (more correction needed = more skew found) # If both are ~0, prefer word alignment as it's more robust if abs(angle_wa) >= abs(angle_hough) or abs(angle_hough) < 0.1: method_used = "word_alignment" angle_applied = angle_wa # Decode word alignment result to BGR wa_array = np.frombuffer(deskewed_wa_bytes, dtype=np.uint8) deskewed_bgr = cv2.imdecode(wa_array, cv2.IMREAD_COLOR) if deskewed_bgr is None: deskewed_bgr = deskewed_hough method_used = "hough" angle_applied = angle_hough else: method_used = "hough" angle_applied = angle_hough deskewed_bgr = deskewed_hough # Encode deskewed as PNG success, deskewed_png_buf = cv2.imencode(".png", deskewed_bgr) deskewed_png = deskewed_png_buf.tobytes() if success else session["original_png"] # Create binarized version try: binarized = create_ocr_image(deskewed_bgr) success_bin, bin_buf = cv2.imencode(".png", binarized) binarized_png = bin_buf.tobytes() if success_bin else None except Exception as e: logger.warning(f"Binarization failed: {e}") binarized_png = None # Confidence: higher angle = lower confidence that we got it right confidence = max(0.5, 1.0 - abs(angle_applied) / 5.0) deskew_result = { "angle_hough": round(angle_hough, 3), "angle_word_alignment": round(angle_wa, 3), "angle_applied": round(angle_applied, 3), "method_used": method_used, "confidence": round(confidence, 2), "duration_seconds": round(duration, 2), } session["deskewed_bgr"] = deskewed_bgr session["deskewed_png"] = deskewed_png session["binarized_png"] = binarized_png session["deskew_result"] = deskew_result logger.info(f"OCR Pipeline: deskew session {session_id}: " f"hough={angle_hough:.2f}° wa={angle_wa:.2f}° → {method_used} {angle_applied:.2f}°") return { "session_id": session_id, **deskew_result, "deskewed_image_url": f"/api/v1/ocr-pipeline/sessions/{session_id}/image/deskewed", "binarized_image_url": f"/api/v1/ocr-pipeline/sessions/{session_id}/image/binarized", } @router.post("/sessions/{session_id}/deskew/manual") async def manual_deskew(session_id: str, req: ManualDeskewRequest): """Apply a manual rotation angle to the original image.""" session = _get_session(session_id) img_bgr = session["original_bgr"] angle = max(-5.0, min(5.0, req.angle)) h, w = img_bgr.shape[:2] center = (w // 2, h // 2) M = cv2.getRotationMatrix2D(center, angle, 1.0) rotated = cv2.warpAffine(img_bgr, M, (w, h), flags=cv2.INTER_LINEAR, borderMode=cv2.BORDER_REPLICATE) success, png_buf = cv2.imencode(".png", rotated) deskewed_png = png_buf.tobytes() if success else session["original_png"] # Binarize try: binarized = create_ocr_image(rotated) success_bin, bin_buf = cv2.imencode(".png", binarized) binarized_png = bin_buf.tobytes() if success_bin else None except Exception: binarized_png = None session["deskewed_bgr"] = rotated session["deskewed_png"] = deskewed_png session["binarized_png"] = binarized_png session["deskew_result"] = { **(session.get("deskew_result") or {}), "angle_applied": round(angle, 3), "method_used": "manual", } logger.info(f"OCR Pipeline: manual deskew session {session_id}: {angle:.2f}°") return { "session_id": session_id, "angle_applied": round(angle, 3), "method_used": "manual", "deskewed_image_url": f"/api/v1/ocr-pipeline/sessions/{session_id}/image/deskewed", } @router.get("/sessions/{session_id}/image/{image_type}") async def get_image(session_id: str, image_type: str): """Serve session images: original, deskewed, dewarped, or binarized.""" session = _get_session(session_id) if image_type == "original": data = session.get("original_png") elif image_type == "deskewed": data = session.get("deskewed_png") elif image_type == "dewarped": data = session.get("dewarped_png") elif image_type == "binarized": data = session.get("binarized_png") else: raise HTTPException(status_code=400, detail=f"Unknown image type: {image_type}") if not data: raise HTTPException(status_code=404, detail=f"Image '{image_type}' not available yet") return Response(content=data, media_type="image/png") @router.post("/sessions/{session_id}/ground-truth/deskew") async def save_deskew_ground_truth(session_id: str, req: DeskewGroundTruthRequest): """Save ground truth feedback for the deskew step.""" session = _get_session(session_id) gt = { "is_correct": req.is_correct, "corrected_angle": req.corrected_angle, "notes": req.notes, "saved_at": datetime.utcnow().isoformat(), "deskew_result": session.get("deskew_result"), } session["ground_truth"]["deskew"] = gt logger.info(f"OCR Pipeline: ground truth deskew session {session_id}: " f"correct={req.is_correct}, corrected_angle={req.corrected_angle}") return {"session_id": session_id, "ground_truth": gt} # --------------------------------------------------------------------------- # Dewarp Endpoints # --------------------------------------------------------------------------- @router.post("/sessions/{session_id}/dewarp") async def auto_dewarp(session_id: str): """Run both dewarp methods on the deskewed image and pick the best.""" session = _get_session(session_id) deskewed_bgr = session.get("deskewed_bgr") if deskewed_bgr is None: raise HTTPException(status_code=400, detail="Deskew must be completed before dewarp") t0 = time.time() dewarped_bgr, dewarp_info = dewarp_image(deskewed_bgr) duration = time.time() - t0 # Encode dewarped as PNG success, png_buf = cv2.imencode(".png", dewarped_bgr) dewarped_png = png_buf.tobytes() if success else session["deskewed_png"] session["dewarped_bgr"] = dewarped_bgr session["dewarped_png"] = dewarped_png session["dewarp_result"] = { "method_used": dewarp_info["method"], "curvature_px": dewarp_info["curvature_px"], "confidence": dewarp_info["confidence"], "duration_seconds": round(duration, 2), } session["displacement_map"] = dewarp_info.get("displacement_map") logger.info(f"OCR Pipeline: dewarp session {session_id}: " f"method={dewarp_info['method']} curvature={dewarp_info['curvature_px']:.1f}px " f"conf={dewarp_info['confidence']:.2f} ({duration:.2f}s)") return { "session_id": session_id, "method_used": dewarp_info["method"], "curvature_px": dewarp_info["curvature_px"], "confidence": dewarp_info["confidence"], "duration_seconds": round(duration, 2), "dewarped_image_url": f"/api/v1/ocr-pipeline/sessions/{session_id}/image/dewarped", } @router.post("/sessions/{session_id}/dewarp/manual") async def manual_dewarp(session_id: str, req: ManualDewarpRequest): """Apply dewarp with a manually scaled displacement map.""" session = _get_session(session_id) deskewed_bgr = session.get("deskewed_bgr") displacement_map = session.get("displacement_map") if deskewed_bgr is None: raise HTTPException(status_code=400, detail="Deskew must be completed before dewarp") scale = max(-3.0, min(3.0, req.scale)) if displacement_map is None or abs(scale) < 0.01: # No displacement map or zero scale — use deskewed as-is dewarped_bgr = deskewed_bgr else: dewarped_bgr = dewarp_image_manual(deskewed_bgr, displacement_map, scale) success, png_buf = cv2.imencode(".png", dewarped_bgr) dewarped_png = png_buf.tobytes() if success else session.get("deskewed_png") session["dewarped_bgr"] = dewarped_bgr session["dewarped_png"] = dewarped_png session["dewarp_result"] = { **(session.get("dewarp_result") or {}), "method_used": "manual", "scale_applied": round(scale, 2), } logger.info(f"OCR Pipeline: manual dewarp session {session_id}: scale={scale:.2f}") return { "session_id": session_id, "scale_applied": round(scale, 2), "method_used": "manual", "dewarped_image_url": f"/api/v1/ocr-pipeline/sessions/{session_id}/image/dewarped", } @router.post("/sessions/{session_id}/ground-truth/dewarp") async def save_dewarp_ground_truth(session_id: str, req: DewarpGroundTruthRequest): """Save ground truth feedback for the dewarp step.""" session = _get_session(session_id) gt = { "is_correct": req.is_correct, "corrected_scale": req.corrected_scale, "notes": req.notes, "saved_at": datetime.utcnow().isoformat(), "dewarp_result": session.get("dewarp_result"), } session["ground_truth"]["dewarp"] = gt logger.info(f"OCR Pipeline: ground truth dewarp session {session_id}: " f"correct={req.is_correct}, corrected_scale={req.corrected_scale}") return {"session_id": session_id, "ground_truth": gt}