Files
breakpilot-lehrer/klausur-service/backend/ocr_pipeline_api.py
Benjamin Admin 589d2f811a feat: Dewarp-Korrektur als Schritt 2 in OCR Pipeline (7 Schritte)
Implementiert Buchwoelbungs-Entzerrung mit zwei Methoden:
- Methode A: Vertikale-Kanten-Analyse (Sobel + Polynom 2. Grades)
- Methode B: Textzeilen-Baseline (Tesseract + Baseline-Kruemmung)
Beste Methode wird automatisch gewaehlt, manueller Slider (-3 bis +3).

Backend: 3 neue Endpoints (auto/manual dewarp, ground truth)
Frontend: StepDewarp + DewarpControls, Pipeline von 6 auf 7 Schritte

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-26 16:46:41 +01:00

424 lines
14 KiB
Python

"""
OCR Pipeline API - Schrittweise Seitenrekonstruktion.
Zerlegt den OCR-Prozess in 7 einzelne Schritte:
1. Deskewing - Scan begradigen
2. Dewarping - Buchwoelbung entzerren
3. Spaltenerkennung - Unsichtbare Spalten finden
4. Worterkennung - OCR mit Bounding Boxes
5. Koordinatenzuweisung - Exakte Positionen
6. Seitenrekonstruktion - Seite nachbauen
7. Ground Truth Validierung - Gesamtpruefung
Lizenz: Apache 2.0
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
"""
import io
import logging
import time
import uuid
from datetime import datetime, timedelta
from typing import Any, Dict, Optional
import cv2
import numpy as np
from fastapi import APIRouter, File, HTTPException, UploadFile
from fastapi.responses import Response
from pydantic import BaseModel
from cv_vocab_pipeline import (
create_ocr_image,
deskew_image,
deskew_image_by_word_alignment,
dewarp_image,
dewarp_image_manual,
render_image_high_res,
render_pdf_high_res,
)
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/api/v1/ocr-pipeline", tags=["ocr-pipeline"])
# ---------------------------------------------------------------------------
# In-memory session store (24h TTL)
# ---------------------------------------------------------------------------
_sessions: Dict[str, Dict[str, Any]] = {}
SESSION_TTL_HOURS = 24
def _cleanup_expired():
"""Remove sessions older than TTL."""
cutoff = datetime.utcnow() - timedelta(hours=SESSION_TTL_HOURS)
expired = [sid for sid, s in _sessions.items() if s.get("created_at", datetime.utcnow()) < cutoff]
for sid in expired:
del _sessions[sid]
logger.info(f"OCR Pipeline: expired session {sid}")
def _get_session(session_id: str) -> Dict[str, Any]:
"""Get session or raise 404."""
session = _sessions.get(session_id)
if not session:
raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
return session
# ---------------------------------------------------------------------------
# Pydantic Models
# ---------------------------------------------------------------------------
class ManualDeskewRequest(BaseModel):
angle: float
class DeskewGroundTruthRequest(BaseModel):
is_correct: bool
corrected_angle: Optional[float] = None
notes: Optional[str] = None
class ManualDewarpRequest(BaseModel):
scale: float
class DewarpGroundTruthRequest(BaseModel):
is_correct: bool
corrected_scale: Optional[float] = None
notes: Optional[str] = None
# ---------------------------------------------------------------------------
# Endpoints
# ---------------------------------------------------------------------------
@router.post("/sessions")
async def create_session(file: UploadFile = File(...)):
"""Upload a PDF or image file and create a pipeline session."""
_cleanup_expired()
file_data = await file.read()
filename = file.filename or "upload"
content_type = file.content_type or ""
session_id = str(uuid.uuid4())
is_pdf = content_type == "application/pdf" or filename.lower().endswith(".pdf")
try:
if is_pdf:
img_bgr = render_pdf_high_res(file_data, page_number=0, zoom=3.0)
else:
img_bgr = render_image_high_res(file_data)
except Exception as e:
raise HTTPException(status_code=400, detail=f"Could not process file: {e}")
# Encode original as PNG bytes for serving
success, png_buf = cv2.imencode(".png", img_bgr)
if not success:
raise HTTPException(status_code=500, detail="Failed to encode image")
_sessions[session_id] = {
"id": session_id,
"filename": filename,
"created_at": datetime.utcnow(),
"original_bgr": img_bgr,
"original_png": png_buf.tobytes(),
"deskewed_bgr": None,
"deskewed_png": None,
"binarized_png": None,
"deskew_result": None,
"dewarped_bgr": None,
"dewarped_png": None,
"dewarp_result": None,
"displacement_map": None,
"ground_truth": {},
"current_step": 1,
}
logger.info(f"OCR Pipeline: created session {session_id} from {filename} "
f"({img_bgr.shape[1]}x{img_bgr.shape[0]})")
return {
"session_id": session_id,
"filename": filename,
"image_width": img_bgr.shape[1],
"image_height": img_bgr.shape[0],
"original_image_url": f"/api/v1/ocr-pipeline/sessions/{session_id}/image/original",
}
@router.post("/sessions/{session_id}/deskew")
async def auto_deskew(session_id: str):
"""Run both deskew methods and pick the best one."""
session = _get_session(session_id)
img_bgr = session["original_bgr"]
t0 = time.time()
# Method 1: Hough Lines
try:
deskewed_hough, angle_hough = deskew_image(img_bgr.copy())
except Exception as e:
logger.warning(f"Hough deskew failed: {e}")
deskewed_hough, angle_hough = img_bgr, 0.0
# Method 2: Word Alignment (needs image bytes)
success_enc, png_orig = cv2.imencode(".png", img_bgr)
orig_bytes = png_orig.tobytes() if success_enc else b""
try:
deskewed_wa_bytes, angle_wa = deskew_image_by_word_alignment(orig_bytes)
except Exception as e:
logger.warning(f"Word alignment deskew failed: {e}")
deskewed_wa_bytes, angle_wa = orig_bytes, 0.0
duration = time.time() - t0
# Pick method with larger detected angle (more correction needed = more skew found)
# If both are ~0, prefer word alignment as it's more robust
if abs(angle_wa) >= abs(angle_hough) or abs(angle_hough) < 0.1:
method_used = "word_alignment"
angle_applied = angle_wa
# Decode word alignment result to BGR
wa_array = np.frombuffer(deskewed_wa_bytes, dtype=np.uint8)
deskewed_bgr = cv2.imdecode(wa_array, cv2.IMREAD_COLOR)
if deskewed_bgr is None:
deskewed_bgr = deskewed_hough
method_used = "hough"
angle_applied = angle_hough
else:
method_used = "hough"
angle_applied = angle_hough
deskewed_bgr = deskewed_hough
# Encode deskewed as PNG
success, deskewed_png_buf = cv2.imencode(".png", deskewed_bgr)
deskewed_png = deskewed_png_buf.tobytes() if success else session["original_png"]
# Create binarized version
try:
binarized = create_ocr_image(deskewed_bgr)
success_bin, bin_buf = cv2.imencode(".png", binarized)
binarized_png = bin_buf.tobytes() if success_bin else None
except Exception as e:
logger.warning(f"Binarization failed: {e}")
binarized_png = None
# Confidence: higher angle = lower confidence that we got it right
confidence = max(0.5, 1.0 - abs(angle_applied) / 5.0)
deskew_result = {
"angle_hough": round(angle_hough, 3),
"angle_word_alignment": round(angle_wa, 3),
"angle_applied": round(angle_applied, 3),
"method_used": method_used,
"confidence": round(confidence, 2),
"duration_seconds": round(duration, 2),
}
session["deskewed_bgr"] = deskewed_bgr
session["deskewed_png"] = deskewed_png
session["binarized_png"] = binarized_png
session["deskew_result"] = deskew_result
logger.info(f"OCR Pipeline: deskew session {session_id}: "
f"hough={angle_hough:.2f}° wa={angle_wa:.2f}° → {method_used} {angle_applied:.2f}°")
return {
"session_id": session_id,
**deskew_result,
"deskewed_image_url": f"/api/v1/ocr-pipeline/sessions/{session_id}/image/deskewed",
"binarized_image_url": f"/api/v1/ocr-pipeline/sessions/{session_id}/image/binarized",
}
@router.post("/sessions/{session_id}/deskew/manual")
async def manual_deskew(session_id: str, req: ManualDeskewRequest):
"""Apply a manual rotation angle to the original image."""
session = _get_session(session_id)
img_bgr = session["original_bgr"]
angle = max(-5.0, min(5.0, req.angle))
h, w = img_bgr.shape[:2]
center = (w // 2, h // 2)
M = cv2.getRotationMatrix2D(center, angle, 1.0)
rotated = cv2.warpAffine(img_bgr, M, (w, h),
flags=cv2.INTER_LINEAR,
borderMode=cv2.BORDER_REPLICATE)
success, png_buf = cv2.imencode(".png", rotated)
deskewed_png = png_buf.tobytes() if success else session["original_png"]
# Binarize
try:
binarized = create_ocr_image(rotated)
success_bin, bin_buf = cv2.imencode(".png", binarized)
binarized_png = bin_buf.tobytes() if success_bin else None
except Exception:
binarized_png = None
session["deskewed_bgr"] = rotated
session["deskewed_png"] = deskewed_png
session["binarized_png"] = binarized_png
session["deskew_result"] = {
**(session.get("deskew_result") or {}),
"angle_applied": round(angle, 3),
"method_used": "manual",
}
logger.info(f"OCR Pipeline: manual deskew session {session_id}: {angle:.2f}°")
return {
"session_id": session_id,
"angle_applied": round(angle, 3),
"method_used": "manual",
"deskewed_image_url": f"/api/v1/ocr-pipeline/sessions/{session_id}/image/deskewed",
}
@router.get("/sessions/{session_id}/image/{image_type}")
async def get_image(session_id: str, image_type: str):
"""Serve session images: original, deskewed, dewarped, or binarized."""
session = _get_session(session_id)
if image_type == "original":
data = session.get("original_png")
elif image_type == "deskewed":
data = session.get("deskewed_png")
elif image_type == "dewarped":
data = session.get("dewarped_png")
elif image_type == "binarized":
data = session.get("binarized_png")
else:
raise HTTPException(status_code=400, detail=f"Unknown image type: {image_type}")
if not data:
raise HTTPException(status_code=404, detail=f"Image '{image_type}' not available yet")
return Response(content=data, media_type="image/png")
@router.post("/sessions/{session_id}/ground-truth/deskew")
async def save_deskew_ground_truth(session_id: str, req: DeskewGroundTruthRequest):
"""Save ground truth feedback for the deskew step."""
session = _get_session(session_id)
gt = {
"is_correct": req.is_correct,
"corrected_angle": req.corrected_angle,
"notes": req.notes,
"saved_at": datetime.utcnow().isoformat(),
"deskew_result": session.get("deskew_result"),
}
session["ground_truth"]["deskew"] = gt
logger.info(f"OCR Pipeline: ground truth deskew session {session_id}: "
f"correct={req.is_correct}, corrected_angle={req.corrected_angle}")
return {"session_id": session_id, "ground_truth": gt}
# ---------------------------------------------------------------------------
# Dewarp Endpoints
# ---------------------------------------------------------------------------
@router.post("/sessions/{session_id}/dewarp")
async def auto_dewarp(session_id: str):
"""Run both dewarp methods on the deskewed image and pick the best."""
session = _get_session(session_id)
deskewed_bgr = session.get("deskewed_bgr")
if deskewed_bgr is None:
raise HTTPException(status_code=400, detail="Deskew must be completed before dewarp")
t0 = time.time()
dewarped_bgr, dewarp_info = dewarp_image(deskewed_bgr)
duration = time.time() - t0
# Encode dewarped as PNG
success, png_buf = cv2.imencode(".png", dewarped_bgr)
dewarped_png = png_buf.tobytes() if success else session["deskewed_png"]
session["dewarped_bgr"] = dewarped_bgr
session["dewarped_png"] = dewarped_png
session["dewarp_result"] = {
"method_used": dewarp_info["method"],
"curvature_px": dewarp_info["curvature_px"],
"confidence": dewarp_info["confidence"],
"duration_seconds": round(duration, 2),
}
session["displacement_map"] = dewarp_info.get("displacement_map")
logger.info(f"OCR Pipeline: dewarp session {session_id}: "
f"method={dewarp_info['method']} curvature={dewarp_info['curvature_px']:.1f}px "
f"conf={dewarp_info['confidence']:.2f} ({duration:.2f}s)")
return {
"session_id": session_id,
"method_used": dewarp_info["method"],
"curvature_px": dewarp_info["curvature_px"],
"confidence": dewarp_info["confidence"],
"duration_seconds": round(duration, 2),
"dewarped_image_url": f"/api/v1/ocr-pipeline/sessions/{session_id}/image/dewarped",
}
@router.post("/sessions/{session_id}/dewarp/manual")
async def manual_dewarp(session_id: str, req: ManualDewarpRequest):
"""Apply dewarp with a manually scaled displacement map."""
session = _get_session(session_id)
deskewed_bgr = session.get("deskewed_bgr")
displacement_map = session.get("displacement_map")
if deskewed_bgr is None:
raise HTTPException(status_code=400, detail="Deskew must be completed before dewarp")
scale = max(-3.0, min(3.0, req.scale))
if displacement_map is None or abs(scale) < 0.01:
# No displacement map or zero scale — use deskewed as-is
dewarped_bgr = deskewed_bgr
else:
dewarped_bgr = dewarp_image_manual(deskewed_bgr, displacement_map, scale)
success, png_buf = cv2.imencode(".png", dewarped_bgr)
dewarped_png = png_buf.tobytes() if success else session.get("deskewed_png")
session["dewarped_bgr"] = dewarped_bgr
session["dewarped_png"] = dewarped_png
session["dewarp_result"] = {
**(session.get("dewarp_result") or {}),
"method_used": "manual",
"scale_applied": round(scale, 2),
}
logger.info(f"OCR Pipeline: manual dewarp session {session_id}: scale={scale:.2f}")
return {
"session_id": session_id,
"scale_applied": round(scale, 2),
"method_used": "manual",
"dewarped_image_url": f"/api/v1/ocr-pipeline/sessions/{session_id}/image/dewarped",
}
@router.post("/sessions/{session_id}/ground-truth/dewarp")
async def save_dewarp_ground_truth(session_id: str, req: DewarpGroundTruthRequest):
"""Save ground truth feedback for the dewarp step."""
session = _get_session(session_id)
gt = {
"is_correct": req.is_correct,
"corrected_scale": req.corrected_scale,
"notes": req.notes,
"saved_at": datetime.utcnow().isoformat(),
"dewarp_result": session.get("dewarp_result"),
}
session["ground_truth"]["dewarp"] = gt
logger.info(f"OCR Pipeline: ground truth dewarp session {session_id}: "
f"correct={req.is_correct}, corrected_scale={req.corrected_scale}")
return {"session_id": session_id, "ground_truth": gt}