feat(klausur-service): OCR-Pipeline Optimierungen (Improvements 2-4)
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 28s
CI / test-go-edu-search (push) Successful in 26s
CI / test-python-klausur (push) Failing after 1m46s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 16s

## Improvement 2: VLM-basierter Dewarp
- Neuer Query-Parameter `method` für POST /sessions/{id}/dewarp
  Optionen: ensemble (default) | vlm | cv
- `_detect_shear_with_vlm()`: fragt qwen2.5vl:32b per Ollama nach
  dem Scherwinkel — gibt Zahlenwert + Konfidenz zurück
- `os`, `Query` zu ocr_pipeline_api.py Imports hinzugefügt
- `_apply_shear` aus cv_vocab_pipeline importiert

## Improvement 4: 3-Methoden Ensemble-Dewarp
- `_detect_shear_by_projection()`: Varianz-Sweep ±3° / 0.25°-Schritte
  auf horizontalen Text-Zeilen-Projektionen (~30ms)
- `_detect_shear_by_hough()`: Gewichteter Median über HoughLinesP
  auf Tabellen-Linien, Vorzeichen-Inversion (~20ms)
- `_ensemble_shear()`: Kombiniert alle 3 Methoden (conf >= 0.3),
  Ausreißer-Filter bei >1° Abweichung, Bonus bei Agreement <0.5°
- `dewarp_image()` nutzt jetzt alle 3 Methoden parallel,
  `use_ensemble: bool = True` für Rückwärtskompatibilität
- auto_dewarp Response enthält jetzt `detections`-Array

## Improvement 3: Vollautomatik-Endpoint
- POST /sessions/{id}/run-auto mit RunAutoRequest:
  from_step (1-6), ocr_engine, pronunciation,
  skip_llm_review, dewarp_method
- SSE-Streaming für alle 5+1 Schritte (deskew→dewarp→columns→rows→words→llm-review)
- Jeder Schritt: start / done / skipped / error Events
- Abschluss-Event: {steps_run, steps_skipped}
- LLM-Review-Fehler sind nicht-fatal (Pipeline läuft weiter)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-03 13:13:20 +01:00
parent 2e0f8632f8
commit 50e1c964ee
2 changed files with 975 additions and 27 deletions

View File

@@ -484,6 +484,133 @@ def _detect_shear_angle(img: np.ndarray) -> Dict[str, Any]:
return result
def _detect_shear_by_projection(img: np.ndarray) -> Dict[str, Any]:
"""Detect shear angle by maximising variance of horizontal text-line projections.
Principle: horizontal text lines produce a row-projection profile with sharp
peaks (high variance) when the image is correctly aligned. Any residual shear
smears the peaks and reduces variance. We sweep ±3° and pick the angle whose
corrected projection has the highest variance.
Works best on pages with clear horizontal banding (vocabulary tables, prose).
Complements _detect_shear_angle() which needs strong vertical edges.
Returns:
Dict with keys: method, shear_degrees, confidence.
"""
import math
result = {"method": "projection", "shear_degrees": 0.0, "confidence": 0.0}
h, w = img.shape[:2]
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
# Otsu binarisation
_, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
# Work at half resolution for speed
small = cv2.resize(binary, (w // 2, h // 2), interpolation=cv2.INTER_AREA)
sh, sw = small.shape
# Angle sweep: ±3° in 0.25° steps
angles = [a * 0.25 for a in range(-12, 13)] # 25 values
best_angle = 0.0
best_variance = -1.0
variances: List[Tuple[float, float]] = []
for angle_deg in angles:
if abs(angle_deg) < 0.01:
rotated = small
else:
shear_tan = math.tan(math.radians(angle_deg))
M = np.float32([[1, shear_tan, -sh / 2.0 * shear_tan], [0, 1, 0]])
rotated = cv2.warpAffine(small, M, (sw, sh),
flags=cv2.INTER_NEAREST,
borderMode=cv2.BORDER_CONSTANT)
profile = np.sum(rotated, axis=1).astype(float)
var = float(np.var(profile))
variances.append((angle_deg, var))
if var > best_variance:
best_variance = var
best_angle = angle_deg
# Confidence: how much sharper is the best angle vs. the mean?
all_mean = sum(v for _, v in variances) / len(variances)
if all_mean > 0 and best_variance > all_mean:
confidence = min(1.0, (best_variance - all_mean) / (all_mean + 1.0) * 0.6)
else:
confidence = 0.0
result["shear_degrees"] = round(best_angle, 3)
result["confidence"] = round(max(0.0, min(1.0, confidence)), 2)
return result
def _detect_shear_by_hough(img: np.ndarray) -> Dict[str, Any]:
"""Detect shear using Hough transform on printed table / ruled lines.
Vocabulary worksheets have near-horizontal printed table borders. After
deskew these should be exactly horizontal; any residual tilt equals the
vertical shear angle (with inverted sign).
The sign convention: a horizontal line tilting +α degrees (left end lower)
means the page has vertical shear of -α degrees (left column edge drifts
to the left going downward).
Returns:
Dict with keys: method, shear_degrees, confidence.
"""
result = {"method": "hough_lines", "shear_degrees": 0.0, "confidence": 0.0}
h, w = img.shape[:2]
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
edges = cv2.Canny(gray, 50, 150, apertureSize=3)
min_len = int(w * 0.15)
lines = cv2.HoughLinesP(
edges, rho=1, theta=np.pi / 360,
threshold=int(w * 0.08),
minLineLength=min_len,
maxLineGap=20,
)
if lines is None or len(lines) < 3:
return result
horizontal_angles: List[Tuple[float, float]] = []
for line in lines:
x1, y1, x2, y2 = line[0]
if x1 == x2:
continue
angle = float(np.degrees(np.arctan2(y2 - y1, x2 - x1)))
if abs(angle) <= 5.0:
length = float(np.sqrt((x2 - x1) ** 2 + (y2 - y1) ** 2))
horizontal_angles.append((angle, length))
if len(horizontal_angles) < 3:
return result
# Weighted median
angles_arr = np.array([a for a, _ in horizontal_angles])
weights_arr = np.array([l for _, l in horizontal_angles])
sorted_idx = np.argsort(angles_arr)
s_angles = angles_arr[sorted_idx]
s_weights = weights_arr[sorted_idx]
cum = np.cumsum(s_weights)
mid_idx = int(np.searchsorted(cum, cum[-1] / 2.0))
median_angle = float(s_angles[min(mid_idx, len(s_angles) - 1)])
agree = sum(1 for a, _ in horizontal_angles if abs(a - median_angle) < 1.0)
confidence = min(1.0, agree / max(len(horizontal_angles), 1)) * 0.85
# Sign inversion: horizontal line tilt is complementary to vertical shear
shear_degrees = -median_angle
result["shear_degrees"] = round(shear_degrees, 3)
result["confidence"] = round(max(0.0, min(1.0, confidence)), 2)
return result
def _apply_shear(img: np.ndarray, shear_degrees: float) -> np.ndarray:
"""Apply a vertical shear correction to an image.
@@ -516,24 +643,78 @@ def _apply_shear(img: np.ndarray, shear_degrees: float) -> np.ndarray:
return corrected
def dewarp_image(img: np.ndarray) -> Tuple[np.ndarray, Dict[str, Any]]:
def _ensemble_shear(detections: List[Dict[str, Any]]) -> Tuple[float, float, str]:
"""Combine multiple shear detections into a single weighted estimate.
Only methods with confidence >= 0.3 are considered.
Results are outlier-filtered: if any accepted result differs by more than
1° from the weighted mean, it is discarded.
Returns:
(shear_degrees, ensemble_confidence, methods_used_str)
"""
accepted = [(d["shear_degrees"], d["confidence"], d["method"])
for d in detections if d["confidence"] >= 0.3]
if not accepted:
return 0.0, 0.0, "none"
if len(accepted) == 1:
deg, conf, method = accepted[0]
return deg, conf, method
# First pass: weighted mean
total_w = sum(c for _, c, _ in accepted)
w_mean = sum(d * c for d, c, _ in accepted) / total_w
# Outlier filter: keep results within 1° of weighted mean
filtered = [(d, c, m) for d, c, m in accepted if abs(d - w_mean) <= 1.0]
if not filtered:
filtered = accepted # fallback: keep all
# Second pass: weighted mean on filtered results
total_w2 = sum(c for _, c, _ in filtered)
final_deg = sum(d * c for d, c, _ in filtered) / total_w2
# Ensemble confidence: average of individual confidences, boosted when
# methods agree (all within 0.5° of each other)
avg_conf = total_w2 / len(filtered)
spread = max(d for d, _, _ in filtered) - min(d for d, _, _ in filtered)
agreement_bonus = 0.15 if spread < 0.5 else 0.0
ensemble_conf = min(1.0, avg_conf + agreement_bonus)
methods_str = "+".join(m for _, _, m in filtered)
return round(final_deg, 3), round(ensemble_conf, 2), methods_str
def dewarp_image(img: np.ndarray, use_ensemble: bool = True) -> Tuple[np.ndarray, Dict[str, Any]]:
"""Correct vertical shear after deskew.
After deskew aligns horizontal text lines, vertical features (column
edges) may still be tilted. This detects the tilt angle of the strongest
vertical edge and applies an affine shear correction.
edges) may still be tilted. This detects the tilt angle using an ensemble
of three complementary methods and applies an affine shear correction.
Methods (all run in ~100ms total):
A. _detect_shear_angle() — vertical edge profile (~50ms)
B. _detect_shear_by_projection() — horizontal text-line variance (~30ms)
C. _detect_shear_by_hough() — Hough lines on table borders (~20ms)
Only methods with confidence >= 0.3 contribute to the ensemble.
Outlier filtering discards results deviating > 1° from the weighted mean.
Args:
img: BGR image (already deskewed).
use_ensemble: If False, fall back to single-method behaviour (method A only).
Returns:
Tuple of (corrected_image, dewarp_info).
dewarp_info keys: method, shear_degrees, confidence.
dewarp_info keys: method, shear_degrees, confidence, detections.
"""
no_correction = {
"method": "none",
"shear_degrees": 0.0,
"confidence": 0.0,
"detections": [],
}
if not CV2_AVAILABLE:
@@ -541,14 +722,31 @@ def dewarp_image(img: np.ndarray) -> Tuple[np.ndarray, Dict[str, Any]]:
t0 = time.time()
detection = _detect_shear_angle(img)
if use_ensemble:
det_a = _detect_shear_angle(img)
det_b = _detect_shear_by_projection(img)
det_c = _detect_shear_by_hough(img)
detections = [det_a, det_b, det_c]
shear_deg, confidence, method = _ensemble_shear(detections)
else:
det_a = _detect_shear_angle(img)
detections = [det_a]
shear_deg = det_a["shear_degrees"]
confidence = det_a["confidence"]
method = det_a["method"]
duration = time.time() - t0
shear_deg = detection["shear_degrees"]
confidence = detection["confidence"]
logger.info(f"dewarp: detected shear={shear_deg:.3f}° "
f"conf={confidence:.2f} ({duration:.2f}s)")
logger.info(
"dewarp: ensemble shear=%.3f° conf=%.2f method=%s (%.2fs) | "
"A=%.3f/%.2f B=%.3f/%.2f C=%.3f/%.2f",
shear_deg, confidence, method, duration,
detections[0]["shear_degrees"], detections[0]["confidence"],
detections[1]["shear_degrees"] if len(detections) > 1 else 0.0,
detections[1]["confidence"] if len(detections) > 1 else 0.0,
detections[2]["shear_degrees"] if len(detections) > 2 else 0.0,
detections[2]["confidence"] if len(detections) > 2 else 0.0,
)
# Only correct if shear is significant (> 0.05°)
if abs(shear_deg) < 0.05 or confidence < 0.3:
@@ -558,9 +756,14 @@ def dewarp_image(img: np.ndarray) -> Tuple[np.ndarray, Dict[str, Any]]:
corrected = _apply_shear(img, -shear_deg)
info = {
"method": detection["method"],
"method": method,
"shear_degrees": shear_deg,
"confidence": confidence,
"detections": [
{"method": d["method"], "shear_degrees": d["shear_degrees"],
"confidence": d["confidence"]}
for d in detections
],
}
return corrected, info
@@ -3053,6 +3256,142 @@ def ocr_region_rapid(
return words
def ocr_region_trocr(img_bgr: np.ndarray, region: PageRegion, handwritten: bool = False) -> List[Dict[str, Any]]:
"""Run TrOCR on a region. Returns line-level word dicts (same format as ocr_region_rapid).
Uses trocr_service.get_trocr_model() + _split_into_lines() for line segmentation.
Bboxes are approximated from equal line-height distribution within the region.
Falls back to Tesseract if TrOCR is not available.
"""
from services.trocr_service import get_trocr_model, _split_into_lines, _check_trocr_available
if not _check_trocr_available():
logger.warning("TrOCR not available, falling back to Tesseract")
if region.height > 0 and region.width > 0:
ocr_img_crop = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY) if img_bgr is not None else None
if ocr_img_crop is not None:
return ocr_region(ocr_img_crop, region, lang="eng+deu", psm=6)
return []
crop = img_bgr[region.y:region.y + region.height, region.x:region.x + region.width]
if crop.size == 0:
return []
try:
import torch
from PIL import Image as _PILImage
processor, model = get_trocr_model(handwritten=handwritten)
if processor is None or model is None:
logger.warning("TrOCR model not loaded, falling back to Tesseract")
ocr_img_crop = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
return ocr_region(ocr_img_crop, region, lang="eng+deu", psm=6)
pil_crop = _PILImage.fromarray(cv2.cvtColor(crop, cv2.COLOR_BGR2RGB))
lines = _split_into_lines(pil_crop)
if not lines:
lines = [pil_crop]
device = next(model.parameters()).device
all_text = []
confidences = []
for line_img in lines:
pixel_values = processor(images=line_img, return_tensors="pt").pixel_values.to(device)
with torch.no_grad():
generated_ids = model.generate(pixel_values, max_length=128)
text_line = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
if text_line:
all_text.append(text_line)
confidences.append(0.85 if len(text_line) > 3 else 0.5)
if not all_text:
return []
avg_conf = int(sum(confidences) / len(confidences) * 100)
line_h = region.height // max(len(all_text), 1)
words = []
for i, line in enumerate(all_text):
words.append({
"text": line,
"left": region.x,
"top": region.y + i * line_h,
"width": region.width,
"height": line_h,
"conf": avg_conf,
"region_type": region.type,
})
return words
except Exception as e:
logger.error(f"ocr_region_trocr failed: {e}")
return []
def ocr_region_lighton(img_bgr: np.ndarray, region: PageRegion) -> List[Dict[str, Any]]:
"""Run LightOnOCR-2-1B on a region. Returns line-level word dicts (same format as ocr_region_rapid).
Falls back to RapidOCR or Tesseract if LightOnOCR is not available.
"""
from services.lighton_ocr_service import get_lighton_model, _check_lighton_available
if not _check_lighton_available():
logger.warning("LightOnOCR not available, falling back to RapidOCR/Tesseract")
if RAPIDOCR_AVAILABLE and img_bgr is not None:
return ocr_region_rapid(img_bgr, region)
ocr_img_crop = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY) if img_bgr is not None else None
return ocr_region(ocr_img_crop, region, lang="eng+deu", psm=6) if ocr_img_crop is not None else []
crop = img_bgr[region.y:region.y + region.height, region.x:region.x + region.width]
if crop.size == 0:
return []
try:
import io
import torch
from PIL import Image as _PILImage
processor, model = get_lighton_model()
if processor is None or model is None:
logger.warning("LightOnOCR model not loaded, falling back to RapidOCR/Tesseract")
if RAPIDOCR_AVAILABLE and img_bgr is not None:
return ocr_region_rapid(img_bgr, region)
ocr_img_crop = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
return ocr_region(ocr_img_crop, region, lang="eng+deu", psm=6)
pil_crop = _PILImage.fromarray(cv2.cvtColor(crop, cv2.COLOR_BGR2RGB))
conversation = [{"role": "user", "content": [{"type": "image"}]}]
inputs = processor.apply_chat_template(
conversation, images=[pil_crop],
add_generation_prompt=True, return_tensors="pt"
).to(model.device)
with torch.no_grad():
output_ids = model.generate(**inputs, max_new_tokens=1024)
text = processor.decode(output_ids[0], skip_special_tokens=True).strip()
if not text:
return []
lines = [l.strip() for l in text.split("\n") if l.strip()]
line_h = region.height // max(len(lines), 1)
words = []
for i, line in enumerate(lines):
words.append({
"text": line,
"left": region.x,
"top": region.y + i * line_h,
"width": region.width,
"height": line_h,
"conf": 85,
"region_type": region.type,
})
return words
except Exception as e:
logger.error(f"ocr_region_lighton failed: {e}")
return []
# =============================================================================
# Post-Processing: Deterministic Quality Fixes
# =============================================================================
@@ -3900,7 +4239,11 @@ def _ocr_single_cell(
x=cell_x, y=cell_y,
width=cell_w, height=cell_h,
)
if use_rapid and img_bgr is not None:
if engine_name in ("trocr-printed", "trocr-handwritten") and img_bgr is not None:
fallback_words = ocr_region_trocr(img_bgr, cell_region, handwritten=(engine_name == "trocr-handwritten"))
elif engine_name == "lighton" and img_bgr is not None:
fallback_words = ocr_region_lighton(img_bgr, cell_region)
elif use_rapid and img_bgr is not None:
fallback_words = ocr_region_rapid(img_bgr, cell_region)
else:
cell_lang = lang_map.get(col.type, lang)
@@ -3981,8 +4324,8 @@ def build_cell_grid(
img_w: Image width in pixels.
img_h: Image height in pixels.
lang: Default Tesseract language.
ocr_engine: 'tesseract', 'rapid', or 'auto'.
img_bgr: BGR color image (required for RapidOCR).
ocr_engine: 'tesseract', 'rapid', 'auto', 'trocr-printed', 'trocr-handwritten', or 'lighton'.
img_bgr: BGR color image (required for RapidOCR / TrOCR / LightOnOCR).
Returns:
(cells, columns_meta) where cells is a list of cell dicts and
@@ -3990,15 +4333,20 @@ def build_cell_grid(
"""
# Resolve engine choice
use_rapid = False
if ocr_engine == "auto":
if ocr_engine in ("trocr-printed", "trocr-handwritten", "lighton"):
engine_name = ocr_engine
elif ocr_engine == "auto":
use_rapid = RAPIDOCR_AVAILABLE and img_bgr is not None
engine_name = "rapid" if use_rapid else "tesseract"
elif ocr_engine == "rapid":
if not RAPIDOCR_AVAILABLE:
logger.warning("RapidOCR requested but not available, falling back to Tesseract")
else:
use_rapid = True
engine_name = "rapid" if use_rapid else "tesseract"
else:
engine_name = "tesseract"
engine_name = "rapid" if use_rapid else "tesseract"
logger.info(f"build_cell_grid: using OCR engine '{engine_name}'")
# Filter to content rows only (skip header/footer)
@@ -4093,7 +4441,11 @@ def build_cell_grid(
)
strip_lang = lang_map.get(relevant_cols[col_idx].type, lang)
if use_rapid and img_bgr is not None:
if engine_name in ("trocr-printed", "trocr-handwritten") and img_bgr is not None:
strip_words = ocr_region_trocr(img_bgr, strip_region, handwritten=(engine_name == "trocr-handwritten"))
elif engine_name == "lighton" and img_bgr is not None:
strip_words = ocr_region_lighton(img_bgr, strip_region)
elif use_rapid and img_bgr is not None:
strip_words = ocr_region_rapid(img_bgr, strip_region)
else:
strip_words = ocr_region(ocr_img, strip_region, lang=strip_lang, psm=6)
@@ -4169,15 +4521,19 @@ def build_cell_grid_streaming(
"""
# Resolve engine choice (same as build_cell_grid)
use_rapid = False
if ocr_engine == "auto":
if ocr_engine in ("trocr-printed", "trocr-handwritten", "lighton"):
engine_name = ocr_engine
elif ocr_engine == "auto":
use_rapid = RAPIDOCR_AVAILABLE and img_bgr is not None
engine_name = "rapid" if use_rapid else "tesseract"
elif ocr_engine == "rapid":
if not RAPIDOCR_AVAILABLE:
logger.warning("RapidOCR requested but not available, falling back to Tesseract")
else:
use_rapid = True
engine_name = "rapid" if use_rapid else "tesseract"
engine_name = "rapid" if use_rapid else "tesseract"
else:
engine_name = "tesseract"
content_rows = [r for r in row_geometries if r.row_type == 'content']
if not content_rows:
@@ -5026,8 +5382,10 @@ import os
import json as _json
import re as _re
_OLLAMA_URL = os.getenv("OLLAMA_URL", os.getenv("OLLAMA_BASE_URL", "http://host.docker.internal:11434"))
OLLAMA_REVIEW_MODEL = os.getenv("OLLAMA_REVIEW_MODEL", "qwen3:30b-a3b")
_OLLAMA_URL = os.getenv("OLLAMA_BASE_URL", "http://host.docker.internal:11434")
OLLAMA_REVIEW_MODEL = os.getenv("OLLAMA_REVIEW_MODEL", "qwen3:0.6b")
_REVIEW_BATCH_SIZE = int(os.getenv("OLLAMA_REVIEW_BATCH_SIZE", "20"))
logger.info("LLM review model: %s (batch=%d)", OLLAMA_REVIEW_MODEL, _REVIEW_BATCH_SIZE)
# Regex: entry contains IPA phonetic brackets like "dance [dɑːns]"
_HAS_PHONETIC_RE = _re.compile(r'\[.*?[ˈˌːʃʒθðŋɑɒɔəɜɪʊʌæ].*?\]')
@@ -5205,7 +5563,7 @@ async def llm_review_entries(
async def llm_review_entries_streaming(
entries: List[Dict],
model: str = None,
batch_size: int = 8,
batch_size: int = _REVIEW_BATCH_SIZE,
):
"""Async generator: yield SSE events while reviewing entries in batches."""
model = model or OLLAMA_REVIEW_MODEL

View File

@@ -17,6 +17,7 @@ DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
import json
import logging
import os
import time
import uuid
from dataclasses import asdict
@@ -25,7 +26,7 @@ from typing import Any, Dict, List, Optional
import cv2
import numpy as np
from fastapi import APIRouter, File, Form, HTTPException, Request, UploadFile
from fastapi import APIRouter, File, Form, HTTPException, Query, Request, UploadFile
from fastapi.responses import Response, StreamingResponse
from pydantic import BaseModel
@@ -50,6 +51,7 @@ from cv_vocab_pipeline import (
deskew_image_by_word_alignment,
detect_column_geometry,
detect_row_geometry,
_apply_shear,
dewarp_image,
dewarp_image_manual,
llm_review_entries,
@@ -544,9 +546,75 @@ async def save_deskew_ground_truth(session_id: str, req: DeskewGroundTruthReques
# Dewarp Endpoints
# ---------------------------------------------------------------------------
async def _detect_shear_with_vlm(image_bytes: bytes) -> Dict[str, Any]:
"""Ask qwen2.5vl:32b to estimate the vertical shear angle of a scanned page.
The VLM is shown the image and asked: are the column/table borders tilted?
If yes, by how many degrees? Returns a dict with shear_degrees and confidence.
Confidence is 0.0 if Ollama is unavailable or parsing fails.
"""
import httpx
import base64
import re
ollama_base = os.getenv("OLLAMA_BASE_URL", "http://host.docker.internal:11434")
model = os.getenv("OLLAMA_HTR_MODEL", "qwen2.5vl:32b")
prompt = (
"This is a scanned vocabulary worksheet. Look at the vertical borders of the table columns. "
"Are they perfectly vertical, or do they tilt slightly? "
"If they tilt, estimate the tilt angle in degrees (positive = top tilts right, negative = top tilts left). "
"Reply with ONLY a JSON object like: {\"shear_degrees\": 1.2, \"confidence\": 0.8} "
"Use confidence 0.0-1.0 based on how clearly you can see the tilt. "
"If the columns look straight, return {\"shear_degrees\": 0.0, \"confidence\": 0.9}"
)
img_b64 = base64.b64encode(image_bytes).decode("utf-8")
payload = {
"model": model,
"prompt": prompt,
"images": [img_b64],
"stream": False,
}
try:
async with httpx.AsyncClient(timeout=60.0) as client:
resp = await client.post(f"{ollama_base}/api/generate", json=payload)
resp.raise_for_status()
text = resp.json().get("response", "")
# Parse JSON from response (may have surrounding text)
match = re.search(r'\{[^}]+\}', text)
if match:
import json
data = json.loads(match.group(0))
shear = float(data.get("shear_degrees", 0.0))
conf = float(data.get("confidence", 0.0))
# Clamp to reasonable range
shear = max(-3.0, min(3.0, shear))
conf = max(0.0, min(1.0, conf))
return {"method": "vlm_qwen2.5vl", "shear_degrees": round(shear, 3), "confidence": round(conf, 2)}
except Exception as e:
logger.warning(f"VLM dewarp failed: {e}")
return {"method": "vlm_qwen2.5vl", "shear_degrees": 0.0, "confidence": 0.0}
@router.post("/sessions/{session_id}/dewarp")
async def auto_dewarp(session_id: str):
"""Detect and correct vertical shear on the deskewed image."""
async def auto_dewarp(
session_id: str,
method: str = Query("ensemble", description="Detection method: ensemble | vlm | cv"),
):
"""Detect and correct vertical shear on the deskewed image.
Methods:
- **ensemble** (default): 3-method CV ensemble (vertical edges + projection + Hough)
- **cv**: CV ensemble only (same as ensemble)
- **vlm**: Ask qwen2.5vl:32b to estimate the shear angle visually
"""
if method not in ("ensemble", "cv", "vlm"):
raise HTTPException(status_code=400, detail="method must be one of: ensemble, cv, vlm")
if session_id not in _cache:
await _load_session_to_cache(session_id)
cached = _get_cached(session_id)
@@ -556,7 +624,26 @@ async def auto_dewarp(session_id: str):
raise HTTPException(status_code=400, detail="Deskew must be completed before dewarp")
t0 = time.time()
dewarped_bgr, dewarp_info = dewarp_image(deskewed_bgr)
if method == "vlm":
# Encode deskewed image to PNG for VLM
success, png_buf = cv2.imencode(".png", deskewed_bgr)
img_bytes = png_buf.tobytes() if success else b""
vlm_det = await _detect_shear_with_vlm(img_bytes)
shear_deg = vlm_det["shear_degrees"]
if abs(shear_deg) >= 0.05 and vlm_det["confidence"] >= 0.3:
dewarped_bgr = _apply_shear(deskewed_bgr, -shear_deg)
else:
dewarped_bgr = deskewed_bgr
dewarp_info = {
"method": vlm_det["method"],
"shear_degrees": shear_deg,
"confidence": vlm_det["confidence"],
"detections": [vlm_det],
}
else:
dewarped_bgr, dewarp_info = dewarp_image(deskewed_bgr)
duration = time.time() - t0
# Encode as PNG
@@ -568,6 +655,7 @@ async def auto_dewarp(session_id: str):
"shear_degrees": dewarp_info["shear_degrees"],
"confidence": dewarp_info["confidence"],
"duration_seconds": round(duration, 2),
"detections": dewarp_info.get("detections", []),
}
# Update cache
@@ -2000,3 +2088,505 @@ async def remove_handwriting_endpoint(session_id: str, req: RemoveHandwritingReq
"image_url": f"/api/v1/ocr-pipeline/sessions/{session_id}/image/clean",
"session_id": session_id,
}
# ---------------------------------------------------------------------------
# Auto-Mode Endpoint (Improvement 3)
# ---------------------------------------------------------------------------
class RunAutoRequest(BaseModel):
from_step: int = 1 # 1=deskew, 2=dewarp, 3=columns, 4=rows, 5=words, 6=llm-review
ocr_engine: str = "auto" # "auto" | "rapid" | "tesseract"
pronunciation: str = "british"
skip_llm_review: bool = False
dewarp_method: str = "ensemble" # "ensemble" | "vlm" | "cv"
async def _auto_sse_event(step: str, status: str, data: Dict[str, Any]) -> str:
"""Format a single SSE event line."""
import json as _json
payload = {"step": step, "status": status, **data}
return f"data: {_json.dumps(payload)}\n\n"
@router.post("/sessions/{session_id}/run-auto")
async def run_auto(session_id: str, req: RunAutoRequest, request: Request):
"""Run the full OCR pipeline automatically from a given step, streaming SSE progress.
Steps:
1. Deskew — straighten the scan
2. Dewarp — correct vertical shear (ensemble CV or VLM)
3. Columns — detect column layout
4. Rows — detect row layout
5. Words — OCR each cell
6. LLM review — correct OCR errors (optional)
Already-completed steps are skipped unless `from_step` forces a rerun.
Yields SSE events of the form:
data: {"step": "deskew", "status": "start"|"done"|"skipped"|"error", ...}
Final event:
data: {"step": "complete", "status": "done", "steps_run": [...], "steps_skipped": [...]}
"""
if req.from_step < 1 or req.from_step > 6:
raise HTTPException(status_code=400, detail="from_step must be 1-6")
if req.dewarp_method not in ("ensemble", "vlm", "cv"):
raise HTTPException(status_code=400, detail="dewarp_method must be: ensemble, vlm, cv")
if session_id not in _cache:
await _load_session_to_cache(session_id)
async def _generate():
steps_run: List[str] = []
steps_skipped: List[str] = []
error_step: Optional[str] = None
session = await get_session_db(session_id)
if not session:
yield await _auto_sse_event("error", "error", {"message": f"Session {session_id} not found"})
return
cached = _get_cached(session_id)
# -----------------------------------------------------------------
# Step 1: Deskew
# -----------------------------------------------------------------
if req.from_step <= 1:
yield await _auto_sse_event("deskew", "start", {})
try:
t0 = time.time()
orig_bgr = cached.get("original_bgr")
if orig_bgr is None:
raise ValueError("Original image not loaded")
# Method 1: Hough lines
try:
deskewed_hough, angle_hough = deskew_image(orig_bgr.copy())
except Exception:
deskewed_hough, angle_hough = orig_bgr, 0.0
# Method 2: Word alignment
success_enc, png_orig = cv2.imencode(".png", orig_bgr)
orig_bytes = png_orig.tobytes() if success_enc else b""
try:
deskewed_wa_bytes, angle_wa = deskew_image_by_word_alignment(orig_bytes)
except Exception:
deskewed_wa_bytes, angle_wa = orig_bytes, 0.0
# Pick best method
if abs(angle_wa) >= abs(angle_hough) or abs(angle_hough) < 0.1:
method_used = "word_alignment"
angle_applied = angle_wa
wa_arr = np.frombuffer(deskewed_wa_bytes, dtype=np.uint8)
deskewed_bgr = cv2.imdecode(wa_arr, cv2.IMREAD_COLOR)
if deskewed_bgr is None:
deskewed_bgr = deskewed_hough
method_used = "hough"
angle_applied = angle_hough
else:
method_used = "hough"
angle_applied = angle_hough
deskewed_bgr = deskewed_hough
success, png_buf = cv2.imencode(".png", deskewed_bgr)
deskewed_png = png_buf.tobytes() if success else b""
deskew_result = {
"method_used": method_used,
"rotation_degrees": round(float(angle_applied), 3),
"duration_seconds": round(time.time() - t0, 2),
}
cached["deskewed_bgr"] = deskewed_bgr
cached["deskew_result"] = deskew_result
await update_session_db(
session_id,
deskewed_png=deskewed_png,
deskew_result=deskew_result,
auto_rotation_degrees=float(angle_applied),
current_step=2,
)
session = await get_session_db(session_id)
steps_run.append("deskew")
yield await _auto_sse_event("deskew", "done", deskew_result)
except Exception as e:
logger.error(f"Auto-mode deskew failed for {session_id}: {e}")
error_step = "deskew"
yield await _auto_sse_event("deskew", "error", {"message": str(e)})
yield await _auto_sse_event("complete", "error", {"error_step": error_step})
return
else:
steps_skipped.append("deskew")
yield await _auto_sse_event("deskew", "skipped", {"reason": "from_step > 1"})
# -----------------------------------------------------------------
# Step 2: Dewarp
# -----------------------------------------------------------------
if req.from_step <= 2:
yield await _auto_sse_event("dewarp", "start", {"method": req.dewarp_method})
try:
t0 = time.time()
deskewed_bgr = cached.get("deskewed_bgr")
if deskewed_bgr is None:
raise ValueError("Deskewed image not available")
if req.dewarp_method == "vlm":
success_enc, png_buf = cv2.imencode(".png", deskewed_bgr)
img_bytes = png_buf.tobytes() if success_enc else b""
vlm_det = await _detect_shear_with_vlm(img_bytes)
shear_deg = vlm_det["shear_degrees"]
if abs(shear_deg) >= 0.05 and vlm_det["confidence"] >= 0.3:
dewarped_bgr = _apply_shear(deskewed_bgr, -shear_deg)
else:
dewarped_bgr = deskewed_bgr
dewarp_info = {
"method": vlm_det["method"],
"shear_degrees": shear_deg,
"confidence": vlm_det["confidence"],
"detections": [vlm_det],
}
else:
dewarped_bgr, dewarp_info = dewarp_image(deskewed_bgr)
success_enc, png_buf = cv2.imencode(".png", dewarped_bgr)
dewarped_png = png_buf.tobytes() if success_enc else b""
dewarp_result = {
"method_used": dewarp_info["method"],
"shear_degrees": dewarp_info["shear_degrees"],
"confidence": dewarp_info["confidence"],
"duration_seconds": round(time.time() - t0, 2),
"detections": dewarp_info.get("detections", []),
}
cached["dewarped_bgr"] = dewarped_bgr
cached["dewarp_result"] = dewarp_result
await update_session_db(
session_id,
dewarped_png=dewarped_png,
dewarp_result=dewarp_result,
auto_shear_degrees=dewarp_info.get("shear_degrees", 0.0),
current_step=3,
)
session = await get_session_db(session_id)
steps_run.append("dewarp")
yield await _auto_sse_event("dewarp", "done", dewarp_result)
except Exception as e:
logger.error(f"Auto-mode dewarp failed for {session_id}: {e}")
error_step = "dewarp"
yield await _auto_sse_event("dewarp", "error", {"message": str(e)})
yield await _auto_sse_event("complete", "error", {"error_step": error_step})
return
else:
steps_skipped.append("dewarp")
yield await _auto_sse_event("dewarp", "skipped", {"reason": "from_step > 2"})
# -----------------------------------------------------------------
# Step 3: Columns
# -----------------------------------------------------------------
if req.from_step <= 3:
yield await _auto_sse_event("columns", "start", {})
try:
t0 = time.time()
dewarped_bgr = cached.get("dewarped_bgr")
if dewarped_bgr is None:
raise ValueError("Dewarped image not available")
ocr_img = create_ocr_image(dewarped_bgr)
h, w = ocr_img.shape[:2]
geo_result = detect_column_geometry(ocr_img, dewarped_bgr)
if geo_result is None:
layout_img = create_layout_image(dewarped_bgr)
regions = analyze_layout(layout_img, ocr_img)
cached["_word_dicts"] = None
cached["_inv"] = None
cached["_content_bounds"] = None
else:
geometries, left_x, right_x, top_y, bottom_y, word_dicts, inv = geo_result
content_w = right_x - left_x
cached["_word_dicts"] = word_dicts
cached["_inv"] = inv
cached["_content_bounds"] = (left_x, right_x, top_y, bottom_y)
header_y, footer_y = _detect_header_footer_gaps(inv, w, h) if inv is not None else (None, None)
geometries = _detect_sub_columns(geometries, content_w, left_x=left_x,
top_y=top_y, header_y=header_y, footer_y=footer_y)
regions = classify_column_types(geometries, content_w, top_y, w, h, bottom_y,
left_x=left_x, right_x=right_x, inv=inv)
columns = [asdict(r) for r in regions]
column_result = {
"columns": columns,
"classification_methods": list({c.get("classification_method", "") for c in columns if c.get("classification_method")}),
"duration_seconds": round(time.time() - t0, 2),
}
cached["column_result"] = column_result
await update_session_db(session_id, column_result=column_result,
row_result=None, word_result=None, current_step=4)
session = await get_session_db(session_id)
steps_run.append("columns")
yield await _auto_sse_event("columns", "done", {
"column_count": len(columns),
"duration_seconds": column_result["duration_seconds"],
})
except Exception as e:
logger.error(f"Auto-mode columns failed for {session_id}: {e}")
error_step = "columns"
yield await _auto_sse_event("columns", "error", {"message": str(e)})
yield await _auto_sse_event("complete", "error", {"error_step": error_step})
return
else:
steps_skipped.append("columns")
yield await _auto_sse_event("columns", "skipped", {"reason": "from_step > 3"})
# -----------------------------------------------------------------
# Step 4: Rows
# -----------------------------------------------------------------
if req.from_step <= 4:
yield await _auto_sse_event("rows", "start", {})
try:
t0 = time.time()
dewarped_bgr = cached.get("dewarped_bgr")
session = await get_session_db(session_id)
column_result = session.get("column_result") or cached.get("column_result")
if not column_result or not column_result.get("columns"):
raise ValueError("Column detection must complete first")
col_regions = [
PageRegion(
type=c["type"], x=c["x"], y=c["y"],
width=c["width"], height=c["height"],
classification_confidence=c.get("classification_confidence", 1.0),
classification_method=c.get("classification_method", ""),
)
for c in column_result["columns"]
]
word_dicts = cached.get("_word_dicts")
inv = cached.get("_inv")
content_bounds = cached.get("_content_bounds")
if word_dicts is None or inv is None or content_bounds is None:
ocr_img_tmp = create_ocr_image(dewarped_bgr)
geo_result = detect_column_geometry(ocr_img_tmp, dewarped_bgr)
if geo_result is None:
raise ValueError("Column geometry detection failed — cannot detect rows")
_g, lx, rx, ty, by, word_dicts, inv = geo_result
cached["_word_dicts"] = word_dicts
cached["_inv"] = inv
cached["_content_bounds"] = (lx, rx, ty, by)
content_bounds = (lx, rx, ty, by)
left_x, right_x, top_y, bottom_y = content_bounds
row_geoms = detect_row_geometry(inv, word_dicts, left_x, right_x, top_y, bottom_y)
row_list = [
{
"index": r.index, "x": r.x, "y": r.y,
"width": r.width, "height": r.height,
"word_count": r.word_count,
"row_type": r.row_type,
"gap_before": r.gap_before,
}
for r in row_geoms
]
row_result = {
"rows": row_list,
"row_count": len(row_list),
"content_rows": len([r for r in row_geoms if r.row_type == "content"]),
"duration_seconds": round(time.time() - t0, 2),
}
cached["row_result"] = row_result
await update_session_db(session_id, row_result=row_result, current_step=5)
session = await get_session_db(session_id)
steps_run.append("rows")
yield await _auto_sse_event("rows", "done", {
"row_count": len(row_list),
"content_rows": row_result["content_rows"],
"duration_seconds": row_result["duration_seconds"],
})
except Exception as e:
logger.error(f"Auto-mode rows failed for {session_id}: {e}")
error_step = "rows"
yield await _auto_sse_event("rows", "error", {"message": str(e)})
yield await _auto_sse_event("complete", "error", {"error_step": error_step})
return
else:
steps_skipped.append("rows")
yield await _auto_sse_event("rows", "skipped", {"reason": "from_step > 4"})
# -----------------------------------------------------------------
# Step 5: Words (OCR)
# -----------------------------------------------------------------
if req.from_step <= 5:
yield await _auto_sse_event("words", "start", {"engine": req.ocr_engine})
try:
t0 = time.time()
dewarped_bgr = cached.get("dewarped_bgr")
session = await get_session_db(session_id)
column_result = session.get("column_result") or cached.get("column_result")
row_result = session.get("row_result") or cached.get("row_result")
col_regions = [
PageRegion(
type=c["type"], x=c["x"], y=c["y"],
width=c["width"], height=c["height"],
classification_confidence=c.get("classification_confidence", 1.0),
classification_method=c.get("classification_method", ""),
)
for c in column_result["columns"]
]
row_geoms = [
RowGeometry(
index=r["index"], x=r["x"], y=r["y"],
width=r["width"], height=r["height"],
word_count=r.get("word_count", 0), words=[],
row_type=r.get("row_type", "content"),
gap_before=r.get("gap_before", 0),
)
for r in row_result["rows"]
]
word_dicts = cached.get("_word_dicts")
if word_dicts is not None:
content_bounds = cached.get("_content_bounds")
top_y = content_bounds[2] if content_bounds else min(r.y for r in row_geoms)
for row in row_geoms:
row_y_rel = row.y - top_y
row_bottom_rel = row_y_rel + row.height
row.words = [
w for w in word_dicts
if row_y_rel <= w['top'] + w['height'] / 2 < row_bottom_rel
]
row.word_count = len(row.words)
ocr_img = create_ocr_image(dewarped_bgr)
img_h, img_w = dewarped_bgr.shape[:2]
cells, columns_meta = build_cell_grid(
ocr_img, col_regions, row_geoms, img_w, img_h,
ocr_engine=req.ocr_engine, img_bgr=dewarped_bgr,
)
duration = time.time() - t0
col_types = {c['type'] for c in columns_meta}
is_vocab = bool(col_types & {'column_en', 'column_de'})
n_content_rows = len([r for r in row_geoms if r.row_type == 'content'])
used_engine = cells[0].get("ocr_engine", "tesseract") if cells else req.ocr_engine
word_result_data = {
"cells": cells,
"grid_shape": {
"rows": n_content_rows,
"cols": len(columns_meta),
"total_cells": len(cells),
},
"columns_used": columns_meta,
"layout": "vocab" if is_vocab else "generic",
"image_width": img_w,
"image_height": img_h,
"duration_seconds": round(duration, 2),
"ocr_engine": used_engine,
"summary": {
"total_cells": len(cells),
"non_empty_cells": sum(1 for c in cells if c.get("text")),
"low_confidence": sum(1 for c in cells if 0 < c.get("confidence", 0) < 50),
},
}
if is_vocab:
entries = _cells_to_vocab_entries(cells, columns_meta)
entries = _fix_character_confusion(entries)
entries = _fix_phonetic_brackets(entries, pronunciation=req.pronunciation)
word_result_data["vocab_entries"] = entries
word_result_data["entries"] = entries
word_result_data["entry_count"] = len(entries)
word_result_data["summary"]["total_entries"] = len(entries)
await update_session_db(session_id, word_result=word_result_data, current_step=6)
cached["word_result"] = word_result_data
session = await get_session_db(session_id)
steps_run.append("words")
yield await _auto_sse_event("words", "done", {
"total_cells": len(cells),
"layout": word_result_data["layout"],
"duration_seconds": round(duration, 2),
"ocr_engine": used_engine,
"summary": word_result_data["summary"],
})
except Exception as e:
logger.error(f"Auto-mode words failed for {session_id}: {e}")
error_step = "words"
yield await _auto_sse_event("words", "error", {"message": str(e)})
yield await _auto_sse_event("complete", "error", {"error_step": error_step})
return
else:
steps_skipped.append("words")
yield await _auto_sse_event("words", "skipped", {"reason": "from_step > 5"})
# -----------------------------------------------------------------
# Step 6: LLM Review (optional)
# -----------------------------------------------------------------
if req.from_step <= 6 and not req.skip_llm_review:
yield await _auto_sse_event("llm_review", "start", {"model": OLLAMA_REVIEW_MODEL})
try:
session = await get_session_db(session_id)
word_result = session.get("word_result") or cached.get("word_result")
entries = word_result.get("entries") or word_result.get("vocab_entries") or []
if not entries:
yield await _auto_sse_event("llm_review", "skipped", {"reason": "no entries"})
steps_skipped.append("llm_review")
else:
reviewed = await llm_review_entries(entries)
session = await get_session_db(session_id)
word_result_updated = dict(session.get("word_result") or {})
word_result_updated["entries"] = reviewed
word_result_updated["vocab_entries"] = reviewed
word_result_updated["llm_reviewed"] = True
word_result_updated["llm_model"] = OLLAMA_REVIEW_MODEL
await update_session_db(session_id, word_result=word_result_updated, current_step=7)
cached["word_result"] = word_result_updated
steps_run.append("llm_review")
yield await _auto_sse_event("llm_review", "done", {
"entries_reviewed": len(reviewed),
"model": OLLAMA_REVIEW_MODEL,
})
except Exception as e:
logger.warning(f"Auto-mode llm_review failed for {session_id} (non-fatal): {e}")
yield await _auto_sse_event("llm_review", "error", {"message": str(e), "fatal": False})
steps_skipped.append("llm_review")
else:
steps_skipped.append("llm_review")
reason = "skipped by request" if req.skip_llm_review else "from_step > 6"
yield await _auto_sse_event("llm_review", "skipped", {"reason": reason})
# -----------------------------------------------------------------
# Final event
# -----------------------------------------------------------------
yield await _auto_sse_event("complete", "done", {
"steps_run": steps_run,
"steps_skipped": steps_skipped,
})
return StreamingResponse(
_generate(),
media_type="text/event-stream",
headers={
"Cache-Control": "no-cache",
"Connection": "keep-alive",
"X-Accel-Buffering": "no",
},
)