Each module is under 1050 lines: - ocr_pipeline_common.py (354) - shared state, cache, models, helpers - ocr_pipeline_sessions.py (483) - session CRUD, image serving, doc-type - ocr_pipeline_geometry.py (1025) - deskew, dewarp, structure, columns - ocr_pipeline_rows.py (348) - row detection, box-overlay helper - ocr_pipeline_words.py (876) - word detection (SSE), paddle-direct - ocr_pipeline_ocr_merge.py (615) - merge helpers, kombi endpoints - ocr_pipeline_postprocess.py (929) - LLM review, reconstruction, export - ocr_pipeline_auto.py (705) - auto-mode orchestrator, reprocess ocr_pipeline_api.py is now a 61-line thin wrapper that re-exports router, _cache, and test-imported symbols for backward compatibility. No changes needed in main.py or tests. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
706 lines
31 KiB
Python
706 lines
31 KiB
Python
"""
|
|
OCR Pipeline Auto-Mode Orchestrator and Reprocess Endpoints.
|
|
|
|
Extracted from ocr_pipeline_api.py — contains:
|
|
- POST /sessions/{session_id}/reprocess (clear downstream + restart from step)
|
|
- POST /sessions/{session_id}/run-auto (full auto-mode with SSE streaming)
|
|
|
|
Lizenz: Apache 2.0
|
|
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
|
"""
|
|
|
|
import json
|
|
import logging
|
|
import os
|
|
import re
|
|
import time
|
|
from dataclasses import asdict
|
|
from typing import Any, Dict, List, Optional
|
|
|
|
import cv2
|
|
import numpy as np
|
|
from fastapi import APIRouter, HTTPException, Request
|
|
from fastapi.responses import StreamingResponse
|
|
from pydantic import BaseModel
|
|
|
|
from cv_vocab_pipeline import (
|
|
OLLAMA_REVIEW_MODEL,
|
|
PageRegion,
|
|
RowGeometry,
|
|
_cells_to_vocab_entries,
|
|
_detect_header_footer_gaps,
|
|
_detect_sub_columns,
|
|
_fix_character_confusion,
|
|
_fix_phonetic_brackets,
|
|
fix_cell_phonetics,
|
|
analyze_layout,
|
|
build_cell_grid,
|
|
classify_column_types,
|
|
create_layout_image,
|
|
create_ocr_image,
|
|
deskew_image,
|
|
deskew_image_by_word_alignment,
|
|
detect_column_geometry,
|
|
detect_row_geometry,
|
|
_apply_shear,
|
|
dewarp_image,
|
|
llm_review_entries,
|
|
)
|
|
from ocr_pipeline_common import (
|
|
_cache,
|
|
_load_session_to_cache,
|
|
_get_cached,
|
|
_get_base_image_png,
|
|
_append_pipeline_log,
|
|
)
|
|
from ocr_pipeline_session_store import (
|
|
get_session_db,
|
|
update_session_db,
|
|
)
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
router = APIRouter(prefix="/api/v1/ocr-pipeline", tags=["ocr-pipeline"])
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Reprocess endpoint
|
|
# ---------------------------------------------------------------------------
|
|
|
|
@router.post("/sessions/{session_id}/reprocess")
|
|
async def reprocess_session(session_id: str, request: Request):
|
|
"""Re-run pipeline from a specific step, clearing downstream data.
|
|
|
|
Body: {"from_step": 5} (1-indexed step number)
|
|
|
|
Pipeline order: Orientation(1) → Deskew(2) → Dewarp(3) → Crop(4) → Columns(5) →
|
|
Rows(6) → Words(7) → LLM-Review(8) → Reconstruction(9) → Validation(10)
|
|
|
|
Clears downstream results:
|
|
- from_step <= 1: orientation_result + all downstream
|
|
- from_step <= 2: deskew_result + all downstream
|
|
- from_step <= 3: dewarp_result + all downstream
|
|
- from_step <= 4: crop_result + all downstream
|
|
- from_step <= 5: column_result, row_result, word_result
|
|
- from_step <= 6: row_result, word_result
|
|
- from_step <= 7: word_result (cells, vocab_entries)
|
|
- from_step <= 8: word_result.llm_review only
|
|
"""
|
|
session = await get_session_db(session_id)
|
|
if not session:
|
|
raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
|
|
|
|
body = await request.json()
|
|
from_step = body.get("from_step", 1)
|
|
if not isinstance(from_step, int) or from_step < 1 or from_step > 10:
|
|
raise HTTPException(status_code=400, detail="from_step must be between 1 and 10")
|
|
|
|
update_kwargs: Dict[str, Any] = {"current_step": from_step}
|
|
|
|
# Clear downstream data based on from_step
|
|
# New pipeline order: Orient(2) → Deskew(3) → Dewarp(4) → Crop(5) →
|
|
# Columns(6) → Rows(7) → Words(8) → LLM(9) → Recon(10) → GT(11)
|
|
if from_step <= 8:
|
|
update_kwargs["word_result"] = None
|
|
elif from_step == 9:
|
|
# Only clear LLM review from word_result
|
|
word_result = session.get("word_result")
|
|
if word_result:
|
|
word_result.pop("llm_review", None)
|
|
word_result.pop("llm_corrections", None)
|
|
update_kwargs["word_result"] = word_result
|
|
|
|
if from_step <= 7:
|
|
update_kwargs["row_result"] = None
|
|
if from_step <= 6:
|
|
update_kwargs["column_result"] = None
|
|
if from_step <= 4:
|
|
update_kwargs["crop_result"] = None
|
|
if from_step <= 3:
|
|
update_kwargs["dewarp_result"] = None
|
|
if from_step <= 2:
|
|
update_kwargs["deskew_result"] = None
|
|
if from_step <= 1:
|
|
update_kwargs["orientation_result"] = None
|
|
|
|
await update_session_db(session_id, **update_kwargs)
|
|
|
|
# Also clear cache
|
|
if session_id in _cache:
|
|
for key in list(update_kwargs.keys()):
|
|
if key != "current_step":
|
|
_cache[session_id][key] = update_kwargs[key]
|
|
_cache[session_id]["current_step"] = from_step
|
|
|
|
logger.info(f"Session {session_id} reprocessing from step {from_step}")
|
|
|
|
return {
|
|
"session_id": session_id,
|
|
"from_step": from_step,
|
|
"cleared": [k for k in update_kwargs if k != "current_step"],
|
|
}
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# VLM shear detection helper (used by dewarp step in auto-mode)
|
|
# ---------------------------------------------------------------------------
|
|
|
|
async def _detect_shear_with_vlm(image_bytes: bytes) -> Dict[str, Any]:
|
|
"""Ask qwen2.5vl:32b to estimate the vertical shear angle of a scanned page.
|
|
|
|
The VLM is shown the image and asked: are the column/table borders tilted?
|
|
If yes, by how many degrees? Returns a dict with shear_degrees and confidence.
|
|
Confidence is 0.0 if Ollama is unavailable or parsing fails.
|
|
"""
|
|
import httpx
|
|
import base64
|
|
import re
|
|
|
|
ollama_base = os.getenv("OLLAMA_BASE_URL", "http://host.docker.internal:11434")
|
|
model = os.getenv("OLLAMA_HTR_MODEL", "qwen2.5vl:32b")
|
|
|
|
prompt = (
|
|
"This is a scanned vocabulary worksheet. Look at the vertical borders of the table columns. "
|
|
"Are they perfectly vertical, or do they tilt slightly? "
|
|
"If they tilt, estimate the tilt angle in degrees (positive = top tilts right, negative = top tilts left). "
|
|
"Reply with ONLY a JSON object like: {\"shear_degrees\": 1.2, \"confidence\": 0.8} "
|
|
"Use confidence 0.0-1.0 based on how clearly you can see the tilt. "
|
|
"If the columns look straight, return {\"shear_degrees\": 0.0, \"confidence\": 0.9}"
|
|
)
|
|
|
|
img_b64 = base64.b64encode(image_bytes).decode("utf-8")
|
|
payload = {
|
|
"model": model,
|
|
"prompt": prompt,
|
|
"images": [img_b64],
|
|
"stream": False,
|
|
}
|
|
|
|
try:
|
|
async with httpx.AsyncClient(timeout=60.0) as client:
|
|
resp = await client.post(f"{ollama_base}/api/generate", json=payload)
|
|
resp.raise_for_status()
|
|
text = resp.json().get("response", "")
|
|
|
|
# Parse JSON from response (may have surrounding text)
|
|
match = re.search(r'\{[^}]+\}', text)
|
|
if match:
|
|
import json
|
|
data = json.loads(match.group(0))
|
|
shear = float(data.get("shear_degrees", 0.0))
|
|
conf = float(data.get("confidence", 0.0))
|
|
# Clamp to reasonable range
|
|
shear = max(-3.0, min(3.0, shear))
|
|
conf = max(0.0, min(1.0, conf))
|
|
return {"method": "vlm_qwen2.5vl", "shear_degrees": round(shear, 3), "confidence": round(conf, 2)}
|
|
except Exception as e:
|
|
logger.warning(f"VLM dewarp failed: {e}")
|
|
|
|
return {"method": "vlm_qwen2.5vl", "shear_degrees": 0.0, "confidence": 0.0}
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Auto-mode orchestrator
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class RunAutoRequest(BaseModel):
|
|
from_step: int = 1 # 1=deskew, 2=dewarp, 3=columns, 4=rows, 5=words, 6=llm-review
|
|
ocr_engine: str = "auto" # "auto" | "rapid" | "tesseract"
|
|
pronunciation: str = "british"
|
|
skip_llm_review: bool = False
|
|
dewarp_method: str = "ensemble" # "ensemble" | "vlm" | "cv"
|
|
|
|
|
|
async def _auto_sse_event(step: str, status: str, data: Dict[str, Any]) -> str:
|
|
"""Format a single SSE event line."""
|
|
import json as _json
|
|
payload = {"step": step, "status": status, **data}
|
|
return f"data: {_json.dumps(payload)}\n\n"
|
|
|
|
|
|
@router.post("/sessions/{session_id}/run-auto")
|
|
async def run_auto(session_id: str, req: RunAutoRequest, request: Request):
|
|
"""Run the full OCR pipeline automatically from a given step, streaming SSE progress.
|
|
|
|
Steps:
|
|
1. Deskew — straighten the scan
|
|
2. Dewarp — correct vertical shear (ensemble CV or VLM)
|
|
3. Columns — detect column layout
|
|
4. Rows — detect row layout
|
|
5. Words — OCR each cell
|
|
6. LLM review — correct OCR errors (optional)
|
|
|
|
Already-completed steps are skipped unless `from_step` forces a rerun.
|
|
Yields SSE events of the form:
|
|
data: {"step": "deskew", "status": "start"|"done"|"skipped"|"error", ...}
|
|
|
|
Final event:
|
|
data: {"step": "complete", "status": "done", "steps_run": [...], "steps_skipped": [...]}
|
|
"""
|
|
if req.from_step < 1 or req.from_step > 6:
|
|
raise HTTPException(status_code=400, detail="from_step must be 1-6")
|
|
if req.dewarp_method not in ("ensemble", "vlm", "cv"):
|
|
raise HTTPException(status_code=400, detail="dewarp_method must be: ensemble, vlm, cv")
|
|
|
|
if session_id not in _cache:
|
|
await _load_session_to_cache(session_id)
|
|
|
|
async def _generate():
|
|
steps_run: List[str] = []
|
|
steps_skipped: List[str] = []
|
|
error_step: Optional[str] = None
|
|
|
|
session = await get_session_db(session_id)
|
|
if not session:
|
|
yield await _auto_sse_event("error", "error", {"message": f"Session {session_id} not found"})
|
|
return
|
|
|
|
cached = _get_cached(session_id)
|
|
|
|
# -----------------------------------------------------------------
|
|
# Step 1: Deskew
|
|
# -----------------------------------------------------------------
|
|
if req.from_step <= 1:
|
|
yield await _auto_sse_event("deskew", "start", {})
|
|
try:
|
|
t0 = time.time()
|
|
orig_bgr = cached.get("original_bgr")
|
|
if orig_bgr is None:
|
|
raise ValueError("Original image not loaded")
|
|
|
|
# Method 1: Hough lines
|
|
try:
|
|
deskewed_hough, angle_hough = deskew_image(orig_bgr.copy())
|
|
except Exception:
|
|
deskewed_hough, angle_hough = orig_bgr, 0.0
|
|
|
|
# Method 2: Word alignment
|
|
success_enc, png_orig = cv2.imencode(".png", orig_bgr)
|
|
orig_bytes = png_orig.tobytes() if success_enc else b""
|
|
try:
|
|
deskewed_wa_bytes, angle_wa = deskew_image_by_word_alignment(orig_bytes)
|
|
except Exception:
|
|
deskewed_wa_bytes, angle_wa = orig_bytes, 0.0
|
|
|
|
# Pick best method
|
|
if abs(angle_wa) >= abs(angle_hough) or abs(angle_hough) < 0.1:
|
|
method_used = "word_alignment"
|
|
angle_applied = angle_wa
|
|
wa_arr = np.frombuffer(deskewed_wa_bytes, dtype=np.uint8)
|
|
deskewed_bgr = cv2.imdecode(wa_arr, cv2.IMREAD_COLOR)
|
|
if deskewed_bgr is None:
|
|
deskewed_bgr = deskewed_hough
|
|
method_used = "hough"
|
|
angle_applied = angle_hough
|
|
else:
|
|
method_used = "hough"
|
|
angle_applied = angle_hough
|
|
deskewed_bgr = deskewed_hough
|
|
|
|
success, png_buf = cv2.imencode(".png", deskewed_bgr)
|
|
deskewed_png = png_buf.tobytes() if success else b""
|
|
|
|
deskew_result = {
|
|
"method_used": method_used,
|
|
"rotation_degrees": round(float(angle_applied), 3),
|
|
"duration_seconds": round(time.time() - t0, 2),
|
|
}
|
|
|
|
cached["deskewed_bgr"] = deskewed_bgr
|
|
cached["deskew_result"] = deskew_result
|
|
await update_session_db(
|
|
session_id,
|
|
deskewed_png=deskewed_png,
|
|
deskew_result=deskew_result,
|
|
auto_rotation_degrees=float(angle_applied),
|
|
current_step=3,
|
|
)
|
|
session = await get_session_db(session_id)
|
|
|
|
steps_run.append("deskew")
|
|
yield await _auto_sse_event("deskew", "done", deskew_result)
|
|
except Exception as e:
|
|
logger.error(f"Auto-mode deskew failed for {session_id}: {e}")
|
|
error_step = "deskew"
|
|
yield await _auto_sse_event("deskew", "error", {"message": str(e)})
|
|
yield await _auto_sse_event("complete", "error", {"error_step": error_step})
|
|
return
|
|
else:
|
|
steps_skipped.append("deskew")
|
|
yield await _auto_sse_event("deskew", "skipped", {"reason": "from_step > 1"})
|
|
|
|
# -----------------------------------------------------------------
|
|
# Step 2: Dewarp
|
|
# -----------------------------------------------------------------
|
|
if req.from_step <= 2:
|
|
yield await _auto_sse_event("dewarp", "start", {"method": req.dewarp_method})
|
|
try:
|
|
t0 = time.time()
|
|
deskewed_bgr = cached.get("deskewed_bgr")
|
|
if deskewed_bgr is None:
|
|
raise ValueError("Deskewed image not available")
|
|
|
|
if req.dewarp_method == "vlm":
|
|
success_enc, png_buf = cv2.imencode(".png", deskewed_bgr)
|
|
img_bytes = png_buf.tobytes() if success_enc else b""
|
|
vlm_det = await _detect_shear_with_vlm(img_bytes)
|
|
shear_deg = vlm_det["shear_degrees"]
|
|
if abs(shear_deg) >= 0.05 and vlm_det["confidence"] >= 0.3:
|
|
dewarped_bgr = _apply_shear(deskewed_bgr, -shear_deg)
|
|
else:
|
|
dewarped_bgr = deskewed_bgr
|
|
dewarp_info = {
|
|
"method": vlm_det["method"],
|
|
"shear_degrees": shear_deg,
|
|
"confidence": vlm_det["confidence"],
|
|
"detections": [vlm_det],
|
|
}
|
|
else:
|
|
dewarped_bgr, dewarp_info = dewarp_image(deskewed_bgr)
|
|
|
|
success_enc, png_buf = cv2.imencode(".png", dewarped_bgr)
|
|
dewarped_png = png_buf.tobytes() if success_enc else b""
|
|
|
|
dewarp_result = {
|
|
"method_used": dewarp_info["method"],
|
|
"shear_degrees": dewarp_info["shear_degrees"],
|
|
"confidence": dewarp_info["confidence"],
|
|
"duration_seconds": round(time.time() - t0, 2),
|
|
"detections": dewarp_info.get("detections", []),
|
|
}
|
|
|
|
cached["dewarped_bgr"] = dewarped_bgr
|
|
cached["dewarp_result"] = dewarp_result
|
|
await update_session_db(
|
|
session_id,
|
|
dewarped_png=dewarped_png,
|
|
dewarp_result=dewarp_result,
|
|
auto_shear_degrees=dewarp_info.get("shear_degrees", 0.0),
|
|
current_step=4,
|
|
)
|
|
session = await get_session_db(session_id)
|
|
|
|
steps_run.append("dewarp")
|
|
yield await _auto_sse_event("dewarp", "done", dewarp_result)
|
|
except Exception as e:
|
|
logger.error(f"Auto-mode dewarp failed for {session_id}: {e}")
|
|
error_step = "dewarp"
|
|
yield await _auto_sse_event("dewarp", "error", {"message": str(e)})
|
|
yield await _auto_sse_event("complete", "error", {"error_step": error_step})
|
|
return
|
|
else:
|
|
steps_skipped.append("dewarp")
|
|
yield await _auto_sse_event("dewarp", "skipped", {"reason": "from_step > 2"})
|
|
|
|
# -----------------------------------------------------------------
|
|
# Step 3: Columns
|
|
# -----------------------------------------------------------------
|
|
if req.from_step <= 3:
|
|
yield await _auto_sse_event("columns", "start", {})
|
|
try:
|
|
t0 = time.time()
|
|
col_img = cached.get("cropped_bgr") if cached.get("cropped_bgr") is not None else cached.get("dewarped_bgr")
|
|
if col_img is None:
|
|
raise ValueError("Cropped/dewarped image not available")
|
|
|
|
ocr_img = create_ocr_image(col_img)
|
|
h, w = ocr_img.shape[:2]
|
|
|
|
geo_result = detect_column_geometry(ocr_img, col_img)
|
|
if geo_result is None:
|
|
layout_img = create_layout_image(col_img)
|
|
regions = analyze_layout(layout_img, ocr_img)
|
|
cached["_word_dicts"] = None
|
|
cached["_inv"] = None
|
|
cached["_content_bounds"] = None
|
|
else:
|
|
geometries, left_x, right_x, top_y, bottom_y, word_dicts, inv = geo_result
|
|
content_w = right_x - left_x
|
|
cached["_word_dicts"] = word_dicts
|
|
cached["_inv"] = inv
|
|
cached["_content_bounds"] = (left_x, right_x, top_y, bottom_y)
|
|
|
|
header_y, footer_y = _detect_header_footer_gaps(inv, w, h) if inv is not None else (None, None)
|
|
geometries = _detect_sub_columns(geometries, content_w, left_x=left_x,
|
|
top_y=top_y, header_y=header_y, footer_y=footer_y)
|
|
regions = classify_column_types(geometries, content_w, top_y, w, h, bottom_y,
|
|
left_x=left_x, right_x=right_x, inv=inv)
|
|
|
|
columns = [asdict(r) for r in regions]
|
|
column_result = {
|
|
"columns": columns,
|
|
"classification_methods": list({c.get("classification_method", "") for c in columns if c.get("classification_method")}),
|
|
"duration_seconds": round(time.time() - t0, 2),
|
|
}
|
|
|
|
cached["column_result"] = column_result
|
|
await update_session_db(session_id, column_result=column_result,
|
|
row_result=None, word_result=None, current_step=6)
|
|
session = await get_session_db(session_id)
|
|
|
|
steps_run.append("columns")
|
|
yield await _auto_sse_event("columns", "done", {
|
|
"column_count": len(columns),
|
|
"duration_seconds": column_result["duration_seconds"],
|
|
})
|
|
except Exception as e:
|
|
logger.error(f"Auto-mode columns failed for {session_id}: {e}")
|
|
error_step = "columns"
|
|
yield await _auto_sse_event("columns", "error", {"message": str(e)})
|
|
yield await _auto_sse_event("complete", "error", {"error_step": error_step})
|
|
return
|
|
else:
|
|
steps_skipped.append("columns")
|
|
yield await _auto_sse_event("columns", "skipped", {"reason": "from_step > 3"})
|
|
|
|
# -----------------------------------------------------------------
|
|
# Step 4: Rows
|
|
# -----------------------------------------------------------------
|
|
if req.from_step <= 4:
|
|
yield await _auto_sse_event("rows", "start", {})
|
|
try:
|
|
t0 = time.time()
|
|
row_img = cached.get("cropped_bgr") if cached.get("cropped_bgr") is not None else cached.get("dewarped_bgr")
|
|
session = await get_session_db(session_id)
|
|
column_result = session.get("column_result") or cached.get("column_result")
|
|
if not column_result or not column_result.get("columns"):
|
|
raise ValueError("Column detection must complete first")
|
|
|
|
col_regions = [
|
|
PageRegion(
|
|
type=c["type"], x=c["x"], y=c["y"],
|
|
width=c["width"], height=c["height"],
|
|
classification_confidence=c.get("classification_confidence", 1.0),
|
|
classification_method=c.get("classification_method", ""),
|
|
)
|
|
for c in column_result["columns"]
|
|
]
|
|
|
|
word_dicts = cached.get("_word_dicts")
|
|
inv = cached.get("_inv")
|
|
content_bounds = cached.get("_content_bounds")
|
|
|
|
if word_dicts is None or inv is None or content_bounds is None:
|
|
ocr_img_tmp = create_ocr_image(row_img)
|
|
geo_result = detect_column_geometry(ocr_img_tmp, row_img)
|
|
if geo_result is None:
|
|
raise ValueError("Column geometry detection failed — cannot detect rows")
|
|
_g, lx, rx, ty, by, word_dicts, inv = geo_result
|
|
cached["_word_dicts"] = word_dicts
|
|
cached["_inv"] = inv
|
|
cached["_content_bounds"] = (lx, rx, ty, by)
|
|
content_bounds = (lx, rx, ty, by)
|
|
|
|
left_x, right_x, top_y, bottom_y = content_bounds
|
|
row_geoms = detect_row_geometry(inv, word_dicts, left_x, right_x, top_y, bottom_y)
|
|
|
|
row_list = [
|
|
{
|
|
"index": r.index, "x": r.x, "y": r.y,
|
|
"width": r.width, "height": r.height,
|
|
"word_count": r.word_count,
|
|
"row_type": r.row_type,
|
|
"gap_before": r.gap_before,
|
|
}
|
|
for r in row_geoms
|
|
]
|
|
row_result = {
|
|
"rows": row_list,
|
|
"row_count": len(row_list),
|
|
"content_rows": len([r for r in row_geoms if r.row_type == "content"]),
|
|
"duration_seconds": round(time.time() - t0, 2),
|
|
}
|
|
|
|
cached["row_result"] = row_result
|
|
await update_session_db(session_id, row_result=row_result, current_step=7)
|
|
session = await get_session_db(session_id)
|
|
|
|
steps_run.append("rows")
|
|
yield await _auto_sse_event("rows", "done", {
|
|
"row_count": len(row_list),
|
|
"content_rows": row_result["content_rows"],
|
|
"duration_seconds": row_result["duration_seconds"],
|
|
})
|
|
except Exception as e:
|
|
logger.error(f"Auto-mode rows failed for {session_id}: {e}")
|
|
error_step = "rows"
|
|
yield await _auto_sse_event("rows", "error", {"message": str(e)})
|
|
yield await _auto_sse_event("complete", "error", {"error_step": error_step})
|
|
return
|
|
else:
|
|
steps_skipped.append("rows")
|
|
yield await _auto_sse_event("rows", "skipped", {"reason": "from_step > 4"})
|
|
|
|
# -----------------------------------------------------------------
|
|
# Step 5: Words (OCR)
|
|
# -----------------------------------------------------------------
|
|
if req.from_step <= 5:
|
|
yield await _auto_sse_event("words", "start", {"engine": req.ocr_engine})
|
|
try:
|
|
t0 = time.time()
|
|
word_img = cached.get("cropped_bgr") if cached.get("cropped_bgr") is not None else cached.get("dewarped_bgr")
|
|
session = await get_session_db(session_id)
|
|
|
|
column_result = session.get("column_result") or cached.get("column_result")
|
|
row_result = session.get("row_result") or cached.get("row_result")
|
|
|
|
col_regions = [
|
|
PageRegion(
|
|
type=c["type"], x=c["x"], y=c["y"],
|
|
width=c["width"], height=c["height"],
|
|
classification_confidence=c.get("classification_confidence", 1.0),
|
|
classification_method=c.get("classification_method", ""),
|
|
)
|
|
for c in column_result["columns"]
|
|
]
|
|
row_geoms = [
|
|
RowGeometry(
|
|
index=r["index"], x=r["x"], y=r["y"],
|
|
width=r["width"], height=r["height"],
|
|
word_count=r.get("word_count", 0), words=[],
|
|
row_type=r.get("row_type", "content"),
|
|
gap_before=r.get("gap_before", 0),
|
|
)
|
|
for r in row_result["rows"]
|
|
]
|
|
|
|
word_dicts = cached.get("_word_dicts")
|
|
if word_dicts is not None:
|
|
content_bounds = cached.get("_content_bounds")
|
|
top_y = content_bounds[2] if content_bounds else min(r.y for r in row_geoms)
|
|
for row in row_geoms:
|
|
row_y_rel = row.y - top_y
|
|
row_bottom_rel = row_y_rel + row.height
|
|
row.words = [
|
|
w for w in word_dicts
|
|
if row_y_rel <= w['top'] + w['height'] / 2 < row_bottom_rel
|
|
]
|
|
row.word_count = len(row.words)
|
|
|
|
ocr_img = create_ocr_image(word_img)
|
|
img_h, img_w = word_img.shape[:2]
|
|
|
|
cells, columns_meta = build_cell_grid(
|
|
ocr_img, col_regions, row_geoms, img_w, img_h,
|
|
ocr_engine=req.ocr_engine, img_bgr=word_img,
|
|
)
|
|
duration = time.time() - t0
|
|
|
|
col_types = {c['type'] for c in columns_meta}
|
|
is_vocab = bool(col_types & {'column_en', 'column_de'})
|
|
n_content_rows = len([r for r in row_geoms if r.row_type == 'content'])
|
|
used_engine = cells[0].get("ocr_engine", "tesseract") if cells else req.ocr_engine
|
|
|
|
# Apply IPA phonetic fixes directly to cell texts
|
|
fix_cell_phonetics(cells, pronunciation=req.pronunciation)
|
|
|
|
word_result_data = {
|
|
"cells": cells,
|
|
"grid_shape": {
|
|
"rows": n_content_rows,
|
|
"cols": len(columns_meta),
|
|
"total_cells": len(cells),
|
|
},
|
|
"columns_used": columns_meta,
|
|
"layout": "vocab" if is_vocab else "generic",
|
|
"image_width": img_w,
|
|
"image_height": img_h,
|
|
"duration_seconds": round(duration, 2),
|
|
"ocr_engine": used_engine,
|
|
"summary": {
|
|
"total_cells": len(cells),
|
|
"non_empty_cells": sum(1 for c in cells if c.get("text")),
|
|
"low_confidence": sum(1 for c in cells if 0 < c.get("confidence", 0) < 50),
|
|
},
|
|
}
|
|
|
|
has_text_col = 'column_text' in col_types
|
|
if is_vocab or has_text_col:
|
|
entries = _cells_to_vocab_entries(cells, columns_meta)
|
|
entries = _fix_character_confusion(entries)
|
|
entries = _fix_phonetic_brackets(entries, pronunciation=req.pronunciation)
|
|
word_result_data["vocab_entries"] = entries
|
|
word_result_data["entries"] = entries
|
|
word_result_data["entry_count"] = len(entries)
|
|
word_result_data["summary"]["total_entries"] = len(entries)
|
|
|
|
await update_session_db(session_id, word_result=word_result_data, current_step=8)
|
|
cached["word_result"] = word_result_data
|
|
session = await get_session_db(session_id)
|
|
|
|
steps_run.append("words")
|
|
yield await _auto_sse_event("words", "done", {
|
|
"total_cells": len(cells),
|
|
"layout": word_result_data["layout"],
|
|
"duration_seconds": round(duration, 2),
|
|
"ocr_engine": used_engine,
|
|
"summary": word_result_data["summary"],
|
|
})
|
|
except Exception as e:
|
|
logger.error(f"Auto-mode words failed for {session_id}: {e}")
|
|
error_step = "words"
|
|
yield await _auto_sse_event("words", "error", {"message": str(e)})
|
|
yield await _auto_sse_event("complete", "error", {"error_step": error_step})
|
|
return
|
|
else:
|
|
steps_skipped.append("words")
|
|
yield await _auto_sse_event("words", "skipped", {"reason": "from_step > 5"})
|
|
|
|
# -----------------------------------------------------------------
|
|
# Step 6: LLM Review (optional)
|
|
# -----------------------------------------------------------------
|
|
if req.from_step <= 6 and not req.skip_llm_review:
|
|
yield await _auto_sse_event("llm_review", "start", {"model": OLLAMA_REVIEW_MODEL})
|
|
try:
|
|
session = await get_session_db(session_id)
|
|
word_result = session.get("word_result") or cached.get("word_result")
|
|
entries = word_result.get("entries") or word_result.get("vocab_entries") or []
|
|
|
|
if not entries:
|
|
yield await _auto_sse_event("llm_review", "skipped", {"reason": "no entries"})
|
|
steps_skipped.append("llm_review")
|
|
else:
|
|
reviewed = await llm_review_entries(entries)
|
|
|
|
session = await get_session_db(session_id)
|
|
word_result_updated = dict(session.get("word_result") or {})
|
|
word_result_updated["entries"] = reviewed
|
|
word_result_updated["vocab_entries"] = reviewed
|
|
word_result_updated["llm_reviewed"] = True
|
|
word_result_updated["llm_model"] = OLLAMA_REVIEW_MODEL
|
|
|
|
await update_session_db(session_id, word_result=word_result_updated, current_step=9)
|
|
cached["word_result"] = word_result_updated
|
|
|
|
steps_run.append("llm_review")
|
|
yield await _auto_sse_event("llm_review", "done", {
|
|
"entries_reviewed": len(reviewed),
|
|
"model": OLLAMA_REVIEW_MODEL,
|
|
})
|
|
except Exception as e:
|
|
logger.warning(f"Auto-mode llm_review failed for {session_id} (non-fatal): {e}")
|
|
yield await _auto_sse_event("llm_review", "error", {"message": str(e), "fatal": False})
|
|
steps_skipped.append("llm_review")
|
|
else:
|
|
steps_skipped.append("llm_review")
|
|
reason = "skipped by request" if req.skip_llm_review else "from_step > 6"
|
|
yield await _auto_sse_event("llm_review", "skipped", {"reason": reason})
|
|
|
|
# -----------------------------------------------------------------
|
|
# Final event
|
|
# -----------------------------------------------------------------
|
|
yield await _auto_sse_event("complete", "done", {
|
|
"steps_run": steps_run,
|
|
"steps_skipped": steps_skipped,
|
|
})
|
|
|
|
return StreamingResponse(
|
|
_generate(),
|
|
media_type="text/event-stream",
|
|
headers={
|
|
"Cache-Control": "no-cache",
|
|
"Connection": "keep-alive",
|
|
"X-Accel-Buffering": "no",
|
|
},
|
|
)
|