Files
breakpilot-lehrer/klausur-service/backend/ocr_pipeline_words.py
Benjamin Admin ec287fd12e refactor: split ocr_pipeline_api.py (5426 lines) into 8 modules
Each module is under 1050 lines:
- ocr_pipeline_common.py (354) - shared state, cache, models, helpers
- ocr_pipeline_sessions.py (483) - session CRUD, image serving, doc-type
- ocr_pipeline_geometry.py (1025) - deskew, dewarp, structure, columns
- ocr_pipeline_rows.py (348) - row detection, box-overlay helper
- ocr_pipeline_words.py (876) - word detection (SSE), paddle-direct
- ocr_pipeline_ocr_merge.py (615) - merge helpers, kombi endpoints
- ocr_pipeline_postprocess.py (929) - LLM review, reconstruction, export
- ocr_pipeline_auto.py (705) - auto-mode orchestrator, reprocess

ocr_pipeline_api.py is now a 61-line thin wrapper that re-exports
router, _cache, and test-imported symbols for backward compatibility.
No changes needed in main.py or tests.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-18 08:42:00 +01:00

877 lines
33 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
OCR Pipeline Words - Word detection and ground truth endpoints.
Extracted from ocr_pipeline_api.py.
Handles:
- POST /sessions/{session_id}/words — main SSE streaming word detection
- POST /sessions/{session_id}/paddle-direct — PaddleOCR direct endpoint
- POST /sessions/{session_id}/ground-truth/words — save ground truth
- GET /sessions/{session_id}/ground-truth/words — get ground truth
Lizenz: Apache 2.0
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
"""
import json
import logging
import time
from datetime import datetime
from typing import Any, Dict, List, Optional
import cv2
import numpy as np
from fastapi import APIRouter, HTTPException, Request
from fastapi.responses import StreamingResponse
from pydantic import BaseModel
from cv_vocab_pipeline import (
PageRegion,
RowGeometry,
_cells_to_vocab_entries,
_fix_character_confusion,
_fix_phonetic_brackets,
fix_cell_phonetics,
build_cell_grid_v2,
build_cell_grid_v2_streaming,
create_ocr_image,
detect_column_geometry,
)
from cv_words_first import build_grid_from_words
from ocr_pipeline_session_store import (
get_session_db,
get_session_image,
update_session_db,
)
from ocr_pipeline_common import (
_cache,
_load_session_to_cache,
_get_cached,
_get_base_image_png,
_append_pipeline_log,
)
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/api/v1/ocr-pipeline", tags=["ocr-pipeline"])
# ---------------------------------------------------------------------------
# Pydantic models
# ---------------------------------------------------------------------------
class WordGroundTruthRequest(BaseModel):
is_correct: bool
corrected_entries: Optional[List[Dict[str, Any]]] = None
notes: Optional[str] = None
# ---------------------------------------------------------------------------
# Word Detection Endpoint (Step 7)
# ---------------------------------------------------------------------------
@router.post("/sessions/{session_id}/words")
async def detect_words(
session_id: str,
request: Request,
engine: str = "auto",
pronunciation: str = "british",
stream: bool = False,
skip_heal_gaps: bool = False,
grid_method: str = "v2",
):
"""Build word grid from columns × rows, OCR each cell.
Query params:
engine: 'auto' (default), 'tesseract', 'rapid', or 'paddle'
pronunciation: 'british' (default) or 'american' — for IPA dictionary lookup
stream: false (default) for JSON response, true for SSE streaming
skip_heal_gaps: false (default). When true, cells keep exact row geometry
positions without gap-healing expansion. Better for overlay rendering.
grid_method: 'v2' (default) or 'words_first' — grid construction strategy.
'v2' uses pre-detected columns/rows (top-down).
'words_first' clusters words bottom-up (no column/row detection needed).
"""
# PaddleOCR is full-page remote OCR → force words_first grid method
if engine == "paddle" and grid_method != "words_first":
logger.info("detect_words: engine=paddle requires words_first, overriding grid_method=%s", grid_method)
grid_method = "words_first"
if session_id not in _cache:
logger.info("detect_words: session %s not in cache, loading from DB", session_id)
await _load_session_to_cache(session_id)
cached = _get_cached(session_id)
dewarped_bgr = cached.get("cropped_bgr") if cached.get("cropped_bgr") is not None else cached.get("dewarped_bgr")
if dewarped_bgr is None:
logger.warning("detect_words: no cropped/dewarped image for session %s (cache keys: %s)",
session_id, [k for k in cached.keys() if k.endswith('_bgr')])
raise HTTPException(status_code=400, detail="Crop or dewarp must be completed before word detection")
session = await get_session_db(session_id)
if not session:
raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
column_result = session.get("column_result")
row_result = session.get("row_result")
if not column_result or not column_result.get("columns"):
# No column detection — synthesize a single full-page pseudo-column.
# This enables the overlay pipeline which skips column detection.
img_h_tmp, img_w_tmp = dewarped_bgr.shape[:2]
column_result = {
"columns": [{
"type": "column_text",
"x": 0, "y": 0,
"width": img_w_tmp, "height": img_h_tmp,
"classification_confidence": 1.0,
"classification_method": "full_page_fallback",
}],
"zones": [],
"duration_seconds": 0,
}
logger.info("detect_words: no column_result — using full-page pseudo-column %dx%d", img_w_tmp, img_h_tmp)
if grid_method != "words_first" and (not row_result or not row_result.get("rows")):
raise HTTPException(status_code=400, detail="Row detection must be completed first")
# Convert column dicts back to PageRegion objects
col_regions = [
PageRegion(
type=c["type"],
x=c["x"], y=c["y"],
width=c["width"], height=c["height"],
classification_confidence=c.get("classification_confidence", 1.0),
classification_method=c.get("classification_method", ""),
)
for c in column_result["columns"]
]
# Convert row dicts back to RowGeometry objects
row_geoms = [
RowGeometry(
index=r["index"],
x=r["x"], y=r["y"],
width=r["width"], height=r["height"],
word_count=r.get("word_count", 0),
words=[],
row_type=r.get("row_type", "content"),
gap_before=r.get("gap_before", 0),
)
for r in row_result["rows"]
]
# Cell-First OCR (v2): no full-page word re-population needed.
# Each cell is cropped and OCR'd in isolation → no neighbour bleeding.
# We still need word_count > 0 for row filtering in build_cell_grid_v2,
# so populate from cached words if available (just for counting).
word_dicts = cached.get("_word_dicts")
if word_dicts is None:
ocr_img_tmp = create_ocr_image(dewarped_bgr)
geo_result = detect_column_geometry(ocr_img_tmp, dewarped_bgr)
if geo_result is not None:
_geoms, left_x, right_x, top_y, bottom_y, word_dicts, inv = geo_result
cached["_word_dicts"] = word_dicts
cached["_inv"] = inv
cached["_content_bounds"] = (left_x, right_x, top_y, bottom_y)
if word_dicts:
content_bounds = cached.get("_content_bounds")
if content_bounds:
_lx, _rx, top_y, _by = content_bounds
else:
top_y = min(r.y for r in row_geoms) if row_geoms else 0
for row in row_geoms:
row_y_rel = row.y - top_y
row_bottom_rel = row_y_rel + row.height
row.words = [
w for w in word_dicts
if row_y_rel <= w['top'] + w['height'] / 2 < row_bottom_rel
]
row.word_count = len(row.words)
# Exclude rows that fall within box zones.
# Use inner box range (shrunk by border_thickness) so that rows at
# the boundary (overlapping with the box border) are NOT excluded.
zones = column_result.get("zones") or []
box_ranges_inner = []
for zone in zones:
if zone.get("zone_type") == "box" and zone.get("box"):
box = zone["box"]
bt = max(box.get("border_thickness", 0), 5) # minimum 5px margin
box_ranges_inner.append((box["y"] + bt, box["y"] + box["height"] - bt))
if box_ranges_inner:
def _row_in_box(r):
center_y = r.y + r.height / 2
return any(by_s <= center_y < by_e for by_s, by_e in box_ranges_inner)
before_count = len(row_geoms)
row_geoms = [r for r in row_geoms if not _row_in_box(r)]
excluded = before_count - len(row_geoms)
if excluded:
logger.info(f"detect_words: excluded {excluded} rows inside box zones")
# --- Words-First path: bottom-up grid from word boxes ---
if grid_method == "words_first":
t0 = time.time()
img_h, img_w = dewarped_bgr.shape[:2]
# For paddle engine: run remote PaddleOCR full-page instead of Tesseract
if engine == "paddle":
from cv_ocr_engines import ocr_region_paddle
wf_word_dicts = await ocr_region_paddle(dewarped_bgr, region=None)
# PaddleOCR returns absolute coordinates, no content_bounds offset needed
cached["_paddle_word_dicts"] = wf_word_dicts
else:
# Get word_dicts from cache or run Tesseract full-page
wf_word_dicts = cached.get("_word_dicts")
if wf_word_dicts is None:
ocr_img_tmp = create_ocr_image(dewarped_bgr)
geo_result = detect_column_geometry(ocr_img_tmp, dewarped_bgr)
if geo_result is not None:
_geoms, left_x, right_x, top_y, bottom_y, wf_word_dicts, inv = geo_result
cached["_word_dicts"] = wf_word_dicts
cached["_inv"] = inv
cached["_content_bounds"] = (left_x, right_x, top_y, bottom_y)
if not wf_word_dicts:
raise HTTPException(status_code=400, detail="No words detected — cannot build words-first grid")
# Convert word coordinates to absolute image coordinates if needed
# (detect_column_geometry returns words relative to content ROI)
# PaddleOCR already returns absolute coordinates — skip offset.
if engine != "paddle":
content_bounds = cached.get("_content_bounds")
if content_bounds:
lx, _rx, ty, _by = content_bounds
abs_words = []
for w in wf_word_dicts:
abs_words.append({
**w,
'left': w['left'] + lx,
'top': w['top'] + ty,
})
wf_word_dicts = abs_words
# Extract box rects for box-aware column clustering
box_rects = []
for zone in zones:
if zone.get("zone_type") == "box" and zone.get("box"):
box_rects.append(zone["box"])
cells, columns_meta = build_grid_from_words(
wf_word_dicts, img_w, img_h, box_rects=box_rects or None,
)
duration = time.time() - t0
# Apply IPA phonetic fixes
fix_cell_phonetics(cells, pronunciation=pronunciation)
# Add zone_index for backward compat
for cell in cells:
cell.setdefault("zone_index", 0)
col_types = {c['type'] for c in columns_meta}
is_vocab = bool(col_types & {'column_en', 'column_de'})
n_rows = len(set(c['row_index'] for c in cells)) if cells else 0
n_cols = len(columns_meta)
used_engine = "paddle" if engine == "paddle" else "words_first"
word_result = {
"cells": cells,
"grid_shape": {
"rows": n_rows,
"cols": n_cols,
"total_cells": len(cells),
},
"columns_used": columns_meta,
"layout": "vocab" if is_vocab else "generic",
"image_width": img_w,
"image_height": img_h,
"duration_seconds": round(duration, 2),
"ocr_engine": used_engine,
"grid_method": "words_first",
"summary": {
"total_cells": len(cells),
"non_empty_cells": sum(1 for c in cells if c.get("text")),
"low_confidence": sum(1 for c in cells if 0 < c.get("confidence", 0) < 50),
},
}
if is_vocab or 'column_text' in col_types:
entries = _cells_to_vocab_entries(cells, columns_meta)
entries = _fix_phonetic_brackets(entries, pronunciation=pronunciation)
word_result["vocab_entries"] = entries
word_result["entries"] = entries
word_result["entry_count"] = len(entries)
word_result["summary"]["total_entries"] = len(entries)
word_result["summary"]["with_english"] = sum(1 for e in entries if e.get("english"))
word_result["summary"]["with_german"] = sum(1 for e in entries if e.get("german"))
await update_session_db(session_id, word_result=word_result, current_step=8)
cached["word_result"] = word_result
logger.info(f"OCR Pipeline: words-first session {session_id}: "
f"{len(cells)} cells ({duration:.2f}s), {n_rows} rows, {n_cols} cols")
await _append_pipeline_log(session_id, "words", {
"grid_method": "words_first",
"total_cells": len(cells),
"non_empty_cells": word_result["summary"]["non_empty_cells"],
"ocr_engine": used_engine,
"layout": word_result["layout"],
}, duration_ms=int(duration * 1000))
return {"session_id": session_id, **word_result}
if stream:
# Cell-First OCR v2: use batch-then-stream approach instead of
# per-cell streaming. The parallel ThreadPoolExecutor in
# build_cell_grid_v2 is much faster than sequential streaming.
return StreamingResponse(
_word_batch_stream_generator(
session_id, cached, col_regions, row_geoms,
dewarped_bgr, engine, pronunciation, request,
skip_heal_gaps=skip_heal_gaps,
),
media_type="text/event-stream",
headers={
"Cache-Control": "no-cache",
"Connection": "keep-alive",
"X-Accel-Buffering": "no",
},
)
# --- Non-streaming path (grid_method=v2) ---
t0 = time.time()
# Create binarized OCR image (for Tesseract)
ocr_img = create_ocr_image(dewarped_bgr)
img_h, img_w = dewarped_bgr.shape[:2]
# Build cell grid using Cell-First OCR (v2) — each cell cropped in isolation
cells, columns_meta = build_cell_grid_v2(
ocr_img, col_regions, row_geoms, img_w, img_h,
ocr_engine=engine, img_bgr=dewarped_bgr,
skip_heal_gaps=skip_heal_gaps,
)
duration = time.time() - t0
# Add zone_index to each cell (default 0 for backward compatibility)
for cell in cells:
cell.setdefault("zone_index", 0)
# Layout detection
col_types = {c['type'] for c in columns_meta}
is_vocab = bool(col_types & {'column_en', 'column_de'})
# Count content rows and columns for grid_shape
n_content_rows = len([r for r in row_geoms if r.row_type == 'content'])
n_cols = len(columns_meta)
# Determine which engine was actually used
used_engine = cells[0].get("ocr_engine", "tesseract") if cells else engine
# Apply IPA phonetic fixes directly to cell texts (for overlay mode)
fix_cell_phonetics(cells, pronunciation=pronunciation)
# Grid result (always generic)
word_result = {
"cells": cells,
"grid_shape": {
"rows": n_content_rows,
"cols": n_cols,
"total_cells": len(cells),
},
"columns_used": columns_meta,
"layout": "vocab" if is_vocab else "generic",
"image_width": img_w,
"image_height": img_h,
"duration_seconds": round(duration, 2),
"ocr_engine": used_engine,
"summary": {
"total_cells": len(cells),
"non_empty_cells": sum(1 for c in cells if c.get("text")),
"low_confidence": sum(1 for c in cells if 0 < c.get("confidence", 0) < 50),
},
}
# For vocab layout or single-column (box sub-sessions): map cells 1:1
# to vocab entries (row→entry).
has_text_col = 'column_text' in col_types
if is_vocab or has_text_col:
entries = _cells_to_vocab_entries(cells, columns_meta)
entries = _fix_phonetic_brackets(entries, pronunciation=pronunciation)
word_result["vocab_entries"] = entries
word_result["entries"] = entries
word_result["entry_count"] = len(entries)
word_result["summary"]["total_entries"] = len(entries)
word_result["summary"]["with_english"] = sum(1 for e in entries if e.get("english"))
word_result["summary"]["with_german"] = sum(1 for e in entries if e.get("german"))
# Persist to DB
await update_session_db(
session_id,
word_result=word_result,
current_step=8,
)
cached["word_result"] = word_result
logger.info(f"OCR Pipeline: words session {session_id}: "
f"layout={word_result['layout']}, "
f"{len(cells)} cells ({duration:.2f}s), summary: {word_result['summary']}")
await _append_pipeline_log(session_id, "words", {
"total_cells": len(cells),
"non_empty_cells": word_result["summary"]["non_empty_cells"],
"low_confidence_count": word_result["summary"]["low_confidence"],
"ocr_engine": used_engine,
"layout": word_result["layout"],
"entry_count": word_result.get("entry_count", 0),
}, duration_ms=int(duration * 1000))
return {
"session_id": session_id,
**word_result,
}
async def _word_batch_stream_generator(
session_id: str,
cached: Dict[str, Any],
col_regions: List[PageRegion],
row_geoms: List[RowGeometry],
dewarped_bgr: np.ndarray,
engine: str,
pronunciation: str,
request: Request,
skip_heal_gaps: bool = False,
):
"""SSE generator that runs batch OCR (parallel) then streams results.
Unlike the old per-cell streaming, this uses build_cell_grid_v2 with
ThreadPoolExecutor for parallel OCR, then emits all cells as SSE events.
The 'preparing' event keeps the connection alive during OCR processing.
"""
import asyncio
t0 = time.time()
ocr_img = create_ocr_image(dewarped_bgr)
img_h, img_w = dewarped_bgr.shape[:2]
_skip_types = {'column_ignore', 'header', 'footer', 'margin_top', 'margin_bottom', 'margin_left', 'margin_right'}
n_content_rows = len([r for r in row_geoms if r.row_type == 'content'])
n_cols = len([c for c in col_regions if c.type not in _skip_types])
col_types = {c.type for c in col_regions if c.type not in _skip_types}
is_vocab = bool(col_types & {'column_en', 'column_de'})
total_cells = n_content_rows * n_cols
# 1. Send meta event immediately
meta_event = {
"type": "meta",
"grid_shape": {"rows": n_content_rows, "cols": n_cols, "total_cells": total_cells},
"layout": "vocab" if is_vocab else "generic",
}
yield f"data: {json.dumps(meta_event)}\n\n"
# 2. Send preparing event (keepalive for proxy)
yield f"data: {json.dumps({'type': 'preparing', 'message': 'Cell-First OCR laeuft parallel...'})}\n\n"
# 3. Run batch OCR in thread pool with periodic keepalive events.
# The OCR takes 30-60s and proxy servers (Nginx) may drop idle SSE
# connections after 30-60s. Send keepalive every 5s to prevent this.
loop = asyncio.get_event_loop()
ocr_future = loop.run_in_executor(
None,
lambda: build_cell_grid_v2(
ocr_img, col_regions, row_geoms, img_w, img_h,
ocr_engine=engine, img_bgr=dewarped_bgr,
skip_heal_gaps=skip_heal_gaps,
),
)
# Send keepalive events every 5 seconds while OCR runs
keepalive_count = 0
while not ocr_future.done():
try:
cells, columns_meta = await asyncio.wait_for(
asyncio.shield(ocr_future), timeout=5.0,
)
break # OCR finished
except asyncio.TimeoutError:
keepalive_count += 1
elapsed = int(time.time() - t0)
yield f"data: {json.dumps({'type': 'keepalive', 'elapsed': elapsed, 'message': f'OCR laeuft... ({elapsed}s)'})}\n\n"
if await request.is_disconnected():
logger.info(f"SSE batch: client disconnected during OCR for {session_id}")
ocr_future.cancel()
return
else:
cells, columns_meta = ocr_future.result()
if await request.is_disconnected():
logger.info(f"SSE batch: client disconnected after OCR for {session_id}")
return
# 4. Apply IPA phonetic fixes directly to cell texts (for overlay mode)
fix_cell_phonetics(cells, pronunciation=pronunciation)
# 5. Send columns meta
if columns_meta:
yield f"data: {json.dumps({'type': 'columns', 'columns_used': columns_meta})}\n\n"
# 6. Stream all cells
for idx, cell in enumerate(cells):
cell_event = {
"type": "cell",
"cell": cell,
"progress": {"current": idx + 1, "total": len(cells)},
}
yield f"data: {json.dumps(cell_event)}\n\n"
# 6. Build final result and persist
duration = time.time() - t0
used_engine = cells[0].get("ocr_engine", "tesseract") if cells else engine
word_result = {
"cells": cells,
"grid_shape": {"rows": n_content_rows, "cols": n_cols, "total_cells": len(cells)},
"columns_used": columns_meta,
"layout": "vocab" if is_vocab else "generic",
"image_width": img_w,
"image_height": img_h,
"duration_seconds": round(duration, 2),
"ocr_engine": used_engine,
"summary": {
"total_cells": len(cells),
"non_empty_cells": sum(1 for c in cells if c.get("text")),
"low_confidence": sum(1 for c in cells if 0 < c.get("confidence", 0) < 50),
},
}
vocab_entries = None
has_text_col = 'column_text' in col_types
if is_vocab or has_text_col:
entries = _cells_to_vocab_entries(cells, columns_meta)
entries = _fix_phonetic_brackets(entries, pronunciation=pronunciation)
word_result["vocab_entries"] = entries
word_result["entries"] = entries
word_result["entry_count"] = len(entries)
word_result["summary"]["total_entries"] = len(entries)
word_result["summary"]["with_english"] = sum(1 for e in entries if e.get("english"))
word_result["summary"]["with_german"] = sum(1 for e in entries if e.get("german"))
vocab_entries = entries
await update_session_db(session_id, word_result=word_result, current_step=8)
cached["word_result"] = word_result
logger.info(f"OCR Pipeline SSE batch: words session {session_id}: "
f"layout={word_result['layout']}, {len(cells)} cells ({duration:.2f}s)")
# 7. Send complete event
complete_event = {
"type": "complete",
"summary": word_result["summary"],
"duration_seconds": round(duration, 2),
"ocr_engine": used_engine,
}
if vocab_entries is not None:
complete_event["vocab_entries"] = vocab_entries
yield f"data: {json.dumps(complete_event)}\n\n"
async def _word_stream_generator(
session_id: str,
cached: Dict[str, Any],
col_regions: List[PageRegion],
row_geoms: List[RowGeometry],
dewarped_bgr: np.ndarray,
engine: str,
pronunciation: str,
request: Request,
):
"""SSE generator that yields cell-by-cell OCR progress."""
t0 = time.time()
ocr_img = create_ocr_image(dewarped_bgr)
img_h, img_w = dewarped_bgr.shape[:2]
# Compute grid shape upfront for the meta event
n_content_rows = len([r for r in row_geoms if r.row_type == 'content'])
_skip_types = {'column_ignore', 'header', 'footer', 'margin_top', 'margin_bottom', 'margin_left', 'margin_right'}
n_cols = len([c for c in col_regions if c.type not in _skip_types])
# Determine layout
col_types = {c.type for c in col_regions if c.type not in _skip_types}
is_vocab = bool(col_types & {'column_en', 'column_de'})
# Start streaming — first event: meta
columns_meta = None # will be set from first yield
total_cells = n_content_rows * n_cols
meta_event = {
"type": "meta",
"grid_shape": {"rows": n_content_rows, "cols": n_cols, "total_cells": total_cells},
"layout": "vocab" if is_vocab else "generic",
}
yield f"data: {json.dumps(meta_event)}\n\n"
# Keepalive: send preparing event so proxy doesn't timeout during OCR init
yield f"data: {json.dumps({'type': 'preparing', 'message': 'Cell-First OCR wird initialisiert...'})}\n\n"
# Stream cells one by one
all_cells: List[Dict[str, Any]] = []
cell_idx = 0
last_keepalive = time.time()
for cell, cols_meta, total in build_cell_grid_v2_streaming(
ocr_img, col_regions, row_geoms, img_w, img_h,
ocr_engine=engine, img_bgr=dewarped_bgr,
):
if await request.is_disconnected():
logger.info(f"SSE: client disconnected during streaming for {session_id}")
return
if columns_meta is None:
columns_meta = cols_meta
# Send columns_used as part of first cell or update meta
meta_update = {
"type": "columns",
"columns_used": cols_meta,
}
yield f"data: {json.dumps(meta_update)}\n\n"
all_cells.append(cell)
cell_idx += 1
cell_event = {
"type": "cell",
"cell": cell,
"progress": {"current": cell_idx, "total": total},
}
yield f"data: {json.dumps(cell_event)}\n\n"
# All cells done — build final result
duration = time.time() - t0
if columns_meta is None:
columns_meta = []
# Post-OCR: remove rows where ALL cells are empty (inter-row gaps
# that had stray Tesseract artifacts giving word_count > 0).
rows_with_text: set = set()
for c in all_cells:
if c.get("text", "").strip():
rows_with_text.add(c["row_index"])
before_filter = len(all_cells)
all_cells = [c for c in all_cells if c["row_index"] in rows_with_text]
empty_rows_removed = (before_filter - len(all_cells)) // max(n_cols, 1)
if empty_rows_removed > 0:
logger.info(f"SSE: removed {empty_rows_removed} all-empty rows after OCR")
used_engine = all_cells[0].get("ocr_engine", "tesseract") if all_cells else engine
# Apply IPA phonetic fixes directly to cell texts (for overlay mode)
fix_cell_phonetics(all_cells, pronunciation=pronunciation)
word_result = {
"cells": all_cells,
"grid_shape": {
"rows": n_content_rows,
"cols": n_cols,
"total_cells": len(all_cells),
},
"columns_used": columns_meta,
"layout": "vocab" if is_vocab else "generic",
"image_width": img_w,
"image_height": img_h,
"duration_seconds": round(duration, 2),
"ocr_engine": used_engine,
"summary": {
"total_cells": len(all_cells),
"non_empty_cells": sum(1 for c in all_cells if c.get("text")),
"low_confidence": sum(1 for c in all_cells if 0 < c.get("confidence", 0) < 50),
},
}
# For vocab layout or single-column (box sub-sessions): map cells 1:1
# to vocab entries (row→entry).
vocab_entries = None
has_text_col = 'column_text' in col_types
if is_vocab or has_text_col:
entries = _cells_to_vocab_entries(all_cells, columns_meta)
entries = _fix_character_confusion(entries)
entries = _fix_phonetic_brackets(entries, pronunciation=pronunciation)
word_result["vocab_entries"] = entries
word_result["entries"] = entries
word_result["entry_count"] = len(entries)
word_result["summary"]["total_entries"] = len(entries)
word_result["summary"]["with_english"] = sum(1 for e in entries if e.get("english"))
word_result["summary"]["with_german"] = sum(1 for e in entries if e.get("german"))
vocab_entries = entries
# Persist to DB
await update_session_db(
session_id,
word_result=word_result,
current_step=8,
)
cached["word_result"] = word_result
logger.info(f"OCR Pipeline SSE: words session {session_id}: "
f"layout={word_result['layout']}, "
f"{len(all_cells)} cells ({duration:.2f}s)")
# Final complete event
complete_event = {
"type": "complete",
"summary": word_result["summary"],
"duration_seconds": round(duration, 2),
"ocr_engine": used_engine,
}
if vocab_entries is not None:
complete_event["vocab_entries"] = vocab_entries
yield f"data: {json.dumps(complete_event)}\n\n"
# ---------------------------------------------------------------------------
# PaddleOCR Direct Endpoint
# ---------------------------------------------------------------------------
@router.post("/sessions/{session_id}/paddle-direct")
async def paddle_direct(session_id: str):
"""Run PaddleOCR on the preprocessed image and build a word grid directly.
Expects orientation/deskew/dewarp/crop to be done already.
Uses the cropped image (falls back to dewarped, then original).
The used image is stored as cropped_png so OverlayReconstruction
can display it as the background.
"""
# Try preprocessed images first (crop > dewarp > original)
img_png = await get_session_image(session_id, "cropped")
if not img_png:
img_png = await get_session_image(session_id, "dewarped")
if not img_png:
img_png = await get_session_image(session_id, "original")
if not img_png:
raise HTTPException(status_code=404, detail="No image found for this session")
img_arr = np.frombuffer(img_png, dtype=np.uint8)
img_bgr = cv2.imdecode(img_arr, cv2.IMREAD_COLOR)
if img_bgr is None:
raise HTTPException(status_code=400, detail="Failed to decode original image")
img_h, img_w = img_bgr.shape[:2]
from cv_ocr_engines import ocr_region_paddle
t0 = time.time()
word_dicts = await ocr_region_paddle(img_bgr, region=None)
if not word_dicts:
raise HTTPException(status_code=400, detail="PaddleOCR returned no words")
# Reuse build_grid_from_words — same function that works in the regular
# pipeline with PaddleOCR (engine=paddle, grid_method=words_first).
# Handles phrase splitting, column clustering, and reading order.
cells, columns_meta = build_grid_from_words(word_dicts, img_w, img_h)
duration = time.time() - t0
# Tag cells as paddle_direct
for cell in cells:
cell["ocr_engine"] = "paddle_direct"
n_rows = len(set(c["row_index"] for c in cells)) if cells else 0
n_cols = len(columns_meta)
col_types = {c.get("type") for c in columns_meta}
is_vocab = bool(col_types & {"column_en", "column_de"})
word_result = {
"cells": cells,
"grid_shape": {
"rows": n_rows,
"cols": n_cols,
"total_cells": len(cells),
},
"columns_used": columns_meta,
"layout": "vocab" if is_vocab else "generic",
"image_width": img_w,
"image_height": img_h,
"duration_seconds": round(duration, 2),
"ocr_engine": "paddle_direct",
"grid_method": "paddle_direct",
"summary": {
"total_cells": len(cells),
"non_empty_cells": sum(1 for c in cells if c.get("text")),
"low_confidence": sum(1 for c in cells if 0 < c.get("confidence", 0) < 50),
},
}
# Store preprocessed image as cropped_png so OverlayReconstruction shows it
await update_session_db(
session_id,
word_result=word_result,
cropped_png=img_png,
current_step=8,
)
logger.info(
"paddle_direct session %s: %d cells (%d rows, %d cols) in %.2fs",
session_id, len(cells), n_rows, n_cols, duration,
)
await _append_pipeline_log(session_id, "paddle_direct", {
"total_cells": len(cells),
"non_empty_cells": word_result["summary"]["non_empty_cells"],
"ocr_engine": "paddle_direct",
}, duration_ms=int(duration * 1000))
return {"session_id": session_id, **word_result}
# ---------------------------------------------------------------------------
# Ground Truth Words Endpoints
# ---------------------------------------------------------------------------
@router.post("/sessions/{session_id}/ground-truth/words")
async def save_word_ground_truth(session_id: str, req: WordGroundTruthRequest):
"""Save ground truth feedback for the word recognition step."""
session = await get_session_db(session_id)
if not session:
raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
ground_truth = session.get("ground_truth") or {}
gt = {
"is_correct": req.is_correct,
"corrected_entries": req.corrected_entries,
"notes": req.notes,
"saved_at": datetime.utcnow().isoformat(),
"word_result": session.get("word_result"),
}
ground_truth["words"] = gt
await update_session_db(session_id, ground_truth=ground_truth)
if session_id in _cache:
_cache[session_id]["ground_truth"] = ground_truth
return {"session_id": session_id, "ground_truth": gt}
@router.get("/sessions/{session_id}/ground-truth/words")
async def get_word_ground_truth(session_id: str):
"""Retrieve saved ground truth for word recognition."""
session = await get_session_db(session_id)
if not session:
raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
ground_truth = session.get("ground_truth") or {}
words_gt = ground_truth.get("words")
if not words_gt:
raise HTTPException(status_code=404, detail="No word ground truth saved")
return {
"session_id": session_id,
"words_gt": words_gt,
"words_auto": session.get("word_result"),
}