Each module is under 1050 lines: - ocr_pipeline_common.py (354) - shared state, cache, models, helpers - ocr_pipeline_sessions.py (483) - session CRUD, image serving, doc-type - ocr_pipeline_geometry.py (1025) - deskew, dewarp, structure, columns - ocr_pipeline_rows.py (348) - row detection, box-overlay helper - ocr_pipeline_words.py (876) - word detection (SSE), paddle-direct - ocr_pipeline_ocr_merge.py (615) - merge helpers, kombi endpoints - ocr_pipeline_postprocess.py (929) - LLM review, reconstruction, export - ocr_pipeline_auto.py (705) - auto-mode orchestrator, reprocess ocr_pipeline_api.py is now a 61-line thin wrapper that re-exports router, _cache, and test-imported symbols for backward compatibility. No changes needed in main.py or tests. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
877 lines
33 KiB
Python
877 lines
33 KiB
Python
"""
|
||
OCR Pipeline Words - Word detection and ground truth endpoints.
|
||
|
||
Extracted from ocr_pipeline_api.py.
|
||
Handles:
|
||
- POST /sessions/{session_id}/words — main SSE streaming word detection
|
||
- POST /sessions/{session_id}/paddle-direct — PaddleOCR direct endpoint
|
||
- POST /sessions/{session_id}/ground-truth/words — save ground truth
|
||
- GET /sessions/{session_id}/ground-truth/words — get ground truth
|
||
|
||
Lizenz: Apache 2.0
|
||
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
||
"""
|
||
|
||
import json
|
||
import logging
|
||
import time
|
||
from datetime import datetime
|
||
from typing import Any, Dict, List, Optional
|
||
|
||
import cv2
|
||
import numpy as np
|
||
from fastapi import APIRouter, HTTPException, Request
|
||
from fastapi.responses import StreamingResponse
|
||
from pydantic import BaseModel
|
||
|
||
from cv_vocab_pipeline import (
|
||
PageRegion,
|
||
RowGeometry,
|
||
_cells_to_vocab_entries,
|
||
_fix_character_confusion,
|
||
_fix_phonetic_brackets,
|
||
fix_cell_phonetics,
|
||
build_cell_grid_v2,
|
||
build_cell_grid_v2_streaming,
|
||
create_ocr_image,
|
||
detect_column_geometry,
|
||
)
|
||
from cv_words_first import build_grid_from_words
|
||
from ocr_pipeline_session_store import (
|
||
get_session_db,
|
||
get_session_image,
|
||
update_session_db,
|
||
)
|
||
from ocr_pipeline_common import (
|
||
_cache,
|
||
_load_session_to_cache,
|
||
_get_cached,
|
||
_get_base_image_png,
|
||
_append_pipeline_log,
|
||
)
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
router = APIRouter(prefix="/api/v1/ocr-pipeline", tags=["ocr-pipeline"])
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Pydantic models
|
||
# ---------------------------------------------------------------------------
|
||
|
||
class WordGroundTruthRequest(BaseModel):
|
||
is_correct: bool
|
||
corrected_entries: Optional[List[Dict[str, Any]]] = None
|
||
notes: Optional[str] = None
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Word Detection Endpoint (Step 7)
|
||
# ---------------------------------------------------------------------------
|
||
|
||
@router.post("/sessions/{session_id}/words")
|
||
async def detect_words(
|
||
session_id: str,
|
||
request: Request,
|
||
engine: str = "auto",
|
||
pronunciation: str = "british",
|
||
stream: bool = False,
|
||
skip_heal_gaps: bool = False,
|
||
grid_method: str = "v2",
|
||
):
|
||
"""Build word grid from columns × rows, OCR each cell.
|
||
|
||
Query params:
|
||
engine: 'auto' (default), 'tesseract', 'rapid', or 'paddle'
|
||
pronunciation: 'british' (default) or 'american' — for IPA dictionary lookup
|
||
stream: false (default) for JSON response, true for SSE streaming
|
||
skip_heal_gaps: false (default). When true, cells keep exact row geometry
|
||
positions without gap-healing expansion. Better for overlay rendering.
|
||
grid_method: 'v2' (default) or 'words_first' — grid construction strategy.
|
||
'v2' uses pre-detected columns/rows (top-down).
|
||
'words_first' clusters words bottom-up (no column/row detection needed).
|
||
"""
|
||
# PaddleOCR is full-page remote OCR → force words_first grid method
|
||
if engine == "paddle" and grid_method != "words_first":
|
||
logger.info("detect_words: engine=paddle requires words_first, overriding grid_method=%s", grid_method)
|
||
grid_method = "words_first"
|
||
|
||
if session_id not in _cache:
|
||
logger.info("detect_words: session %s not in cache, loading from DB", session_id)
|
||
await _load_session_to_cache(session_id)
|
||
cached = _get_cached(session_id)
|
||
|
||
dewarped_bgr = cached.get("cropped_bgr") if cached.get("cropped_bgr") is not None else cached.get("dewarped_bgr")
|
||
if dewarped_bgr is None:
|
||
logger.warning("detect_words: no cropped/dewarped image for session %s (cache keys: %s)",
|
||
session_id, [k for k in cached.keys() if k.endswith('_bgr')])
|
||
raise HTTPException(status_code=400, detail="Crop or dewarp must be completed before word detection")
|
||
|
||
session = await get_session_db(session_id)
|
||
if not session:
|
||
raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
|
||
|
||
column_result = session.get("column_result")
|
||
row_result = session.get("row_result")
|
||
if not column_result or not column_result.get("columns"):
|
||
# No column detection — synthesize a single full-page pseudo-column.
|
||
# This enables the overlay pipeline which skips column detection.
|
||
img_h_tmp, img_w_tmp = dewarped_bgr.shape[:2]
|
||
column_result = {
|
||
"columns": [{
|
||
"type": "column_text",
|
||
"x": 0, "y": 0,
|
||
"width": img_w_tmp, "height": img_h_tmp,
|
||
"classification_confidence": 1.0,
|
||
"classification_method": "full_page_fallback",
|
||
}],
|
||
"zones": [],
|
||
"duration_seconds": 0,
|
||
}
|
||
logger.info("detect_words: no column_result — using full-page pseudo-column %dx%d", img_w_tmp, img_h_tmp)
|
||
if grid_method != "words_first" and (not row_result or not row_result.get("rows")):
|
||
raise HTTPException(status_code=400, detail="Row detection must be completed first")
|
||
|
||
# Convert column dicts back to PageRegion objects
|
||
col_regions = [
|
||
PageRegion(
|
||
type=c["type"],
|
||
x=c["x"], y=c["y"],
|
||
width=c["width"], height=c["height"],
|
||
classification_confidence=c.get("classification_confidence", 1.0),
|
||
classification_method=c.get("classification_method", ""),
|
||
)
|
||
for c in column_result["columns"]
|
||
]
|
||
|
||
# Convert row dicts back to RowGeometry objects
|
||
row_geoms = [
|
||
RowGeometry(
|
||
index=r["index"],
|
||
x=r["x"], y=r["y"],
|
||
width=r["width"], height=r["height"],
|
||
word_count=r.get("word_count", 0),
|
||
words=[],
|
||
row_type=r.get("row_type", "content"),
|
||
gap_before=r.get("gap_before", 0),
|
||
)
|
||
for r in row_result["rows"]
|
||
]
|
||
|
||
# Cell-First OCR (v2): no full-page word re-population needed.
|
||
# Each cell is cropped and OCR'd in isolation → no neighbour bleeding.
|
||
# We still need word_count > 0 for row filtering in build_cell_grid_v2,
|
||
# so populate from cached words if available (just for counting).
|
||
word_dicts = cached.get("_word_dicts")
|
||
if word_dicts is None:
|
||
ocr_img_tmp = create_ocr_image(dewarped_bgr)
|
||
geo_result = detect_column_geometry(ocr_img_tmp, dewarped_bgr)
|
||
if geo_result is not None:
|
||
_geoms, left_x, right_x, top_y, bottom_y, word_dicts, inv = geo_result
|
||
cached["_word_dicts"] = word_dicts
|
||
cached["_inv"] = inv
|
||
cached["_content_bounds"] = (left_x, right_x, top_y, bottom_y)
|
||
|
||
if word_dicts:
|
||
content_bounds = cached.get("_content_bounds")
|
||
if content_bounds:
|
||
_lx, _rx, top_y, _by = content_bounds
|
||
else:
|
||
top_y = min(r.y for r in row_geoms) if row_geoms else 0
|
||
|
||
for row in row_geoms:
|
||
row_y_rel = row.y - top_y
|
||
row_bottom_rel = row_y_rel + row.height
|
||
row.words = [
|
||
w for w in word_dicts
|
||
if row_y_rel <= w['top'] + w['height'] / 2 < row_bottom_rel
|
||
]
|
||
row.word_count = len(row.words)
|
||
|
||
# Exclude rows that fall within box zones.
|
||
# Use inner box range (shrunk by border_thickness) so that rows at
|
||
# the boundary (overlapping with the box border) are NOT excluded.
|
||
zones = column_result.get("zones") or []
|
||
box_ranges_inner = []
|
||
for zone in zones:
|
||
if zone.get("zone_type") == "box" and zone.get("box"):
|
||
box = zone["box"]
|
||
bt = max(box.get("border_thickness", 0), 5) # minimum 5px margin
|
||
box_ranges_inner.append((box["y"] + bt, box["y"] + box["height"] - bt))
|
||
|
||
if box_ranges_inner:
|
||
def _row_in_box(r):
|
||
center_y = r.y + r.height / 2
|
||
return any(by_s <= center_y < by_e for by_s, by_e in box_ranges_inner)
|
||
|
||
before_count = len(row_geoms)
|
||
row_geoms = [r for r in row_geoms if not _row_in_box(r)]
|
||
excluded = before_count - len(row_geoms)
|
||
if excluded:
|
||
logger.info(f"detect_words: excluded {excluded} rows inside box zones")
|
||
|
||
# --- Words-First path: bottom-up grid from word boxes ---
|
||
if grid_method == "words_first":
|
||
t0 = time.time()
|
||
img_h, img_w = dewarped_bgr.shape[:2]
|
||
|
||
# For paddle engine: run remote PaddleOCR full-page instead of Tesseract
|
||
if engine == "paddle":
|
||
from cv_ocr_engines import ocr_region_paddle
|
||
|
||
wf_word_dicts = await ocr_region_paddle(dewarped_bgr, region=None)
|
||
# PaddleOCR returns absolute coordinates, no content_bounds offset needed
|
||
cached["_paddle_word_dicts"] = wf_word_dicts
|
||
else:
|
||
# Get word_dicts from cache or run Tesseract full-page
|
||
wf_word_dicts = cached.get("_word_dicts")
|
||
if wf_word_dicts is None:
|
||
ocr_img_tmp = create_ocr_image(dewarped_bgr)
|
||
geo_result = detect_column_geometry(ocr_img_tmp, dewarped_bgr)
|
||
if geo_result is not None:
|
||
_geoms, left_x, right_x, top_y, bottom_y, wf_word_dicts, inv = geo_result
|
||
cached["_word_dicts"] = wf_word_dicts
|
||
cached["_inv"] = inv
|
||
cached["_content_bounds"] = (left_x, right_x, top_y, bottom_y)
|
||
|
||
if not wf_word_dicts:
|
||
raise HTTPException(status_code=400, detail="No words detected — cannot build words-first grid")
|
||
|
||
# Convert word coordinates to absolute image coordinates if needed
|
||
# (detect_column_geometry returns words relative to content ROI)
|
||
# PaddleOCR already returns absolute coordinates — skip offset.
|
||
if engine != "paddle":
|
||
content_bounds = cached.get("_content_bounds")
|
||
if content_bounds:
|
||
lx, _rx, ty, _by = content_bounds
|
||
abs_words = []
|
||
for w in wf_word_dicts:
|
||
abs_words.append({
|
||
**w,
|
||
'left': w['left'] + lx,
|
||
'top': w['top'] + ty,
|
||
})
|
||
wf_word_dicts = abs_words
|
||
|
||
# Extract box rects for box-aware column clustering
|
||
box_rects = []
|
||
for zone in zones:
|
||
if zone.get("zone_type") == "box" and zone.get("box"):
|
||
box_rects.append(zone["box"])
|
||
|
||
cells, columns_meta = build_grid_from_words(
|
||
wf_word_dicts, img_w, img_h, box_rects=box_rects or None,
|
||
)
|
||
duration = time.time() - t0
|
||
|
||
# Apply IPA phonetic fixes
|
||
fix_cell_phonetics(cells, pronunciation=pronunciation)
|
||
|
||
# Add zone_index for backward compat
|
||
for cell in cells:
|
||
cell.setdefault("zone_index", 0)
|
||
|
||
col_types = {c['type'] for c in columns_meta}
|
||
is_vocab = bool(col_types & {'column_en', 'column_de'})
|
||
n_rows = len(set(c['row_index'] for c in cells)) if cells else 0
|
||
n_cols = len(columns_meta)
|
||
used_engine = "paddle" if engine == "paddle" else "words_first"
|
||
|
||
word_result = {
|
||
"cells": cells,
|
||
"grid_shape": {
|
||
"rows": n_rows,
|
||
"cols": n_cols,
|
||
"total_cells": len(cells),
|
||
},
|
||
"columns_used": columns_meta,
|
||
"layout": "vocab" if is_vocab else "generic",
|
||
"image_width": img_w,
|
||
"image_height": img_h,
|
||
"duration_seconds": round(duration, 2),
|
||
"ocr_engine": used_engine,
|
||
"grid_method": "words_first",
|
||
"summary": {
|
||
"total_cells": len(cells),
|
||
"non_empty_cells": sum(1 for c in cells if c.get("text")),
|
||
"low_confidence": sum(1 for c in cells if 0 < c.get("confidence", 0) < 50),
|
||
},
|
||
}
|
||
|
||
if is_vocab or 'column_text' in col_types:
|
||
entries = _cells_to_vocab_entries(cells, columns_meta)
|
||
entries = _fix_phonetic_brackets(entries, pronunciation=pronunciation)
|
||
word_result["vocab_entries"] = entries
|
||
word_result["entries"] = entries
|
||
word_result["entry_count"] = len(entries)
|
||
word_result["summary"]["total_entries"] = len(entries)
|
||
word_result["summary"]["with_english"] = sum(1 for e in entries if e.get("english"))
|
||
word_result["summary"]["with_german"] = sum(1 for e in entries if e.get("german"))
|
||
|
||
await update_session_db(session_id, word_result=word_result, current_step=8)
|
||
cached["word_result"] = word_result
|
||
|
||
logger.info(f"OCR Pipeline: words-first session {session_id}: "
|
||
f"{len(cells)} cells ({duration:.2f}s), {n_rows} rows, {n_cols} cols")
|
||
|
||
await _append_pipeline_log(session_id, "words", {
|
||
"grid_method": "words_first",
|
||
"total_cells": len(cells),
|
||
"non_empty_cells": word_result["summary"]["non_empty_cells"],
|
||
"ocr_engine": used_engine,
|
||
"layout": word_result["layout"],
|
||
}, duration_ms=int(duration * 1000))
|
||
|
||
return {"session_id": session_id, **word_result}
|
||
|
||
if stream:
|
||
# Cell-First OCR v2: use batch-then-stream approach instead of
|
||
# per-cell streaming. The parallel ThreadPoolExecutor in
|
||
# build_cell_grid_v2 is much faster than sequential streaming.
|
||
return StreamingResponse(
|
||
_word_batch_stream_generator(
|
||
session_id, cached, col_regions, row_geoms,
|
||
dewarped_bgr, engine, pronunciation, request,
|
||
skip_heal_gaps=skip_heal_gaps,
|
||
),
|
||
media_type="text/event-stream",
|
||
headers={
|
||
"Cache-Control": "no-cache",
|
||
"Connection": "keep-alive",
|
||
"X-Accel-Buffering": "no",
|
||
},
|
||
)
|
||
|
||
# --- Non-streaming path (grid_method=v2) ---
|
||
t0 = time.time()
|
||
|
||
# Create binarized OCR image (for Tesseract)
|
||
ocr_img = create_ocr_image(dewarped_bgr)
|
||
img_h, img_w = dewarped_bgr.shape[:2]
|
||
|
||
# Build cell grid using Cell-First OCR (v2) — each cell cropped in isolation
|
||
cells, columns_meta = build_cell_grid_v2(
|
||
ocr_img, col_regions, row_geoms, img_w, img_h,
|
||
ocr_engine=engine, img_bgr=dewarped_bgr,
|
||
skip_heal_gaps=skip_heal_gaps,
|
||
)
|
||
duration = time.time() - t0
|
||
|
||
# Add zone_index to each cell (default 0 for backward compatibility)
|
||
for cell in cells:
|
||
cell.setdefault("zone_index", 0)
|
||
|
||
# Layout detection
|
||
col_types = {c['type'] for c in columns_meta}
|
||
is_vocab = bool(col_types & {'column_en', 'column_de'})
|
||
|
||
# Count content rows and columns for grid_shape
|
||
n_content_rows = len([r for r in row_geoms if r.row_type == 'content'])
|
||
n_cols = len(columns_meta)
|
||
|
||
# Determine which engine was actually used
|
||
used_engine = cells[0].get("ocr_engine", "tesseract") if cells else engine
|
||
|
||
# Apply IPA phonetic fixes directly to cell texts (for overlay mode)
|
||
fix_cell_phonetics(cells, pronunciation=pronunciation)
|
||
|
||
# Grid result (always generic)
|
||
word_result = {
|
||
"cells": cells,
|
||
"grid_shape": {
|
||
"rows": n_content_rows,
|
||
"cols": n_cols,
|
||
"total_cells": len(cells),
|
||
},
|
||
"columns_used": columns_meta,
|
||
"layout": "vocab" if is_vocab else "generic",
|
||
"image_width": img_w,
|
||
"image_height": img_h,
|
||
"duration_seconds": round(duration, 2),
|
||
"ocr_engine": used_engine,
|
||
"summary": {
|
||
"total_cells": len(cells),
|
||
"non_empty_cells": sum(1 for c in cells if c.get("text")),
|
||
"low_confidence": sum(1 for c in cells if 0 < c.get("confidence", 0) < 50),
|
||
},
|
||
}
|
||
|
||
# For vocab layout or single-column (box sub-sessions): map cells 1:1
|
||
# to vocab entries (row→entry).
|
||
has_text_col = 'column_text' in col_types
|
||
if is_vocab or has_text_col:
|
||
entries = _cells_to_vocab_entries(cells, columns_meta)
|
||
entries = _fix_phonetic_brackets(entries, pronunciation=pronunciation)
|
||
word_result["vocab_entries"] = entries
|
||
word_result["entries"] = entries
|
||
word_result["entry_count"] = len(entries)
|
||
word_result["summary"]["total_entries"] = len(entries)
|
||
word_result["summary"]["with_english"] = sum(1 for e in entries if e.get("english"))
|
||
word_result["summary"]["with_german"] = sum(1 for e in entries if e.get("german"))
|
||
|
||
# Persist to DB
|
||
await update_session_db(
|
||
session_id,
|
||
word_result=word_result,
|
||
current_step=8,
|
||
)
|
||
|
||
cached["word_result"] = word_result
|
||
|
||
logger.info(f"OCR Pipeline: words session {session_id}: "
|
||
f"layout={word_result['layout']}, "
|
||
f"{len(cells)} cells ({duration:.2f}s), summary: {word_result['summary']}")
|
||
|
||
await _append_pipeline_log(session_id, "words", {
|
||
"total_cells": len(cells),
|
||
"non_empty_cells": word_result["summary"]["non_empty_cells"],
|
||
"low_confidence_count": word_result["summary"]["low_confidence"],
|
||
"ocr_engine": used_engine,
|
||
"layout": word_result["layout"],
|
||
"entry_count": word_result.get("entry_count", 0),
|
||
}, duration_ms=int(duration * 1000))
|
||
|
||
return {
|
||
"session_id": session_id,
|
||
**word_result,
|
||
}
|
||
|
||
|
||
async def _word_batch_stream_generator(
|
||
session_id: str,
|
||
cached: Dict[str, Any],
|
||
col_regions: List[PageRegion],
|
||
row_geoms: List[RowGeometry],
|
||
dewarped_bgr: np.ndarray,
|
||
engine: str,
|
||
pronunciation: str,
|
||
request: Request,
|
||
skip_heal_gaps: bool = False,
|
||
):
|
||
"""SSE generator that runs batch OCR (parallel) then streams results.
|
||
|
||
Unlike the old per-cell streaming, this uses build_cell_grid_v2 with
|
||
ThreadPoolExecutor for parallel OCR, then emits all cells as SSE events.
|
||
The 'preparing' event keeps the connection alive during OCR processing.
|
||
"""
|
||
import asyncio
|
||
|
||
t0 = time.time()
|
||
ocr_img = create_ocr_image(dewarped_bgr)
|
||
img_h, img_w = dewarped_bgr.shape[:2]
|
||
|
||
_skip_types = {'column_ignore', 'header', 'footer', 'margin_top', 'margin_bottom', 'margin_left', 'margin_right'}
|
||
n_content_rows = len([r for r in row_geoms if r.row_type == 'content'])
|
||
n_cols = len([c for c in col_regions if c.type not in _skip_types])
|
||
col_types = {c.type for c in col_regions if c.type not in _skip_types}
|
||
is_vocab = bool(col_types & {'column_en', 'column_de'})
|
||
total_cells = n_content_rows * n_cols
|
||
|
||
# 1. Send meta event immediately
|
||
meta_event = {
|
||
"type": "meta",
|
||
"grid_shape": {"rows": n_content_rows, "cols": n_cols, "total_cells": total_cells},
|
||
"layout": "vocab" if is_vocab else "generic",
|
||
}
|
||
yield f"data: {json.dumps(meta_event)}\n\n"
|
||
|
||
# 2. Send preparing event (keepalive for proxy)
|
||
yield f"data: {json.dumps({'type': 'preparing', 'message': 'Cell-First OCR laeuft parallel...'})}\n\n"
|
||
|
||
# 3. Run batch OCR in thread pool with periodic keepalive events.
|
||
# The OCR takes 30-60s and proxy servers (Nginx) may drop idle SSE
|
||
# connections after 30-60s. Send keepalive every 5s to prevent this.
|
||
loop = asyncio.get_event_loop()
|
||
ocr_future = loop.run_in_executor(
|
||
None,
|
||
lambda: build_cell_grid_v2(
|
||
ocr_img, col_regions, row_geoms, img_w, img_h,
|
||
ocr_engine=engine, img_bgr=dewarped_bgr,
|
||
skip_heal_gaps=skip_heal_gaps,
|
||
),
|
||
)
|
||
|
||
# Send keepalive events every 5 seconds while OCR runs
|
||
keepalive_count = 0
|
||
while not ocr_future.done():
|
||
try:
|
||
cells, columns_meta = await asyncio.wait_for(
|
||
asyncio.shield(ocr_future), timeout=5.0,
|
||
)
|
||
break # OCR finished
|
||
except asyncio.TimeoutError:
|
||
keepalive_count += 1
|
||
elapsed = int(time.time() - t0)
|
||
yield f"data: {json.dumps({'type': 'keepalive', 'elapsed': elapsed, 'message': f'OCR laeuft... ({elapsed}s)'})}\n\n"
|
||
if await request.is_disconnected():
|
||
logger.info(f"SSE batch: client disconnected during OCR for {session_id}")
|
||
ocr_future.cancel()
|
||
return
|
||
else:
|
||
cells, columns_meta = ocr_future.result()
|
||
|
||
if await request.is_disconnected():
|
||
logger.info(f"SSE batch: client disconnected after OCR for {session_id}")
|
||
return
|
||
|
||
# 4. Apply IPA phonetic fixes directly to cell texts (for overlay mode)
|
||
fix_cell_phonetics(cells, pronunciation=pronunciation)
|
||
|
||
# 5. Send columns meta
|
||
if columns_meta:
|
||
yield f"data: {json.dumps({'type': 'columns', 'columns_used': columns_meta})}\n\n"
|
||
|
||
# 6. Stream all cells
|
||
for idx, cell in enumerate(cells):
|
||
cell_event = {
|
||
"type": "cell",
|
||
"cell": cell,
|
||
"progress": {"current": idx + 1, "total": len(cells)},
|
||
}
|
||
yield f"data: {json.dumps(cell_event)}\n\n"
|
||
|
||
# 6. Build final result and persist
|
||
duration = time.time() - t0
|
||
used_engine = cells[0].get("ocr_engine", "tesseract") if cells else engine
|
||
|
||
word_result = {
|
||
"cells": cells,
|
||
"grid_shape": {"rows": n_content_rows, "cols": n_cols, "total_cells": len(cells)},
|
||
"columns_used": columns_meta,
|
||
"layout": "vocab" if is_vocab else "generic",
|
||
"image_width": img_w,
|
||
"image_height": img_h,
|
||
"duration_seconds": round(duration, 2),
|
||
"ocr_engine": used_engine,
|
||
"summary": {
|
||
"total_cells": len(cells),
|
||
"non_empty_cells": sum(1 for c in cells if c.get("text")),
|
||
"low_confidence": sum(1 for c in cells if 0 < c.get("confidence", 0) < 50),
|
||
},
|
||
}
|
||
|
||
vocab_entries = None
|
||
has_text_col = 'column_text' in col_types
|
||
if is_vocab or has_text_col:
|
||
entries = _cells_to_vocab_entries(cells, columns_meta)
|
||
entries = _fix_phonetic_brackets(entries, pronunciation=pronunciation)
|
||
word_result["vocab_entries"] = entries
|
||
word_result["entries"] = entries
|
||
word_result["entry_count"] = len(entries)
|
||
word_result["summary"]["total_entries"] = len(entries)
|
||
word_result["summary"]["with_english"] = sum(1 for e in entries if e.get("english"))
|
||
word_result["summary"]["with_german"] = sum(1 for e in entries if e.get("german"))
|
||
vocab_entries = entries
|
||
|
||
await update_session_db(session_id, word_result=word_result, current_step=8)
|
||
cached["word_result"] = word_result
|
||
|
||
logger.info(f"OCR Pipeline SSE batch: words session {session_id}: "
|
||
f"layout={word_result['layout']}, {len(cells)} cells ({duration:.2f}s)")
|
||
|
||
# 7. Send complete event
|
||
complete_event = {
|
||
"type": "complete",
|
||
"summary": word_result["summary"],
|
||
"duration_seconds": round(duration, 2),
|
||
"ocr_engine": used_engine,
|
||
}
|
||
if vocab_entries is not None:
|
||
complete_event["vocab_entries"] = vocab_entries
|
||
yield f"data: {json.dumps(complete_event)}\n\n"
|
||
|
||
|
||
async def _word_stream_generator(
|
||
session_id: str,
|
||
cached: Dict[str, Any],
|
||
col_regions: List[PageRegion],
|
||
row_geoms: List[RowGeometry],
|
||
dewarped_bgr: np.ndarray,
|
||
engine: str,
|
||
pronunciation: str,
|
||
request: Request,
|
||
):
|
||
"""SSE generator that yields cell-by-cell OCR progress."""
|
||
t0 = time.time()
|
||
|
||
ocr_img = create_ocr_image(dewarped_bgr)
|
||
img_h, img_w = dewarped_bgr.shape[:2]
|
||
|
||
# Compute grid shape upfront for the meta event
|
||
n_content_rows = len([r for r in row_geoms if r.row_type == 'content'])
|
||
_skip_types = {'column_ignore', 'header', 'footer', 'margin_top', 'margin_bottom', 'margin_left', 'margin_right'}
|
||
n_cols = len([c for c in col_regions if c.type not in _skip_types])
|
||
|
||
# Determine layout
|
||
col_types = {c.type for c in col_regions if c.type not in _skip_types}
|
||
is_vocab = bool(col_types & {'column_en', 'column_de'})
|
||
|
||
# Start streaming — first event: meta
|
||
columns_meta = None # will be set from first yield
|
||
total_cells = n_content_rows * n_cols
|
||
|
||
meta_event = {
|
||
"type": "meta",
|
||
"grid_shape": {"rows": n_content_rows, "cols": n_cols, "total_cells": total_cells},
|
||
"layout": "vocab" if is_vocab else "generic",
|
||
}
|
||
yield f"data: {json.dumps(meta_event)}\n\n"
|
||
|
||
# Keepalive: send preparing event so proxy doesn't timeout during OCR init
|
||
yield f"data: {json.dumps({'type': 'preparing', 'message': 'Cell-First OCR wird initialisiert...'})}\n\n"
|
||
|
||
# Stream cells one by one
|
||
all_cells: List[Dict[str, Any]] = []
|
||
cell_idx = 0
|
||
last_keepalive = time.time()
|
||
|
||
for cell, cols_meta, total in build_cell_grid_v2_streaming(
|
||
ocr_img, col_regions, row_geoms, img_w, img_h,
|
||
ocr_engine=engine, img_bgr=dewarped_bgr,
|
||
):
|
||
if await request.is_disconnected():
|
||
logger.info(f"SSE: client disconnected during streaming for {session_id}")
|
||
return
|
||
|
||
if columns_meta is None:
|
||
columns_meta = cols_meta
|
||
# Send columns_used as part of first cell or update meta
|
||
meta_update = {
|
||
"type": "columns",
|
||
"columns_used": cols_meta,
|
||
}
|
||
yield f"data: {json.dumps(meta_update)}\n\n"
|
||
|
||
all_cells.append(cell)
|
||
cell_idx += 1
|
||
|
||
cell_event = {
|
||
"type": "cell",
|
||
"cell": cell,
|
||
"progress": {"current": cell_idx, "total": total},
|
||
}
|
||
yield f"data: {json.dumps(cell_event)}\n\n"
|
||
|
||
# All cells done — build final result
|
||
duration = time.time() - t0
|
||
if columns_meta is None:
|
||
columns_meta = []
|
||
|
||
# Post-OCR: remove rows where ALL cells are empty (inter-row gaps
|
||
# that had stray Tesseract artifacts giving word_count > 0).
|
||
rows_with_text: set = set()
|
||
for c in all_cells:
|
||
if c.get("text", "").strip():
|
||
rows_with_text.add(c["row_index"])
|
||
before_filter = len(all_cells)
|
||
all_cells = [c for c in all_cells if c["row_index"] in rows_with_text]
|
||
empty_rows_removed = (before_filter - len(all_cells)) // max(n_cols, 1)
|
||
if empty_rows_removed > 0:
|
||
logger.info(f"SSE: removed {empty_rows_removed} all-empty rows after OCR")
|
||
|
||
used_engine = all_cells[0].get("ocr_engine", "tesseract") if all_cells else engine
|
||
|
||
# Apply IPA phonetic fixes directly to cell texts (for overlay mode)
|
||
fix_cell_phonetics(all_cells, pronunciation=pronunciation)
|
||
|
||
word_result = {
|
||
"cells": all_cells,
|
||
"grid_shape": {
|
||
"rows": n_content_rows,
|
||
"cols": n_cols,
|
||
"total_cells": len(all_cells),
|
||
},
|
||
"columns_used": columns_meta,
|
||
"layout": "vocab" if is_vocab else "generic",
|
||
"image_width": img_w,
|
||
"image_height": img_h,
|
||
"duration_seconds": round(duration, 2),
|
||
"ocr_engine": used_engine,
|
||
"summary": {
|
||
"total_cells": len(all_cells),
|
||
"non_empty_cells": sum(1 for c in all_cells if c.get("text")),
|
||
"low_confidence": sum(1 for c in all_cells if 0 < c.get("confidence", 0) < 50),
|
||
},
|
||
}
|
||
|
||
# For vocab layout or single-column (box sub-sessions): map cells 1:1
|
||
# to vocab entries (row→entry).
|
||
vocab_entries = None
|
||
has_text_col = 'column_text' in col_types
|
||
if is_vocab or has_text_col:
|
||
entries = _cells_to_vocab_entries(all_cells, columns_meta)
|
||
entries = _fix_character_confusion(entries)
|
||
entries = _fix_phonetic_brackets(entries, pronunciation=pronunciation)
|
||
word_result["vocab_entries"] = entries
|
||
word_result["entries"] = entries
|
||
word_result["entry_count"] = len(entries)
|
||
word_result["summary"]["total_entries"] = len(entries)
|
||
word_result["summary"]["with_english"] = sum(1 for e in entries if e.get("english"))
|
||
word_result["summary"]["with_german"] = sum(1 for e in entries if e.get("german"))
|
||
vocab_entries = entries
|
||
|
||
# Persist to DB
|
||
await update_session_db(
|
||
session_id,
|
||
word_result=word_result,
|
||
current_step=8,
|
||
)
|
||
cached["word_result"] = word_result
|
||
|
||
logger.info(f"OCR Pipeline SSE: words session {session_id}: "
|
||
f"layout={word_result['layout']}, "
|
||
f"{len(all_cells)} cells ({duration:.2f}s)")
|
||
|
||
# Final complete event
|
||
complete_event = {
|
||
"type": "complete",
|
||
"summary": word_result["summary"],
|
||
"duration_seconds": round(duration, 2),
|
||
"ocr_engine": used_engine,
|
||
}
|
||
if vocab_entries is not None:
|
||
complete_event["vocab_entries"] = vocab_entries
|
||
yield f"data: {json.dumps(complete_event)}\n\n"
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# PaddleOCR Direct Endpoint
|
||
# ---------------------------------------------------------------------------
|
||
|
||
@router.post("/sessions/{session_id}/paddle-direct")
|
||
async def paddle_direct(session_id: str):
|
||
"""Run PaddleOCR on the preprocessed image and build a word grid directly.
|
||
|
||
Expects orientation/deskew/dewarp/crop to be done already.
|
||
Uses the cropped image (falls back to dewarped, then original).
|
||
The used image is stored as cropped_png so OverlayReconstruction
|
||
can display it as the background.
|
||
"""
|
||
# Try preprocessed images first (crop > dewarp > original)
|
||
img_png = await get_session_image(session_id, "cropped")
|
||
if not img_png:
|
||
img_png = await get_session_image(session_id, "dewarped")
|
||
if not img_png:
|
||
img_png = await get_session_image(session_id, "original")
|
||
if not img_png:
|
||
raise HTTPException(status_code=404, detail="No image found for this session")
|
||
|
||
img_arr = np.frombuffer(img_png, dtype=np.uint8)
|
||
img_bgr = cv2.imdecode(img_arr, cv2.IMREAD_COLOR)
|
||
if img_bgr is None:
|
||
raise HTTPException(status_code=400, detail="Failed to decode original image")
|
||
|
||
img_h, img_w = img_bgr.shape[:2]
|
||
|
||
from cv_ocr_engines import ocr_region_paddle
|
||
|
||
t0 = time.time()
|
||
word_dicts = await ocr_region_paddle(img_bgr, region=None)
|
||
if not word_dicts:
|
||
raise HTTPException(status_code=400, detail="PaddleOCR returned no words")
|
||
|
||
# Reuse build_grid_from_words — same function that works in the regular
|
||
# pipeline with PaddleOCR (engine=paddle, grid_method=words_first).
|
||
# Handles phrase splitting, column clustering, and reading order.
|
||
cells, columns_meta = build_grid_from_words(word_dicts, img_w, img_h)
|
||
duration = time.time() - t0
|
||
|
||
# Tag cells as paddle_direct
|
||
for cell in cells:
|
||
cell["ocr_engine"] = "paddle_direct"
|
||
|
||
n_rows = len(set(c["row_index"] for c in cells)) if cells else 0
|
||
n_cols = len(columns_meta)
|
||
col_types = {c.get("type") for c in columns_meta}
|
||
is_vocab = bool(col_types & {"column_en", "column_de"})
|
||
|
||
word_result = {
|
||
"cells": cells,
|
||
"grid_shape": {
|
||
"rows": n_rows,
|
||
"cols": n_cols,
|
||
"total_cells": len(cells),
|
||
},
|
||
"columns_used": columns_meta,
|
||
"layout": "vocab" if is_vocab else "generic",
|
||
"image_width": img_w,
|
||
"image_height": img_h,
|
||
"duration_seconds": round(duration, 2),
|
||
"ocr_engine": "paddle_direct",
|
||
"grid_method": "paddle_direct",
|
||
"summary": {
|
||
"total_cells": len(cells),
|
||
"non_empty_cells": sum(1 for c in cells if c.get("text")),
|
||
"low_confidence": sum(1 for c in cells if 0 < c.get("confidence", 0) < 50),
|
||
},
|
||
}
|
||
|
||
# Store preprocessed image as cropped_png so OverlayReconstruction shows it
|
||
await update_session_db(
|
||
session_id,
|
||
word_result=word_result,
|
||
cropped_png=img_png,
|
||
current_step=8,
|
||
)
|
||
|
||
logger.info(
|
||
"paddle_direct session %s: %d cells (%d rows, %d cols) in %.2fs",
|
||
session_id, len(cells), n_rows, n_cols, duration,
|
||
)
|
||
|
||
await _append_pipeline_log(session_id, "paddle_direct", {
|
||
"total_cells": len(cells),
|
||
"non_empty_cells": word_result["summary"]["non_empty_cells"],
|
||
"ocr_engine": "paddle_direct",
|
||
}, duration_ms=int(duration * 1000))
|
||
|
||
return {"session_id": session_id, **word_result}
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Ground Truth Words Endpoints
|
||
# ---------------------------------------------------------------------------
|
||
|
||
@router.post("/sessions/{session_id}/ground-truth/words")
|
||
async def save_word_ground_truth(session_id: str, req: WordGroundTruthRequest):
|
||
"""Save ground truth feedback for the word recognition step."""
|
||
session = await get_session_db(session_id)
|
||
if not session:
|
||
raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
|
||
|
||
ground_truth = session.get("ground_truth") or {}
|
||
gt = {
|
||
"is_correct": req.is_correct,
|
||
"corrected_entries": req.corrected_entries,
|
||
"notes": req.notes,
|
||
"saved_at": datetime.utcnow().isoformat(),
|
||
"word_result": session.get("word_result"),
|
||
}
|
||
ground_truth["words"] = gt
|
||
|
||
await update_session_db(session_id, ground_truth=ground_truth)
|
||
|
||
if session_id in _cache:
|
||
_cache[session_id]["ground_truth"] = ground_truth
|
||
|
||
return {"session_id": session_id, "ground_truth": gt}
|
||
|
||
|
||
@router.get("/sessions/{session_id}/ground-truth/words")
|
||
async def get_word_ground_truth(session_id: str):
|
||
"""Retrieve saved ground truth for word recognition."""
|
||
session = await get_session_db(session_id)
|
||
if not session:
|
||
raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
|
||
|
||
ground_truth = session.get("ground_truth") or {}
|
||
words_gt = ground_truth.get("words")
|
||
if not words_gt:
|
||
raise HTTPException(status_code=404, detail="No word ground truth saved")
|
||
|
||
return {
|
||
"session_id": session_id,
|
||
"words_gt": words_gt,
|
||
"words_auto": session.get("word_result"),
|
||
}
|