backend-lehrer (5 files): - alerts_agent/db/repository.py (992 → 5), abitur_docs_api.py (956 → 3) - teacher_dashboard_api.py (951 → 3), services/pdf_service.py (916 → 3) - mail/mail_db.py (987 → 6) klausur-service (5 files): - legal_templates_ingestion.py (942 → 3), ocr_pipeline_postprocess.py (929 → 4) - ocr_pipeline_words.py (876 → 3), ocr_pipeline_ocr_merge.py (616 → 2) - KorrekturPage.tsx (956 → 6) website (5 pages): - mail (985 → 9), edu-search (958 → 8), mac-mini (950 → 7) - ocr-labeling (946 → 7), audit-workspace (871 → 4) studio-v2 (5 files + 1 deleted): - page.tsx (946 → 5), MessagesContext.tsx (925 → 4) - korrektur (914 → 6), worksheet-cleanup (899 → 6) - useVocabWorksheet.ts (888 → 3) - Deleted dead page-original.tsx (934 LOC) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
304 lines
11 KiB
Python
304 lines
11 KiB
Python
"""
|
|
OCR Pipeline Words Stream — SSE streaming generators for word detection.
|
|
|
|
Extracted from ocr_pipeline_words.py.
|
|
|
|
Lizenz: Apache 2.0
|
|
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
|
"""
|
|
|
|
import json
|
|
import logging
|
|
import time
|
|
from typing import Any, Dict, List
|
|
|
|
import numpy as np
|
|
from fastapi import Request
|
|
|
|
from cv_vocab_pipeline import (
|
|
PageRegion,
|
|
RowGeometry,
|
|
_cells_to_vocab_entries,
|
|
_fix_character_confusion,
|
|
_fix_phonetic_brackets,
|
|
fix_cell_phonetics,
|
|
build_cell_grid_v2,
|
|
build_cell_grid_v2_streaming,
|
|
create_ocr_image,
|
|
)
|
|
from ocr_pipeline_session_store import update_session_db
|
|
from ocr_pipeline_common import _cache
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
async def _word_batch_stream_generator(
|
|
session_id: str,
|
|
cached: Dict[str, Any],
|
|
col_regions: List[PageRegion],
|
|
row_geoms: List[RowGeometry],
|
|
dewarped_bgr: np.ndarray,
|
|
engine: str,
|
|
pronunciation: str,
|
|
request: Request,
|
|
skip_heal_gaps: bool = False,
|
|
):
|
|
"""SSE generator that runs batch OCR (parallel) then streams results.
|
|
|
|
Uses build_cell_grid_v2 with ThreadPoolExecutor for parallel OCR,
|
|
then emits all cells as SSE events.
|
|
"""
|
|
import asyncio
|
|
|
|
t0 = time.time()
|
|
ocr_img = create_ocr_image(dewarped_bgr)
|
|
img_h, img_w = dewarped_bgr.shape[:2]
|
|
|
|
_skip_types = {'column_ignore', 'header', 'footer', 'margin_top', 'margin_bottom', 'margin_left', 'margin_right'}
|
|
n_content_rows = len([r for r in row_geoms if r.row_type == 'content'])
|
|
n_cols = len([c for c in col_regions if c.type not in _skip_types])
|
|
col_types = {c.type for c in col_regions if c.type not in _skip_types}
|
|
is_vocab = bool(col_types & {'column_en', 'column_de'})
|
|
total_cells = n_content_rows * n_cols
|
|
|
|
# 1. Send meta event immediately
|
|
meta_event = {
|
|
"type": "meta",
|
|
"grid_shape": {"rows": n_content_rows, "cols": n_cols, "total_cells": total_cells},
|
|
"layout": "vocab" if is_vocab else "generic",
|
|
}
|
|
yield f"data: {json.dumps(meta_event)}\n\n"
|
|
|
|
# 2. Send preparing event (keepalive for proxy)
|
|
yield f"data: {json.dumps({'type': 'preparing', 'message': 'Cell-First OCR laeuft parallel...'})}\n\n"
|
|
|
|
# 3. Run batch OCR in thread pool with periodic keepalive events.
|
|
loop = asyncio.get_event_loop()
|
|
ocr_future = loop.run_in_executor(
|
|
None,
|
|
lambda: build_cell_grid_v2(
|
|
ocr_img, col_regions, row_geoms, img_w, img_h,
|
|
ocr_engine=engine, img_bgr=dewarped_bgr,
|
|
skip_heal_gaps=skip_heal_gaps,
|
|
),
|
|
)
|
|
|
|
# Send keepalive events every 5 seconds while OCR runs
|
|
keepalive_count = 0
|
|
while not ocr_future.done():
|
|
try:
|
|
cells, columns_meta = await asyncio.wait_for(
|
|
asyncio.shield(ocr_future), timeout=5.0,
|
|
)
|
|
break # OCR finished
|
|
except asyncio.TimeoutError:
|
|
keepalive_count += 1
|
|
elapsed = int(time.time() - t0)
|
|
yield f"data: {json.dumps({'type': 'keepalive', 'elapsed': elapsed, 'message': f'OCR laeuft... ({elapsed}s)'})}\n\n"
|
|
if await request.is_disconnected():
|
|
logger.info(f"SSE batch: client disconnected during OCR for {session_id}")
|
|
ocr_future.cancel()
|
|
return
|
|
else:
|
|
cells, columns_meta = ocr_future.result()
|
|
|
|
if await request.is_disconnected():
|
|
logger.info(f"SSE batch: client disconnected after OCR for {session_id}")
|
|
return
|
|
|
|
# 4. Apply IPA phonetic fixes
|
|
fix_cell_phonetics(cells, pronunciation=pronunciation)
|
|
|
|
# 5. Send columns meta
|
|
if columns_meta:
|
|
yield f"data: {json.dumps({'type': 'columns', 'columns_used': columns_meta})}\n\n"
|
|
|
|
# 6. Stream all cells
|
|
for idx, cell in enumerate(cells):
|
|
cell_event = {
|
|
"type": "cell",
|
|
"cell": cell,
|
|
"progress": {"current": idx + 1, "total": len(cells)},
|
|
}
|
|
yield f"data: {json.dumps(cell_event)}\n\n"
|
|
|
|
# 7. Build final result and persist
|
|
duration = time.time() - t0
|
|
used_engine = cells[0].get("ocr_engine", "tesseract") if cells else engine
|
|
|
|
word_result = {
|
|
"cells": cells,
|
|
"grid_shape": {"rows": n_content_rows, "cols": n_cols, "total_cells": len(cells)},
|
|
"columns_used": columns_meta,
|
|
"layout": "vocab" if is_vocab else "generic",
|
|
"image_width": img_w,
|
|
"image_height": img_h,
|
|
"duration_seconds": round(duration, 2),
|
|
"ocr_engine": used_engine,
|
|
"summary": {
|
|
"total_cells": len(cells),
|
|
"non_empty_cells": sum(1 for c in cells if c.get("text")),
|
|
"low_confidence": sum(1 for c in cells if 0 < c.get("confidence", 0) < 50),
|
|
},
|
|
}
|
|
|
|
vocab_entries = None
|
|
has_text_col = 'column_text' in col_types
|
|
if is_vocab or has_text_col:
|
|
entries = _cells_to_vocab_entries(cells, columns_meta)
|
|
entries = _fix_phonetic_brackets(entries, pronunciation=pronunciation)
|
|
word_result["vocab_entries"] = entries
|
|
word_result["entries"] = entries
|
|
word_result["entry_count"] = len(entries)
|
|
word_result["summary"]["total_entries"] = len(entries)
|
|
word_result["summary"]["with_english"] = sum(1 for e in entries if e.get("english"))
|
|
word_result["summary"]["with_german"] = sum(1 for e in entries if e.get("german"))
|
|
vocab_entries = entries
|
|
|
|
await update_session_db(session_id, word_result=word_result, current_step=8)
|
|
cached["word_result"] = word_result
|
|
|
|
logger.info(f"OCR Pipeline SSE batch: words session {session_id}: "
|
|
f"layout={word_result['layout']}, {len(cells)} cells ({duration:.2f}s)")
|
|
|
|
# 8. Send complete event
|
|
complete_event = {
|
|
"type": "complete",
|
|
"summary": word_result["summary"],
|
|
"duration_seconds": round(duration, 2),
|
|
"ocr_engine": used_engine,
|
|
}
|
|
if vocab_entries is not None:
|
|
complete_event["vocab_entries"] = vocab_entries
|
|
yield f"data: {json.dumps(complete_event)}\n\n"
|
|
|
|
|
|
async def _word_stream_generator(
|
|
session_id: str,
|
|
cached: Dict[str, Any],
|
|
col_regions: List[PageRegion],
|
|
row_geoms: List[RowGeometry],
|
|
dewarped_bgr: np.ndarray,
|
|
engine: str,
|
|
pronunciation: str,
|
|
request: Request,
|
|
):
|
|
"""SSE generator that yields cell-by-cell OCR progress."""
|
|
t0 = time.time()
|
|
|
|
ocr_img = create_ocr_image(dewarped_bgr)
|
|
img_h, img_w = dewarped_bgr.shape[:2]
|
|
|
|
n_content_rows = len([r for r in row_geoms if r.row_type == 'content'])
|
|
_skip_types = {'column_ignore', 'header', 'footer', 'margin_top', 'margin_bottom', 'margin_left', 'margin_right'}
|
|
n_cols = len([c for c in col_regions if c.type not in _skip_types])
|
|
|
|
col_types = {c.type for c in col_regions if c.type not in _skip_types}
|
|
is_vocab = bool(col_types & {'column_en', 'column_de'})
|
|
|
|
columns_meta = None
|
|
total_cells = n_content_rows * n_cols
|
|
|
|
meta_event = {
|
|
"type": "meta",
|
|
"grid_shape": {"rows": n_content_rows, "cols": n_cols, "total_cells": total_cells},
|
|
"layout": "vocab" if is_vocab else "generic",
|
|
}
|
|
yield f"data: {json.dumps(meta_event)}\n\n"
|
|
|
|
yield f"data: {json.dumps({'type': 'preparing', 'message': 'Cell-First OCR wird initialisiert...'})}\n\n"
|
|
|
|
all_cells: List[Dict[str, Any]] = []
|
|
cell_idx = 0
|
|
last_keepalive = time.time()
|
|
|
|
for cell, cols_meta, total in build_cell_grid_v2_streaming(
|
|
ocr_img, col_regions, row_geoms, img_w, img_h,
|
|
ocr_engine=engine, img_bgr=dewarped_bgr,
|
|
):
|
|
if await request.is_disconnected():
|
|
logger.info(f"SSE: client disconnected during streaming for {session_id}")
|
|
return
|
|
|
|
if columns_meta is None:
|
|
columns_meta = cols_meta
|
|
meta_update = {"type": "columns", "columns_used": cols_meta}
|
|
yield f"data: {json.dumps(meta_update)}\n\n"
|
|
|
|
all_cells.append(cell)
|
|
cell_idx += 1
|
|
|
|
cell_event = {
|
|
"type": "cell",
|
|
"cell": cell,
|
|
"progress": {"current": cell_idx, "total": total},
|
|
}
|
|
yield f"data: {json.dumps(cell_event)}\n\n"
|
|
|
|
# All cells done
|
|
duration = time.time() - t0
|
|
if columns_meta is None:
|
|
columns_meta = []
|
|
|
|
# Remove all-empty rows
|
|
rows_with_text: set = set()
|
|
for c in all_cells:
|
|
if c.get("text", "").strip():
|
|
rows_with_text.add(c["row_index"])
|
|
before_filter = len(all_cells)
|
|
all_cells = [c for c in all_cells if c["row_index"] in rows_with_text]
|
|
empty_rows_removed = (before_filter - len(all_cells)) // max(n_cols, 1)
|
|
if empty_rows_removed > 0:
|
|
logger.info(f"SSE: removed {empty_rows_removed} all-empty rows after OCR")
|
|
|
|
used_engine = all_cells[0].get("ocr_engine", "tesseract") if all_cells else engine
|
|
|
|
fix_cell_phonetics(all_cells, pronunciation=pronunciation)
|
|
|
|
word_result = {
|
|
"cells": all_cells,
|
|
"grid_shape": {"rows": n_content_rows, "cols": n_cols, "total_cells": len(all_cells)},
|
|
"columns_used": columns_meta,
|
|
"layout": "vocab" if is_vocab else "generic",
|
|
"image_width": img_w,
|
|
"image_height": img_h,
|
|
"duration_seconds": round(duration, 2),
|
|
"ocr_engine": used_engine,
|
|
"summary": {
|
|
"total_cells": len(all_cells),
|
|
"non_empty_cells": sum(1 for c in all_cells if c.get("text")),
|
|
"low_confidence": sum(1 for c in all_cells if 0 < c.get("confidence", 0) < 50),
|
|
},
|
|
}
|
|
|
|
vocab_entries = None
|
|
has_text_col = 'column_text' in col_types
|
|
if is_vocab or has_text_col:
|
|
entries = _cells_to_vocab_entries(all_cells, columns_meta)
|
|
entries = _fix_character_confusion(entries)
|
|
entries = _fix_phonetic_brackets(entries, pronunciation=pronunciation)
|
|
word_result["vocab_entries"] = entries
|
|
word_result["entries"] = entries
|
|
word_result["entry_count"] = len(entries)
|
|
word_result["summary"]["total_entries"] = len(entries)
|
|
word_result["summary"]["with_english"] = sum(1 for e in entries if e.get("english"))
|
|
word_result["summary"]["with_german"] = sum(1 for e in entries if e.get("german"))
|
|
vocab_entries = entries
|
|
|
|
await update_session_db(session_id, word_result=word_result, current_step=8)
|
|
cached["word_result"] = word_result
|
|
|
|
logger.info(f"OCR Pipeline SSE: words session {session_id}: "
|
|
f"layout={word_result['layout']}, "
|
|
f"{len(all_cells)} cells ({duration:.2f}s)")
|
|
|
|
complete_event = {
|
|
"type": "complete",
|
|
"summary": word_result["summary"],
|
|
"duration_seconds": round(duration, 2),
|
|
"ocr_engine": used_engine,
|
|
}
|
|
if vocab_entries is not None:
|
|
complete_event["vocab_entries"] = vocab_entries
|
|
yield f"data: {json.dumps(complete_event)}\n\n"
|