backend-lehrer (5 files): - alerts_agent/db/repository.py (992 → 5), abitur_docs_api.py (956 → 3) - teacher_dashboard_api.py (951 → 3), services/pdf_service.py (916 → 3) - mail/mail_db.py (987 → 6) klausur-service (5 files): - legal_templates_ingestion.py (942 → 3), ocr_pipeline_postprocess.py (929 → 4) - ocr_pipeline_words.py (876 → 3), ocr_pipeline_ocr_merge.py (616 → 2) - KorrekturPage.tsx (956 → 6) website (5 pages): - mail (985 → 9), edu-search (958 → 8), mac-mini (950 → 7) - ocr-labeling (946 → 7), audit-workspace (871 → 4) studio-v2 (5 files + 1 deleted): - page.tsx (946 → 5), MessagesContext.tsx (925 → 4) - korrektur (914 → 6), worksheet-cleanup (899 → 6) - useVocabWorksheet.ts (888 → 3) - Deleted dead page-original.tsx (934 LOC) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
394 lines
14 KiB
Python
394 lines
14 KiB
Python
"""
|
|
OCR Pipeline Words Detect — main word detection endpoint (Step 7).
|
|
|
|
Extracted from ocr_pipeline_words.py. Contains the ``detect_words``
|
|
endpoint which handles both v2 and words_first grid methods.
|
|
|
|
Lizenz: Apache 2.0
|
|
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
|
"""
|
|
|
|
import json
|
|
import logging
|
|
import time
|
|
from typing import Any, Dict, List
|
|
|
|
import numpy as np
|
|
from fastapi import APIRouter, HTTPException, Request
|
|
from fastapi.responses import StreamingResponse
|
|
|
|
from cv_vocab_pipeline import (
|
|
PageRegion,
|
|
RowGeometry,
|
|
_cells_to_vocab_entries,
|
|
_fix_phonetic_brackets,
|
|
fix_cell_phonetics,
|
|
build_cell_grid_v2,
|
|
create_ocr_image,
|
|
detect_column_geometry,
|
|
)
|
|
from cv_words_first import build_grid_from_words
|
|
from ocr_pipeline_session_store import (
|
|
get_session_db,
|
|
update_session_db,
|
|
)
|
|
from ocr_pipeline_common import (
|
|
_cache,
|
|
_load_session_to_cache,
|
|
_get_cached,
|
|
_append_pipeline_log,
|
|
)
|
|
from ocr_pipeline_words_stream import (
|
|
_word_batch_stream_generator,
|
|
)
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
router = APIRouter(prefix="/api/v1/ocr-pipeline", tags=["ocr-pipeline"])
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Word Detection Endpoint (Step 7)
|
|
# ---------------------------------------------------------------------------
|
|
|
|
@router.post("/sessions/{session_id}/words")
|
|
async def detect_words(
|
|
session_id: str,
|
|
request: Request,
|
|
engine: str = "auto",
|
|
pronunciation: str = "british",
|
|
stream: bool = False,
|
|
skip_heal_gaps: bool = False,
|
|
grid_method: str = "v2",
|
|
):
|
|
"""Build word grid from columns x rows, OCR each cell.
|
|
|
|
Query params:
|
|
engine: 'auto' (default), 'tesseract', 'rapid', or 'paddle'
|
|
pronunciation: 'british' (default) or 'american'
|
|
stream: false (default) for JSON response, true for SSE streaming
|
|
skip_heal_gaps: false (default). When true, cells keep exact row geometry.
|
|
grid_method: 'v2' (default) or 'words_first'
|
|
"""
|
|
# PaddleOCR is full-page remote OCR -> force words_first grid method
|
|
if engine == "paddle" and grid_method != "words_first":
|
|
logger.info("detect_words: engine=paddle requires words_first, overriding grid_method=%s", grid_method)
|
|
grid_method = "words_first"
|
|
|
|
if session_id not in _cache:
|
|
logger.info("detect_words: session %s not in cache, loading from DB", session_id)
|
|
await _load_session_to_cache(session_id)
|
|
cached = _get_cached(session_id)
|
|
|
|
dewarped_bgr = cached.get("cropped_bgr") if cached.get("cropped_bgr") is not None else cached.get("dewarped_bgr")
|
|
if dewarped_bgr is None:
|
|
logger.warning("detect_words: no cropped/dewarped image for session %s (cache keys: %s)",
|
|
session_id, [k for k in cached.keys() if k.endswith('_bgr')])
|
|
raise HTTPException(status_code=400, detail="Crop or dewarp must be completed before word detection")
|
|
|
|
session = await get_session_db(session_id)
|
|
if not session:
|
|
raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
|
|
|
|
column_result = session.get("column_result")
|
|
row_result = session.get("row_result")
|
|
if not column_result or not column_result.get("columns"):
|
|
img_h_tmp, img_w_tmp = dewarped_bgr.shape[:2]
|
|
column_result = {
|
|
"columns": [{
|
|
"type": "column_text",
|
|
"x": 0, "y": 0,
|
|
"width": img_w_tmp, "height": img_h_tmp,
|
|
"classification_confidence": 1.0,
|
|
"classification_method": "full_page_fallback",
|
|
}],
|
|
"zones": [],
|
|
"duration_seconds": 0,
|
|
}
|
|
logger.info("detect_words: no column_result -- using full-page pseudo-column %dx%d", img_w_tmp, img_h_tmp)
|
|
if grid_method != "words_first" and (not row_result or not row_result.get("rows")):
|
|
raise HTTPException(status_code=400, detail="Row detection must be completed first")
|
|
|
|
# Convert column dicts back to PageRegion objects
|
|
col_regions = [
|
|
PageRegion(
|
|
type=c["type"],
|
|
x=c["x"], y=c["y"],
|
|
width=c["width"], height=c["height"],
|
|
classification_confidence=c.get("classification_confidence", 1.0),
|
|
classification_method=c.get("classification_method", ""),
|
|
)
|
|
for c in column_result["columns"]
|
|
]
|
|
|
|
# Convert row dicts back to RowGeometry objects
|
|
row_geoms = [
|
|
RowGeometry(
|
|
index=r["index"],
|
|
x=r["x"], y=r["y"],
|
|
width=r["width"], height=r["height"],
|
|
word_count=r.get("word_count", 0),
|
|
words=[],
|
|
row_type=r.get("row_type", "content"),
|
|
gap_before=r.get("gap_before", 0),
|
|
)
|
|
for r in row_result["rows"]
|
|
]
|
|
|
|
# Populate word counts from cached words
|
|
word_dicts = cached.get("_word_dicts")
|
|
if word_dicts is None:
|
|
ocr_img_tmp = create_ocr_image(dewarped_bgr)
|
|
geo_result = detect_column_geometry(ocr_img_tmp, dewarped_bgr)
|
|
if geo_result is not None:
|
|
_geoms, left_x, right_x, top_y, bottom_y, word_dicts, inv = geo_result
|
|
cached["_word_dicts"] = word_dicts
|
|
cached["_inv"] = inv
|
|
cached["_content_bounds"] = (left_x, right_x, top_y, bottom_y)
|
|
|
|
if word_dicts:
|
|
content_bounds = cached.get("_content_bounds")
|
|
if content_bounds:
|
|
_lx, _rx, top_y, _by = content_bounds
|
|
else:
|
|
top_y = min(r.y for r in row_geoms) if row_geoms else 0
|
|
|
|
for row in row_geoms:
|
|
row_y_rel = row.y - top_y
|
|
row_bottom_rel = row_y_rel + row.height
|
|
row.words = [
|
|
w for w in word_dicts
|
|
if row_y_rel <= w['top'] + w['height'] / 2 < row_bottom_rel
|
|
]
|
|
row.word_count = len(row.words)
|
|
|
|
# Exclude rows that fall within box zones
|
|
zones = column_result.get("zones") or []
|
|
box_ranges_inner = []
|
|
for zone in zones:
|
|
if zone.get("zone_type") == "box" and zone.get("box"):
|
|
box = zone["box"]
|
|
bt = max(box.get("border_thickness", 0), 5)
|
|
box_ranges_inner.append((box["y"] + bt, box["y"] + box["height"] - bt))
|
|
|
|
if box_ranges_inner:
|
|
def _row_in_box(r):
|
|
center_y = r.y + r.height / 2
|
|
return any(by_s <= center_y < by_e for by_s, by_e in box_ranges_inner)
|
|
|
|
before_count = len(row_geoms)
|
|
row_geoms = [r for r in row_geoms if not _row_in_box(r)]
|
|
excluded = before_count - len(row_geoms)
|
|
if excluded:
|
|
logger.info(f"detect_words: excluded {excluded} rows inside box zones")
|
|
|
|
# --- Words-First path ---
|
|
if grid_method == "words_first":
|
|
return await _words_first_path(
|
|
session_id, cached, dewarped_bgr, engine, pronunciation, zones,
|
|
)
|
|
|
|
if stream:
|
|
return StreamingResponse(
|
|
_word_batch_stream_generator(
|
|
session_id, cached, col_regions, row_geoms,
|
|
dewarped_bgr, engine, pronunciation, request,
|
|
skip_heal_gaps=skip_heal_gaps,
|
|
),
|
|
media_type="text/event-stream",
|
|
headers={
|
|
"Cache-Control": "no-cache",
|
|
"Connection": "keep-alive",
|
|
"X-Accel-Buffering": "no",
|
|
},
|
|
)
|
|
|
|
# --- Non-streaming path (grid_method=v2) ---
|
|
return await _v2_path(
|
|
session_id, cached, col_regions, row_geoms,
|
|
dewarped_bgr, engine, pronunciation, skip_heal_gaps,
|
|
)
|
|
|
|
|
|
async def _words_first_path(
|
|
session_id: str,
|
|
cached: Dict[str, Any],
|
|
dewarped_bgr: np.ndarray,
|
|
engine: str,
|
|
pronunciation: str,
|
|
zones: list,
|
|
) -> dict:
|
|
"""Words-first grid construction path."""
|
|
t0 = time.time()
|
|
img_h, img_w = dewarped_bgr.shape[:2]
|
|
|
|
if engine == "paddle":
|
|
from cv_ocr_engines import ocr_region_paddle
|
|
wf_word_dicts = await ocr_region_paddle(dewarped_bgr, region=None)
|
|
cached["_paddle_word_dicts"] = wf_word_dicts
|
|
else:
|
|
wf_word_dicts = cached.get("_word_dicts")
|
|
if wf_word_dicts is None:
|
|
ocr_img_tmp = create_ocr_image(dewarped_bgr)
|
|
geo_result = detect_column_geometry(ocr_img_tmp, dewarped_bgr)
|
|
if geo_result is not None:
|
|
_geoms, left_x, right_x, top_y, bottom_y, wf_word_dicts, inv = geo_result
|
|
cached["_word_dicts"] = wf_word_dicts
|
|
cached["_inv"] = inv
|
|
cached["_content_bounds"] = (left_x, right_x, top_y, bottom_y)
|
|
|
|
if not wf_word_dicts:
|
|
raise HTTPException(status_code=400, detail="No words detected -- cannot build words-first grid")
|
|
|
|
# Convert word coordinates to absolute if needed
|
|
if engine != "paddle":
|
|
content_bounds = cached.get("_content_bounds")
|
|
if content_bounds:
|
|
lx, _rx, ty, _by = content_bounds
|
|
abs_words = []
|
|
for w in wf_word_dicts:
|
|
abs_words.append({**w, 'left': w['left'] + lx, 'top': w['top'] + ty})
|
|
wf_word_dicts = abs_words
|
|
|
|
box_rects = []
|
|
for zone in zones:
|
|
if zone.get("zone_type") == "box" and zone.get("box"):
|
|
box_rects.append(zone["box"])
|
|
|
|
cells, columns_meta = build_grid_from_words(
|
|
wf_word_dicts, img_w, img_h, box_rects=box_rects or None,
|
|
)
|
|
duration = time.time() - t0
|
|
|
|
fix_cell_phonetics(cells, pronunciation=pronunciation)
|
|
for cell in cells:
|
|
cell.setdefault("zone_index", 0)
|
|
|
|
col_types = {c['type'] for c in columns_meta}
|
|
is_vocab = bool(col_types & {'column_en', 'column_de'})
|
|
n_rows = len(set(c['row_index'] for c in cells)) if cells else 0
|
|
n_cols = len(columns_meta)
|
|
used_engine = "paddle" if engine == "paddle" else "words_first"
|
|
|
|
word_result = {
|
|
"cells": cells,
|
|
"grid_shape": {"rows": n_rows, "cols": n_cols, "total_cells": len(cells)},
|
|
"columns_used": columns_meta,
|
|
"layout": "vocab" if is_vocab else "generic",
|
|
"image_width": img_w,
|
|
"image_height": img_h,
|
|
"duration_seconds": round(duration, 2),
|
|
"ocr_engine": used_engine,
|
|
"grid_method": "words_first",
|
|
"summary": {
|
|
"total_cells": len(cells),
|
|
"non_empty_cells": sum(1 for c in cells if c.get("text")),
|
|
"low_confidence": sum(1 for c in cells if 0 < c.get("confidence", 0) < 50),
|
|
},
|
|
}
|
|
|
|
if is_vocab or 'column_text' in col_types:
|
|
entries = _cells_to_vocab_entries(cells, columns_meta)
|
|
entries = _fix_phonetic_brackets(entries, pronunciation=pronunciation)
|
|
word_result["vocab_entries"] = entries
|
|
word_result["entries"] = entries
|
|
word_result["entry_count"] = len(entries)
|
|
word_result["summary"]["total_entries"] = len(entries)
|
|
word_result["summary"]["with_english"] = sum(1 for e in entries if e.get("english"))
|
|
word_result["summary"]["with_german"] = sum(1 for e in entries if e.get("german"))
|
|
|
|
await update_session_db(session_id, word_result=word_result, current_step=8)
|
|
cached["word_result"] = word_result
|
|
|
|
logger.info(f"OCR Pipeline: words-first session {session_id}: "
|
|
f"{len(cells)} cells ({duration:.2f}s), {n_rows} rows, {n_cols} cols")
|
|
|
|
await _append_pipeline_log(session_id, "words", {
|
|
"grid_method": "words_first",
|
|
"total_cells": len(cells),
|
|
"non_empty_cells": word_result["summary"]["non_empty_cells"],
|
|
"ocr_engine": used_engine,
|
|
"layout": word_result["layout"],
|
|
}, duration_ms=int(duration * 1000))
|
|
|
|
return {"session_id": session_id, **word_result}
|
|
|
|
|
|
async def _v2_path(
|
|
session_id: str,
|
|
cached: Dict[str, Any],
|
|
col_regions: List[PageRegion],
|
|
row_geoms: List[RowGeometry],
|
|
dewarped_bgr: np.ndarray,
|
|
engine: str,
|
|
pronunciation: str,
|
|
skip_heal_gaps: bool,
|
|
) -> dict:
|
|
"""Cell-First OCR v2 non-streaming path."""
|
|
t0 = time.time()
|
|
ocr_img = create_ocr_image(dewarped_bgr)
|
|
img_h, img_w = dewarped_bgr.shape[:2]
|
|
|
|
cells, columns_meta = build_cell_grid_v2(
|
|
ocr_img, col_regions, row_geoms, img_w, img_h,
|
|
ocr_engine=engine, img_bgr=dewarped_bgr,
|
|
skip_heal_gaps=skip_heal_gaps,
|
|
)
|
|
duration = time.time() - t0
|
|
|
|
for cell in cells:
|
|
cell.setdefault("zone_index", 0)
|
|
|
|
col_types = {c['type'] for c in columns_meta}
|
|
is_vocab = bool(col_types & {'column_en', 'column_de'})
|
|
n_content_rows = len([r for r in row_geoms if r.row_type == 'content'])
|
|
n_cols = len(columns_meta)
|
|
used_engine = cells[0].get("ocr_engine", "tesseract") if cells else engine
|
|
|
|
fix_cell_phonetics(cells, pronunciation=pronunciation)
|
|
|
|
word_result = {
|
|
"cells": cells,
|
|
"grid_shape": {"rows": n_content_rows, "cols": n_cols, "total_cells": len(cells)},
|
|
"columns_used": columns_meta,
|
|
"layout": "vocab" if is_vocab else "generic",
|
|
"image_width": img_w,
|
|
"image_height": img_h,
|
|
"duration_seconds": round(duration, 2),
|
|
"ocr_engine": used_engine,
|
|
"summary": {
|
|
"total_cells": len(cells),
|
|
"non_empty_cells": sum(1 for c in cells if c.get("text")),
|
|
"low_confidence": sum(1 for c in cells if 0 < c.get("confidence", 0) < 50),
|
|
},
|
|
}
|
|
|
|
has_text_col = 'column_text' in col_types
|
|
if is_vocab or has_text_col:
|
|
entries = _cells_to_vocab_entries(cells, columns_meta)
|
|
entries = _fix_phonetic_brackets(entries, pronunciation=pronunciation)
|
|
word_result["vocab_entries"] = entries
|
|
word_result["entries"] = entries
|
|
word_result["entry_count"] = len(entries)
|
|
word_result["summary"]["total_entries"] = len(entries)
|
|
word_result["summary"]["with_english"] = sum(1 for e in entries if e.get("english"))
|
|
word_result["summary"]["with_german"] = sum(1 for e in entries if e.get("german"))
|
|
|
|
await update_session_db(session_id, word_result=word_result, current_step=8)
|
|
cached["word_result"] = word_result
|
|
|
|
logger.info(f"OCR Pipeline: words session {session_id}: "
|
|
f"layout={word_result['layout']}, "
|
|
f"{len(cells)} cells ({duration:.2f}s), summary: {word_result['summary']}")
|
|
|
|
await _append_pipeline_log(session_id, "words", {
|
|
"total_cells": len(cells),
|
|
"non_empty_cells": word_result["summary"]["non_empty_cells"],
|
|
"low_confidence_count": word_result["summary"]["low_confidence"],
|
|
"ocr_engine": used_engine,
|
|
"layout": word_result["layout"],
|
|
"entry_count": word_result.get("entry_count", 0),
|
|
}, duration_ms=int(duration * 1000))
|
|
|
|
return {"session_id": session_id, **word_result}
|