[split-required] Split 500-1000 LOC files across all services
backend-lehrer (5 files): - alerts_agent/db/repository.py (992 → 5), abitur_docs_api.py (956 → 3) - teacher_dashboard_api.py (951 → 3), services/pdf_service.py (916 → 3) - mail/mail_db.py (987 → 6) klausur-service (5 files): - legal_templates_ingestion.py (942 → 3), ocr_pipeline_postprocess.py (929 → 4) - ocr_pipeline_words.py (876 → 3), ocr_pipeline_ocr_merge.py (616 → 2) - KorrekturPage.tsx (956 → 6) website (5 pages): - mail (985 → 9), edu-search (958 → 8), mac-mini (950 → 7) - ocr-labeling (946 → 7), audit-workspace (871 → 4) studio-v2 (5 files + 1 deleted): - page.tsx (946 → 5), MessagesContext.tsx (925 → 4) - korrektur (914 → 6), worksheet-cleanup (899 → 6) - useVocabWorksheet.ts (888 → 3) - Deleted dead page-original.tsx (934 LOC) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
393
klausur-service/backend/ocr_pipeline_words_detect.py
Normal file
393
klausur-service/backend/ocr_pipeline_words_detect.py
Normal file
@@ -0,0 +1,393 @@
|
||||
"""
|
||||
OCR Pipeline Words Detect — main word detection endpoint (Step 7).
|
||||
|
||||
Extracted from ocr_pipeline_words.py. Contains the ``detect_words``
|
||||
endpoint which handles both v2 and words_first grid methods.
|
||||
|
||||
Lizenz: Apache 2.0
|
||||
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import time
|
||||
from typing import Any, Dict, List
|
||||
|
||||
import numpy as np
|
||||
from fastapi import APIRouter, HTTPException, Request
|
||||
from fastapi.responses import StreamingResponse
|
||||
|
||||
from cv_vocab_pipeline import (
|
||||
PageRegion,
|
||||
RowGeometry,
|
||||
_cells_to_vocab_entries,
|
||||
_fix_phonetic_brackets,
|
||||
fix_cell_phonetics,
|
||||
build_cell_grid_v2,
|
||||
create_ocr_image,
|
||||
detect_column_geometry,
|
||||
)
|
||||
from cv_words_first import build_grid_from_words
|
||||
from ocr_pipeline_session_store import (
|
||||
get_session_db,
|
||||
update_session_db,
|
||||
)
|
||||
from ocr_pipeline_common import (
|
||||
_cache,
|
||||
_load_session_to_cache,
|
||||
_get_cached,
|
||||
_append_pipeline_log,
|
||||
)
|
||||
from ocr_pipeline_words_stream import (
|
||||
_word_batch_stream_generator,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
router = APIRouter(prefix="/api/v1/ocr-pipeline", tags=["ocr-pipeline"])
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Word Detection Endpoint (Step 7)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
@router.post("/sessions/{session_id}/words")
|
||||
async def detect_words(
|
||||
session_id: str,
|
||||
request: Request,
|
||||
engine: str = "auto",
|
||||
pronunciation: str = "british",
|
||||
stream: bool = False,
|
||||
skip_heal_gaps: bool = False,
|
||||
grid_method: str = "v2",
|
||||
):
|
||||
"""Build word grid from columns x rows, OCR each cell.
|
||||
|
||||
Query params:
|
||||
engine: 'auto' (default), 'tesseract', 'rapid', or 'paddle'
|
||||
pronunciation: 'british' (default) or 'american'
|
||||
stream: false (default) for JSON response, true for SSE streaming
|
||||
skip_heal_gaps: false (default). When true, cells keep exact row geometry.
|
||||
grid_method: 'v2' (default) or 'words_first'
|
||||
"""
|
||||
# PaddleOCR is full-page remote OCR -> force words_first grid method
|
||||
if engine == "paddle" and grid_method != "words_first":
|
||||
logger.info("detect_words: engine=paddle requires words_first, overriding grid_method=%s", grid_method)
|
||||
grid_method = "words_first"
|
||||
|
||||
if session_id not in _cache:
|
||||
logger.info("detect_words: session %s not in cache, loading from DB", session_id)
|
||||
await _load_session_to_cache(session_id)
|
||||
cached = _get_cached(session_id)
|
||||
|
||||
dewarped_bgr = cached.get("cropped_bgr") if cached.get("cropped_bgr") is not None else cached.get("dewarped_bgr")
|
||||
if dewarped_bgr is None:
|
||||
logger.warning("detect_words: no cropped/dewarped image for session %s (cache keys: %s)",
|
||||
session_id, [k for k in cached.keys() if k.endswith('_bgr')])
|
||||
raise HTTPException(status_code=400, detail="Crop or dewarp must be completed before word detection")
|
||||
|
||||
session = await get_session_db(session_id)
|
||||
if not session:
|
||||
raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
|
||||
|
||||
column_result = session.get("column_result")
|
||||
row_result = session.get("row_result")
|
||||
if not column_result or not column_result.get("columns"):
|
||||
img_h_tmp, img_w_tmp = dewarped_bgr.shape[:2]
|
||||
column_result = {
|
||||
"columns": [{
|
||||
"type": "column_text",
|
||||
"x": 0, "y": 0,
|
||||
"width": img_w_tmp, "height": img_h_tmp,
|
||||
"classification_confidence": 1.0,
|
||||
"classification_method": "full_page_fallback",
|
||||
}],
|
||||
"zones": [],
|
||||
"duration_seconds": 0,
|
||||
}
|
||||
logger.info("detect_words: no column_result -- using full-page pseudo-column %dx%d", img_w_tmp, img_h_tmp)
|
||||
if grid_method != "words_first" and (not row_result or not row_result.get("rows")):
|
||||
raise HTTPException(status_code=400, detail="Row detection must be completed first")
|
||||
|
||||
# Convert column dicts back to PageRegion objects
|
||||
col_regions = [
|
||||
PageRegion(
|
||||
type=c["type"],
|
||||
x=c["x"], y=c["y"],
|
||||
width=c["width"], height=c["height"],
|
||||
classification_confidence=c.get("classification_confidence", 1.0),
|
||||
classification_method=c.get("classification_method", ""),
|
||||
)
|
||||
for c in column_result["columns"]
|
||||
]
|
||||
|
||||
# Convert row dicts back to RowGeometry objects
|
||||
row_geoms = [
|
||||
RowGeometry(
|
||||
index=r["index"],
|
||||
x=r["x"], y=r["y"],
|
||||
width=r["width"], height=r["height"],
|
||||
word_count=r.get("word_count", 0),
|
||||
words=[],
|
||||
row_type=r.get("row_type", "content"),
|
||||
gap_before=r.get("gap_before", 0),
|
||||
)
|
||||
for r in row_result["rows"]
|
||||
]
|
||||
|
||||
# Populate word counts from cached words
|
||||
word_dicts = cached.get("_word_dicts")
|
||||
if word_dicts is None:
|
||||
ocr_img_tmp = create_ocr_image(dewarped_bgr)
|
||||
geo_result = detect_column_geometry(ocr_img_tmp, dewarped_bgr)
|
||||
if geo_result is not None:
|
||||
_geoms, left_x, right_x, top_y, bottom_y, word_dicts, inv = geo_result
|
||||
cached["_word_dicts"] = word_dicts
|
||||
cached["_inv"] = inv
|
||||
cached["_content_bounds"] = (left_x, right_x, top_y, bottom_y)
|
||||
|
||||
if word_dicts:
|
||||
content_bounds = cached.get("_content_bounds")
|
||||
if content_bounds:
|
||||
_lx, _rx, top_y, _by = content_bounds
|
||||
else:
|
||||
top_y = min(r.y for r in row_geoms) if row_geoms else 0
|
||||
|
||||
for row in row_geoms:
|
||||
row_y_rel = row.y - top_y
|
||||
row_bottom_rel = row_y_rel + row.height
|
||||
row.words = [
|
||||
w for w in word_dicts
|
||||
if row_y_rel <= w['top'] + w['height'] / 2 < row_bottom_rel
|
||||
]
|
||||
row.word_count = len(row.words)
|
||||
|
||||
# Exclude rows that fall within box zones
|
||||
zones = column_result.get("zones") or []
|
||||
box_ranges_inner = []
|
||||
for zone in zones:
|
||||
if zone.get("zone_type") == "box" and zone.get("box"):
|
||||
box = zone["box"]
|
||||
bt = max(box.get("border_thickness", 0), 5)
|
||||
box_ranges_inner.append((box["y"] + bt, box["y"] + box["height"] - bt))
|
||||
|
||||
if box_ranges_inner:
|
||||
def _row_in_box(r):
|
||||
center_y = r.y + r.height / 2
|
||||
return any(by_s <= center_y < by_e for by_s, by_e in box_ranges_inner)
|
||||
|
||||
before_count = len(row_geoms)
|
||||
row_geoms = [r for r in row_geoms if not _row_in_box(r)]
|
||||
excluded = before_count - len(row_geoms)
|
||||
if excluded:
|
||||
logger.info(f"detect_words: excluded {excluded} rows inside box zones")
|
||||
|
||||
# --- Words-First path ---
|
||||
if grid_method == "words_first":
|
||||
return await _words_first_path(
|
||||
session_id, cached, dewarped_bgr, engine, pronunciation, zones,
|
||||
)
|
||||
|
||||
if stream:
|
||||
return StreamingResponse(
|
||||
_word_batch_stream_generator(
|
||||
session_id, cached, col_regions, row_geoms,
|
||||
dewarped_bgr, engine, pronunciation, request,
|
||||
skip_heal_gaps=skip_heal_gaps,
|
||||
),
|
||||
media_type="text/event-stream",
|
||||
headers={
|
||||
"Cache-Control": "no-cache",
|
||||
"Connection": "keep-alive",
|
||||
"X-Accel-Buffering": "no",
|
||||
},
|
||||
)
|
||||
|
||||
# --- Non-streaming path (grid_method=v2) ---
|
||||
return await _v2_path(
|
||||
session_id, cached, col_regions, row_geoms,
|
||||
dewarped_bgr, engine, pronunciation, skip_heal_gaps,
|
||||
)
|
||||
|
||||
|
||||
async def _words_first_path(
|
||||
session_id: str,
|
||||
cached: Dict[str, Any],
|
||||
dewarped_bgr: np.ndarray,
|
||||
engine: str,
|
||||
pronunciation: str,
|
||||
zones: list,
|
||||
) -> dict:
|
||||
"""Words-first grid construction path."""
|
||||
t0 = time.time()
|
||||
img_h, img_w = dewarped_bgr.shape[:2]
|
||||
|
||||
if engine == "paddle":
|
||||
from cv_ocr_engines import ocr_region_paddle
|
||||
wf_word_dicts = await ocr_region_paddle(dewarped_bgr, region=None)
|
||||
cached["_paddle_word_dicts"] = wf_word_dicts
|
||||
else:
|
||||
wf_word_dicts = cached.get("_word_dicts")
|
||||
if wf_word_dicts is None:
|
||||
ocr_img_tmp = create_ocr_image(dewarped_bgr)
|
||||
geo_result = detect_column_geometry(ocr_img_tmp, dewarped_bgr)
|
||||
if geo_result is not None:
|
||||
_geoms, left_x, right_x, top_y, bottom_y, wf_word_dicts, inv = geo_result
|
||||
cached["_word_dicts"] = wf_word_dicts
|
||||
cached["_inv"] = inv
|
||||
cached["_content_bounds"] = (left_x, right_x, top_y, bottom_y)
|
||||
|
||||
if not wf_word_dicts:
|
||||
raise HTTPException(status_code=400, detail="No words detected -- cannot build words-first grid")
|
||||
|
||||
# Convert word coordinates to absolute if needed
|
||||
if engine != "paddle":
|
||||
content_bounds = cached.get("_content_bounds")
|
||||
if content_bounds:
|
||||
lx, _rx, ty, _by = content_bounds
|
||||
abs_words = []
|
||||
for w in wf_word_dicts:
|
||||
abs_words.append({**w, 'left': w['left'] + lx, 'top': w['top'] + ty})
|
||||
wf_word_dicts = abs_words
|
||||
|
||||
box_rects = []
|
||||
for zone in zones:
|
||||
if zone.get("zone_type") == "box" and zone.get("box"):
|
||||
box_rects.append(zone["box"])
|
||||
|
||||
cells, columns_meta = build_grid_from_words(
|
||||
wf_word_dicts, img_w, img_h, box_rects=box_rects or None,
|
||||
)
|
||||
duration = time.time() - t0
|
||||
|
||||
fix_cell_phonetics(cells, pronunciation=pronunciation)
|
||||
for cell in cells:
|
||||
cell.setdefault("zone_index", 0)
|
||||
|
||||
col_types = {c['type'] for c in columns_meta}
|
||||
is_vocab = bool(col_types & {'column_en', 'column_de'})
|
||||
n_rows = len(set(c['row_index'] for c in cells)) if cells else 0
|
||||
n_cols = len(columns_meta)
|
||||
used_engine = "paddle" if engine == "paddle" else "words_first"
|
||||
|
||||
word_result = {
|
||||
"cells": cells,
|
||||
"grid_shape": {"rows": n_rows, "cols": n_cols, "total_cells": len(cells)},
|
||||
"columns_used": columns_meta,
|
||||
"layout": "vocab" if is_vocab else "generic",
|
||||
"image_width": img_w,
|
||||
"image_height": img_h,
|
||||
"duration_seconds": round(duration, 2),
|
||||
"ocr_engine": used_engine,
|
||||
"grid_method": "words_first",
|
||||
"summary": {
|
||||
"total_cells": len(cells),
|
||||
"non_empty_cells": sum(1 for c in cells if c.get("text")),
|
||||
"low_confidence": sum(1 for c in cells if 0 < c.get("confidence", 0) < 50),
|
||||
},
|
||||
}
|
||||
|
||||
if is_vocab or 'column_text' in col_types:
|
||||
entries = _cells_to_vocab_entries(cells, columns_meta)
|
||||
entries = _fix_phonetic_brackets(entries, pronunciation=pronunciation)
|
||||
word_result["vocab_entries"] = entries
|
||||
word_result["entries"] = entries
|
||||
word_result["entry_count"] = len(entries)
|
||||
word_result["summary"]["total_entries"] = len(entries)
|
||||
word_result["summary"]["with_english"] = sum(1 for e in entries if e.get("english"))
|
||||
word_result["summary"]["with_german"] = sum(1 for e in entries if e.get("german"))
|
||||
|
||||
await update_session_db(session_id, word_result=word_result, current_step=8)
|
||||
cached["word_result"] = word_result
|
||||
|
||||
logger.info(f"OCR Pipeline: words-first session {session_id}: "
|
||||
f"{len(cells)} cells ({duration:.2f}s), {n_rows} rows, {n_cols} cols")
|
||||
|
||||
await _append_pipeline_log(session_id, "words", {
|
||||
"grid_method": "words_first",
|
||||
"total_cells": len(cells),
|
||||
"non_empty_cells": word_result["summary"]["non_empty_cells"],
|
||||
"ocr_engine": used_engine,
|
||||
"layout": word_result["layout"],
|
||||
}, duration_ms=int(duration * 1000))
|
||||
|
||||
return {"session_id": session_id, **word_result}
|
||||
|
||||
|
||||
async def _v2_path(
|
||||
session_id: str,
|
||||
cached: Dict[str, Any],
|
||||
col_regions: List[PageRegion],
|
||||
row_geoms: List[RowGeometry],
|
||||
dewarped_bgr: np.ndarray,
|
||||
engine: str,
|
||||
pronunciation: str,
|
||||
skip_heal_gaps: bool,
|
||||
) -> dict:
|
||||
"""Cell-First OCR v2 non-streaming path."""
|
||||
t0 = time.time()
|
||||
ocr_img = create_ocr_image(dewarped_bgr)
|
||||
img_h, img_w = dewarped_bgr.shape[:2]
|
||||
|
||||
cells, columns_meta = build_cell_grid_v2(
|
||||
ocr_img, col_regions, row_geoms, img_w, img_h,
|
||||
ocr_engine=engine, img_bgr=dewarped_bgr,
|
||||
skip_heal_gaps=skip_heal_gaps,
|
||||
)
|
||||
duration = time.time() - t0
|
||||
|
||||
for cell in cells:
|
||||
cell.setdefault("zone_index", 0)
|
||||
|
||||
col_types = {c['type'] for c in columns_meta}
|
||||
is_vocab = bool(col_types & {'column_en', 'column_de'})
|
||||
n_content_rows = len([r for r in row_geoms if r.row_type == 'content'])
|
||||
n_cols = len(columns_meta)
|
||||
used_engine = cells[0].get("ocr_engine", "tesseract") if cells else engine
|
||||
|
||||
fix_cell_phonetics(cells, pronunciation=pronunciation)
|
||||
|
||||
word_result = {
|
||||
"cells": cells,
|
||||
"grid_shape": {"rows": n_content_rows, "cols": n_cols, "total_cells": len(cells)},
|
||||
"columns_used": columns_meta,
|
||||
"layout": "vocab" if is_vocab else "generic",
|
||||
"image_width": img_w,
|
||||
"image_height": img_h,
|
||||
"duration_seconds": round(duration, 2),
|
||||
"ocr_engine": used_engine,
|
||||
"summary": {
|
||||
"total_cells": len(cells),
|
||||
"non_empty_cells": sum(1 for c in cells if c.get("text")),
|
||||
"low_confidence": sum(1 for c in cells if 0 < c.get("confidence", 0) < 50),
|
||||
},
|
||||
}
|
||||
|
||||
has_text_col = 'column_text' in col_types
|
||||
if is_vocab or has_text_col:
|
||||
entries = _cells_to_vocab_entries(cells, columns_meta)
|
||||
entries = _fix_phonetic_brackets(entries, pronunciation=pronunciation)
|
||||
word_result["vocab_entries"] = entries
|
||||
word_result["entries"] = entries
|
||||
word_result["entry_count"] = len(entries)
|
||||
word_result["summary"]["total_entries"] = len(entries)
|
||||
word_result["summary"]["with_english"] = sum(1 for e in entries if e.get("english"))
|
||||
word_result["summary"]["with_german"] = sum(1 for e in entries if e.get("german"))
|
||||
|
||||
await update_session_db(session_id, word_result=word_result, current_step=8)
|
||||
cached["word_result"] = word_result
|
||||
|
||||
logger.info(f"OCR Pipeline: words session {session_id}: "
|
||||
f"layout={word_result['layout']}, "
|
||||
f"{len(cells)} cells ({duration:.2f}s), summary: {word_result['summary']}")
|
||||
|
||||
await _append_pipeline_log(session_id, "words", {
|
||||
"total_cells": len(cells),
|
||||
"non_empty_cells": word_result["summary"]["non_empty_cells"],
|
||||
"low_confidence_count": word_result["summary"]["low_confidence"],
|
||||
"ocr_engine": used_engine,
|
||||
"layout": word_result["layout"],
|
||||
"entry_count": word_result.get("entry_count", 0),
|
||||
}, duration_ms=int(duration * 1000))
|
||||
|
||||
return {"session_id": session_id, **word_result}
|
||||
Reference in New Issue
Block a user