feat(ocr-pipeline): generic cell-grid with optional vocab mapping
Extract build_cell_grid() as layout-agnostic foundation from build_word_grid(). Step 5 now produces a generic cell grid (columns x rows) and auto-detects whether vocab layout is present. Frontend dynamically switches between vocab table (EN/DE/Example) and generic cell table based on layout type. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -31,8 +31,14 @@ from pydantic import BaseModel
|
||||
from cv_vocab_pipeline import (
|
||||
PageRegion,
|
||||
RowGeometry,
|
||||
_cells_to_vocab_entries,
|
||||
_fix_character_confusion,
|
||||
_fix_phonetic_brackets,
|
||||
_split_comma_entries,
|
||||
_attach_example_sentences,
|
||||
analyze_layout,
|
||||
analyze_layout_by_words,
|
||||
build_cell_grid,
|
||||
build_word_grid,
|
||||
classify_column_types,
|
||||
create_layout_image,
|
||||
@@ -1075,35 +1081,60 @@ async def detect_words(session_id: str, engine: str = "auto", pronunciation: str
|
||||
for r in row_result["rows"]
|
||||
]
|
||||
|
||||
# Build word grid — pass both binarized (for Tesseract) and BGR (for RapidOCR)
|
||||
entries = build_word_grid(
|
||||
# Build generic cell grid
|
||||
cells, columns_meta = build_cell_grid(
|
||||
ocr_img, col_regions, row_geoms, img_w, img_h,
|
||||
ocr_engine=engine, img_bgr=dewarped_bgr,
|
||||
pronunciation=pronunciation,
|
||||
)
|
||||
duration = time.time() - t0
|
||||
|
||||
# Build summary
|
||||
summary = {
|
||||
"total_entries": len(entries),
|
||||
"with_english": sum(1 for e in entries if e.get("english")),
|
||||
"with_german": sum(1 for e in entries if e.get("german")),
|
||||
"low_confidence": sum(1 for e in entries if e.get("confidence", 0) < 50),
|
||||
}
|
||||
# Layout detection
|
||||
col_types = {c['type'] for c in columns_meta}
|
||||
is_vocab = bool(col_types & {'column_en', 'column_de'})
|
||||
|
||||
# Count content rows and columns for grid_shape
|
||||
n_content_rows = len([r for r in row_geoms if r.row_type == 'content'])
|
||||
n_cols = len(columns_meta)
|
||||
|
||||
# Determine which engine was actually used
|
||||
used_engine = entries[0].get("ocr_engine", "tesseract") if entries else engine
|
||||
used_engine = cells[0].get("ocr_engine", "tesseract") if cells else engine
|
||||
|
||||
# Grid result (always generic)
|
||||
word_result = {
|
||||
"entries": entries,
|
||||
"entry_count": len(entries),
|
||||
"cells": cells,
|
||||
"grid_shape": {
|
||||
"rows": n_content_rows,
|
||||
"cols": n_cols,
|
||||
"total_cells": len(cells),
|
||||
},
|
||||
"columns_used": columns_meta,
|
||||
"layout": "vocab" if is_vocab else "generic",
|
||||
"image_width": img_w,
|
||||
"image_height": img_h,
|
||||
"duration_seconds": round(duration, 2),
|
||||
"summary": summary,
|
||||
"ocr_engine": used_engine,
|
||||
"summary": {
|
||||
"total_cells": len(cells),
|
||||
"non_empty_cells": sum(1 for c in cells if c.get("text")),
|
||||
"low_confidence": sum(1 for c in cells if 0 < c.get("confidence", 0) < 50),
|
||||
},
|
||||
}
|
||||
|
||||
# For vocab layout: add post-processed vocab_entries (backwards compat)
|
||||
if is_vocab:
|
||||
entries = _cells_to_vocab_entries(cells, columns_meta)
|
||||
entries = _fix_character_confusion(entries)
|
||||
entries = _fix_phonetic_brackets(entries, pronunciation=pronunciation)
|
||||
entries = _split_comma_entries(entries)
|
||||
entries = _attach_example_sentences(entries)
|
||||
word_result["vocab_entries"] = entries
|
||||
# Also keep "entries" key for backwards compatibility
|
||||
word_result["entries"] = entries
|
||||
word_result["entry_count"] = len(entries)
|
||||
word_result["summary"]["total_entries"] = len(entries)
|
||||
word_result["summary"]["with_english"] = sum(1 for e in entries if e.get("english"))
|
||||
word_result["summary"]["with_german"] = sum(1 for e in entries if e.get("german"))
|
||||
|
||||
# Persist to DB
|
||||
await update_session_db(
|
||||
session_id,
|
||||
@@ -1114,7 +1145,8 @@ async def detect_words(session_id: str, engine: str = "auto", pronunciation: str
|
||||
cached["word_result"] = word_result
|
||||
|
||||
logger.info(f"OCR Pipeline: words session {session_id}: "
|
||||
f"{len(entries)} entries ({duration:.2f}s), summary: {summary}")
|
||||
f"layout={word_result['layout']}, "
|
||||
f"{len(cells)} cells ({duration:.2f}s), summary: {word_result['summary']}")
|
||||
|
||||
return {
|
||||
"session_id": session_id,
|
||||
@@ -1232,17 +1264,19 @@ async def _get_rows_overlay(session_id: str) -> Response:
|
||||
|
||||
|
||||
async def _get_words_overlay(session_id: str) -> Response:
|
||||
"""Generate dewarped image with word grid cells drawn on it."""
|
||||
"""Generate dewarped image with cell grid drawn on it."""
|
||||
session = await get_session_db(session_id)
|
||||
if not session:
|
||||
raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
|
||||
|
||||
word_result = session.get("word_result")
|
||||
if not word_result or not word_result.get("entries"):
|
||||
if not word_result:
|
||||
raise HTTPException(status_code=404, detail="No word data available")
|
||||
|
||||
column_result = session.get("column_result")
|
||||
row_result = session.get("row_result")
|
||||
# Support both new cell-based and legacy entry-based formats
|
||||
cells = word_result.get("cells")
|
||||
if not cells and not word_result.get("entries"):
|
||||
raise HTTPException(status_code=404, detail="No word data available")
|
||||
|
||||
# Load dewarped image
|
||||
dewarped_png = await get_session_image(session_id, "dewarped")
|
||||
@@ -1256,80 +1290,105 @@ async def _get_words_overlay(session_id: str) -> Response:
|
||||
|
||||
img_h, img_w = img.shape[:2]
|
||||
|
||||
# Color map for column types (BGR)
|
||||
col_colors = {
|
||||
"column_en": (255, 180, 0), # Blue
|
||||
"column_de": (0, 200, 0), # Green
|
||||
"column_example": (0, 140, 255), # Orange
|
||||
}
|
||||
|
||||
overlay = img.copy()
|
||||
|
||||
# Build grid from column_result × row_result (the actual cells)
|
||||
columns = []
|
||||
if column_result and column_result.get("columns"):
|
||||
columns = [c for c in column_result["columns"]
|
||||
if c.get("type", "").startswith("column_")]
|
||||
if cells:
|
||||
# New cell-based overlay: color by column index
|
||||
col_palette = [
|
||||
(255, 180, 0), # Blue (BGR)
|
||||
(0, 200, 0), # Green
|
||||
(0, 140, 255), # Orange
|
||||
(200, 100, 200), # Purple
|
||||
(200, 200, 0), # Cyan
|
||||
(100, 200, 200), # Yellow-ish
|
||||
]
|
||||
|
||||
content_rows_data = []
|
||||
if row_result and row_result.get("rows"):
|
||||
content_rows_data = [r for r in row_result["rows"]
|
||||
if r.get("row_type") == "content"]
|
||||
for cell in cells:
|
||||
bbox = cell.get("bbox_px", {})
|
||||
cx = bbox.get("x", 0)
|
||||
cy = bbox.get("y", 0)
|
||||
cw = bbox.get("w", 0)
|
||||
ch = bbox.get("h", 0)
|
||||
if cw <= 0 or ch <= 0:
|
||||
continue
|
||||
|
||||
# Draw grid: column × row cells
|
||||
for col in columns:
|
||||
col_type = col.get("type", "")
|
||||
color = col_colors.get(col_type, (200, 200, 200))
|
||||
cx, cw = col["x"], col["width"]
|
||||
col_idx = cell.get("col_index", 0)
|
||||
color = col_palette[col_idx % len(col_palette)]
|
||||
|
||||
for row in content_rows_data:
|
||||
ry, rh = row["y"], row["height"]
|
||||
# Cell rectangle (exact grid intersection, no padding)
|
||||
cv2.rectangle(img, (cx, ry), (cx + cw, ry + rh), color, 1)
|
||||
# Cell rectangle border
|
||||
cv2.rectangle(img, (cx, cy), (cx + cw, cy + ch), color, 1)
|
||||
# Semi-transparent fill
|
||||
cv2.rectangle(overlay, (cx, ry), (cx + cw, ry + rh), color, -1)
|
||||
cv2.rectangle(overlay, (cx, cy), (cx + cw, cy + ch), color, -1)
|
||||
|
||||
# Place OCR text labels inside grid cells
|
||||
# Build lookup: row_index → entry for fast access
|
||||
entries = word_result["entries"]
|
||||
entry_by_row: Dict[int, Dict] = {}
|
||||
for entry in entries:
|
||||
entry_by_row[entry.get("row_index", -1)] = entry
|
||||
# Cell-ID label (top-left corner)
|
||||
cell_id = cell.get("cell_id", "")
|
||||
cv2.putText(img, cell_id, (cx + 2, cy + 10),
|
||||
cv2.FONT_HERSHEY_SIMPLEX, 0.28, color, 1)
|
||||
|
||||
for row_idx, row in enumerate(content_rows_data):
|
||||
entry = entry_by_row.get(row_idx)
|
||||
if not entry:
|
||||
continue
|
||||
# Text label (bottom of cell)
|
||||
text = cell.get("text", "")
|
||||
if text:
|
||||
conf = cell.get("confidence", 0)
|
||||
if conf >= 70:
|
||||
text_color = (0, 180, 0)
|
||||
elif conf >= 50:
|
||||
text_color = (0, 180, 220)
|
||||
else:
|
||||
text_color = (0, 0, 220)
|
||||
|
||||
conf = entry.get("confidence", 0)
|
||||
if conf >= 70:
|
||||
text_color = (0, 180, 0)
|
||||
elif conf >= 50:
|
||||
text_color = (0, 180, 220)
|
||||
else:
|
||||
text_color = (0, 0, 220)
|
||||
label = text.replace('\n', ' ')[:30]
|
||||
cv2.putText(img, label, (cx + 3, cy + ch - 4),
|
||||
cv2.FONT_HERSHEY_SIMPLEX, 0.35, text_color, 1)
|
||||
else:
|
||||
# Legacy fallback: entry-based overlay (for old sessions)
|
||||
column_result = session.get("column_result")
|
||||
row_result = session.get("row_result")
|
||||
col_colors = {
|
||||
"column_en": (255, 180, 0),
|
||||
"column_de": (0, 200, 0),
|
||||
"column_example": (0, 140, 255),
|
||||
}
|
||||
|
||||
ry, rh = row["y"], row["height"]
|
||||
columns = []
|
||||
if column_result and column_result.get("columns"):
|
||||
columns = [c for c in column_result["columns"]
|
||||
if c.get("type", "").startswith("column_")]
|
||||
|
||||
content_rows_data = []
|
||||
if row_result and row_result.get("rows"):
|
||||
content_rows_data = [r for r in row_result["rows"]
|
||||
if r.get("row_type") == "content"]
|
||||
|
||||
for col in columns:
|
||||
col_type = col.get("type", "")
|
||||
color = col_colors.get(col_type, (200, 200, 200))
|
||||
cx, cw = col["x"], col["width"]
|
||||
for row in content_rows_data:
|
||||
ry, rh = row["y"], row["height"]
|
||||
cv2.rectangle(img, (cx, ry), (cx + cw, ry + rh), color, 1)
|
||||
cv2.rectangle(overlay, (cx, ry), (cx + cw, ry + rh), color, -1)
|
||||
|
||||
# Pick the right text field for this column
|
||||
if col_type == "column_en":
|
||||
text = entry.get("english", "")
|
||||
elif col_type == "column_de":
|
||||
text = entry.get("german", "")
|
||||
elif col_type == "column_example":
|
||||
text = entry.get("example", "")
|
||||
else:
|
||||
text = ""
|
||||
entries = word_result["entries"]
|
||||
entry_by_row: Dict[int, Dict] = {}
|
||||
for entry in entries:
|
||||
entry_by_row[entry.get("row_index", -1)] = entry
|
||||
|
||||
if text:
|
||||
label = text.replace('\n', ' ')[:30]
|
||||
font_scale = 0.35
|
||||
cv2.putText(img, label, (cx + 3, ry + rh - 4),
|
||||
cv2.FONT_HERSHEY_SIMPLEX, font_scale, text_color, 1)
|
||||
for row_idx, row in enumerate(content_rows_data):
|
||||
entry = entry_by_row.get(row_idx)
|
||||
if not entry:
|
||||
continue
|
||||
conf = entry.get("confidence", 0)
|
||||
text_color = (0, 180, 0) if conf >= 70 else (0, 180, 220) if conf >= 50 else (0, 0, 220)
|
||||
ry, rh = row["y"], row["height"]
|
||||
for col in columns:
|
||||
col_type = col.get("type", "")
|
||||
cx, cw = col["x"], col["width"]
|
||||
field = {"column_en": "english", "column_de": "german", "column_example": "example"}.get(col_type, "")
|
||||
text = entry.get(field, "") if field else ""
|
||||
if text:
|
||||
label = text.replace('\n', ' ')[:30]
|
||||
cv2.putText(img, label, (cx + 3, ry + rh - 4),
|
||||
cv2.FONT_HERSHEY_SIMPLEX, 0.35, text_color, 1)
|
||||
|
||||
# Blend overlay at 10% opacity
|
||||
cv2.addWeighted(overlay, 0.1, img, 0.9, 0, img)
|
||||
|
||||
Reference in New Issue
Block a user