feat: cell-first OCR + document type detection + dynamic pipeline steps
Cell-First OCR (v2): Each cell is cropped and OCR'd in isolation, eliminating neighbour bleeding (e.g. "to", "ps" in marker columns). Uses ThreadPoolExecutor for parallel Tesseract calls. Document type detection: Classifies pages as vocab_table, full_text, or generic_table using projection profiles (<2s, no OCR needed). Frontend dynamically skips columns/rows steps for full-text pages. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -32,6 +32,7 @@ from pydantic import BaseModel
|
||||
|
||||
from cv_vocab_pipeline import (
|
||||
OLLAMA_REVIEW_MODEL,
|
||||
DocumentTypeResult,
|
||||
PageRegion,
|
||||
RowGeometry,
|
||||
_cells_to_vocab_entries,
|
||||
@@ -43,6 +44,8 @@ from cv_vocab_pipeline import (
|
||||
analyze_layout_by_words,
|
||||
build_cell_grid,
|
||||
build_cell_grid_streaming,
|
||||
build_cell_grid_v2,
|
||||
build_cell_grid_v2_streaming,
|
||||
build_word_grid,
|
||||
classify_column_types,
|
||||
create_layout_image,
|
||||
@@ -50,6 +53,7 @@ from cv_vocab_pipeline import (
|
||||
deskew_image,
|
||||
deskew_image_by_word_alignment,
|
||||
detect_column_geometry,
|
||||
detect_document_type,
|
||||
detect_row_geometry,
|
||||
expand_narrow_columns,
|
||||
_apply_shear,
|
||||
@@ -759,6 +763,54 @@ async def save_dewarp_ground_truth(session_id: str, req: DewarpGroundTruthReques
|
||||
return {"session_id": session_id, "ground_truth": gt}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Document Type Detection (between Dewarp and Columns)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
@router.post("/sessions/{session_id}/detect-type")
|
||||
async def detect_type(session_id: str):
|
||||
"""Detect document type (vocab_table, full_text, generic_table).
|
||||
|
||||
Should be called after dewarp (clean image available).
|
||||
Stores result in session for frontend to decide pipeline flow.
|
||||
"""
|
||||
if session_id not in _cache:
|
||||
await _load_session_to_cache(session_id)
|
||||
cached = _get_cached(session_id)
|
||||
|
||||
dewarped_bgr = cached.get("dewarped_bgr")
|
||||
if dewarped_bgr is None:
|
||||
raise HTTPException(status_code=400, detail="Dewarp must be completed first")
|
||||
|
||||
t0 = time.time()
|
||||
ocr_img = create_ocr_image(dewarped_bgr)
|
||||
result = detect_document_type(ocr_img, dewarped_bgr)
|
||||
duration = time.time() - t0
|
||||
|
||||
result_dict = {
|
||||
"doc_type": result.doc_type,
|
||||
"confidence": result.confidence,
|
||||
"pipeline": result.pipeline,
|
||||
"skip_steps": result.skip_steps,
|
||||
"features": result.features,
|
||||
"duration_seconds": round(duration, 2),
|
||||
}
|
||||
|
||||
# Persist to DB
|
||||
await update_session_db(
|
||||
session_id,
|
||||
doc_type=result.doc_type,
|
||||
doc_type_result=result_dict,
|
||||
)
|
||||
|
||||
cached["doc_type_result"] = result_dict
|
||||
|
||||
logger.info(f"OCR Pipeline: detect-type session {session_id}: "
|
||||
f"{result.doc_type} (confidence={result.confidence}, {duration:.2f}s)")
|
||||
|
||||
return {"session_id": session_id, **result_dict}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Column Detection Endpoints (Step 3)
|
||||
# ---------------------------------------------------------------------------
|
||||
@@ -1196,8 +1248,10 @@ async def detect_words(
|
||||
for r in row_result["rows"]
|
||||
]
|
||||
|
||||
# Re-populate row.words from cached full-page Tesseract words.
|
||||
# Word-lookup in _ocr_single_cell needs these to avoid re-running OCR.
|
||||
# Cell-First OCR (v2): no full-page word re-population needed.
|
||||
# Each cell is cropped and OCR'd in isolation → no neighbour bleeding.
|
||||
# We still need word_count > 0 for row filtering in build_cell_grid_v2,
|
||||
# so populate from cached words if available (just for counting).
|
||||
word_dicts = cached.get("_word_dicts")
|
||||
if word_dicts is None:
|
||||
ocr_img_tmp = create_ocr_image(dewarped_bgr)
|
||||
@@ -1209,8 +1263,6 @@ async def detect_words(
|
||||
cached["_content_bounds"] = (left_x, right_x, top_y, bottom_y)
|
||||
|
||||
if word_dicts:
|
||||
# words['top'] is relative to content-ROI top_y.
|
||||
# row.y is absolute. Convert: row_y_rel = row.y - top_y.
|
||||
content_bounds = cached.get("_content_bounds")
|
||||
if content_bounds:
|
||||
_lx, _rx, top_y, _by = content_bounds
|
||||
@@ -1240,15 +1292,15 @@ async def detect_words(
|
||||
},
|
||||
)
|
||||
|
||||
# --- Non-streaming path (unchanged) ---
|
||||
# --- Non-streaming path ---
|
||||
t0 = time.time()
|
||||
|
||||
# Create binarized OCR image (for Tesseract)
|
||||
ocr_img = create_ocr_image(dewarped_bgr)
|
||||
img_h, img_w = dewarped_bgr.shape[:2]
|
||||
|
||||
# Build generic cell grid
|
||||
cells, columns_meta = build_cell_grid(
|
||||
# Build cell grid using Cell-First OCR (v2) — each cell cropped in isolation
|
||||
cells, columns_meta = build_cell_grid_v2(
|
||||
ocr_img, col_regions, row_geoms, img_w, img_h,
|
||||
ocr_engine=engine, img_bgr=dewarped_bgr,
|
||||
)
|
||||
@@ -1358,7 +1410,7 @@ async def _word_stream_generator(
|
||||
all_cells: List[Dict[str, Any]] = []
|
||||
cell_idx = 0
|
||||
|
||||
for cell, cols_meta, total in build_cell_grid_streaming(
|
||||
for cell, cols_meta, total in build_cell_grid_v2_streaming(
|
||||
ocr_img, col_regions, row_geoms, img_w, img_h,
|
||||
ocr_engine=engine, img_bgr=dewarped_bgr,
|
||||
):
|
||||
|
||||
Reference in New Issue
Block a user