feat: cell-first OCR + document type detection + dynamic pipeline steps

Cell-First OCR (v2): Each cell is cropped and OCR'd in isolation,
eliminating neighbour bleeding (e.g. "to", "ps" in marker columns).
Uses ThreadPoolExecutor for parallel Tesseract calls.

Document type detection: Classifies pages as vocab_table, full_text,
or generic_table using projection profiles (<2s, no OCR needed).
Frontend dynamically skips columns/rows steps for full-text pages.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-04 13:52:38 +01:00
parent 00a74b3144
commit 29c74a9962
7 changed files with 1001 additions and 75 deletions

View File

@@ -32,6 +32,7 @@ from pydantic import BaseModel
from cv_vocab_pipeline import (
OLLAMA_REVIEW_MODEL,
DocumentTypeResult,
PageRegion,
RowGeometry,
_cells_to_vocab_entries,
@@ -43,6 +44,8 @@ from cv_vocab_pipeline import (
analyze_layout_by_words,
build_cell_grid,
build_cell_grid_streaming,
build_cell_grid_v2,
build_cell_grid_v2_streaming,
build_word_grid,
classify_column_types,
create_layout_image,
@@ -50,6 +53,7 @@ from cv_vocab_pipeline import (
deskew_image,
deskew_image_by_word_alignment,
detect_column_geometry,
detect_document_type,
detect_row_geometry,
expand_narrow_columns,
_apply_shear,
@@ -759,6 +763,54 @@ async def save_dewarp_ground_truth(session_id: str, req: DewarpGroundTruthReques
return {"session_id": session_id, "ground_truth": gt}
# ---------------------------------------------------------------------------
# Document Type Detection (between Dewarp and Columns)
# ---------------------------------------------------------------------------
@router.post("/sessions/{session_id}/detect-type")
async def detect_type(session_id: str):
"""Detect document type (vocab_table, full_text, generic_table).
Should be called after dewarp (clean image available).
Stores result in session for frontend to decide pipeline flow.
"""
if session_id not in _cache:
await _load_session_to_cache(session_id)
cached = _get_cached(session_id)
dewarped_bgr = cached.get("dewarped_bgr")
if dewarped_bgr is None:
raise HTTPException(status_code=400, detail="Dewarp must be completed first")
t0 = time.time()
ocr_img = create_ocr_image(dewarped_bgr)
result = detect_document_type(ocr_img, dewarped_bgr)
duration = time.time() - t0
result_dict = {
"doc_type": result.doc_type,
"confidence": result.confidence,
"pipeline": result.pipeline,
"skip_steps": result.skip_steps,
"features": result.features,
"duration_seconds": round(duration, 2),
}
# Persist to DB
await update_session_db(
session_id,
doc_type=result.doc_type,
doc_type_result=result_dict,
)
cached["doc_type_result"] = result_dict
logger.info(f"OCR Pipeline: detect-type session {session_id}: "
f"{result.doc_type} (confidence={result.confidence}, {duration:.2f}s)")
return {"session_id": session_id, **result_dict}
# ---------------------------------------------------------------------------
# Column Detection Endpoints (Step 3)
# ---------------------------------------------------------------------------
@@ -1196,8 +1248,10 @@ async def detect_words(
for r in row_result["rows"]
]
# Re-populate row.words from cached full-page Tesseract words.
# Word-lookup in _ocr_single_cell needs these to avoid re-running OCR.
# Cell-First OCR (v2): no full-page word re-population needed.
# Each cell is cropped and OCR'd in isolation → no neighbour bleeding.
# We still need word_count > 0 for row filtering in build_cell_grid_v2,
# so populate from cached words if available (just for counting).
word_dicts = cached.get("_word_dicts")
if word_dicts is None:
ocr_img_tmp = create_ocr_image(dewarped_bgr)
@@ -1209,8 +1263,6 @@ async def detect_words(
cached["_content_bounds"] = (left_x, right_x, top_y, bottom_y)
if word_dicts:
# words['top'] is relative to content-ROI top_y.
# row.y is absolute. Convert: row_y_rel = row.y - top_y.
content_bounds = cached.get("_content_bounds")
if content_bounds:
_lx, _rx, top_y, _by = content_bounds
@@ -1240,15 +1292,15 @@ async def detect_words(
},
)
# --- Non-streaming path (unchanged) ---
# --- Non-streaming path ---
t0 = time.time()
# Create binarized OCR image (for Tesseract)
ocr_img = create_ocr_image(dewarped_bgr)
img_h, img_w = dewarped_bgr.shape[:2]
# Build generic cell grid
cells, columns_meta = build_cell_grid(
# Build cell grid using Cell-First OCR (v2) — each cell cropped in isolation
cells, columns_meta = build_cell_grid_v2(
ocr_img, col_regions, row_geoms, img_w, img_h,
ocr_engine=engine, img_bgr=dewarped_bgr,
)
@@ -1358,7 +1410,7 @@ async def _word_stream_generator(
all_cells: List[Dict[str, Any]] = []
cell_idx = 0
for cell, cols_meta, total in build_cell_grid_streaming(
for cell, cols_meta, total in build_cell_grid_v2_streaming(
ocr_img, col_regions, row_geoms, img_w, img_h,
ocr_engine=engine, img_bgr=dewarped_bgr,
):