feat: cell-first OCR + document type detection + dynamic pipeline steps

Cell-First OCR (v2): Each cell is cropped and OCR'd in isolation, eliminating neighbour bleeding (e.g. "to", "ps" in marker columns). Uses ThreadPoolExecutor for parallel Tesseract calls. Document type detection: Classifies pages as vocab_table, full_text, or generic_table using projection profiles (<2s, no OCR needed). Frontend dynamically skips columns/rows steps for full-text pages. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-04 13:52:38 +01:00
parent 00a74b3144
commit 29c74a9962
7 changed files with 1001 additions and 75 deletions
--- a/klausur-service/backend/ocr_pipeline_api.py
+++ b/klausur-service/backend/ocr_pipeline_api.py
@@ -32,6 +32,7 @@ from pydantic import BaseModel

 from cv_vocab_pipeline import (
    OLLAMA_REVIEW_MODEL,
+    DocumentTypeResult,
    PageRegion,
    RowGeometry,
    _cells_to_vocab_entries,
@@ -43,6 +44,8 @@ from cv_vocab_pipeline import (
    analyze_layout_by_words,
    build_cell_grid,
    build_cell_grid_streaming,
+    build_cell_grid_v2,
+    build_cell_grid_v2_streaming,
    build_word_grid,
    classify_column_types,
    create_layout_image,
@@ -50,6 +53,7 @@ from cv_vocab_pipeline import (
    deskew_image,
    deskew_image_by_word_alignment,
    detect_column_geometry,
+    detect_document_type,
    detect_row_geometry,
    expand_narrow_columns,
    _apply_shear,
@@ -759,6 +763,54 @@ async def save_dewarp_ground_truth(session_id: str, req: DewarpGroundTruthReques
    return {"session_id": session_id, "ground_truth": gt}


+# ---------------------------------------------------------------------------
+# Document Type Detection (between Dewarp and Columns)
+# ---------------------------------------------------------------------------
+
+@router.post("/sessions/{session_id}/detect-type")
+async def detect_type(session_id: str):
+    """Detect document type (vocab_table, full_text, generic_table).
+
+    Should be called after dewarp (clean image available).
+    Stores result in session for frontend to decide pipeline flow.
+    """
+    if session_id not in _cache:
+        await _load_session_to_cache(session_id)
+    cached = _get_cached(session_id)
+
+    dewarped_bgr = cached.get("dewarped_bgr")
+    if dewarped_bgr is None:
+        raise HTTPException(status_code=400, detail="Dewarp must be completed first")
+
+    t0 = time.time()
+    ocr_img = create_ocr_image(dewarped_bgr)
+    result = detect_document_type(ocr_img, dewarped_bgr)
+    duration = time.time() - t0
+
+    result_dict = {
+        "doc_type": result.doc_type,
+        "confidence": result.confidence,
+        "pipeline": result.pipeline,
+        "skip_steps": result.skip_steps,
+        "features": result.features,
+        "duration_seconds": round(duration, 2),
+    }
+
+    # Persist to DB
+    await update_session_db(
+        session_id,
+        doc_type=result.doc_type,
+        doc_type_result=result_dict,
+    )
+
+    cached["doc_type_result"] = result_dict
+
+    logger.info(f"OCR Pipeline: detect-type session {session_id}: "
+                f"{result.doc_type} (confidence={result.confidence}, {duration:.2f}s)")
+
+    return {"session_id": session_id, **result_dict}
+
+
 # ---------------------------------------------------------------------------
 # Column Detection Endpoints (Step 3)
 # ---------------------------------------------------------------------------
@@ -1196,8 +1248,10 @@ async def detect_words(
        for r in row_result["rows"]
    ]

-    # Re-populate row.words from cached full-page Tesseract words.
-    # Word-lookup in _ocr_single_cell needs these to avoid re-running OCR.
+    # Cell-First OCR (v2): no full-page word re-population needed.
+    # Each cell is cropped and OCR'd in isolation → no neighbour bleeding.
+    # We still need word_count > 0 for row filtering in build_cell_grid_v2,
+    # so populate from cached words if available (just for counting).
    word_dicts = cached.get("_word_dicts")
    if word_dicts is None:
        ocr_img_tmp = create_ocr_image(dewarped_bgr)
@@ -1209,8 +1263,6 @@ async def detect_words(
            cached["_content_bounds"] = (left_x, right_x, top_y, bottom_y)

    if word_dicts:
-        # words['top'] is relative to content-ROI top_y.
-        # row.y is absolute. Convert: row_y_rel = row.y - top_y.
        content_bounds = cached.get("_content_bounds")
        if content_bounds:
            _lx, _rx, top_y, _by = content_bounds
@@ -1240,15 +1292,15 @@ async def detect_words(
            },
        )

-    # --- Non-streaming path (unchanged) ---
+    # --- Non-streaming path ---
    t0 = time.time()

    # Create binarized OCR image (for Tesseract)
    ocr_img = create_ocr_image(dewarped_bgr)
    img_h, img_w = dewarped_bgr.shape[:2]

-    # Build generic cell grid
-    cells, columns_meta = build_cell_grid(
+    # Build cell grid using Cell-First OCR (v2) — each cell cropped in isolation
+    cells, columns_meta = build_cell_grid_v2(
        ocr_img, col_regions, row_geoms, img_w, img_h,
        ocr_engine=engine, img_bgr=dewarped_bgr,
    )
@@ -1358,7 +1410,7 @@ async def _word_stream_generator(
    all_cells: List[Dict[str, Any]] = []
    cell_idx = 0

-    for cell, cols_meta, total in build_cell_grid_streaming(
+    for cell, cols_meta, total in build_cell_grid_v2_streaming(
        ocr_img, col_regions, row_geoms, img_w, img_h,
        ocr_engine=engine, img_bgr=dewarped_bgr,
    ):