feat: cell-first OCR + document type detection + dynamic pipeline steps

Cell-First OCR (v2): Each cell is cropped and OCR'd in isolation, eliminating neighbour bleeding (e.g. "to", "ps" in marker columns). Uses ThreadPoolExecutor for parallel Tesseract calls. Document type detection: Classifies pages as vocab_table, full_text, or generic_table using projection profiles (<2s, no OCR needed). Frontend dynamically skips columns/rows steps for full-text pages. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-04 13:52:38 +01:00
parent 00a74b3144
commit 29c74a9962
7 changed files with 1001 additions and 75 deletions
--- a/klausur-service/backend/ocr_pipeline_session_store.py
+++ b/klausur-service/backend/ocr_pipeline_session_store.py
@@ -64,7 +64,9 @@ async def init_ocr_pipeline_tables():
        await conn.execute("""
            ALTER TABLE ocr_pipeline_sessions
            ADD COLUMN IF NOT EXISTS clean_png BYTEA,
-            ADD COLUMN IF NOT EXISTS handwriting_removal_meta JSONB
+            ADD COLUMN IF NOT EXISTS handwriting_removal_meta JSONB,
+            ADD COLUMN IF NOT EXISTS doc_type VARCHAR(50),
+            ADD COLUMN IF NOT EXISTS doc_type_result JSONB
        """)


@@ -88,6 +90,7 @@ async def create_session_db(
            RETURNING id, name, filename, status, current_step,
                      deskew_result, dewarp_result, column_result, row_result,
                      word_result, ground_truth, auto_shear_degrees,
+                      doc_type, doc_type_result,
                      created_at, updated_at
        """, uuid.UUID(session_id), name, filename, original_png)

@@ -102,6 +105,7 @@ async def get_session_db(session_id: str) -> Optional[Dict[str, Any]]:
            SELECT id, name, filename, status, current_step,
                   deskew_result, dewarp_result, column_result, row_result,
                   word_result, ground_truth, auto_shear_degrees,
+                   doc_type, doc_type_result,
                   created_at, updated_at
            FROM ocr_pipeline_sessions WHERE id = $1
        """, uuid.UUID(session_id))
@@ -146,9 +150,10 @@ async def update_session_db(session_id: str, **kwargs) -> Optional[Dict[str, Any
        'clean_png', 'handwriting_removal_meta',
        'deskew_result', 'dewarp_result', 'column_result', 'row_result',
        'word_result', 'ground_truth', 'auto_shear_degrees',
+        'doc_type', 'doc_type_result',
    }

-    jsonb_fields = {'deskew_result', 'dewarp_result', 'column_result', 'row_result', 'word_result', 'ground_truth', 'handwriting_removal_meta'}
+    jsonb_fields = {'deskew_result', 'dewarp_result', 'column_result', 'row_result', 'word_result', 'ground_truth', 'handwriting_removal_meta', 'doc_type_result'}

    for key, value in kwargs.items():
        if key in allowed_fields:
@@ -174,6 +179,7 @@ async def update_session_db(session_id: str, **kwargs) -> Optional[Dict[str, Any
            RETURNING id, name, filename, status, current_step,
                      deskew_result, dewarp_result, column_result, row_result,
                      word_result, ground_truth, auto_shear_degrees,
+                      doc_type, doc_type_result,
                      created_at, updated_at
        """, *values)

@@ -229,7 +235,7 @@ def _row_to_dict(row: asyncpg.Record) -> Dict[str, Any]:
            result[key] = result[key].isoformat()

    # JSONB → parsed (asyncpg returns str for JSONB)
-    for key in ['deskew_result', 'dewarp_result', 'column_result', 'row_result', 'word_result', 'ground_truth']:
+    for key in ['deskew_result', 'dewarp_result', 'column_result', 'row_result', 'word_result', 'ground_truth', 'doc_type_result']:
        if key in result and result[key] is not None:
            if isinstance(result[key], str):
                result[key] = json.loads(result[key])