Add OCR Kombi Pipeline: modular 11-step architecture with multi-page support
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 29s
CI / test-go-edu-search (push) Successful in 28s
CI / test-python-klausur (push) Failing after 2m24s
CI / test-python-agent-core (push) Successful in 22s
CI / test-nodejs-website (push) Successful in 20s

Phase 1 of the clean architecture refactor: Replaces the 751-line ocr-overlay
monolith with a modular pipeline. Each step gets its own component file.

Frontend: /ai/ocr-kombi route with 11 steps (Upload, Orientation, PageSplit,
Deskew, Dewarp, ContentCrop, OCR, Structure, GridBuild, GridReview, GroundTruth).
Session list supports document grouping for multi-page uploads.

Backend: New ocr_kombi/ module with multi-page PDF upload (splits PDF into N
sessions with shared document_group_id). DB migration adds document_group_id
and page_number columns.

Old /ai/ocr-overlay remains fully functional for A/B testing.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-26 15:55:28 +01:00
parent d26233b5b3
commit d26a9f60ab
25 changed files with 1935 additions and 7 deletions

View File

@@ -76,7 +76,16 @@ async def init_ocr_pipeline_tables():
ADD COLUMN IF NOT EXISTS parent_session_id UUID REFERENCES ocr_pipeline_sessions(id) ON DELETE CASCADE,
ADD COLUMN IF NOT EXISTS box_index INT,
ADD COLUMN IF NOT EXISTS grid_editor_result JSONB,
ADD COLUMN IF NOT EXISTS structure_result JSONB
ADD COLUMN IF NOT EXISTS structure_result JSONB,
ADD COLUMN IF NOT EXISTS document_group_id UUID,
ADD COLUMN IF NOT EXISTS page_number INT
""")
# Index for document group lookups
await conn.execute("""
CREATE INDEX IF NOT EXISTS idx_ocr_sessions_document_group
ON ocr_pipeline_sessions (document_group_id)
WHERE document_group_id IS NOT NULL
""")
@@ -91,21 +100,26 @@ async def create_session_db(
original_png: bytes,
parent_session_id: Optional[str] = None,
box_index: Optional[int] = None,
document_group_id: Optional[str] = None,
page_number: Optional[int] = None,
) -> Dict[str, Any]:
"""Create a new OCR pipeline session.
Args:
parent_session_id: If set, this is a sub-session for a box region.
box_index: 0-based index of the box this sub-session represents.
document_group_id: Groups multi-page uploads into one document.
page_number: 1-based page index within the document group.
"""
pool = await get_pool()
parent_uuid = uuid.UUID(parent_session_id) if parent_session_id else None
group_uuid = uuid.UUID(document_group_id) if document_group_id else None
async with pool.acquire() as conn:
row = await conn.fetchrow("""
INSERT INTO ocr_pipeline_sessions (
id, name, filename, original_png, status, current_step,
parent_session_id, box_index
) VALUES ($1, $2, $3, $4, 'active', 1, $5, $6)
parent_session_id, box_index, document_group_id, page_number
) VALUES ($1, $2, $3, $4, 'active', 1, $5, $6, $7, $8)
RETURNING id, name, filename, status, current_step,
orientation_result, crop_result,
deskew_result, dewarp_result, column_result, row_result,
@@ -114,9 +128,10 @@ async def create_session_db(
document_category, pipeline_log,
grid_editor_result, structure_result,
parent_session_id, box_index,
document_group_id, page_number,
created_at, updated_at
""", uuid.UUID(session_id), name, filename, original_png,
parent_uuid, box_index)
parent_uuid, box_index, group_uuid, page_number)
return _row_to_dict(row)
@@ -134,6 +149,7 @@ async def get_session_db(session_id: str) -> Optional[Dict[str, Any]]:
document_category, pipeline_log,
grid_editor_result, structure_result,
parent_session_id, box_index,
document_group_id, page_number,
created_at, updated_at
FROM ocr_pipeline_sessions WHERE id = $1
""", uuid.UUID(session_id))
@@ -186,6 +202,7 @@ async def update_session_db(session_id: str, **kwargs) -> Optional[Dict[str, Any
'document_category', 'pipeline_log',
'grid_editor_result', 'structure_result',
'parent_session_id', 'box_index',
'document_group_id', 'page_number',
}
jsonb_fields = {'orientation_result', 'crop_result', 'deskew_result', 'dewarp_result', 'column_result', 'row_result', 'word_result', 'ground_truth', 'handwriting_removal_meta', 'doc_type_result', 'pipeline_log', 'grid_editor_result', 'structure_result'}
@@ -217,8 +234,9 @@ async def update_session_db(session_id: str, **kwargs) -> Optional[Dict[str, Any
word_result, ground_truth, auto_shear_degrees,
doc_type, doc_type_result,
document_category, pipeline_log,
grid_editor_result,
grid_editor_result, structure_result,
parent_session_id, box_index,
document_group_id, page_number,
created_at, updated_at
""", *values)
@@ -243,6 +261,7 @@ async def list_sessions_db(
SELECT id, name, filename, status, current_step,
document_category, doc_type,
parent_session_id, box_index,
document_group_id, page_number,
created_at, updated_at
FROM ocr_pipeline_sessions
{where}
@@ -261,6 +280,7 @@ async def get_sub_sessions(parent_session_id: str) -> List[Dict[str, Any]]:
SELECT id, name, filename, status, current_step,
document_category, doc_type,
parent_session_id, box_index,
document_group_id, page_number,
created_at, updated_at
FROM ocr_pipeline_sessions
WHERE parent_session_id = $1
@@ -270,6 +290,24 @@ async def get_sub_sessions(parent_session_id: str) -> List[Dict[str, Any]]:
return [_row_to_dict(row) for row in rows]
async def get_document_group_sessions(document_group_id: str) -> List[Dict[str, Any]]:
"""Get all sessions in a document group, ordered by page_number."""
pool = await get_pool()
async with pool.acquire() as conn:
rows = await conn.fetch("""
SELECT id, name, filename, status, current_step,
document_category, doc_type,
parent_session_id, box_index,
document_group_id, page_number,
created_at, updated_at
FROM ocr_pipeline_sessions
WHERE document_group_id = $1
ORDER BY page_number ASC
""", uuid.UUID(document_group_id))
return [_row_to_dict(row) for row in rows]
async def list_ground_truth_sessions_db() -> List[Dict[str, Any]]:
"""List sessions that have a build_grid_reference in ground_truth."""
pool = await get_pool()
@@ -324,7 +362,7 @@ def _row_to_dict(row: asyncpg.Record) -> Dict[str, Any]:
result = dict(row)
# UUID → string
for key in ['id', 'session_id', 'parent_session_id']:
for key in ['id', 'session_id', 'parent_session_id', 'document_group_id']:
if key in result and result[key] is not None:
result[key] = str(result[key])