feat(ocr-pipeline): add row detection step with horizontal gap analysis
Add Step 4 (row detection) between column detection and word recognition. Uses horizontal projection profiles + whitespace gaps (same method as columns). Includes header/footer classification via gap-size heuristics. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -79,7 +79,7 @@ async def create_session_db(
|
||||
id, name, filename, original_png, status, current_step
|
||||
) VALUES ($1, $2, $3, $4, 'active', 1)
|
||||
RETURNING id, name, filename, status, current_step,
|
||||
deskew_result, dewarp_result, column_result,
|
||||
deskew_result, dewarp_result, column_result, row_result,
|
||||
ground_truth, auto_shear_degrees,
|
||||
created_at, updated_at
|
||||
""", uuid.UUID(session_id), name, filename, original_png)
|
||||
@@ -93,7 +93,7 @@ async def get_session_db(session_id: str) -> Optional[Dict[str, Any]]:
|
||||
async with pool.acquire() as conn:
|
||||
row = await conn.fetchrow("""
|
||||
SELECT id, name, filename, status, current_step,
|
||||
deskew_result, dewarp_result, column_result,
|
||||
deskew_result, dewarp_result, column_result, row_result,
|
||||
ground_truth, auto_shear_degrees,
|
||||
created_at, updated_at
|
||||
FROM ocr_pipeline_sessions WHERE id = $1
|
||||
@@ -135,11 +135,11 @@ async def update_session_db(session_id: str, **kwargs) -> Optional[Dict[str, Any
|
||||
allowed_fields = {
|
||||
'name', 'filename', 'status', 'current_step',
|
||||
'original_png', 'deskewed_png', 'binarized_png', 'dewarped_png',
|
||||
'deskew_result', 'dewarp_result', 'column_result',
|
||||
'deskew_result', 'dewarp_result', 'column_result', 'row_result',
|
||||
'ground_truth', 'auto_shear_degrees',
|
||||
}
|
||||
|
||||
jsonb_fields = {'deskew_result', 'dewarp_result', 'column_result', 'ground_truth'}
|
||||
jsonb_fields = {'deskew_result', 'dewarp_result', 'column_result', 'row_result', 'ground_truth'}
|
||||
|
||||
for key, value in kwargs.items():
|
||||
if key in allowed_fields:
|
||||
@@ -163,7 +163,7 @@ async def update_session_db(session_id: str, **kwargs) -> Optional[Dict[str, Any
|
||||
SET {', '.join(fields)}
|
||||
WHERE id = ${param_idx}
|
||||
RETURNING id, name, filename, status, current_step,
|
||||
deskew_result, dewarp_result, column_result,
|
||||
deskew_result, dewarp_result, column_result, row_result,
|
||||
ground_truth, auto_shear_degrees,
|
||||
created_at, updated_at
|
||||
""", *values)
|
||||
@@ -220,7 +220,7 @@ def _row_to_dict(row: asyncpg.Record) -> Dict[str, Any]:
|
||||
result[key] = result[key].isoformat()
|
||||
|
||||
# JSONB → parsed (asyncpg returns str for JSONB)
|
||||
for key in ['deskew_result', 'dewarp_result', 'column_result', 'ground_truth']:
|
||||
for key in ['deskew_result', 'dewarp_result', 'column_result', 'row_result', 'ground_truth']:
|
||||
if key in result and result[key] is not None:
|
||||
if isinstance(result[key], str):
|
||||
result[key] = json.loads(result[key])
|
||||
|
||||
Reference in New Issue
Block a user