feat: improved OCR pipeline session manager with categories, thumbnails, pipeline logging
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 39s
CI / test-go-edu-search (push) Successful in 27s
CI / test-python-klausur (push) Failing after 1m48s
CI / test-python-agent-core (push) Successful in 17s
CI / test-nodejs-website (push) Successful in 20s

- Add document_category (10 types) and pipeline_log JSONB columns
- Session list: thumbnails, copyable IDs, category/doc_type badges
- Inline category dropdown, bulk delete, pipeline step logging
- New endpoints: thumbnail, delete-all, pipeline-log, categories
- Cleared all 22 old test sessions

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-05 09:44:38 +01:00
parent a58dfca1d8
commit 293e7914d8
4 changed files with 411 additions and 77 deletions

View File

@@ -66,7 +66,9 @@ async def init_ocr_pipeline_tables():
ADD COLUMN IF NOT EXISTS clean_png BYTEA,
ADD COLUMN IF NOT EXISTS handwriting_removal_meta JSONB,
ADD COLUMN IF NOT EXISTS doc_type VARCHAR(50),
ADD COLUMN IF NOT EXISTS doc_type_result JSONB
ADD COLUMN IF NOT EXISTS doc_type_result JSONB,
ADD COLUMN IF NOT EXISTS document_category VARCHAR(50),
ADD COLUMN IF NOT EXISTS pipeline_log JSONB
""")
@@ -91,6 +93,7 @@ async def create_session_db(
deskew_result, dewarp_result, column_result, row_result,
word_result, ground_truth, auto_shear_degrees,
doc_type, doc_type_result,
document_category, pipeline_log,
created_at, updated_at
""", uuid.UUID(session_id), name, filename, original_png)
@@ -106,6 +109,7 @@ async def get_session_db(session_id: str) -> Optional[Dict[str, Any]]:
deskew_result, dewarp_result, column_result, row_result,
word_result, ground_truth, auto_shear_degrees,
doc_type, doc_type_result,
document_category, pipeline_log,
created_at, updated_at
FROM ocr_pipeline_sessions WHERE id = $1
""", uuid.UUID(session_id))
@@ -151,9 +155,10 @@ async def update_session_db(session_id: str, **kwargs) -> Optional[Dict[str, Any
'deskew_result', 'dewarp_result', 'column_result', 'row_result',
'word_result', 'ground_truth', 'auto_shear_degrees',
'doc_type', 'doc_type_result',
'document_category', 'pipeline_log',
}
jsonb_fields = {'deskew_result', 'dewarp_result', 'column_result', 'row_result', 'word_result', 'ground_truth', 'handwriting_removal_meta', 'doc_type_result'}
jsonb_fields = {'deskew_result', 'dewarp_result', 'column_result', 'row_result', 'word_result', 'ground_truth', 'handwriting_removal_meta', 'doc_type_result', 'pipeline_log'}
for key, value in kwargs.items():
if key in allowed_fields:
@@ -180,6 +185,7 @@ async def update_session_db(session_id: str, **kwargs) -> Optional[Dict[str, Any
deskew_result, dewarp_result, column_result, row_result,
word_result, ground_truth, auto_shear_degrees,
doc_type, doc_type_result,
document_category, pipeline_log,
created_at, updated_at
""", *values)
@@ -194,6 +200,7 @@ async def list_sessions_db(limit: int = 50) -> List[Dict[str, Any]]:
async with pool.acquire() as conn:
rows = await conn.fetch("""
SELECT id, name, filename, status, current_step,
document_category, doc_type,
created_at, updated_at
FROM ocr_pipeline_sessions
ORDER BY created_at DESC
@@ -213,6 +220,18 @@ async def delete_session_db(session_id: str) -> bool:
return result == "DELETE 1"
async def delete_all_sessions_db() -> int:
"""Delete all sessions. Returns number of deleted rows."""
pool = await get_pool()
async with pool.acquire() as conn:
result = await conn.execute("DELETE FROM ocr_pipeline_sessions")
# result is e.g. "DELETE 5"
try:
return int(result.split()[-1])
except (ValueError, IndexError):
return 0
# =============================================================================
# HELPER
# =============================================================================
@@ -235,7 +254,7 @@ def _row_to_dict(row: asyncpg.Record) -> Dict[str, Any]:
result[key] = result[key].isoformat()
# JSONB → parsed (asyncpg returns str for JSONB)
for key in ['deskew_result', 'dewarp_result', 'column_result', 'row_result', 'word_result', 'ground_truth', 'doc_type_result']:
for key in ['deskew_result', 'dewarp_result', 'column_result', 'row_result', 'word_result', 'ground_truth', 'doc_type_result', 'pipeline_log']:
if key in result and result[key] is not None:
if isinstance(result[key], str):
result[key] = json.loads(result[key])