feat: Box-Zonen durch gesamte Pipeline + Sub-Sessions fuer Box-Inhalt
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 29s
CI / test-go-edu-search (push) Successful in 27s
CI / test-python-klausur (push) Failing after 2m0s
CI / test-python-agent-core (push) Successful in 18s
CI / test-nodejs-website (push) Successful in 19s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 29s
CI / test-go-edu-search (push) Successful in 27s
CI / test-python-klausur (push) Failing after 2m0s
CI / test-python-agent-core (push) Successful in 18s
CI / test-nodejs-website (push) Successful in 19s
- Rote semi-transparente Box-Markierung in allen Overlays (Spalten, Zeilen, Woerter)
- Zeilenerkennung: Combined-Image-Ansatz schliesst Box-Bereiche aus
- Woerter-Erkennung: Zeilen innerhalb von Box-Zonen werden gefiltert
- Sub-Sessions: parent_session_id/box_index in DB-Schema
- POST /sessions/{id}/create-box-sessions erstellt Sub-Sessions aus Box-Regionen
- Session-Info zeigt Sub-Sessions bzw. Parent-Verknuepfung
- Sessions-Liste blendet Sub-Sessions per Default aus
- Rekonstruktion: Fabric-JSON merged Sub-Session-Zellen an Box-Positionen
- Save-Reconstruction routet box{N}_* Updates an Sub-Sessions
- GET /sessions/{id}/vocab-entries/merged fuer zusammengefuehrte Eintraege
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -72,7 +72,9 @@ async def init_ocr_pipeline_tables():
|
||||
ADD COLUMN IF NOT EXISTS oriented_png BYTEA,
|
||||
ADD COLUMN IF NOT EXISTS cropped_png BYTEA,
|
||||
ADD COLUMN IF NOT EXISTS orientation_result JSONB,
|
||||
ADD COLUMN IF NOT EXISTS crop_result JSONB
|
||||
ADD COLUMN IF NOT EXISTS crop_result JSONB,
|
||||
ADD COLUMN IF NOT EXISTS parent_session_id UUID REFERENCES ocr_pipeline_sessions(id) ON DELETE CASCADE,
|
||||
ADD COLUMN IF NOT EXISTS box_index INT
|
||||
""")
|
||||
|
||||
|
||||
@@ -85,22 +87,33 @@ async def create_session_db(
|
||||
name: str,
|
||||
filename: str,
|
||||
original_png: bytes,
|
||||
parent_session_id: Optional[str] = None,
|
||||
box_index: Optional[int] = None,
|
||||
) -> Dict[str, Any]:
|
||||
"""Create a new OCR pipeline session."""
|
||||
"""Create a new OCR pipeline session.
|
||||
|
||||
Args:
|
||||
parent_session_id: If set, this is a sub-session for a box region.
|
||||
box_index: 0-based index of the box this sub-session represents.
|
||||
"""
|
||||
pool = await get_pool()
|
||||
parent_uuid = uuid.UUID(parent_session_id) if parent_session_id else None
|
||||
async with pool.acquire() as conn:
|
||||
row = await conn.fetchrow("""
|
||||
INSERT INTO ocr_pipeline_sessions (
|
||||
id, name, filename, original_png, status, current_step
|
||||
) VALUES ($1, $2, $3, $4, 'active', 1)
|
||||
id, name, filename, original_png, status, current_step,
|
||||
parent_session_id, box_index
|
||||
) VALUES ($1, $2, $3, $4, 'active', 1, $5, $6)
|
||||
RETURNING id, name, filename, status, current_step,
|
||||
orientation_result, crop_result,
|
||||
deskew_result, dewarp_result, column_result, row_result,
|
||||
word_result, ground_truth, auto_shear_degrees,
|
||||
doc_type, doc_type_result,
|
||||
document_category, pipeline_log,
|
||||
parent_session_id, box_index,
|
||||
created_at, updated_at
|
||||
""", uuid.UUID(session_id), name, filename, original_png)
|
||||
""", uuid.UUID(session_id), name, filename, original_png,
|
||||
parent_uuid, box_index)
|
||||
|
||||
return _row_to_dict(row)
|
||||
|
||||
@@ -116,6 +129,7 @@ async def get_session_db(session_id: str) -> Optional[Dict[str, Any]]:
|
||||
word_result, ground_truth, auto_shear_degrees,
|
||||
doc_type, doc_type_result,
|
||||
document_category, pipeline_log,
|
||||
parent_session_id, box_index,
|
||||
created_at, updated_at
|
||||
FROM ocr_pipeline_sessions WHERE id = $1
|
||||
""", uuid.UUID(session_id))
|
||||
@@ -166,6 +180,7 @@ async def update_session_db(session_id: str, **kwargs) -> Optional[Dict[str, Any
|
||||
'word_result', 'ground_truth', 'auto_shear_degrees',
|
||||
'doc_type', 'doc_type_result',
|
||||
'document_category', 'pipeline_log',
|
||||
'parent_session_id', 'box_index',
|
||||
}
|
||||
|
||||
jsonb_fields = {'orientation_result', 'crop_result', 'deskew_result', 'dewarp_result', 'column_result', 'row_result', 'word_result', 'ground_truth', 'handwriting_removal_meta', 'doc_type_result', 'pipeline_log'}
|
||||
@@ -197,6 +212,7 @@ async def update_session_db(session_id: str, **kwargs) -> Optional[Dict[str, Any
|
||||
word_result, ground_truth, auto_shear_degrees,
|
||||
doc_type, doc_type_result,
|
||||
document_category, pipeline_log,
|
||||
parent_session_id, box_index,
|
||||
created_at, updated_at
|
||||
""", *values)
|
||||
|
||||
@@ -205,18 +221,45 @@ async def update_session_db(session_id: str, **kwargs) -> Optional[Dict[str, Any
|
||||
return None
|
||||
|
||||
|
||||
async def list_sessions_db(limit: int = 50) -> List[Dict[str, Any]]:
|
||||
"""List all sessions (metadata only, no images)."""
|
||||
async def list_sessions_db(
|
||||
limit: int = 50,
|
||||
include_sub_sessions: bool = False,
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""List sessions (metadata only, no images).
|
||||
|
||||
By default, sub-sessions (those with parent_session_id) are excluded.
|
||||
Pass include_sub_sessions=True to include them.
|
||||
"""
|
||||
pool = await get_pool()
|
||||
async with pool.acquire() as conn:
|
||||
where = "" if include_sub_sessions else "WHERE parent_session_id IS NULL"
|
||||
rows = await conn.fetch(f"""
|
||||
SELECT id, name, filename, status, current_step,
|
||||
document_category, doc_type,
|
||||
parent_session_id, box_index,
|
||||
created_at, updated_at
|
||||
FROM ocr_pipeline_sessions
|
||||
{where}
|
||||
ORDER BY created_at DESC
|
||||
LIMIT $1
|
||||
""", limit)
|
||||
|
||||
return [_row_to_dict(row) for row in rows]
|
||||
|
||||
|
||||
async def get_sub_sessions(parent_session_id: str) -> List[Dict[str, Any]]:
|
||||
"""Get all sub-sessions for a parent session, ordered by box_index."""
|
||||
pool = await get_pool()
|
||||
async with pool.acquire() as conn:
|
||||
rows = await conn.fetch("""
|
||||
SELECT id, name, filename, status, current_step,
|
||||
document_category, doc_type,
|
||||
parent_session_id, box_index,
|
||||
created_at, updated_at
|
||||
FROM ocr_pipeline_sessions
|
||||
ORDER BY created_at DESC
|
||||
LIMIT $1
|
||||
""", limit)
|
||||
WHERE parent_session_id = $1
|
||||
ORDER BY box_index ASC
|
||||
""", uuid.UUID(parent_session_id))
|
||||
|
||||
return [_row_to_dict(row) for row in rows]
|
||||
|
||||
@@ -255,7 +298,7 @@ def _row_to_dict(row: asyncpg.Record) -> Dict[str, Any]:
|
||||
result = dict(row)
|
||||
|
||||
# UUID → string
|
||||
for key in ['id', 'session_id']:
|
||||
for key in ['id', 'session_id', 'parent_session_id']:
|
||||
if key in result and result[key] is not None:
|
||||
result[key] = str(result[key])
|
||||
|
||||
|
||||
Reference in New Issue
Block a user