Restructure: Move ocr_pipeline + labeling + crop into ocr/ package
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 29s
CI / test-go-edu-search (push) Successful in 29s
CI / test-python-klausur (push) Failing after 2m25s
CI / test-python-agent-core (push) Successful in 19s
CI / test-nodejs-website (push) Successful in 20s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 29s
CI / test-go-edu-search (push) Successful in 29s
CI / test-python-klausur (push) Failing after 2m25s
CI / test-python-agent-core (push) Successful in 19s
CI / test-nodejs-website (push) Successful in 20s
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -1,362 +1,4 @@
|
||||
"""
|
||||
OCR Pipeline Reconstruction — save edits, Fabric JSON export, merged entries, PDF/DOCX export.
|
||||
|
||||
Extracted from ocr_pipeline_postprocess.py.
|
||||
|
||||
Lizenz: Apache 2.0
|
||||
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import re
|
||||
from typing import Dict
|
||||
|
||||
from fastapi import APIRouter, HTTPException, Request
|
||||
from fastapi.responses import StreamingResponse
|
||||
|
||||
from ocr_pipeline_session_store import (
|
||||
get_session_db,
|
||||
get_sub_sessions,
|
||||
update_session_db,
|
||||
)
|
||||
from ocr_pipeline_common import _cache
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
router = APIRouter(prefix="/api/v1/ocr-pipeline", tags=["ocr-pipeline"])
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Step 9: Reconstruction + Fabric JSON export
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
@router.post("/sessions/{session_id}/reconstruction")
|
||||
async def save_reconstruction(session_id: str, request: Request):
|
||||
"""Save edited cell texts from reconstruction step."""
|
||||
session = await get_session_db(session_id)
|
||||
if not session:
|
||||
raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
|
||||
|
||||
word_result = session.get("word_result")
|
||||
if not word_result:
|
||||
raise HTTPException(status_code=400, detail="No word result found")
|
||||
|
||||
body = await request.json()
|
||||
cell_updates = body.get("cells", [])
|
||||
|
||||
if not cell_updates:
|
||||
await update_session_db(session_id, current_step=10)
|
||||
return {"session_id": session_id, "updated": 0}
|
||||
|
||||
# Build update map: cell_id -> new text
|
||||
update_map = {c["cell_id"]: c["text"] for c in cell_updates}
|
||||
|
||||
# Separate sub-session updates (cell_ids prefixed with "box{N}_")
|
||||
sub_updates: Dict[int, Dict[str, str]] = {} # box_index -> {original_cell_id: text}
|
||||
main_updates: Dict[str, str] = {}
|
||||
for cell_id, text in update_map.items():
|
||||
m = re.match(r'^box(\d+)_(.+)$', cell_id)
|
||||
if m:
|
||||
bi = int(m.group(1))
|
||||
original_id = m.group(2)
|
||||
sub_updates.setdefault(bi, {})[original_id] = text
|
||||
else:
|
||||
main_updates[cell_id] = text
|
||||
|
||||
# Update main session cells
|
||||
cells = word_result.get("cells", [])
|
||||
updated_count = 0
|
||||
for cell in cells:
|
||||
if cell["cell_id"] in main_updates:
|
||||
cell["text"] = main_updates[cell["cell_id"]]
|
||||
cell["status"] = "edited"
|
||||
updated_count += 1
|
||||
|
||||
word_result["cells"] = cells
|
||||
|
||||
# Also update vocab_entries if present
|
||||
entries = word_result.get("vocab_entries") or word_result.get("entries") or []
|
||||
if entries:
|
||||
for entry in entries:
|
||||
row_idx = entry.get("row_index", -1)
|
||||
for col_idx, field_name in enumerate(["english", "german", "example"]):
|
||||
cell_id = f"R{row_idx:02d}_C{col_idx}"
|
||||
cell_id_alt = f"R{row_idx}_C{col_idx}"
|
||||
new_text = main_updates.get(cell_id) or main_updates.get(cell_id_alt)
|
||||
if new_text is not None:
|
||||
entry[field_name] = new_text
|
||||
|
||||
word_result["vocab_entries"] = entries
|
||||
if "entries" in word_result:
|
||||
word_result["entries"] = entries
|
||||
|
||||
await update_session_db(session_id, word_result=word_result, current_step=10)
|
||||
|
||||
if session_id in _cache:
|
||||
_cache[session_id]["word_result"] = word_result
|
||||
|
||||
# Route sub-session updates
|
||||
sub_updated = 0
|
||||
if sub_updates:
|
||||
subs = await get_sub_sessions(session_id)
|
||||
sub_by_index = {s.get("box_index"): s["id"] for s in subs}
|
||||
for bi, updates in sub_updates.items():
|
||||
sub_id = sub_by_index.get(bi)
|
||||
if not sub_id:
|
||||
continue
|
||||
sub_session = await get_session_db(sub_id)
|
||||
if not sub_session:
|
||||
continue
|
||||
sub_word = sub_session.get("word_result")
|
||||
if not sub_word:
|
||||
continue
|
||||
sub_cells = sub_word.get("cells", [])
|
||||
for cell in sub_cells:
|
||||
if cell["cell_id"] in updates:
|
||||
cell["text"] = updates[cell["cell_id"]]
|
||||
cell["status"] = "edited"
|
||||
sub_updated += 1
|
||||
sub_word["cells"] = sub_cells
|
||||
await update_session_db(sub_id, word_result=sub_word)
|
||||
if sub_id in _cache:
|
||||
_cache[sub_id]["word_result"] = sub_word
|
||||
|
||||
total_updated = updated_count + sub_updated
|
||||
logger.info(f"Reconstruction saved for session {session_id}: "
|
||||
f"{updated_count} main + {sub_updated} sub-session cells updated")
|
||||
|
||||
return {
|
||||
"session_id": session_id,
|
||||
"updated": total_updated,
|
||||
"main_updated": updated_count,
|
||||
"sub_updated": sub_updated,
|
||||
}
|
||||
|
||||
|
||||
@router.get("/sessions/{session_id}/reconstruction/fabric-json")
|
||||
async def get_fabric_json(session_id: str):
|
||||
"""Return cell grid as Fabric.js-compatible JSON for the canvas editor."""
|
||||
session = await get_session_db(session_id)
|
||||
if not session:
|
||||
raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
|
||||
|
||||
word_result = session.get("word_result")
|
||||
if not word_result:
|
||||
raise HTTPException(status_code=400, detail="No word result found")
|
||||
|
||||
cells = list(word_result.get("cells", []))
|
||||
img_w = word_result.get("image_width", 800)
|
||||
img_h = word_result.get("image_height", 600)
|
||||
|
||||
# Merge sub-session cells at box positions
|
||||
subs = await get_sub_sessions(session_id)
|
||||
if subs:
|
||||
column_result = session.get("column_result") or {}
|
||||
zones = column_result.get("zones") or []
|
||||
box_zones = [z for z in zones if z.get("zone_type") == "box" and z.get("box")]
|
||||
|
||||
for sub in subs:
|
||||
sub_session = await get_session_db(sub["id"])
|
||||
if not sub_session:
|
||||
continue
|
||||
sub_word = sub_session.get("word_result")
|
||||
if not sub_word or not sub_word.get("cells"):
|
||||
continue
|
||||
|
||||
bi = sub.get("box_index", 0)
|
||||
if bi < len(box_zones):
|
||||
box = box_zones[bi]["box"]
|
||||
box_y, box_x = box["y"], box["x"]
|
||||
else:
|
||||
box_y, box_x = 0, 0
|
||||
|
||||
for cell in sub_word["cells"]:
|
||||
cell_copy = dict(cell)
|
||||
cell_copy["cell_id"] = f"box{bi}_{cell_copy.get('cell_id', '')}"
|
||||
cell_copy["source"] = f"box_{bi}"
|
||||
bbox = cell_copy.get("bbox_px", {})
|
||||
if bbox:
|
||||
bbox = dict(bbox)
|
||||
bbox["x"] = bbox.get("x", 0) + box_x
|
||||
bbox["y"] = bbox.get("y", 0) + box_y
|
||||
cell_copy["bbox_px"] = bbox
|
||||
cells.append(cell_copy)
|
||||
|
||||
from services.layout_reconstruction_service import cells_to_fabric_json
|
||||
fabric_json = cells_to_fabric_json(cells, img_w, img_h)
|
||||
|
||||
return fabric_json
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Vocab entries merged + PDF/DOCX export
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
@router.get("/sessions/{session_id}/vocab-entries/merged")
|
||||
async def get_merged_vocab_entries(session_id: str):
|
||||
"""Return vocab entries from main session + all sub-sessions, sorted by Y position."""
|
||||
session = await get_session_db(session_id)
|
||||
if not session:
|
||||
raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
|
||||
|
||||
word_result = session.get("word_result") or {}
|
||||
entries = list(word_result.get("vocab_entries") or word_result.get("entries") or [])
|
||||
|
||||
for e in entries:
|
||||
e.setdefault("source", "main")
|
||||
|
||||
subs = await get_sub_sessions(session_id)
|
||||
if subs:
|
||||
column_result = session.get("column_result") or {}
|
||||
zones = column_result.get("zones") or []
|
||||
box_zones = [z for z in zones if z.get("zone_type") == "box" and z.get("box")]
|
||||
|
||||
for sub in subs:
|
||||
sub_session = await get_session_db(sub["id"])
|
||||
if not sub_session:
|
||||
continue
|
||||
sub_word = sub_session.get("word_result") or {}
|
||||
sub_entries = sub_word.get("vocab_entries") or sub_word.get("entries") or []
|
||||
|
||||
bi = sub.get("box_index", 0)
|
||||
box_y = 0
|
||||
if bi < len(box_zones):
|
||||
box_y = box_zones[bi]["box"]["y"]
|
||||
|
||||
for e in sub_entries:
|
||||
e_copy = dict(e)
|
||||
e_copy["source"] = f"box_{bi}"
|
||||
e_copy["source_y"] = box_y
|
||||
entries.append(e_copy)
|
||||
|
||||
def _sort_key(e):
|
||||
if e.get("source", "main") == "main":
|
||||
return e.get("row_index", 0) * 100
|
||||
return e.get("source_y", 0) * 100 + e.get("row_index", 0)
|
||||
|
||||
entries.sort(key=_sort_key)
|
||||
|
||||
return {
|
||||
"session_id": session_id,
|
||||
"entries": entries,
|
||||
"total": len(entries),
|
||||
"sources": list(set(e.get("source", "main") for e in entries)),
|
||||
}
|
||||
|
||||
|
||||
@router.get("/sessions/{session_id}/reconstruction/export/pdf")
|
||||
async def export_reconstruction_pdf(session_id: str):
|
||||
"""Export the reconstructed cell grid as a PDF table."""
|
||||
session = await get_session_db(session_id)
|
||||
if not session:
|
||||
raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
|
||||
|
||||
word_result = session.get("word_result")
|
||||
if not word_result:
|
||||
raise HTTPException(status_code=400, detail="No word result found")
|
||||
|
||||
cells = word_result.get("cells", [])
|
||||
columns_used = word_result.get("columns_used", [])
|
||||
grid_shape = word_result.get("grid_shape", {})
|
||||
n_rows = grid_shape.get("rows", 0)
|
||||
n_cols = grid_shape.get("cols", 0)
|
||||
|
||||
# Build table data: rows x columns
|
||||
table_data: list[list[str]] = []
|
||||
header = [c.get("label", c.get("type", f"Col {i}")) for i, c in enumerate(columns_used)]
|
||||
if not header:
|
||||
header = [f"Col {i}" for i in range(n_cols)]
|
||||
table_data.append(header)
|
||||
|
||||
for r in range(n_rows):
|
||||
row_texts = []
|
||||
for ci in range(n_cols):
|
||||
cell_id = f"R{r:02d}_C{ci}"
|
||||
cell = next((c for c in cells if c.get("cell_id") == cell_id), None)
|
||||
row_texts.append(cell.get("text", "") if cell else "")
|
||||
table_data.append(row_texts)
|
||||
|
||||
try:
|
||||
from reportlab.lib.pagesizes import A4
|
||||
from reportlab.lib import colors
|
||||
from reportlab.platypus import SimpleDocTemplate, Table, TableStyle
|
||||
import io as _io
|
||||
|
||||
buf = _io.BytesIO()
|
||||
doc = SimpleDocTemplate(buf, pagesize=A4)
|
||||
if not table_data or not table_data[0]:
|
||||
raise HTTPException(status_code=400, detail="No data to export")
|
||||
|
||||
t = Table(table_data)
|
||||
t.setStyle(TableStyle([
|
||||
('BACKGROUND', (0, 0), (-1, 0), colors.HexColor('#0d9488')),
|
||||
('TEXTCOLOR', (0, 0), (-1, 0), colors.white),
|
||||
('FONTSIZE', (0, 0), (-1, -1), 9),
|
||||
('GRID', (0, 0), (-1, -1), 0.5, colors.grey),
|
||||
('VALIGN', (0, 0), (-1, -1), 'TOP'),
|
||||
('WORDWRAP', (0, 0), (-1, -1), True),
|
||||
]))
|
||||
doc.build([t])
|
||||
buf.seek(0)
|
||||
|
||||
return StreamingResponse(
|
||||
buf,
|
||||
media_type="application/pdf",
|
||||
headers={"Content-Disposition": f'attachment; filename="reconstruction_{session_id}.pdf"'},
|
||||
)
|
||||
except ImportError:
|
||||
raise HTTPException(status_code=501, detail="reportlab not installed")
|
||||
|
||||
|
||||
@router.get("/sessions/{session_id}/reconstruction/export/docx")
|
||||
async def export_reconstruction_docx(session_id: str):
|
||||
"""Export the reconstructed cell grid as a DOCX table."""
|
||||
session = await get_session_db(session_id)
|
||||
if not session:
|
||||
raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
|
||||
|
||||
word_result = session.get("word_result")
|
||||
if not word_result:
|
||||
raise HTTPException(status_code=400, detail="No word result found")
|
||||
|
||||
cells = word_result.get("cells", [])
|
||||
columns_used = word_result.get("columns_used", [])
|
||||
grid_shape = word_result.get("grid_shape", {})
|
||||
n_rows = grid_shape.get("rows", 0)
|
||||
n_cols = grid_shape.get("cols", 0)
|
||||
|
||||
try:
|
||||
from docx import Document
|
||||
from docx.shared import Pt
|
||||
import io as _io
|
||||
|
||||
doc = Document()
|
||||
doc.add_heading(f'Rekonstruktion -- Session {session_id[:8]}', level=1)
|
||||
|
||||
header = [c.get("label", c.get("type", f"Col {i}")) for i, c in enumerate(columns_used)]
|
||||
if not header:
|
||||
header = [f"Col {i}" for i in range(n_cols)]
|
||||
|
||||
table = doc.add_table(rows=1 + n_rows, cols=max(n_cols, 1))
|
||||
table.style = 'Table Grid'
|
||||
|
||||
for ci, h in enumerate(header):
|
||||
table.rows[0].cells[ci].text = h
|
||||
|
||||
for r in range(n_rows):
|
||||
for ci in range(n_cols):
|
||||
cell_id = f"R{r:02d}_C{ci}"
|
||||
cell = next((c for c in cells if c.get("cell_id") == cell_id), None)
|
||||
table.rows[r + 1].cells[ci].text = cell.get("text", "") if cell else ""
|
||||
|
||||
buf = _io.BytesIO()
|
||||
doc.save(buf)
|
||||
buf.seek(0)
|
||||
|
||||
return StreamingResponse(
|
||||
buf,
|
||||
media_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
headers={"Content-Disposition": f'attachment; filename="reconstruction_{session_id}.docx"'},
|
||||
)
|
||||
except ImportError:
|
||||
raise HTTPException(status_code=501, detail="python-docx not installed")
|
||||
# Backward-compat shim -- module moved to ocr/pipeline/reconstruction.py
|
||||
import importlib as _importlib
|
||||
import sys as _sys
|
||||
_sys.modules[__name__] = _importlib.import_module("ocr.pipeline.reconstruction")
|
||||
|
||||
Reference in New Issue
Block a user