""" OCR Pipeline Reconstruction — save edits, Fabric JSON export, merged entries, PDF/DOCX export. Extracted from ocr_pipeline_postprocess.py. Lizenz: Apache 2.0 DATENSCHUTZ: Alle Verarbeitung erfolgt lokal. """ import logging import re from typing import Dict from fastapi import APIRouter, HTTPException, Request from fastapi.responses import StreamingResponse from ocr_pipeline_session_store import ( get_session_db, get_sub_sessions, update_session_db, ) from ocr_pipeline_common import _cache logger = logging.getLogger(__name__) router = APIRouter(prefix="/api/v1/ocr-pipeline", tags=["ocr-pipeline"]) # --------------------------------------------------------------------------- # Step 9: Reconstruction + Fabric JSON export # --------------------------------------------------------------------------- @router.post("/sessions/{session_id}/reconstruction") async def save_reconstruction(session_id: str, request: Request): """Save edited cell texts from reconstruction step.""" session = await get_session_db(session_id) if not session: raise HTTPException(status_code=404, detail=f"Session {session_id} not found") word_result = session.get("word_result") if not word_result: raise HTTPException(status_code=400, detail="No word result found") body = await request.json() cell_updates = body.get("cells", []) if not cell_updates: await update_session_db(session_id, current_step=10) return {"session_id": session_id, "updated": 0} # Build update map: cell_id -> new text update_map = {c["cell_id"]: c["text"] for c in cell_updates} # Separate sub-session updates (cell_ids prefixed with "box{N}_") sub_updates: Dict[int, Dict[str, str]] = {} # box_index -> {original_cell_id: text} main_updates: Dict[str, str] = {} for cell_id, text in update_map.items(): m = re.match(r'^box(\d+)_(.+)$', cell_id) if m: bi = int(m.group(1)) original_id = m.group(2) sub_updates.setdefault(bi, {})[original_id] = text else: main_updates[cell_id] = text # Update main session cells cells = word_result.get("cells", []) updated_count = 0 for cell in cells: if cell["cell_id"] in main_updates: cell["text"] = main_updates[cell["cell_id"]] cell["status"] = "edited" updated_count += 1 word_result["cells"] = cells # Also update vocab_entries if present entries = word_result.get("vocab_entries") or word_result.get("entries") or [] if entries: for entry in entries: row_idx = entry.get("row_index", -1) for col_idx, field_name in enumerate(["english", "german", "example"]): cell_id = f"R{row_idx:02d}_C{col_idx}" cell_id_alt = f"R{row_idx}_C{col_idx}" new_text = main_updates.get(cell_id) or main_updates.get(cell_id_alt) if new_text is not None: entry[field_name] = new_text word_result["vocab_entries"] = entries if "entries" in word_result: word_result["entries"] = entries await update_session_db(session_id, word_result=word_result, current_step=10) if session_id in _cache: _cache[session_id]["word_result"] = word_result # Route sub-session updates sub_updated = 0 if sub_updates: subs = await get_sub_sessions(session_id) sub_by_index = {s.get("box_index"): s["id"] for s in subs} for bi, updates in sub_updates.items(): sub_id = sub_by_index.get(bi) if not sub_id: continue sub_session = await get_session_db(sub_id) if not sub_session: continue sub_word = sub_session.get("word_result") if not sub_word: continue sub_cells = sub_word.get("cells", []) for cell in sub_cells: if cell["cell_id"] in updates: cell["text"] = updates[cell["cell_id"]] cell["status"] = "edited" sub_updated += 1 sub_word["cells"] = sub_cells await update_session_db(sub_id, word_result=sub_word) if sub_id in _cache: _cache[sub_id]["word_result"] = sub_word total_updated = updated_count + sub_updated logger.info(f"Reconstruction saved for session {session_id}: " f"{updated_count} main + {sub_updated} sub-session cells updated") return { "session_id": session_id, "updated": total_updated, "main_updated": updated_count, "sub_updated": sub_updated, } @router.get("/sessions/{session_id}/reconstruction/fabric-json") async def get_fabric_json(session_id: str): """Return cell grid as Fabric.js-compatible JSON for the canvas editor.""" session = await get_session_db(session_id) if not session: raise HTTPException(status_code=404, detail=f"Session {session_id} not found") word_result = session.get("word_result") if not word_result: raise HTTPException(status_code=400, detail="No word result found") cells = list(word_result.get("cells", [])) img_w = word_result.get("image_width", 800) img_h = word_result.get("image_height", 600) # Merge sub-session cells at box positions subs = await get_sub_sessions(session_id) if subs: column_result = session.get("column_result") or {} zones = column_result.get("zones") or [] box_zones = [z for z in zones if z.get("zone_type") == "box" and z.get("box")] for sub in subs: sub_session = await get_session_db(sub["id"]) if not sub_session: continue sub_word = sub_session.get("word_result") if not sub_word or not sub_word.get("cells"): continue bi = sub.get("box_index", 0) if bi < len(box_zones): box = box_zones[bi]["box"] box_y, box_x = box["y"], box["x"] else: box_y, box_x = 0, 0 for cell in sub_word["cells"]: cell_copy = dict(cell) cell_copy["cell_id"] = f"box{bi}_{cell_copy.get('cell_id', '')}" cell_copy["source"] = f"box_{bi}" bbox = cell_copy.get("bbox_px", {}) if bbox: bbox = dict(bbox) bbox["x"] = bbox.get("x", 0) + box_x bbox["y"] = bbox.get("y", 0) + box_y cell_copy["bbox_px"] = bbox cells.append(cell_copy) from services.layout_reconstruction_service import cells_to_fabric_json fabric_json = cells_to_fabric_json(cells, img_w, img_h) return fabric_json # --------------------------------------------------------------------------- # Vocab entries merged + PDF/DOCX export # --------------------------------------------------------------------------- @router.get("/sessions/{session_id}/vocab-entries/merged") async def get_merged_vocab_entries(session_id: str): """Return vocab entries from main session + all sub-sessions, sorted by Y position.""" session = await get_session_db(session_id) if not session: raise HTTPException(status_code=404, detail=f"Session {session_id} not found") word_result = session.get("word_result") or {} entries = list(word_result.get("vocab_entries") or word_result.get("entries") or []) for e in entries: e.setdefault("source", "main") subs = await get_sub_sessions(session_id) if subs: column_result = session.get("column_result") or {} zones = column_result.get("zones") or [] box_zones = [z for z in zones if z.get("zone_type") == "box" and z.get("box")] for sub in subs: sub_session = await get_session_db(sub["id"]) if not sub_session: continue sub_word = sub_session.get("word_result") or {} sub_entries = sub_word.get("vocab_entries") or sub_word.get("entries") or [] bi = sub.get("box_index", 0) box_y = 0 if bi < len(box_zones): box_y = box_zones[bi]["box"]["y"] for e in sub_entries: e_copy = dict(e) e_copy["source"] = f"box_{bi}" e_copy["source_y"] = box_y entries.append(e_copy) def _sort_key(e): if e.get("source", "main") == "main": return e.get("row_index", 0) * 100 return e.get("source_y", 0) * 100 + e.get("row_index", 0) entries.sort(key=_sort_key) return { "session_id": session_id, "entries": entries, "total": len(entries), "sources": list(set(e.get("source", "main") for e in entries)), } @router.get("/sessions/{session_id}/reconstruction/export/pdf") async def export_reconstruction_pdf(session_id: str): """Export the reconstructed cell grid as a PDF table.""" session = await get_session_db(session_id) if not session: raise HTTPException(status_code=404, detail=f"Session {session_id} not found") word_result = session.get("word_result") if not word_result: raise HTTPException(status_code=400, detail="No word result found") cells = word_result.get("cells", []) columns_used = word_result.get("columns_used", []) grid_shape = word_result.get("grid_shape", {}) n_rows = grid_shape.get("rows", 0) n_cols = grid_shape.get("cols", 0) # Build table data: rows x columns table_data: list[list[str]] = [] header = [c.get("label", c.get("type", f"Col {i}")) for i, c in enumerate(columns_used)] if not header: header = [f"Col {i}" for i in range(n_cols)] table_data.append(header) for r in range(n_rows): row_texts = [] for ci in range(n_cols): cell_id = f"R{r:02d}_C{ci}" cell = next((c for c in cells if c.get("cell_id") == cell_id), None) row_texts.append(cell.get("text", "") if cell else "") table_data.append(row_texts) try: from reportlab.lib.pagesizes import A4 from reportlab.lib import colors from reportlab.platypus import SimpleDocTemplate, Table, TableStyle import io as _io buf = _io.BytesIO() doc = SimpleDocTemplate(buf, pagesize=A4) if not table_data or not table_data[0]: raise HTTPException(status_code=400, detail="No data to export") t = Table(table_data) t.setStyle(TableStyle([ ('BACKGROUND', (0, 0), (-1, 0), colors.HexColor('#0d9488')), ('TEXTCOLOR', (0, 0), (-1, 0), colors.white), ('FONTSIZE', (0, 0), (-1, -1), 9), ('GRID', (0, 0), (-1, -1), 0.5, colors.grey), ('VALIGN', (0, 0), (-1, -1), 'TOP'), ('WORDWRAP', (0, 0), (-1, -1), True), ])) doc.build([t]) buf.seek(0) return StreamingResponse( buf, media_type="application/pdf", headers={"Content-Disposition": f'attachment; filename="reconstruction_{session_id}.pdf"'}, ) except ImportError: raise HTTPException(status_code=501, detail="reportlab not installed") @router.get("/sessions/{session_id}/reconstruction/export/docx") async def export_reconstruction_docx(session_id: str): """Export the reconstructed cell grid as a DOCX table.""" session = await get_session_db(session_id) if not session: raise HTTPException(status_code=404, detail=f"Session {session_id} not found") word_result = session.get("word_result") if not word_result: raise HTTPException(status_code=400, detail="No word result found") cells = word_result.get("cells", []) columns_used = word_result.get("columns_used", []) grid_shape = word_result.get("grid_shape", {}) n_rows = grid_shape.get("rows", 0) n_cols = grid_shape.get("cols", 0) try: from docx import Document from docx.shared import Pt import io as _io doc = Document() doc.add_heading(f'Rekonstruktion -- Session {session_id[:8]}', level=1) header = [c.get("label", c.get("type", f"Col {i}")) for i, c in enumerate(columns_used)] if not header: header = [f"Col {i}" for i in range(n_cols)] table = doc.add_table(rows=1 + n_rows, cols=max(n_cols, 1)) table.style = 'Table Grid' for ci, h in enumerate(header): table.rows[0].cells[ci].text = h for r in range(n_rows): for ci in range(n_cols): cell_id = f"R{r:02d}_C{ci}" cell = next((c for c in cells if c.get("cell_id") == cell_id), None) table.rows[r + 1].cells[ci].text = cell.get("text", "") if cell else "" buf = _io.BytesIO() doc.save(buf) buf.seek(0) return StreamingResponse( buf, media_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document", headers={"Content-Disposition": f'attachment; filename="reconstruction_{session_id}.docx"'}, ) except ImportError: raise HTTPException(status_code=501, detail="python-docx not installed")