Restructure: Move ocr_pipeline + labeling + crop into ocr/ package

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-25 21:51:43 +02:00
parent 59c400b9aa
commit 0504d22b8e
98 changed files with 10351 additions and 10152 deletions
--- a/klausur-service/backend/ocr_pipeline_reconstruction.py
+++ b/klausur-service/backend/ocr_pipeline_reconstruction.py
@@ -1,362 +1,4 @@
-"""
-OCR Pipeline Reconstruction — save edits, Fabric JSON export, merged entries, PDF/DOCX export.
-
-Extracted from ocr_pipeline_postprocess.py.
-
-Lizenz: Apache 2.0
-DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
-"""
-
-import logging
-import re
-from typing import Dict
-
-from fastapi import APIRouter, HTTPException, Request
-from fastapi.responses import StreamingResponse
-
-from ocr_pipeline_session_store import (
-    get_session_db,
-    get_sub_sessions,
-    update_session_db,
-)
-from ocr_pipeline_common import _cache
-
-logger = logging.getLogger(__name__)
-
-router = APIRouter(prefix="/api/v1/ocr-pipeline", tags=["ocr-pipeline"])
-
-
-# ---------------------------------------------------------------------------
-# Step 9: Reconstruction + Fabric JSON export
-# ---------------------------------------------------------------------------
-
-@router.post("/sessions/{session_id}/reconstruction")
-async def save_reconstruction(session_id: str, request: Request):
-    """Save edited cell texts from reconstruction step."""
-    session = await get_session_db(session_id)
-    if not session:
-        raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
-
-    word_result = session.get("word_result")
-    if not word_result:
-        raise HTTPException(status_code=400, detail="No word result found")
-
-    body = await request.json()
-    cell_updates = body.get("cells", [])
-
-    if not cell_updates:
-        await update_session_db(session_id, current_step=10)
-        return {"session_id": session_id, "updated": 0}
-
-    # Build update map: cell_id -> new text
-    update_map = {c["cell_id"]: c["text"] for c in cell_updates}
-
-    # Separate sub-session updates (cell_ids prefixed with "box{N}_")
-    sub_updates: Dict[int, Dict[str, str]] = {}  # box_index -> {original_cell_id: text}
-    main_updates: Dict[str, str] = {}
-    for cell_id, text in update_map.items():
-        m = re.match(r'^box(\d+)_(.+)$', cell_id)
-        if m:
-            bi = int(m.group(1))
-            original_id = m.group(2)
-            sub_updates.setdefault(bi, {})[original_id] = text
-        else:
-            main_updates[cell_id] = text
-
-    # Update main session cells
-    cells = word_result.get("cells", [])
-    updated_count = 0
-    for cell in cells:
-        if cell["cell_id"] in main_updates:
-            cell["text"] = main_updates[cell["cell_id"]]
-            cell["status"] = "edited"
-            updated_count += 1
-
-    word_result["cells"] = cells
-
-    # Also update vocab_entries if present
-    entries = word_result.get("vocab_entries") or word_result.get("entries") or []
-    if entries:
-        for entry in entries:
-            row_idx = entry.get("row_index", -1)
-            for col_idx, field_name in enumerate(["english", "german", "example"]):
-                cell_id = f"R{row_idx:02d}_C{col_idx}"
-                cell_id_alt = f"R{row_idx}_C{col_idx}"
-                new_text = main_updates.get(cell_id) or main_updates.get(cell_id_alt)
-                if new_text is not None:
-                    entry[field_name] = new_text
-
-        word_result["vocab_entries"] = entries
-        if "entries" in word_result:
-            word_result["entries"] = entries
-
-    await update_session_db(session_id, word_result=word_result, current_step=10)
-
-    if session_id in _cache:
-        _cache[session_id]["word_result"] = word_result
-
-    # Route sub-session updates
-    sub_updated = 0
-    if sub_updates:
-        subs = await get_sub_sessions(session_id)
-        sub_by_index = {s.get("box_index"): s["id"] for s in subs}
-        for bi, updates in sub_updates.items():
-            sub_id = sub_by_index.get(bi)
-            if not sub_id:
-                continue
-            sub_session = await get_session_db(sub_id)
-            if not sub_session:
-                continue
-            sub_word = sub_session.get("word_result")
-            if not sub_word:
-                continue
-            sub_cells = sub_word.get("cells", [])
-            for cell in sub_cells:
-                if cell["cell_id"] in updates:
-                    cell["text"] = updates[cell["cell_id"]]
-                    cell["status"] = "edited"
-                    sub_updated += 1
-            sub_word["cells"] = sub_cells
-            await update_session_db(sub_id, word_result=sub_word)
-            if sub_id in _cache:
-                _cache[sub_id]["word_result"] = sub_word
-
-    total_updated = updated_count + sub_updated
-    logger.info(f"Reconstruction saved for session {session_id}: "
-                f"{updated_count} main + {sub_updated} sub-session cells updated")
-
-    return {
-        "session_id": session_id,
-        "updated": total_updated,
-        "main_updated": updated_count,
-        "sub_updated": sub_updated,
-    }
-
-
-@router.get("/sessions/{session_id}/reconstruction/fabric-json")
-async def get_fabric_json(session_id: str):
-    """Return cell grid as Fabric.js-compatible JSON for the canvas editor."""
-    session = await get_session_db(session_id)
-    if not session:
-        raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
-
-    word_result = session.get("word_result")
-    if not word_result:
-        raise HTTPException(status_code=400, detail="No word result found")
-
-    cells = list(word_result.get("cells", []))
-    img_w = word_result.get("image_width", 800)
-    img_h = word_result.get("image_height", 600)
-
-    # Merge sub-session cells at box positions
-    subs = await get_sub_sessions(session_id)
-    if subs:
-        column_result = session.get("column_result") or {}
-        zones = column_result.get("zones") or []
-        box_zones = [z for z in zones if z.get("zone_type") == "box" and z.get("box")]
-
-        for sub in subs:
-            sub_session = await get_session_db(sub["id"])
-            if not sub_session:
-                continue
-            sub_word = sub_session.get("word_result")
-            if not sub_word or not sub_word.get("cells"):
-                continue
-
-            bi = sub.get("box_index", 0)
-            if bi < len(box_zones):
-                box = box_zones[bi]["box"]
-                box_y, box_x = box["y"], box["x"]
-            else:
-                box_y, box_x = 0, 0
-
-            for cell in sub_word["cells"]:
-                cell_copy = dict(cell)
-                cell_copy["cell_id"] = f"box{bi}_{cell_copy.get('cell_id', '')}"
-                cell_copy["source"] = f"box_{bi}"
-                bbox = cell_copy.get("bbox_px", {})
-                if bbox:
-                    bbox = dict(bbox)
-                    bbox["x"] = bbox.get("x", 0) + box_x
-                    bbox["y"] = bbox.get("y", 0) + box_y
-                    cell_copy["bbox_px"] = bbox
-                cells.append(cell_copy)
-
-    from services.layout_reconstruction_service import cells_to_fabric_json
-    fabric_json = cells_to_fabric_json(cells, img_w, img_h)
-
-    return fabric_json
-
-
-# ---------------------------------------------------------------------------
-# Vocab entries merged + PDF/DOCX export
-# ---------------------------------------------------------------------------
-
-@router.get("/sessions/{session_id}/vocab-entries/merged")
-async def get_merged_vocab_entries(session_id: str):
-    """Return vocab entries from main session + all sub-sessions, sorted by Y position."""
-    session = await get_session_db(session_id)
-    if not session:
-        raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
-
-    word_result = session.get("word_result") or {}
-    entries = list(word_result.get("vocab_entries") or word_result.get("entries") or [])
-
-    for e in entries:
-        e.setdefault("source", "main")
-
-    subs = await get_sub_sessions(session_id)
-    if subs:
-        column_result = session.get("column_result") or {}
-        zones = column_result.get("zones") or []
-        box_zones = [z for z in zones if z.get("zone_type") == "box" and z.get("box")]
-
-        for sub in subs:
-            sub_session = await get_session_db(sub["id"])
-            if not sub_session:
-                continue
-            sub_word = sub_session.get("word_result") or {}
-            sub_entries = sub_word.get("vocab_entries") or sub_word.get("entries") or []
-
-            bi = sub.get("box_index", 0)
-            box_y = 0
-            if bi < len(box_zones):
-                box_y = box_zones[bi]["box"]["y"]
-
-            for e in sub_entries:
-                e_copy = dict(e)
-                e_copy["source"] = f"box_{bi}"
-                e_copy["source_y"] = box_y
-                entries.append(e_copy)
-
-    def _sort_key(e):
-        if e.get("source", "main") == "main":
-            return e.get("row_index", 0) * 100
-        return e.get("source_y", 0) * 100 + e.get("row_index", 0)
-
-    entries.sort(key=_sort_key)
-
-    return {
-        "session_id": session_id,
-        "entries": entries,
-        "total": len(entries),
-        "sources": list(set(e.get("source", "main") for e in entries)),
-    }
-
-
-@router.get("/sessions/{session_id}/reconstruction/export/pdf")
-async def export_reconstruction_pdf(session_id: str):
-    """Export the reconstructed cell grid as a PDF table."""
-    session = await get_session_db(session_id)
-    if not session:
-        raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
-
-    word_result = session.get("word_result")
-    if not word_result:
-        raise HTTPException(status_code=400, detail="No word result found")
-
-    cells = word_result.get("cells", [])
-    columns_used = word_result.get("columns_used", [])
-    grid_shape = word_result.get("grid_shape", {})
-    n_rows = grid_shape.get("rows", 0)
-    n_cols = grid_shape.get("cols", 0)
-
-    # Build table data: rows x columns
-    table_data: list[list[str]] = []
-    header = [c.get("label", c.get("type", f"Col {i}")) for i, c in enumerate(columns_used)]
-    if not header:
-        header = [f"Col {i}" for i in range(n_cols)]
-    table_data.append(header)
-
-    for r in range(n_rows):
-        row_texts = []
-        for ci in range(n_cols):
-            cell_id = f"R{r:02d}_C{ci}"
-            cell = next((c for c in cells if c.get("cell_id") == cell_id), None)
-            row_texts.append(cell.get("text", "") if cell else "")
-        table_data.append(row_texts)
-
-    try:
-        from reportlab.lib.pagesizes import A4
-        from reportlab.lib import colors
-        from reportlab.platypus import SimpleDocTemplate, Table, TableStyle
-        import io as _io
-
-        buf = _io.BytesIO()
-        doc = SimpleDocTemplate(buf, pagesize=A4)
-        if not table_data or not table_data[0]:
-            raise HTTPException(status_code=400, detail="No data to export")
-
-        t = Table(table_data)
-        t.setStyle(TableStyle([
-            ('BACKGROUND', (0, 0), (-1, 0), colors.HexColor('#0d9488')),
-            ('TEXTCOLOR', (0, 0), (-1, 0), colors.white),
-            ('FONTSIZE', (0, 0), (-1, -1), 9),
-            ('GRID', (0, 0), (-1, -1), 0.5, colors.grey),
-            ('VALIGN', (0, 0), (-1, -1), 'TOP'),
-            ('WORDWRAP', (0, 0), (-1, -1), True),
-        ]))
-        doc.build([t])
-        buf.seek(0)
-
-        return StreamingResponse(
-            buf,
-            media_type="application/pdf",
-            headers={"Content-Disposition": f'attachment; filename="reconstruction_{session_id}.pdf"'},
-        )
-    except ImportError:
-        raise HTTPException(status_code=501, detail="reportlab not installed")
-
-
-@router.get("/sessions/{session_id}/reconstruction/export/docx")
-async def export_reconstruction_docx(session_id: str):
-    """Export the reconstructed cell grid as a DOCX table."""
-    session = await get_session_db(session_id)
-    if not session:
-        raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
-
-    word_result = session.get("word_result")
-    if not word_result:
-        raise HTTPException(status_code=400, detail="No word result found")
-
-    cells = word_result.get("cells", [])
-    columns_used = word_result.get("columns_used", [])
-    grid_shape = word_result.get("grid_shape", {})
-    n_rows = grid_shape.get("rows", 0)
-    n_cols = grid_shape.get("cols", 0)
-
-    try:
-        from docx import Document
-        from docx.shared import Pt
-        import io as _io
-
-        doc = Document()
-        doc.add_heading(f'Rekonstruktion -- Session {session_id[:8]}', level=1)
-
-        header = [c.get("label", c.get("type", f"Col {i}")) for i, c in enumerate(columns_used)]
-        if not header:
-            header = [f"Col {i}" for i in range(n_cols)]
-
-        table = doc.add_table(rows=1 + n_rows, cols=max(n_cols, 1))
-        table.style = 'Table Grid'
-
-        for ci, h in enumerate(header):
-            table.rows[0].cells[ci].text = h
-
-        for r in range(n_rows):
-            for ci in range(n_cols):
-                cell_id = f"R{r:02d}_C{ci}"
-                cell = next((c for c in cells if c.get("cell_id") == cell_id), None)
-                table.rows[r + 1].cells[ci].text = cell.get("text", "") if cell else ""
-
-        buf = _io.BytesIO()
-        doc.save(buf)
-        buf.seek(0)
-
-        return StreamingResponse(
-            buf,
-            media_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
-            headers={"Content-Disposition": f'attachment; filename="reconstruction_{session_id}.docx"'},
-        )
-    except ImportError:
-        raise HTTPException(status_code=501, detail="python-docx not installed")
+# Backward-compat shim -- module moved to ocr/pipeline/reconstruction.py
+import importlib as _importlib
+import sys as _sys
+_sys.modules[__name__] = _importlib.import_module("ocr.pipeline.reconstruction")