Restructure: Move ocr_pipeline + labeling + crop into ocr/ package

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-25 21:51:43 +02:00
parent 59c400b9aa
commit 0504d22b8e
98 changed files with 10351 additions and 10152 deletions
--- a/klausur-service/backend/page_sub_sessions.py
+++ b/klausur-service/backend/page_sub_sessions.py
@@ -1,189 +1,4 @@
-"""
-Sub-session creation for multi-page spreads.
-
-Used by both the page-split and crop steps when a double-page scan is detected.
-"""
-
-import logging
-import uuid as uuid_mod
-from typing import Any, Dict, List
-
-import cv2
-import numpy as np
-
-from page_crop import detect_and_crop_page
-from ocr_pipeline_session_store import (
-    create_session_db,
-    get_sub_sessions,
-    update_session_db,
-)
-from orientation_crop_helpers import get_cache_ref
-
-logger = logging.getLogger(__name__)
-
-
-async def create_page_sub_sessions(
-    parent_session_id: str,
-    parent_cached: dict,
-    full_img_bgr: np.ndarray,
-    page_splits: List[Dict[str, Any]],
-) -> List[Dict[str, Any]]:
-    """Create sub-sessions for each detected page in a multi-page spread.
-
-    Each page region is individually cropped, then stored as a sub-session
-    with its own cropped image ready for the rest of the pipeline.
-    """
-    # Check for existing sub-sessions (idempotent)
-    existing = await get_sub_sessions(parent_session_id)
-    if existing:
-        return [
-            {"id": s["id"], "name": s["name"], "page_index": s.get("box_index", i)}
-            for i, s in enumerate(existing)
-        ]
-
-    parent_name = parent_cached.get("name", "Scan")
-    parent_filename = parent_cached.get("filename", "scan.png")
-
-    sub_sessions: List[Dict[str, Any]] = []
-
-    for page in page_splits:
-        pi = page["page_index"]
-        px, py = page["x"], page["y"]
-        pw, ph = page["width"], page["height"]
-
-        # Extract page region
-        page_bgr = full_img_bgr[py:py + ph, px:px + pw].copy()
-
-        # Crop each page individually (remove its own borders)
-        cropped_page, page_crop_info = detect_and_crop_page(page_bgr)
-
-        # Encode as PNG
-        ok, png_buf = cv2.imencode(".png", cropped_page)
-        page_png = png_buf.tobytes() if ok else b""
-
-        sub_id = str(uuid_mod.uuid4())
-        sub_name = f"{parent_name} — Seite {pi + 1}"
-
-        await create_session_db(
-            session_id=sub_id,
-            name=sub_name,
-            filename=parent_filename,
-            original_png=page_png,
-        )
-
-        # Pre-populate: set cropped = original (already cropped)
-        await update_session_db(
-            sub_id,
-            cropped_png=page_png,
-            crop_result=page_crop_info,
-            current_step=5,
-        )
-
-        ch, cw = cropped_page.shape[:2]
-        sub_sessions.append({
-            "id": sub_id,
-            "name": sub_name,
-            "page_index": pi,
-            "source_rect": page,
-            "cropped_size": {"width": cw, "height": ch},
-            "detected_format": page_crop_info.get("detected_format"),
-        })
-
-        logger.info(
-            "Page sub-session %s: page %d, region x=%d w=%d -> cropped %dx%d",
-            sub_id, pi + 1, px, pw, cw, ch,
-        )
-
-    return sub_sessions
-
-
-async def create_page_sub_sessions_full(
-    parent_session_id: str,
-    parent_cached: dict,
-    full_img_bgr: np.ndarray,
-    page_splits: List[Dict[str, Any]],
-    start_step: int = 2,
-) -> List[Dict[str, Any]]:
-    """Create sub-sessions for each page with RAW regions for full pipeline processing.
-
-    Unlike ``create_page_sub_sessions`` (used by the crop step), these
-    sub-sessions store the *uncropped* page region and start at
-    ``start_step`` (default 2 = ready for deskew; 1 if orientation still
-    needed).  Each page goes through its own pipeline independently,
-    which is essential for book spreads where each page has a different tilt.
-    """
-    _cache = get_cache_ref()
-
-    # Idempotent: reuse existing sub-sessions
-    existing = await get_sub_sessions(parent_session_id)
-    if existing:
-        return [
-            {"id": s["id"], "name": s["name"], "page_index": s.get("box_index", i)}
-            for i, s in enumerate(existing)
-        ]
-
-    parent_name = parent_cached.get("name", "Scan")
-    parent_filename = parent_cached.get("filename", "scan.png")
-
-    sub_sessions: List[Dict[str, Any]] = []
-
-    for page in page_splits:
-        pi = page["page_index"]
-        px, py = page["x"], page["y"]
-        pw, ph = page["width"], page["height"]
-
-        # Extract RAW page region — NO individual cropping here; each
-        # sub-session will run its own crop step after deskew + dewarp.
-        page_bgr = full_img_bgr[py:py + ph, px:px + pw].copy()
-
-        # Encode as PNG
-        ok, png_buf = cv2.imencode(".png", page_bgr)
-        page_png = png_buf.tobytes() if ok else b""
-
-        sub_id = str(uuid_mod.uuid4())
-        sub_name = f"{parent_name} — Seite {pi + 1}"
-
-        await create_session_db(
-            session_id=sub_id,
-            name=sub_name,
-            filename=parent_filename,
-            original_png=page_png,
-        )
-
-        # start_step=2 -> ready for deskew (orientation already done on spread)
-        # start_step=1 -> needs its own orientation (split from original image)
-        await update_session_db(sub_id, current_step=start_step)
-
-        # Cache the BGR so the pipeline can start immediately
-        _cache[sub_id] = {
-            "id": sub_id,
-            "filename": parent_filename,
-            "name": sub_name,
-            "original_bgr": page_bgr,
-            "oriented_bgr": None,
-            "cropped_bgr": None,
-            "deskewed_bgr": None,
-            "dewarped_bgr": None,
-            "orientation_result": None,
-            "crop_result": None,
-            "deskew_result": None,
-            "dewarp_result": None,
-            "ground_truth": {},
-            "current_step": start_step,
-        }
-
-        rh, rw = page_bgr.shape[:2]
-        sub_sessions.append({
-            "id": sub_id,
-            "name": sub_name,
-            "page_index": pi,
-            "source_rect": page,
-            "image_size": {"width": rw, "height": rh},
-        })
-
-        logger.info(
-            "Page sub-session %s (full pipeline): page %d, region x=%d w=%d -> %dx%d",
-            sub_id, pi + 1, px, pw, rw, rh,
-        )
-
-    return sub_sessions
+# Backward-compat shim -- module moved to ocr/pipeline/page_sub_sessions.py
+import importlib as _importlib
+import sys as _sys
+_sys.modules[__name__] = _importlib.import_module("ocr.pipeline.page_sub_sessions")