diff --git a/klausur-service/backend/orientation_crop_api.py b/klausur-service/backend/orientation_crop_api.py index 67a0fde..cee1c64 100644 --- a/klausur-service/backend/orientation_crop_api.py +++ b/klausur-service/backend/orientation_crop_api.py @@ -9,7 +9,8 @@ These endpoints were extracted from the main pipeline to keep files manageable. import logging import time -from typing import Any, Dict, Optional +import uuid as uuid_mod +from typing import Any, Dict, List, Optional import cv2 import numpy as np @@ -17,10 +18,12 @@ from fastapi import APIRouter, HTTPException from pydantic import BaseModel from cv_vocab_pipeline import detect_and_fix_orientation -from page_crop import detect_and_crop_page +from page_crop import detect_and_crop_page, detect_page_splits from ocr_pipeline_session_store import ( + create_session_db, get_session_db, get_session_image, + get_sub_sessions, update_session_db, ) @@ -170,6 +173,10 @@ async def auto_crop(session_id: str): Reads the dewarped image (post-deskew + dewarp, so the page is straight). Falls back to oriented → original if earlier steps were skipped. + + If the image is a multi-page spread (e.g. book on scanner), it will + automatically split into separate sub-sessions per page, crop each + individually, and return the split info. """ cached = await _ensure_cached(session_id) @@ -184,10 +191,68 @@ async def auto_crop(session_id: str): t0 = time.time() + # --- Multi-page detection --- + page_splits = detect_page_splits(img_bgr) + + if page_splits and len(page_splits) >= 2: + # Multi-page spread detected — create sub-sessions + sub_sessions = await _create_page_sub_sessions( + session_id, cached, img_bgr, page_splits, + ) + duration = time.time() - t0 + + crop_info: Dict[str, Any] = { + "crop_applied": True, + "multi_page": True, + "page_count": len(page_splits), + "page_splits": page_splits, + "duration_seconds": round(duration, 2), + } + cached["crop_result"] = crop_info + + # Store the first page as the main cropped image for backward compat + first_page = page_splits[0] + first_bgr = img_bgr[ + first_page["y"]:first_page["y"] + first_page["height"], + first_page["x"]:first_page["x"] + first_page["width"], + ].copy() + first_cropped, _ = detect_and_crop_page(first_bgr) + cached["cropped_bgr"] = first_cropped + + ok, png_buf = cv2.imencode(".png", first_cropped) + await update_session_db( + session_id, + cropped_png=png_buf.tobytes() if ok else b"", + crop_result=crop_info, + current_step=5, + ) + + logger.info( + "OCR Pipeline: crop session %s: multi-page split into %d pages in %.2fs", + session_id, len(page_splits), duration, + ) + + await _append_pipeline_log(session_id, "crop", { + "multi_page": True, + "page_count": len(page_splits), + }, duration_ms=int(duration * 1000)) + + h, w = first_cropped.shape[:2] + return { + "session_id": session_id, + **crop_info, + "image_width": w, + "image_height": h, + "cropped_image_url": f"/api/v1/ocr-pipeline/sessions/{session_id}/image/cropped", + "sub_sessions": sub_sessions, + } + + # --- Single page (normal) --- cropped_bgr, crop_info = detect_and_crop_page(img_bgr) duration = time.time() - t0 crop_info["duration_seconds"] = round(duration, 2) + crop_info["multi_page"] = False # Encode cropped image success, png_buf = cv2.imencode(".png", cropped_bgr) @@ -228,6 +293,83 @@ async def auto_crop(session_id: str): } +async def _create_page_sub_sessions( + parent_session_id: str, + parent_cached: dict, + full_img_bgr: np.ndarray, + page_splits: List[Dict[str, Any]], +) -> List[Dict[str, Any]]: + """Create sub-sessions for each detected page in a multi-page spread. + + Each page region is individually cropped, then stored as a sub-session + with its own cropped image ready for the rest of the pipeline. + """ + # Check for existing sub-sessions (idempotent) + existing = await get_sub_sessions(parent_session_id) + if existing: + return [ + {"id": s["id"], "name": s["name"], "page_index": s.get("box_index", i)} + for i, s in enumerate(existing) + ] + + parent_name = parent_cached.get("name", "Scan") + parent_filename = parent_cached.get("filename", "scan.png") + + sub_sessions: List[Dict[str, Any]] = [] + + for page in page_splits: + pi = page["page_index"] + px, py = page["x"], page["y"] + pw, ph = page["width"], page["height"] + + # Extract page region + page_bgr = full_img_bgr[py:py + ph, px:px + pw].copy() + + # Crop each page individually (remove its own borders) + cropped_page, page_crop_info = detect_and_crop_page(page_bgr) + + # Encode as PNG + ok, png_buf = cv2.imencode(".png", cropped_page) + page_png = png_buf.tobytes() if ok else b"" + + sub_id = str(uuid_mod.uuid4()) + sub_name = f"{parent_name} — Seite {pi + 1}" + + await create_session_db( + session_id=sub_id, + name=sub_name, + filename=parent_filename, + original_png=page_png, + parent_session_id=parent_session_id, + box_index=pi, + ) + + # Pre-populate: set cropped = original (already cropped) + await update_session_db( + sub_id, + cropped_png=page_png, + crop_result=page_crop_info, + current_step=5, + ) + + ch, cw = cropped_page.shape[:2] + sub_sessions.append({ + "id": sub_id, + "name": sub_name, + "page_index": pi, + "source_rect": page, + "cropped_size": {"width": cw, "height": ch}, + "detected_format": page_crop_info.get("detected_format"), + }) + + logger.info( + "Page sub-session %s: page %d, region x=%d w=%d -> cropped %dx%d", + sub_id, pi + 1, px, pw, cw, ch, + ) + + return sub_sessions + + class ManualCropRequest(BaseModel): x: float # percentage 0-100 y: float # percentage 0-100 diff --git a/klausur-service/backend/page_crop.py b/klausur-service/backend/page_crop.py index 67ddc8d..ea51714 100644 --- a/klausur-service/backend/page_crop.py +++ b/klausur-service/backend/page_crop.py @@ -32,6 +32,109 @@ _INK_THRESHOLD = 0.003 # 0.3% _MIN_RUN_FRAC = 0.005 # 0.5% +def detect_page_splits( + img_bgr: np.ndarray, + min_gap_frac: float = 0.008, +) -> list: + """Detect if the image is a multi-page spread and return split rectangles. + + Checks for wide vertical gaps (spine area) that indicate the image + contains multiple pages side by side (e.g. book on scanner). + + Returns a list of page dicts ``{x, y, width, height, page_index}`` + or an empty list if only one page is detected. + """ + h, w = img_bgr.shape[:2] + + # Only check landscape-ish images (width > height * 0.85) + if w < h * 1.15: + return [] + + gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY) + binary = cv2.adaptiveThreshold( + gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, + cv2.THRESH_BINARY_INV, blockSize=51, C=15, + ) + + # Vertical projection: mean ink density per column + v_proj = np.mean(binary, axis=0) / 255.0 + + # Smooth with boxcar (width = 0.5% of image width, min 5) + kern = max(5, w // 200) + if kern % 2 == 0: + kern += 1 + v_smooth = np.convolve(v_proj, np.ones(kern) / kern, mode="same") + + peak = float(np.max(v_smooth)) + if peak < 0.005: + return [] + + # Look for valleys in center region (25-75% of width) + gap_thresh = peak * 0.15 # valley must be < 15% of peak density + center_lo = int(w * 0.25) + center_hi = int(w * 0.75) + min_gap_px = max(5, int(w * min_gap_frac)) + + # Find contiguous gap runs in the center region + gaps: list = [] + in_gap = False + gap_start = 0 + for x in range(center_lo, center_hi): + if v_smooth[x] < gap_thresh: + if not in_gap: + gap_start = x + in_gap = True + else: + if in_gap: + gap_w = x - gap_start + if gap_w >= min_gap_px: + gaps.append({"x": gap_start, "width": gap_w, + "center": gap_start + gap_w // 2}) + in_gap = False + if in_gap: + gap_w = center_hi - gap_start + if gap_w >= min_gap_px: + gaps.append({"x": gap_start, "width": gap_w, + "center": gap_start + gap_w // 2}) + + if not gaps: + return [] + + # Sort gaps by width (largest = most likely spine) + gaps.sort(key=lambda g: g["width"], reverse=True) + + # Use the widest gap(s) as split points + # For now: support up to N-1 gaps → N pages + split_points = sorted(g["center"] for g in gaps[:3]) # max 4 pages + + # Build page rectangles + pages: list = [] + prev_x = 0 + for i, sx in enumerate(split_points): + pages.append({"x": prev_x, "y": 0, "width": sx - prev_x, + "height": h, "page_index": i}) + prev_x = sx + pages.append({"x": prev_x, "y": 0, "width": w - prev_x, + "height": h, "page_index": len(split_points)}) + + # Filter out tiny pages (< 15% of total width) + pages = [p for p in pages if p["width"] >= w * 0.15] + if len(pages) < 2: + return [] + + # Re-index + for i, p in enumerate(pages): + p["page_index"] = i + + logger.info( + "Page split detected: %d pages, gap widths=%s, split_points=%s", + len(pages), + [g["width"] for g in gaps[:len(split_points)]], + split_points, + ) + return pages + + def detect_and_crop_page( img_bgr: np.ndarray, margin_frac: float = 0.01,