klausur-service (11 files): - cv_gutter_repair, ocr_pipeline_regression, upload_api - ocr_pipeline_sessions, smart_spell, nru_worksheet_generator - ocr_pipeline_overlays, mail/aggregator, zeugnis_api - cv_syllable_detect, self_rag backend-lehrer (17 files): - classroom_engine/suggestions, generators/quiz_generator - worksheets_api, llm_gateway/comparison, state_engine_api - classroom/models (→ 4 submodules), services/file_processor - alerts_agent/api/wizard+digests+routes, content_generators/pdf - classroom/routes/sessions, llm_gateway/inference - classroom_engine/analytics, auth/keycloak_auth - alerts_agent/processing/rule_engine, ai_processor/print_versions agent-core (5 files): - brain/memory_store, brain/knowledge_graph, brain/context_manager - orchestrator/supervisor, sessions/session_manager admin-lehrer (5 components): - GridOverlay, StepGridReview, DevOpsPipelineSidebar - DataFlowDiagram, sbom/wizard/page website (2 files): - DependencyMap, lehrer/abitur-archiv Other: nibis_ingestion, grid_detection_service, export-doclayout-onnx Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
450 lines
15 KiB
Python
450 lines
15 KiB
Python
"""
|
|
OCR Pipeline Sessions CRUD — session create, read, update, delete, box sessions.
|
|
|
|
Extracted from ocr_pipeline_sessions.py for modularity.
|
|
|
|
Lizenz: Apache 2.0
|
|
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
|
"""
|
|
|
|
import logging
|
|
import uuid
|
|
from typing import Any, Dict, Optional
|
|
|
|
import cv2
|
|
import numpy as np
|
|
from fastapi import APIRouter, File, Form, HTTPException, Query, UploadFile
|
|
|
|
from cv_vocab_pipeline import render_image_high_res, render_pdf_high_res
|
|
from ocr_pipeline_common import (
|
|
VALID_DOCUMENT_CATEGORIES,
|
|
UpdateSessionRequest,
|
|
_cache,
|
|
)
|
|
from ocr_pipeline_session_store import (
|
|
create_session_db,
|
|
delete_all_sessions_db,
|
|
delete_session_db,
|
|
get_session_db,
|
|
get_session_image,
|
|
get_sub_sessions,
|
|
list_sessions_db,
|
|
update_session_db,
|
|
)
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
router = APIRouter(prefix="/api/v1/ocr-pipeline", tags=["ocr-pipeline"])
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Session Management Endpoints
|
|
# ---------------------------------------------------------------------------
|
|
|
|
@router.get("/sessions")
|
|
async def list_sessions(include_sub_sessions: bool = False):
|
|
"""List OCR pipeline sessions.
|
|
|
|
By default, sub-sessions (box regions) are hidden.
|
|
Pass ?include_sub_sessions=true to show them.
|
|
"""
|
|
sessions = await list_sessions_db(include_sub_sessions=include_sub_sessions)
|
|
return {"sessions": sessions}
|
|
|
|
|
|
@router.post("/sessions")
|
|
async def create_session(
|
|
file: UploadFile = File(...),
|
|
name: Optional[str] = Form(None),
|
|
):
|
|
"""Upload a PDF or image file and create a pipeline session.
|
|
|
|
For multi-page PDFs (> 1 page), each page becomes its own session
|
|
grouped under a ``document_group_id``. The response includes a
|
|
``pages`` array with one entry per page/session.
|
|
"""
|
|
file_data = await file.read()
|
|
filename = file.filename or "upload"
|
|
content_type = file.content_type or ""
|
|
|
|
is_pdf = content_type == "application/pdf" or filename.lower().endswith(".pdf")
|
|
session_name = name or filename
|
|
|
|
# --- Multi-page PDF handling ---
|
|
if is_pdf:
|
|
try:
|
|
import fitz # PyMuPDF
|
|
pdf_doc = fitz.open(stream=file_data, filetype="pdf")
|
|
page_count = pdf_doc.page_count
|
|
pdf_doc.close()
|
|
except Exception as e:
|
|
raise HTTPException(status_code=400, detail=f"Could not read PDF: {e}")
|
|
|
|
if page_count > 1:
|
|
return await _create_multi_page_sessions(
|
|
file_data, filename, session_name, page_count,
|
|
)
|
|
|
|
# --- Single page (image or 1-page PDF) ---
|
|
session_id = str(uuid.uuid4())
|
|
|
|
try:
|
|
if is_pdf:
|
|
img_bgr = render_pdf_high_res(file_data, page_number=0, zoom=3.0)
|
|
else:
|
|
img_bgr = render_image_high_res(file_data)
|
|
except Exception as e:
|
|
raise HTTPException(status_code=400, detail=f"Could not process file: {e}")
|
|
|
|
# Encode original as PNG bytes
|
|
success, png_buf = cv2.imencode(".png", img_bgr)
|
|
if not success:
|
|
raise HTTPException(status_code=500, detail="Failed to encode image")
|
|
|
|
original_png = png_buf.tobytes()
|
|
|
|
# Persist to DB
|
|
await create_session_db(
|
|
session_id=session_id,
|
|
name=session_name,
|
|
filename=filename,
|
|
original_png=original_png,
|
|
)
|
|
|
|
# Cache BGR array for immediate processing
|
|
_cache[session_id] = {
|
|
"id": session_id,
|
|
"filename": filename,
|
|
"name": session_name,
|
|
"original_bgr": img_bgr,
|
|
"oriented_bgr": None,
|
|
"cropped_bgr": None,
|
|
"deskewed_bgr": None,
|
|
"dewarped_bgr": None,
|
|
"orientation_result": None,
|
|
"crop_result": None,
|
|
"deskew_result": None,
|
|
"dewarp_result": None,
|
|
"ground_truth": {},
|
|
"current_step": 1,
|
|
}
|
|
|
|
logger.info(f"OCR Pipeline: created session {session_id} from {filename} "
|
|
f"({img_bgr.shape[1]}x{img_bgr.shape[0]})")
|
|
|
|
return {
|
|
"session_id": session_id,
|
|
"filename": filename,
|
|
"name": session_name,
|
|
"image_width": img_bgr.shape[1],
|
|
"image_height": img_bgr.shape[0],
|
|
"original_image_url": f"/api/v1/ocr-pipeline/sessions/{session_id}/image/original",
|
|
}
|
|
|
|
|
|
async def _create_multi_page_sessions(
|
|
pdf_data: bytes,
|
|
filename: str,
|
|
base_name: str,
|
|
page_count: int,
|
|
) -> dict:
|
|
"""Create one session per PDF page, grouped by document_group_id."""
|
|
document_group_id = str(uuid.uuid4())
|
|
pages = []
|
|
|
|
for page_idx in range(page_count):
|
|
session_id = str(uuid.uuid4())
|
|
page_name = f"{base_name} — Seite {page_idx + 1}"
|
|
|
|
try:
|
|
img_bgr = render_pdf_high_res(pdf_data, page_number=page_idx, zoom=3.0)
|
|
except Exception as e:
|
|
logger.warning(f"Failed to render PDF page {page_idx + 1}: {e}")
|
|
continue
|
|
|
|
ok, png_buf = cv2.imencode(".png", img_bgr)
|
|
if not ok:
|
|
continue
|
|
page_png = png_buf.tobytes()
|
|
|
|
await create_session_db(
|
|
session_id=session_id,
|
|
name=page_name,
|
|
filename=filename,
|
|
original_png=page_png,
|
|
document_group_id=document_group_id,
|
|
page_number=page_idx + 1,
|
|
)
|
|
|
|
_cache[session_id] = {
|
|
"id": session_id,
|
|
"filename": filename,
|
|
"name": page_name,
|
|
"original_bgr": img_bgr,
|
|
"oriented_bgr": None,
|
|
"cropped_bgr": None,
|
|
"deskewed_bgr": None,
|
|
"dewarped_bgr": None,
|
|
"orientation_result": None,
|
|
"crop_result": None,
|
|
"deskew_result": None,
|
|
"dewarp_result": None,
|
|
"ground_truth": {},
|
|
"current_step": 1,
|
|
}
|
|
|
|
h, w = img_bgr.shape[:2]
|
|
pages.append({
|
|
"session_id": session_id,
|
|
"name": page_name,
|
|
"page_number": page_idx + 1,
|
|
"image_width": w,
|
|
"image_height": h,
|
|
"original_image_url": f"/api/v1/ocr-pipeline/sessions/{session_id}/image/original",
|
|
})
|
|
|
|
logger.info(
|
|
f"OCR Pipeline: created page session {session_id} "
|
|
f"(page {page_idx + 1}/{page_count}) from {filename} ({w}x{h})"
|
|
)
|
|
|
|
# Include session_id pointing to first page for backwards compatibility
|
|
# (frontends that expect a single session_id will navigate to page 1)
|
|
first_session_id = pages[0]["session_id"] if pages else None
|
|
|
|
return {
|
|
"session_id": first_session_id,
|
|
"document_group_id": document_group_id,
|
|
"filename": filename,
|
|
"name": base_name,
|
|
"page_count": page_count,
|
|
"pages": pages,
|
|
}
|
|
|
|
|
|
@router.get("/sessions/{session_id}")
|
|
async def get_session_info(session_id: str):
|
|
"""Get session info including deskew/dewarp/column results for step navigation."""
|
|
session = await get_session_db(session_id)
|
|
if not session:
|
|
raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
|
|
|
|
# Get image dimensions from original PNG
|
|
original_png = await get_session_image(session_id, "original")
|
|
if original_png:
|
|
arr = np.frombuffer(original_png, dtype=np.uint8)
|
|
img = cv2.imdecode(arr, cv2.IMREAD_COLOR)
|
|
img_w, img_h = img.shape[1], img.shape[0] if img is not None else (0, 0)
|
|
else:
|
|
img_w, img_h = 0, 0
|
|
|
|
result = {
|
|
"session_id": session["id"],
|
|
"filename": session.get("filename", ""),
|
|
"name": session.get("name", ""),
|
|
"image_width": img_w,
|
|
"image_height": img_h,
|
|
"original_image_url": f"/api/v1/ocr-pipeline/sessions/{session_id}/image/original",
|
|
"current_step": session.get("current_step", 1),
|
|
"document_category": session.get("document_category"),
|
|
"doc_type": session.get("doc_type"),
|
|
}
|
|
|
|
if session.get("orientation_result"):
|
|
result["orientation_result"] = session["orientation_result"]
|
|
if session.get("crop_result"):
|
|
result["crop_result"] = session["crop_result"]
|
|
if session.get("deskew_result"):
|
|
result["deskew_result"] = session["deskew_result"]
|
|
if session.get("dewarp_result"):
|
|
result["dewarp_result"] = session["dewarp_result"]
|
|
if session.get("column_result"):
|
|
result["column_result"] = session["column_result"]
|
|
if session.get("row_result"):
|
|
result["row_result"] = session["row_result"]
|
|
if session.get("word_result"):
|
|
result["word_result"] = session["word_result"]
|
|
if session.get("doc_type_result"):
|
|
result["doc_type_result"] = session["doc_type_result"]
|
|
if session.get("structure_result"):
|
|
result["structure_result"] = session["structure_result"]
|
|
if session.get("grid_editor_result"):
|
|
# Include summary only to keep response small
|
|
gr = session["grid_editor_result"]
|
|
result["grid_editor_result"] = {
|
|
"summary": gr.get("summary", {}),
|
|
"zones_count": len(gr.get("zones", [])),
|
|
"edited": gr.get("edited", False),
|
|
}
|
|
if session.get("ground_truth"):
|
|
result["ground_truth"] = session["ground_truth"]
|
|
|
|
# Box sub-session info (zone_type='box' from column detection — NOT page-split)
|
|
if session.get("parent_session_id"):
|
|
result["parent_session_id"] = session["parent_session_id"]
|
|
result["box_index"] = session.get("box_index")
|
|
else:
|
|
# Check for box sub-sessions (column detection creates these)
|
|
subs = await get_sub_sessions(session_id)
|
|
if subs:
|
|
result["sub_sessions"] = [
|
|
{"id": s["id"], "name": s.get("name"), "box_index": s.get("box_index")}
|
|
for s in subs
|
|
]
|
|
|
|
return result
|
|
|
|
|
|
@router.put("/sessions/{session_id}")
|
|
async def update_session(session_id: str, req: UpdateSessionRequest):
|
|
"""Update session name and/or document category."""
|
|
kwargs: Dict[str, Any] = {}
|
|
if req.name is not None:
|
|
kwargs["name"] = req.name
|
|
if req.document_category is not None:
|
|
if req.document_category not in VALID_DOCUMENT_CATEGORIES:
|
|
raise HTTPException(
|
|
status_code=400,
|
|
detail=f"Invalid category '{req.document_category}'. Valid: {sorted(VALID_DOCUMENT_CATEGORIES)}",
|
|
)
|
|
kwargs["document_category"] = req.document_category
|
|
if not kwargs:
|
|
raise HTTPException(status_code=400, detail="Nothing to update")
|
|
updated = await update_session_db(session_id, **kwargs)
|
|
if not updated:
|
|
raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
|
|
return {"session_id": session_id, **kwargs}
|
|
|
|
|
|
@router.delete("/sessions/{session_id}")
|
|
async def delete_session(session_id: str):
|
|
"""Delete a session."""
|
|
_cache.pop(session_id, None)
|
|
deleted = await delete_session_db(session_id)
|
|
if not deleted:
|
|
raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
|
|
return {"session_id": session_id, "deleted": True}
|
|
|
|
|
|
@router.delete("/sessions")
|
|
async def delete_all_sessions():
|
|
"""Delete ALL sessions (cleanup)."""
|
|
_cache.clear()
|
|
count = await delete_all_sessions_db()
|
|
return {"deleted_count": count}
|
|
|
|
|
|
@router.post("/sessions/{session_id}/create-box-sessions")
|
|
async def create_box_sessions(session_id: str):
|
|
"""Create sub-sessions for each detected box region.
|
|
|
|
Crops box regions from the cropped/dewarped image and creates
|
|
independent sub-sessions that can be processed through the pipeline.
|
|
"""
|
|
session = await get_session_db(session_id)
|
|
if not session:
|
|
raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
|
|
|
|
column_result = session.get("column_result")
|
|
if not column_result:
|
|
raise HTTPException(status_code=400, detail="Column detection must be completed first")
|
|
|
|
zones = column_result.get("zones") or []
|
|
box_zones = [z for z in zones if z.get("zone_type") == "box" and z.get("box")]
|
|
if not box_zones:
|
|
return {"session_id": session_id, "sub_sessions": [], "message": "No boxes detected"}
|
|
|
|
# Check for existing sub-sessions
|
|
existing = await get_sub_sessions(session_id)
|
|
if existing:
|
|
return {
|
|
"session_id": session_id,
|
|
"sub_sessions": [{"id": s["id"], "box_index": s.get("box_index")} for s in existing],
|
|
"message": f"{len(existing)} sub-session(s) already exist",
|
|
}
|
|
|
|
# Load base image
|
|
base_png = await get_session_image(session_id, "cropped")
|
|
if not base_png:
|
|
base_png = await get_session_image(session_id, "dewarped")
|
|
if not base_png:
|
|
raise HTTPException(status_code=400, detail="No base image available")
|
|
|
|
arr = np.frombuffer(base_png, dtype=np.uint8)
|
|
img = cv2.imdecode(arr, cv2.IMREAD_COLOR)
|
|
if img is None:
|
|
raise HTTPException(status_code=500, detail="Failed to decode image")
|
|
|
|
parent_name = session.get("name", "Session")
|
|
created = []
|
|
|
|
for i, zone in enumerate(box_zones):
|
|
box = zone["box"]
|
|
bx, by = box["x"], box["y"]
|
|
bw, bh = box["width"], box["height"]
|
|
|
|
# Crop box region with small padding
|
|
pad = 5
|
|
y1 = max(0, by - pad)
|
|
y2 = min(img.shape[0], by + bh + pad)
|
|
x1 = max(0, bx - pad)
|
|
x2 = min(img.shape[1], bx + bw + pad)
|
|
crop = img[y1:y2, x1:x2]
|
|
|
|
# Encode as PNG
|
|
success, png_buf = cv2.imencode(".png", crop)
|
|
if not success:
|
|
logger.warning(f"Failed to encode box {i} crop for session {session_id}")
|
|
continue
|
|
|
|
sub_id = str(uuid.uuid4())
|
|
sub_name = f"{parent_name} — Box {i + 1}"
|
|
|
|
await create_session_db(
|
|
session_id=sub_id,
|
|
name=sub_name,
|
|
filename=session.get("filename", "box-crop.png"),
|
|
original_png=png_buf.tobytes(),
|
|
parent_session_id=session_id,
|
|
box_index=i,
|
|
)
|
|
|
|
# Cache the BGR for immediate processing
|
|
# Promote original to cropped so column/row/word detection finds it
|
|
box_bgr = crop.copy()
|
|
_cache[sub_id] = {
|
|
"id": sub_id,
|
|
"filename": session.get("filename", "box-crop.png"),
|
|
"name": sub_name,
|
|
"parent_session_id": session_id,
|
|
"original_bgr": box_bgr,
|
|
"oriented_bgr": None,
|
|
"cropped_bgr": box_bgr,
|
|
"deskewed_bgr": None,
|
|
"dewarped_bgr": None,
|
|
"orientation_result": None,
|
|
"crop_result": None,
|
|
"deskew_result": None,
|
|
"dewarp_result": None,
|
|
"ground_truth": {},
|
|
"current_step": 1,
|
|
}
|
|
|
|
created.append({
|
|
"id": sub_id,
|
|
"name": sub_name,
|
|
"box_index": i,
|
|
"box": box,
|
|
"image_width": crop.shape[1],
|
|
"image_height": crop.shape[0],
|
|
})
|
|
|
|
logger.info(f"Created box sub-session {sub_id} for session {session_id} "
|
|
f"(box {i}, {crop.shape[1]}x{crop.shape[0]})")
|
|
|
|
return {
|
|
"session_id": session_id,
|
|
"sub_sessions": created,
|
|
"total": len(created),
|
|
}
|