Multi-page PDF support: create one session per page
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Failing after 27s
CI / test-go-edu-search (push) Successful in 39s
CI / test-python-klausur (push) Failing after 2m36s
CI / test-python-agent-core (push) Successful in 24s
CI / test-nodejs-website (push) Successful in 35s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Failing after 27s
CI / test-go-edu-search (push) Successful in 39s
CI / test-python-klausur (push) Failing after 2m36s
CI / test-python-agent-core (push) Successful in 24s
CI / test-nodejs-website (push) Successful in 35s
When uploading a PDF with > 1 page to the OCR pipeline, each page now gets its own session (grouped by document_group_id). Previously only page 1 was processed. The response includes a pages array with all session IDs so the frontend can navigate between them. Single-page PDFs and images continue to work as before. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -71,13 +71,36 @@ async def create_session(
|
|||||||
file: UploadFile = File(...),
|
file: UploadFile = File(...),
|
||||||
name: Optional[str] = Form(None),
|
name: Optional[str] = Form(None),
|
||||||
):
|
):
|
||||||
"""Upload a PDF or image file and create a pipeline session."""
|
"""Upload a PDF or image file and create a pipeline session.
|
||||||
|
|
||||||
|
For multi-page PDFs (> 1 page), each page becomes its own session
|
||||||
|
grouped under a ``document_group_id``. The response includes a
|
||||||
|
``pages`` array with one entry per page/session.
|
||||||
|
"""
|
||||||
file_data = await file.read()
|
file_data = await file.read()
|
||||||
filename = file.filename or "upload"
|
filename = file.filename or "upload"
|
||||||
content_type = file.content_type or ""
|
content_type = file.content_type or ""
|
||||||
|
|
||||||
session_id = str(uuid.uuid4())
|
|
||||||
is_pdf = content_type == "application/pdf" or filename.lower().endswith(".pdf")
|
is_pdf = content_type == "application/pdf" or filename.lower().endswith(".pdf")
|
||||||
|
session_name = name or filename
|
||||||
|
|
||||||
|
# --- Multi-page PDF handling ---
|
||||||
|
if is_pdf:
|
||||||
|
try:
|
||||||
|
import fitz # PyMuPDF
|
||||||
|
pdf_doc = fitz.open(stream=file_data, filetype="pdf")
|
||||||
|
page_count = pdf_doc.page_count
|
||||||
|
pdf_doc.close()
|
||||||
|
except Exception as e:
|
||||||
|
raise HTTPException(status_code=400, detail=f"Could not read PDF: {e}")
|
||||||
|
|
||||||
|
if page_count > 1:
|
||||||
|
return await _create_multi_page_sessions(
|
||||||
|
file_data, filename, session_name, page_count,
|
||||||
|
)
|
||||||
|
|
||||||
|
# --- Single page (image or 1-page PDF) ---
|
||||||
|
session_id = str(uuid.uuid4())
|
||||||
|
|
||||||
try:
|
try:
|
||||||
if is_pdf:
|
if is_pdf:
|
||||||
@@ -93,7 +116,6 @@ async def create_session(
|
|||||||
raise HTTPException(status_code=500, detail="Failed to encode image")
|
raise HTTPException(status_code=500, detail="Failed to encode image")
|
||||||
|
|
||||||
original_png = png_buf.tobytes()
|
original_png = png_buf.tobytes()
|
||||||
session_name = name or filename
|
|
||||||
|
|
||||||
# Persist to DB
|
# Persist to DB
|
||||||
await create_session_db(
|
await create_session_db(
|
||||||
@@ -134,6 +156,81 @@ async def create_session(
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
async def _create_multi_page_sessions(
|
||||||
|
pdf_data: bytes,
|
||||||
|
filename: str,
|
||||||
|
base_name: str,
|
||||||
|
page_count: int,
|
||||||
|
) -> dict:
|
||||||
|
"""Create one session per PDF page, grouped by document_group_id."""
|
||||||
|
document_group_id = str(uuid.uuid4())
|
||||||
|
pages = []
|
||||||
|
|
||||||
|
for page_idx in range(page_count):
|
||||||
|
session_id = str(uuid.uuid4())
|
||||||
|
page_name = f"{base_name} — Seite {page_idx + 1}"
|
||||||
|
|
||||||
|
try:
|
||||||
|
img_bgr = render_pdf_high_res(pdf_data, page_number=page_idx, zoom=3.0)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Failed to render PDF page {page_idx + 1}: {e}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
ok, png_buf = cv2.imencode(".png", img_bgr)
|
||||||
|
if not ok:
|
||||||
|
continue
|
||||||
|
page_png = png_buf.tobytes()
|
||||||
|
|
||||||
|
await create_session_db(
|
||||||
|
session_id=session_id,
|
||||||
|
name=page_name,
|
||||||
|
filename=filename,
|
||||||
|
original_png=page_png,
|
||||||
|
document_group_id=document_group_id,
|
||||||
|
page_number=page_idx + 1,
|
||||||
|
)
|
||||||
|
|
||||||
|
_cache[session_id] = {
|
||||||
|
"id": session_id,
|
||||||
|
"filename": filename,
|
||||||
|
"name": page_name,
|
||||||
|
"original_bgr": img_bgr,
|
||||||
|
"oriented_bgr": None,
|
||||||
|
"cropped_bgr": None,
|
||||||
|
"deskewed_bgr": None,
|
||||||
|
"dewarped_bgr": None,
|
||||||
|
"orientation_result": None,
|
||||||
|
"crop_result": None,
|
||||||
|
"deskew_result": None,
|
||||||
|
"dewarp_result": None,
|
||||||
|
"ground_truth": {},
|
||||||
|
"current_step": 1,
|
||||||
|
}
|
||||||
|
|
||||||
|
h, w = img_bgr.shape[:2]
|
||||||
|
pages.append({
|
||||||
|
"session_id": session_id,
|
||||||
|
"name": page_name,
|
||||||
|
"page_number": page_idx + 1,
|
||||||
|
"image_width": w,
|
||||||
|
"image_height": h,
|
||||||
|
"original_image_url": f"/api/v1/ocr-pipeline/sessions/{session_id}/image/original",
|
||||||
|
})
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
f"OCR Pipeline: created page session {session_id} "
|
||||||
|
f"(page {page_idx + 1}/{page_count}) from {filename} ({w}x{h})"
|
||||||
|
)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"document_group_id": document_group_id,
|
||||||
|
"filename": filename,
|
||||||
|
"name": base_name,
|
||||||
|
"page_count": page_count,
|
||||||
|
"pages": pages,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
@router.get("/sessions/{session_id}")
|
@router.get("/sessions/{session_id}")
|
||||||
async def get_session_info(session_id: str):
|
async def get_session_info(session_id: str):
|
||||||
"""Get session info including deskew/dewarp/column results for step navigation."""
|
"""Get session info including deskew/dewarp/column results for step navigation."""
|
||||||
|
|||||||
Reference in New Issue
Block a user