Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 29s
CI / test-go-edu-search (push) Successful in 28s
CI / test-python-klausur (push) Failing after 2m24s
CI / test-python-agent-core (push) Successful in 22s
CI / test-nodejs-website (push) Successful in 20s
Phase 1 of the clean architecture refactor: Replaces the 751-line ocr-overlay monolith with a modular pipeline. Each step gets its own component file. Frontend: /ai/ocr-kombi route with 11 steps (Upload, Orientation, PageSplit, Deskew, Dewarp, ContentCrop, OCR, Structure, GridBuild, GridReview, GroundTruth). Session list supports document grouping for multi-page uploads. Backend: New ocr_kombi/ module with multi-page PDF upload (splits PDF into N sessions with shared document_group_id). DB migration adds document_group_id and page_number columns. Old /ai/ocr-overlay remains fully functional for A/B testing. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
133 lines
3.9 KiB
Python
133 lines
3.9 KiB
Python
"""
|
|
Step 1: Upload — handles single images and multi-page PDFs.
|
|
|
|
Multi-page PDFs are split into individual PNG pages, each getting its own
|
|
session linked by a shared document_group_id.
|
|
"""
|
|
|
|
import io
|
|
import uuid
|
|
import logging
|
|
import time
|
|
from typing import Optional
|
|
|
|
from fastapi import APIRouter, UploadFile, File, Form, HTTPException
|
|
|
|
from ocr_pipeline_session_store import create_session_db, get_document_group_sessions
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
router = APIRouter()
|
|
|
|
|
|
def _pdf_to_pngs(pdf_bytes: bytes) -> list[bytes]:
|
|
"""Convert a PDF to a list of PNG byte buffers (one per page)."""
|
|
try:
|
|
import fitz # PyMuPDF
|
|
except ImportError:
|
|
raise HTTPException(
|
|
status_code=500,
|
|
detail="PDF-Verarbeitung nicht verfuegbar (PyMuPDF fehlt)"
|
|
)
|
|
|
|
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
|
|
pages: list[bytes] = []
|
|
for page in doc:
|
|
# Render at 300 DPI for OCR quality
|
|
mat = fitz.Matrix(300 / 72, 300 / 72)
|
|
pix = page.get_pixmap(matrix=mat)
|
|
pages.append(pix.tobytes("png"))
|
|
doc.close()
|
|
return pages
|
|
|
|
|
|
@router.post("/upload")
|
|
async def upload_document(
|
|
file: UploadFile = File(...),
|
|
name: Optional[str] = Form(None),
|
|
document_category: Optional[str] = Form(None),
|
|
):
|
|
"""Upload a single image or multi-page PDF.
|
|
|
|
Single image: Creates 1 session with document_group_id + page_number=1.
|
|
Multi-page PDF: Creates N sessions with shared document_group_id,
|
|
page_number 1..N, and titles "Title — S. X".
|
|
"""
|
|
t0 = time.time()
|
|
file_bytes = await file.read()
|
|
filename = file.filename or "upload"
|
|
base_title = name or filename.rsplit(".", 1)[0]
|
|
|
|
is_pdf = (
|
|
filename.lower().endswith(".pdf")
|
|
or file.content_type == "application/pdf"
|
|
or file_bytes[:4] == b"%PDF"
|
|
)
|
|
|
|
group_id = str(uuid.uuid4())
|
|
created_sessions = []
|
|
|
|
if is_pdf:
|
|
pages = _pdf_to_pngs(file_bytes)
|
|
if not pages:
|
|
raise HTTPException(status_code=400, detail="PDF enthaelt keine Seiten")
|
|
|
|
for i, png_bytes in enumerate(pages, start=1):
|
|
session_id = str(uuid.uuid4())
|
|
page_title = f"{base_title} — S. {i}" if len(pages) > 1 else base_title
|
|
session = await create_session_db(
|
|
session_id=session_id,
|
|
name=page_title,
|
|
filename=filename,
|
|
original_png=png_bytes,
|
|
document_group_id=group_id,
|
|
page_number=i,
|
|
)
|
|
created_sessions.append({
|
|
"session_id": session["id"],
|
|
"name": session["name"],
|
|
"page_number": i,
|
|
})
|
|
else:
|
|
# Single image
|
|
session_id = str(uuid.uuid4())
|
|
session = await create_session_db(
|
|
session_id=session_id,
|
|
name=base_title,
|
|
filename=filename,
|
|
original_png=file_bytes,
|
|
document_group_id=group_id,
|
|
page_number=1,
|
|
)
|
|
created_sessions.append({
|
|
"session_id": session["id"],
|
|
"name": session["name"],
|
|
"page_number": 1,
|
|
})
|
|
|
|
duration = round(time.time() - t0, 2)
|
|
logger.info(
|
|
"Upload complete: %d page(s), group=%s, %.2fs",
|
|
len(created_sessions), group_id, duration,
|
|
)
|
|
|
|
return {
|
|
"document_group_id": group_id,
|
|
"page_count": len(created_sessions),
|
|
"sessions": created_sessions,
|
|
"duration_seconds": duration,
|
|
}
|
|
|
|
|
|
@router.get("/documents/{group_id}")
|
|
async def get_document_group(group_id: str):
|
|
"""Get all sessions in a document group, sorted by page_number."""
|
|
sessions = await get_document_group_sessions(group_id)
|
|
if not sessions:
|
|
raise HTTPException(status_code=404, detail="Dokumentgruppe nicht gefunden")
|
|
return {
|
|
"document_group_id": group_id,
|
|
"page_count": len(sessions),
|
|
"sessions": sessions,
|
|
}
|