Fix: Remove broken getKlausurApiUrl and clean up empty lines
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 42s
CI / test-go-edu-search (push) Successful in 34s
CI / test-python-klausur (push) Failing after 2m51s
CI / test-python-agent-core (push) Successful in 21s
CI / test-nodejs-website (push) Successful in 29s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 42s
CI / test-go-edu-search (push) Successful in 34s
CI / test-python-klausur (push) Failing after 2m51s
CI / test-python-agent-core (push) Successful in 21s
CI / test-nodejs-website (push) Successful in 29s
sed replacement left orphaned hostname references in story page and empty lines in getApiBase functions. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
490
klausur-service/backend/vocab_worksheet_upload_api.py
Normal file
490
klausur-service/backend/vocab_worksheet_upload_api.py
Normal file
@@ -0,0 +1,490 @@
|
||||
"""
|
||||
Vocab Worksheet Upload API — PDF upload, thumbnails, and page processing.
|
||||
|
||||
Extracted from vocab_worksheet_api.py to keep modules under 500 LOC.
|
||||
|
||||
Routes (no prefix — included into the main /api/v1/vocab router):
|
||||
POST /sessions/{session_id}/upload-pdf-info
|
||||
GET /sessions/{session_id}/pdf-thumbnail/{page_number}
|
||||
GET /sessions/{session_id}/pdf-page-image/{page_number}
|
||||
POST /sessions/{session_id}/process-single-page/{page_number}
|
||||
POST /sessions/{session_id}/process-pages
|
||||
"""
|
||||
|
||||
import io
|
||||
import logging
|
||||
import os
|
||||
import uuid
|
||||
from typing import List, Optional
|
||||
|
||||
from fastapi import APIRouter, HTTPException, Query, UploadFile, File
|
||||
from fastapi.responses import StreamingResponse
|
||||
|
||||
from vocab_worksheet_models import SessionStatus, VocabularyEntry
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Local storage path
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
LOCAL_STORAGE_PATH = os.getenv("VOCAB_STORAGE_PATH", "/app/vocab-worksheets")
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Optional heavy dependencies
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
try:
|
||||
import numpy as np
|
||||
from cv_preprocessing import render_pdf_high_res, detect_and_fix_orientation
|
||||
OCR_PIPELINE_AVAILABLE = True
|
||||
except ImportError:
|
||||
np = None # type: ignore[assignment]
|
||||
OCR_PIPELINE_AVAILABLE = False
|
||||
logger.warning("OCR pipeline imports not available in upload module")
|
||||
|
||||
# Sub-module imports (already split out)
|
||||
from vocab_worksheet_generation import (
|
||||
convert_pdf_page_to_image,
|
||||
convert_pdf_to_images,
|
||||
get_pdf_page_count,
|
||||
)
|
||||
from vocab_worksheet_extraction import extract_vocabulary_from_image
|
||||
|
||||
try:
|
||||
from vocab_worksheet_ocr import _run_ocr_pipeline_for_page
|
||||
except ImportError:
|
||||
_run_ocr_pipeline_for_page = None # type: ignore[assignment]
|
||||
logger.warning("vocab_worksheet_ocr not available — process-single-page disabled")
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# In-memory session store (shared with main module)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _get_sessions():
|
||||
from vocab_worksheet_api import _sessions
|
||||
return _sessions
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Router (no prefix — will be included into the main vocab router)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
upload_router = APIRouter()
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# POST /sessions/{session_id}/upload-pdf-info
|
||||
# =============================================================================
|
||||
|
||||
@upload_router.post("/sessions/{session_id}/upload-pdf-info")
|
||||
async def upload_pdf_get_info(
|
||||
session_id: str,
|
||||
file: UploadFile = File(...),
|
||||
):
|
||||
"""
|
||||
Upload a PDF and get page count and thumbnails for preview.
|
||||
Use this before processing to let user select pages.
|
||||
"""
|
||||
logger.info(f"PDF info request for session {session_id}")
|
||||
|
||||
if session_id not in _get_sessions():
|
||||
raise HTTPException(status_code=404, detail="Session not found")
|
||||
|
||||
session = _get_sessions()[session_id]
|
||||
|
||||
# Validate file type
|
||||
extension = file.filename.split('.')[-1].lower() if file.filename else ''
|
||||
content_type = file.content_type or ''
|
||||
|
||||
if extension != 'pdf' and content_type != 'application/pdf':
|
||||
raise HTTPException(status_code=400, detail="Only PDF files supported for this endpoint")
|
||||
|
||||
content = await file.read()
|
||||
|
||||
# Save PDF temporarily
|
||||
session_dir = os.path.join(LOCAL_STORAGE_PATH, session_id)
|
||||
os.makedirs(session_dir, exist_ok=True)
|
||||
pdf_path = os.path.join(session_dir, "source.pdf")
|
||||
|
||||
with open(pdf_path, 'wb') as f:
|
||||
f.write(content)
|
||||
|
||||
# Get page count
|
||||
page_count = get_pdf_page_count(content)
|
||||
|
||||
# Store PDF data in session for later processing
|
||||
session["pdf_data"] = content
|
||||
session["pdf_path"] = pdf_path
|
||||
session["pdf_page_count"] = page_count
|
||||
session["status"] = "pdf_uploaded"
|
||||
|
||||
# Detect orientation for each page so thumbnails are shown correctly
|
||||
page_rotations: dict = {}
|
||||
if OCR_PIPELINE_AVAILABLE:
|
||||
for pg in range(page_count):
|
||||
try:
|
||||
img_bgr = render_pdf_high_res(content, pg, zoom=2.0)
|
||||
_, rotation = detect_and_fix_orientation(img_bgr)
|
||||
if rotation:
|
||||
page_rotations[pg] = rotation
|
||||
logger.info(f"Page {pg + 1}: orientation {rotation}°")
|
||||
except Exception as e:
|
||||
logger.warning(f"Orientation detection failed for page {pg + 1}: {e}")
|
||||
session["page_rotations"] = page_rotations
|
||||
|
||||
return {
|
||||
"session_id": session_id,
|
||||
"page_count": page_count,
|
||||
"filename": file.filename,
|
||||
"page_rotations": page_rotations,
|
||||
}
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# GET /sessions/{session_id}/pdf-thumbnail/{page_number}
|
||||
# =============================================================================
|
||||
|
||||
@upload_router.get("/sessions/{session_id}/pdf-thumbnail/{page_number}")
|
||||
async def get_pdf_thumbnail(session_id: str, page_number: int, hires: bool = Query(False)):
|
||||
"""Get a thumbnail image of a specific PDF page.
|
||||
|
||||
Uses fitz for rendering so that page_rotations (from OCR orientation
|
||||
detection) are applied consistently.
|
||||
|
||||
Args:
|
||||
hires: If True, return full-resolution image (zoom=2.0) instead of thumbnail (zoom=0.5).
|
||||
"""
|
||||
if session_id not in _get_sessions():
|
||||
raise HTTPException(status_code=404, detail="Session not found")
|
||||
|
||||
session = _get_sessions()[session_id]
|
||||
pdf_data = session.get("pdf_data")
|
||||
|
||||
if not pdf_data:
|
||||
raise HTTPException(status_code=400, detail="No PDF uploaded for this session")
|
||||
|
||||
try:
|
||||
import fitz
|
||||
zoom = 2.0 if hires else 0.5
|
||||
pdf_document = fitz.open(stream=pdf_data, filetype="pdf")
|
||||
page = pdf_document[page_number]
|
||||
# Apply orientation correction detected during OCR processing
|
||||
rot = session.get("page_rotations", {}).get(page_number, 0)
|
||||
if rot:
|
||||
page.set_rotation(rot)
|
||||
mat = fitz.Matrix(zoom, zoom)
|
||||
pix = page.get_pixmap(matrix=mat)
|
||||
png_data = pix.tobytes("png")
|
||||
pdf_document.close()
|
||||
except Exception as e:
|
||||
logger.error(f"PDF thumbnail failed: {e}")
|
||||
raise HTTPException(status_code=500, detail=f"PDF rendering failed: {str(e)}")
|
||||
|
||||
return StreamingResponse(
|
||||
io.BytesIO(png_data),
|
||||
media_type="image/png",
|
||||
)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# GET /sessions/{session_id}/pdf-page-image/{page_number}
|
||||
# =============================================================================
|
||||
|
||||
@upload_router.get("/sessions/{session_id}/pdf-page-image/{page_number}")
|
||||
async def get_pdf_page_image(session_id: str, page_number: int, zoom: float = Query(2.0, ge=0.5, le=4.0)):
|
||||
"""PDF page as PNG at arbitrary resolution (for editor view).
|
||||
|
||||
Args:
|
||||
zoom: Zoom factor (0.5=72DPI, 1.0=144DPI, 2.0=288DPI, 4.0=576DPI).
|
||||
"""
|
||||
if session_id not in _get_sessions():
|
||||
raise HTTPException(status_code=404, detail="Session not found")
|
||||
|
||||
session = _get_sessions()[session_id]
|
||||
pdf_data = session.get("pdf_data")
|
||||
|
||||
if not pdf_data:
|
||||
raise HTTPException(status_code=400, detail="No PDF uploaded for this session")
|
||||
|
||||
page_count = session.get("pdf_page_count", 1)
|
||||
if page_number < 0 or page_number >= page_count:
|
||||
raise HTTPException(status_code=400, detail=f"Invalid page number. PDF has {page_count} pages (0-indexed).")
|
||||
|
||||
try:
|
||||
import fitz
|
||||
pdf_document = fitz.open(stream=pdf_data, filetype="pdf")
|
||||
page = pdf_document[page_number]
|
||||
# Apply orientation correction detected during OCR processing
|
||||
rot = session.get("page_rotations", {}).get(page_number, 0)
|
||||
if rot:
|
||||
page.set_rotation(rot)
|
||||
mat = fitz.Matrix(zoom, zoom)
|
||||
pix = page.get_pixmap(matrix=mat)
|
||||
png_data = pix.tobytes("png")
|
||||
pdf_document.close()
|
||||
logger.info(f"PDF page {page_number} rendered at zoom={zoom} rot={rot}: {len(png_data)} bytes")
|
||||
except Exception as e:
|
||||
logger.error(f"PDF page image failed: {e}")
|
||||
raise HTTPException(status_code=500, detail=f"PDF rendering failed: {str(e)}")
|
||||
|
||||
return StreamingResponse(
|
||||
io.BytesIO(png_data),
|
||||
media_type="image/png",
|
||||
)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# POST /sessions/{session_id}/process-single-page/{page_number}
|
||||
# =============================================================================
|
||||
|
||||
@upload_router.post("/sessions/{session_id}/process-single-page/{page_number}")
|
||||
async def process_single_page(
|
||||
session_id: str,
|
||||
page_number: int,
|
||||
ipa_mode: str = Query("none", pattern="^(auto|all|de|en|none)$"),
|
||||
syllable_mode: str = Query("none", pattern="^(auto|all|de|en|none)$"),
|
||||
enhance: bool = Query(True, description="Step 3: CLAHE + Denoise for degraded scans"),
|
||||
max_cols: int = Query(3, description="Step 2: Max column count (0=unlimited)"),
|
||||
min_conf: int = Query(0, description="Step 1: Min OCR confidence (0=auto from quality score)"),
|
||||
):
|
||||
"""
|
||||
Process a SINGLE page of an uploaded PDF using the Kombi OCR pipeline.
|
||||
|
||||
Uses the full Kombi pipeline (orientation -> deskew -> dewarp -> crop ->
|
||||
dual-engine OCR -> grid-build with autocorrect/merge) for best quality.
|
||||
|
||||
Query params:
|
||||
ipa_mode: "none" (default), "auto", "all", "en", "de"
|
||||
syllable_mode: "none" (default), "auto", "all", "en", "de"
|
||||
enhance: true (default) -- apply CLAHE/denoise for degraded scans
|
||||
max_cols: 3 (default) -- max column count (0=unlimited)
|
||||
min_conf: 0 (default=auto) -- min OCR confidence (0=from quality score)
|
||||
|
||||
The frontend should call this sequentially for each page.
|
||||
Returns the vocabulary for just this one page.
|
||||
"""
|
||||
logger.info(f"Processing SINGLE page {page_number + 1} for session {session_id}")
|
||||
|
||||
if session_id not in _get_sessions():
|
||||
raise HTTPException(
|
||||
status_code=404,
|
||||
detail="Session nicht im Speicher. Bitte erstellen Sie eine neue Session und laden Sie das PDF erneut hoch.",
|
||||
)
|
||||
|
||||
session = _get_sessions()[session_id]
|
||||
pdf_data = session.get("pdf_data")
|
||||
|
||||
if not pdf_data:
|
||||
raise HTTPException(status_code=400, detail="No PDF uploaded for this session")
|
||||
|
||||
page_count = session.get("pdf_page_count", 1)
|
||||
|
||||
if page_number < 0 or page_number >= page_count:
|
||||
raise HTTPException(status_code=400, detail=f"Invalid page number. PDF has {page_count} pages (0-indexed).")
|
||||
|
||||
# Derive pipeline-level variable names for the quality report
|
||||
enable_enhance = enhance
|
||||
max_columns = max_cols if max_cols > 0 else None
|
||||
override_min_conf = min_conf if min_conf > 0 else None
|
||||
|
||||
# --- OCR Pipeline path (use same render_pdf_high_res as admin OCR pipeline) ---
|
||||
rotation_deg = 0
|
||||
quality_report = None
|
||||
min_ocr_conf = 40 # default; overridden by pipeline when quality report is available
|
||||
if OCR_PIPELINE_AVAILABLE and _run_ocr_pipeline_for_page is not None:
|
||||
try:
|
||||
img_bgr = render_pdf_high_res(pdf_data, page_number, zoom=3.0)
|
||||
page_vocabulary, rotation_deg, quality_report = await _run_ocr_pipeline_for_page(
|
||||
img_bgr, page_number, session_id,
|
||||
ipa_mode=ipa_mode, syllable_mode=syllable_mode,
|
||||
enable_enhance=enable_enhance,
|
||||
max_columns=max_columns,
|
||||
override_min_conf=override_min_conf,
|
||||
)
|
||||
# Update min_ocr_conf from quality report if available
|
||||
if quality_report and hasattr(quality_report, 'recommended_min_conf'):
|
||||
min_ocr_conf = quality_report.recommended_min_conf
|
||||
except Exception as e:
|
||||
logger.error(f"OCR pipeline failed for page {page_number + 1}: {e}", exc_info=True)
|
||||
return {
|
||||
"session_id": session_id,
|
||||
"page_number": page_number + 1,
|
||||
"success": False,
|
||||
"error": f"OCR pipeline error: {e}",
|
||||
"vocabulary": [],
|
||||
"vocabulary_count": 0,
|
||||
}
|
||||
else:
|
||||
# Fallback to LLM vision extraction
|
||||
logger.warning("OCR pipeline not available, falling back to LLM vision")
|
||||
image_data = await convert_pdf_page_to_image(pdf_data, page_number, thumbnail=False)
|
||||
vocabulary, confidence, error = await extract_vocabulary_from_image(
|
||||
image_data,
|
||||
f"page_{page_number + 1}.png",
|
||||
page_number=page_number
|
||||
)
|
||||
if error:
|
||||
logger.warning(f"Page {page_number + 1} failed: {error}")
|
||||
return {
|
||||
"session_id": session_id,
|
||||
"page_number": page_number + 1,
|
||||
"success": False,
|
||||
"error": error,
|
||||
"vocabulary": [],
|
||||
"vocabulary_count": 0,
|
||||
}
|
||||
page_vocabulary = []
|
||||
for entry in vocabulary:
|
||||
entry_dict = entry.dict() if hasattr(entry, 'dict') else (entry.__dict__.copy() if hasattr(entry, '__dict__') else dict(entry))
|
||||
entry_dict['source_page'] = page_number + 1
|
||||
if 'id' not in entry_dict or not entry_dict['id']:
|
||||
entry_dict['id'] = str(uuid.uuid4())
|
||||
page_vocabulary.append(entry_dict)
|
||||
|
||||
logger.info(f"Page {page_number + 1}: {len(page_vocabulary)} Vokabeln extrahiert")
|
||||
|
||||
# Store rotation for this page (used by image/thumbnail endpoints)
|
||||
session.setdefault("page_rotations", {})[page_number] = rotation_deg
|
||||
|
||||
# Add to session's vocabulary (append, don't replace)
|
||||
existing_vocab = session.get("vocabulary", [])
|
||||
# Remove any existing entries from this page (in case of re-processing)
|
||||
existing_vocab = [v for v in existing_vocab if v.get("source_page") != page_number + 1]
|
||||
existing_vocab.extend(page_vocabulary)
|
||||
session["vocabulary"] = existing_vocab
|
||||
session["vocabulary_count"] = len(existing_vocab)
|
||||
session["status"] = SessionStatus.EXTRACTED.value
|
||||
|
||||
result = {
|
||||
"session_id": session_id,
|
||||
"page_number": page_number + 1,
|
||||
"success": True,
|
||||
"vocabulary": page_vocabulary,
|
||||
"vocabulary_count": len(page_vocabulary),
|
||||
"total_vocabulary_count": len(existing_vocab),
|
||||
"extraction_confidence": 0.9,
|
||||
"rotation": rotation_deg,
|
||||
}
|
||||
|
||||
# Add scan quality report + active steps info
|
||||
if quality_report:
|
||||
sq = quality_report.to_dict()
|
||||
sq["active_steps"] = {
|
||||
"step1_confidence": f"min_conf={min_ocr_conf}" if not override_min_conf else f"min_conf={override_min_conf} (override)",
|
||||
"step2_max_columns": f"max_cols={max_columns}" if max_columns else "unlimited",
|
||||
"step3_enhance": "on" if enable_enhance and quality_report.is_degraded else "off",
|
||||
}
|
||||
result["scan_quality"] = sq
|
||||
|
||||
return result
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# POST /sessions/{session_id}/process-pages (DEPRECATED)
|
||||
# =============================================================================
|
||||
|
||||
@upload_router.post("/sessions/{session_id}/process-pages")
|
||||
async def process_pdf_pages(
|
||||
session_id: str,
|
||||
pages: List[int] = None,
|
||||
process_all: bool = False,
|
||||
):
|
||||
"""
|
||||
Process specific pages of an uploaded PDF.
|
||||
|
||||
DEPRECATED: Use /process-single-page/{page_number} instead for better results.
|
||||
|
||||
Args:
|
||||
pages: List of 0-indexed page numbers to process
|
||||
process_all: If True, process all pages
|
||||
"""
|
||||
logger.info(f"Process pages request for session {session_id}: pages={pages}, process_all={process_all}")
|
||||
|
||||
if session_id not in _get_sessions():
|
||||
raise HTTPException(status_code=404, detail="Session not found")
|
||||
|
||||
session = _get_sessions()[session_id]
|
||||
pdf_data = session.get("pdf_data")
|
||||
|
||||
if not pdf_data:
|
||||
raise HTTPException(status_code=400, detail="No PDF uploaded for this session")
|
||||
|
||||
page_count = session.get("pdf_page_count", 1)
|
||||
|
||||
# Determine which pages to process
|
||||
if process_all:
|
||||
pages = list(range(page_count))
|
||||
elif pages is None or len(pages) == 0:
|
||||
pages = [0] # Default to first page
|
||||
|
||||
# Convert selected pages to images
|
||||
images = await convert_pdf_to_images(pdf_data, pages)
|
||||
|
||||
# Extract vocabulary from each page SEQUENTIALLY
|
||||
all_vocabulary = []
|
||||
total_confidence = 0.0
|
||||
successful_pages = []
|
||||
failed_pages = []
|
||||
error_messages = []
|
||||
|
||||
for i, image_data in enumerate(images):
|
||||
page_num = pages[i]
|
||||
logger.info(f"Extracting vocabulary from page {page_num + 1} of {len(images)}...")
|
||||
|
||||
vocabulary, confidence, error = await extract_vocabulary_from_image(
|
||||
image_data,
|
||||
f"page_{page_num + 1}.png",
|
||||
page_number=page_num
|
||||
)
|
||||
|
||||
if error:
|
||||
failed_pages.append(page_num + 1)
|
||||
error_messages.append(error)
|
||||
logger.warning(f"Page {page_num + 1} failed: {error}")
|
||||
else:
|
||||
successful_pages.append(page_num + 1)
|
||||
total_confidence += confidence
|
||||
|
||||
# Add page info to each entry and convert to dict
|
||||
for entry in vocabulary:
|
||||
entry_dict = entry.dict() if hasattr(entry, 'dict') else (entry.__dict__.copy() if hasattr(entry, '__dict__') else dict(entry))
|
||||
entry_dict['source_page'] = page_num + 1
|
||||
all_vocabulary.append(entry_dict)
|
||||
|
||||
logger.info(f"Page {page_num + 1}: {len(vocabulary)} Vokabeln extrahiert")
|
||||
|
||||
avg_confidence = total_confidence / len(successful_pages) if successful_pages else 0
|
||||
|
||||
# Update session
|
||||
session["vocabulary"] = all_vocabulary
|
||||
session["vocabulary_count"] = len(all_vocabulary)
|
||||
session["extraction_confidence"] = avg_confidence
|
||||
session["processed_pages"] = pages
|
||||
session["successful_pages"] = successful_pages
|
||||
session["failed_pages"] = failed_pages
|
||||
session["status"] = SessionStatus.EXTRACTED.value
|
||||
|
||||
# Save first page as preview image
|
||||
if images:
|
||||
session_dir = os.path.join(LOCAL_STORAGE_PATH, session_id)
|
||||
image_path = os.path.join(session_dir, "source.png")
|
||||
with open(image_path, 'wb') as f:
|
||||
f.write(images[0])
|
||||
session["image_path"] = image_path
|
||||
|
||||
result = {
|
||||
"session_id": session_id,
|
||||
"pages_processed": len(pages),
|
||||
"pages_successful": len(successful_pages),
|
||||
"pages_failed": len(failed_pages),
|
||||
"successful_pages": successful_pages,
|
||||
"failed_pages": failed_pages,
|
||||
"vocabulary_count": len(all_vocabulary),
|
||||
"extraction_confidence": avg_confidence,
|
||||
"status": SessionStatus.EXTRACTED.value,
|
||||
}
|
||||
|
||||
if error_messages:
|
||||
result["errors"] = error_messages
|
||||
|
||||
return result
|
||||
Reference in New Issue
Block a user