Restructure: Move grid_* + vocab_* into packages (klausur-service)
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 28s
CI / test-go-edu-search (push) Successful in 29s
CI / test-python-klausur (push) Failing after 2m31s
CI / test-python-agent-core (push) Successful in 20s
CI / test-nodejs-website (push) Successful in 23s

grid/ package (16 files):
  grid/build/   — core, zones, cleanup, text_ops, cell_ops, finalize
  grid/editor/  — api, helpers, columns, filters, headers, zones

vocab/ package (10 files):
  vocab/worksheet/ — api, models, extraction, generation, ocr, upload, analysis, compare
  vocab/           — session_store, learn_bridge

26 backward-compat shims. Internal imports relative. RAG untouched.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-04-25 21:30:20 +02:00
parent 098a2ff092
commit 59c400b9aa
58 changed files with 8803 additions and 8659 deletions

View File

@@ -0,0 +1,490 @@
"""
Vocab Worksheet Upload API — PDF upload, thumbnails, and page processing.
Extracted from vocab_worksheet_api.py to keep modules under 500 LOC.
Routes (no prefix — included into the main /api/v1/vocab router):
POST /sessions/{session_id}/upload-pdf-info
GET /sessions/{session_id}/pdf-thumbnail/{page_number}
GET /sessions/{session_id}/pdf-page-image/{page_number}
POST /sessions/{session_id}/process-single-page/{page_number}
POST /sessions/{session_id}/process-pages
"""
import io
import logging
import os
import uuid
from typing import List
from fastapi import APIRouter, HTTPException, Query, UploadFile, File
from fastapi.responses import StreamingResponse
from .models import SessionStatus
logger = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# Local storage path
# ---------------------------------------------------------------------------
LOCAL_STORAGE_PATH = os.getenv("VOCAB_STORAGE_PATH", "/app/vocab-worksheets")
# ---------------------------------------------------------------------------
# Optional heavy dependencies
# ---------------------------------------------------------------------------
try:
import numpy as np
from cv_preprocessing import render_pdf_high_res, detect_and_fix_orientation
OCR_PIPELINE_AVAILABLE = True
except ImportError:
np = None # type: ignore[assignment]
OCR_PIPELINE_AVAILABLE = False
logger.warning("OCR pipeline imports not available in upload module")
# Sub-module imports (already split out)
from .generation import (
convert_pdf_page_to_image,
convert_pdf_to_images,
get_pdf_page_count,
)
from .extraction import extract_vocabulary_from_image
try:
from .ocr import _run_ocr_pipeline_for_page
except ImportError:
_run_ocr_pipeline_for_page = None # type: ignore[assignment]
logger.warning("vocab_worksheet_ocr not available — process-single-page disabled")
# ---------------------------------------------------------------------------
# In-memory session store (shared with main module)
# ---------------------------------------------------------------------------
def _get_sessions():
from .api import _sessions
return _sessions
# ---------------------------------------------------------------------------
# Router (no prefix — will be included into the main vocab router)
# ---------------------------------------------------------------------------
upload_router = APIRouter()
# =============================================================================
# POST /sessions/{session_id}/upload-pdf-info
# =============================================================================
@upload_router.post("/sessions/{session_id}/upload-pdf-info")
async def upload_pdf_get_info(
session_id: str,
file: UploadFile = File(...),
):
"""
Upload a PDF and get page count and thumbnails for preview.
Use this before processing to let user select pages.
"""
logger.info(f"PDF info request for session {session_id}")
if session_id not in _get_sessions():
raise HTTPException(status_code=404, detail="Session not found")
session = _get_sessions()[session_id]
# Validate file type
extension = file.filename.split('.')[-1].lower() if file.filename else ''
content_type = file.content_type or ''
if extension != 'pdf' and content_type != 'application/pdf':
raise HTTPException(status_code=400, detail="Only PDF files supported for this endpoint")
content = await file.read()
# Save PDF temporarily
session_dir = os.path.join(LOCAL_STORAGE_PATH, session_id)
os.makedirs(session_dir, exist_ok=True)
pdf_path = os.path.join(session_dir, "source.pdf")
with open(pdf_path, 'wb') as f:
f.write(content)
# Get page count
page_count = get_pdf_page_count(content)
# Store PDF data in session for later processing
session["pdf_data"] = content
session["pdf_path"] = pdf_path
session["pdf_page_count"] = page_count
session["status"] = "pdf_uploaded"
# Detect orientation for each page so thumbnails are shown correctly
page_rotations: dict = {}
if OCR_PIPELINE_AVAILABLE:
for pg in range(page_count):
try:
img_bgr = render_pdf_high_res(content, pg, zoom=2.0)
_, rotation = detect_and_fix_orientation(img_bgr)
if rotation:
page_rotations[pg] = rotation
logger.info(f"Page {pg + 1}: orientation {rotation}°")
except Exception as e:
logger.warning(f"Orientation detection failed for page {pg + 1}: {e}")
session["page_rotations"] = page_rotations
return {
"session_id": session_id,
"page_count": page_count,
"filename": file.filename,
"page_rotations": page_rotations,
}
# =============================================================================
# GET /sessions/{session_id}/pdf-thumbnail/{page_number}
# =============================================================================
@upload_router.get("/sessions/{session_id}/pdf-thumbnail/{page_number}")
async def get_pdf_thumbnail(session_id: str, page_number: int, hires: bool = Query(False)):
"""Get a thumbnail image of a specific PDF page.
Uses fitz for rendering so that page_rotations (from OCR orientation
detection) are applied consistently.
Args:
hires: If True, return full-resolution image (zoom=2.0) instead of thumbnail (zoom=0.5).
"""
if session_id not in _get_sessions():
raise HTTPException(status_code=404, detail="Session not found")
session = _get_sessions()[session_id]
pdf_data = session.get("pdf_data")
if not pdf_data:
raise HTTPException(status_code=400, detail="No PDF uploaded for this session")
try:
import fitz
zoom = 2.0 if hires else 0.5
pdf_document = fitz.open(stream=pdf_data, filetype="pdf")
page = pdf_document[page_number]
# Apply orientation correction detected during OCR processing
rot = session.get("page_rotations", {}).get(page_number, 0)
if rot:
page.set_rotation(rot)
mat = fitz.Matrix(zoom, zoom)
pix = page.get_pixmap(matrix=mat)
png_data = pix.tobytes("png")
pdf_document.close()
except Exception as e:
logger.error(f"PDF thumbnail failed: {e}")
raise HTTPException(status_code=500, detail=f"PDF rendering failed: {str(e)}")
return StreamingResponse(
io.BytesIO(png_data),
media_type="image/png",
)
# =============================================================================
# GET /sessions/{session_id}/pdf-page-image/{page_number}
# =============================================================================
@upload_router.get("/sessions/{session_id}/pdf-page-image/{page_number}")
async def get_pdf_page_image(session_id: str, page_number: int, zoom: float = Query(2.0, ge=0.5, le=4.0)):
"""PDF page as PNG at arbitrary resolution (for editor view).
Args:
zoom: Zoom factor (0.5=72DPI, 1.0=144DPI, 2.0=288DPI, 4.0=576DPI).
"""
if session_id not in _get_sessions():
raise HTTPException(status_code=404, detail="Session not found")
session = _get_sessions()[session_id]
pdf_data = session.get("pdf_data")
if not pdf_data:
raise HTTPException(status_code=400, detail="No PDF uploaded for this session")
page_count = session.get("pdf_page_count", 1)
if page_number < 0 or page_number >= page_count:
raise HTTPException(status_code=400, detail=f"Invalid page number. PDF has {page_count} pages (0-indexed).")
try:
import fitz
pdf_document = fitz.open(stream=pdf_data, filetype="pdf")
page = pdf_document[page_number]
# Apply orientation correction detected during OCR processing
rot = session.get("page_rotations", {}).get(page_number, 0)
if rot:
page.set_rotation(rot)
mat = fitz.Matrix(zoom, zoom)
pix = page.get_pixmap(matrix=mat)
png_data = pix.tobytes("png")
pdf_document.close()
logger.info(f"PDF page {page_number} rendered at zoom={zoom} rot={rot}: {len(png_data)} bytes")
except Exception as e:
logger.error(f"PDF page image failed: {e}")
raise HTTPException(status_code=500, detail=f"PDF rendering failed: {str(e)}")
return StreamingResponse(
io.BytesIO(png_data),
media_type="image/png",
)
# =============================================================================
# POST /sessions/{session_id}/process-single-page/{page_number}
# =============================================================================
@upload_router.post("/sessions/{session_id}/process-single-page/{page_number}")
async def process_single_page(
session_id: str,
page_number: int,
ipa_mode: str = Query("none", pattern="^(auto|all|de|en|none)$"),
syllable_mode: str = Query("none", pattern="^(auto|all|de|en|none)$"),
enhance: bool = Query(True, description="Step 3: CLAHE + Denoise for degraded scans"),
max_cols: int = Query(3, description="Step 2: Max column count (0=unlimited)"),
min_conf: int = Query(0, description="Step 1: Min OCR confidence (0=auto from quality score)"),
):
"""
Process a SINGLE page of an uploaded PDF using the Kombi OCR pipeline.
Uses the full Kombi pipeline (orientation -> deskew -> dewarp -> crop ->
dual-engine OCR -> grid-build with autocorrect/merge) for best quality.
Query params:
ipa_mode: "none" (default), "auto", "all", "en", "de"
syllable_mode: "none" (default), "auto", "all", "en", "de"
enhance: true (default) -- apply CLAHE/denoise for degraded scans
max_cols: 3 (default) -- max column count (0=unlimited)
min_conf: 0 (default=auto) -- min OCR confidence (0=from quality score)
The frontend should call this sequentially for each page.
Returns the vocabulary for just this one page.
"""
logger.info(f"Processing SINGLE page {page_number + 1} for session {session_id}")
if session_id not in _get_sessions():
raise HTTPException(
status_code=404,
detail="Session nicht im Speicher. Bitte erstellen Sie eine neue Session und laden Sie das PDF erneut hoch.",
)
session = _get_sessions()[session_id]
pdf_data = session.get("pdf_data")
if not pdf_data:
raise HTTPException(status_code=400, detail="No PDF uploaded for this session")
page_count = session.get("pdf_page_count", 1)
if page_number < 0 or page_number >= page_count:
raise HTTPException(status_code=400, detail=f"Invalid page number. PDF has {page_count} pages (0-indexed).")
# Derive pipeline-level variable names for the quality report
enable_enhance = enhance
max_columns = max_cols if max_cols > 0 else None
override_min_conf = min_conf if min_conf > 0 else None
# --- OCR Pipeline path (use same render_pdf_high_res as admin OCR pipeline) ---
rotation_deg = 0
quality_report = None
min_ocr_conf = 40 # default; overridden by pipeline when quality report is available
if OCR_PIPELINE_AVAILABLE and _run_ocr_pipeline_for_page is not None:
try:
img_bgr = render_pdf_high_res(pdf_data, page_number, zoom=3.0)
page_vocabulary, rotation_deg, quality_report = await _run_ocr_pipeline_for_page(
img_bgr, page_number, session_id,
ipa_mode=ipa_mode, syllable_mode=syllable_mode,
enable_enhance=enable_enhance,
max_columns=max_columns,
override_min_conf=override_min_conf,
)
# Update min_ocr_conf from quality report if available
if quality_report and hasattr(quality_report, 'recommended_min_conf'):
min_ocr_conf = quality_report.recommended_min_conf
except Exception as e:
logger.error(f"OCR pipeline failed for page {page_number + 1}: {e}", exc_info=True)
return {
"session_id": session_id,
"page_number": page_number + 1,
"success": False,
"error": f"OCR pipeline error: {e}",
"vocabulary": [],
"vocabulary_count": 0,
}
else:
# Fallback to LLM vision extraction
logger.warning("OCR pipeline not available, falling back to LLM vision")
image_data = await convert_pdf_page_to_image(pdf_data, page_number, thumbnail=False)
vocabulary, confidence, error = await extract_vocabulary_from_image(
image_data,
f"page_{page_number + 1}.png",
page_number=page_number
)
if error:
logger.warning(f"Page {page_number + 1} failed: {error}")
return {
"session_id": session_id,
"page_number": page_number + 1,
"success": False,
"error": error,
"vocabulary": [],
"vocabulary_count": 0,
}
page_vocabulary = []
for entry in vocabulary:
entry_dict = entry.dict() if hasattr(entry, 'dict') else (entry.__dict__.copy() if hasattr(entry, '__dict__') else dict(entry))
entry_dict['source_page'] = page_number + 1
if 'id' not in entry_dict or not entry_dict['id']:
entry_dict['id'] = str(uuid.uuid4())
page_vocabulary.append(entry_dict)
logger.info(f"Page {page_number + 1}: {len(page_vocabulary)} Vokabeln extrahiert")
# Store rotation for this page (used by image/thumbnail endpoints)
session.setdefault("page_rotations", {})[page_number] = rotation_deg
# Add to session's vocabulary (append, don't replace)
existing_vocab = session.get("vocabulary", [])
# Remove any existing entries from this page (in case of re-processing)
existing_vocab = [v for v in existing_vocab if v.get("source_page") != page_number + 1]
existing_vocab.extend(page_vocabulary)
session["vocabulary"] = existing_vocab
session["vocabulary_count"] = len(existing_vocab)
session["status"] = SessionStatus.EXTRACTED.value
result = {
"session_id": session_id,
"page_number": page_number + 1,
"success": True,
"vocabulary": page_vocabulary,
"vocabulary_count": len(page_vocabulary),
"total_vocabulary_count": len(existing_vocab),
"extraction_confidence": 0.9,
"rotation": rotation_deg,
}
# Add scan quality report + active steps info
if quality_report:
sq = quality_report.to_dict()
sq["active_steps"] = {
"step1_confidence": f"min_conf={min_ocr_conf}" if not override_min_conf else f"min_conf={override_min_conf} (override)",
"step2_max_columns": f"max_cols={max_columns}" if max_columns else "unlimited",
"step3_enhance": "on" if enable_enhance and quality_report.is_degraded else "off",
}
result["scan_quality"] = sq
return result
# =============================================================================
# POST /sessions/{session_id}/process-pages (DEPRECATED)
# =============================================================================
@upload_router.post("/sessions/{session_id}/process-pages")
async def process_pdf_pages(
session_id: str,
pages: List[int] = None,
process_all: bool = False,
):
"""
Process specific pages of an uploaded PDF.
DEPRECATED: Use /process-single-page/{page_number} instead for better results.
Args:
pages: List of 0-indexed page numbers to process
process_all: If True, process all pages
"""
logger.info(f"Process pages request for session {session_id}: pages={pages}, process_all={process_all}")
if session_id not in _get_sessions():
raise HTTPException(status_code=404, detail="Session not found")
session = _get_sessions()[session_id]
pdf_data = session.get("pdf_data")
if not pdf_data:
raise HTTPException(status_code=400, detail="No PDF uploaded for this session")
page_count = session.get("pdf_page_count", 1)
# Determine which pages to process
if process_all:
pages = list(range(page_count))
elif pages is None or len(pages) == 0:
pages = [0] # Default to first page
# Convert selected pages to images
images = await convert_pdf_to_images(pdf_data, pages)
# Extract vocabulary from each page SEQUENTIALLY
all_vocabulary = []
total_confidence = 0.0
successful_pages = []
failed_pages = []
error_messages = []
for i, image_data in enumerate(images):
page_num = pages[i]
logger.info(f"Extracting vocabulary from page {page_num + 1} of {len(images)}...")
vocabulary, confidence, error = await extract_vocabulary_from_image(
image_data,
f"page_{page_num + 1}.png",
page_number=page_num
)
if error:
failed_pages.append(page_num + 1)
error_messages.append(error)
logger.warning(f"Page {page_num + 1} failed: {error}")
else:
successful_pages.append(page_num + 1)
total_confidence += confidence
# Add page info to each entry and convert to dict
for entry in vocabulary:
entry_dict = entry.dict() if hasattr(entry, 'dict') else (entry.__dict__.copy() if hasattr(entry, '__dict__') else dict(entry))
entry_dict['source_page'] = page_num + 1
all_vocabulary.append(entry_dict)
logger.info(f"Page {page_num + 1}: {len(vocabulary)} Vokabeln extrahiert")
avg_confidence = total_confidence / len(successful_pages) if successful_pages else 0
# Update session
session["vocabulary"] = all_vocabulary
session["vocabulary_count"] = len(all_vocabulary)
session["extraction_confidence"] = avg_confidence
session["processed_pages"] = pages
session["successful_pages"] = successful_pages
session["failed_pages"] = failed_pages
session["status"] = SessionStatus.EXTRACTED.value
# Save first page as preview image
if images:
session_dir = os.path.join(LOCAL_STORAGE_PATH, session_id)
image_path = os.path.join(session_dir, "source.png")
with open(image_path, 'wb') as f:
f.write(images[0])
session["image_path"] = image_path
result = {
"session_id": session_id,
"pages_processed": len(pages),
"pages_successful": len(successful_pages),
"pages_failed": len(failed_pages),
"successful_pages": successful_pages,
"failed_pages": failed_pages,
"vocabulary_count": len(all_vocabulary),
"extraction_confidence": avg_confidence,
"status": SessionStatus.EXTRACTED.value,
}
if error_messages:
result["errors"] = error_messages
return result