Fix: Remove broken getKlausurApiUrl and clean up empty lines
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 42s
CI / test-go-edu-search (push) Successful in 34s
CI / test-python-klausur (push) Failing after 2m51s
CI / test-python-agent-core (push) Successful in 21s
CI / test-nodejs-website (push) Successful in 29s

sed replacement left orphaned hostname references in story page
and empty lines in getApiBase functions.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-04-24 16:02:04 +02:00
parent b07f802c24
commit 9ba420fa91
150 changed files with 30231 additions and 32053 deletions

View File

@@ -0,0 +1,490 @@
"""
Vocab Worksheet Upload API — PDF upload, thumbnails, and page processing.
Extracted from vocab_worksheet_api.py to keep modules under 500 LOC.
Routes (no prefix — included into the main /api/v1/vocab router):
POST /sessions/{session_id}/upload-pdf-info
GET /sessions/{session_id}/pdf-thumbnail/{page_number}
GET /sessions/{session_id}/pdf-page-image/{page_number}
POST /sessions/{session_id}/process-single-page/{page_number}
POST /sessions/{session_id}/process-pages
"""
import io
import logging
import os
import uuid
from typing import List, Optional
from fastapi import APIRouter, HTTPException, Query, UploadFile, File
from fastapi.responses import StreamingResponse
from vocab_worksheet_models import SessionStatus, VocabularyEntry
logger = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# Local storage path
# ---------------------------------------------------------------------------
LOCAL_STORAGE_PATH = os.getenv("VOCAB_STORAGE_PATH", "/app/vocab-worksheets")
# ---------------------------------------------------------------------------
# Optional heavy dependencies
# ---------------------------------------------------------------------------
try:
import numpy as np
from cv_preprocessing import render_pdf_high_res, detect_and_fix_orientation
OCR_PIPELINE_AVAILABLE = True
except ImportError:
np = None # type: ignore[assignment]
OCR_PIPELINE_AVAILABLE = False
logger.warning("OCR pipeline imports not available in upload module")
# Sub-module imports (already split out)
from vocab_worksheet_generation import (
convert_pdf_page_to_image,
convert_pdf_to_images,
get_pdf_page_count,
)
from vocab_worksheet_extraction import extract_vocabulary_from_image
try:
from vocab_worksheet_ocr import _run_ocr_pipeline_for_page
except ImportError:
_run_ocr_pipeline_for_page = None # type: ignore[assignment]
logger.warning("vocab_worksheet_ocr not available — process-single-page disabled")
# ---------------------------------------------------------------------------
# In-memory session store (shared with main module)
# ---------------------------------------------------------------------------
def _get_sessions():
from vocab_worksheet_api import _sessions
return _sessions
# ---------------------------------------------------------------------------
# Router (no prefix — will be included into the main vocab router)
# ---------------------------------------------------------------------------
upload_router = APIRouter()
# =============================================================================
# POST /sessions/{session_id}/upload-pdf-info
# =============================================================================
@upload_router.post("/sessions/{session_id}/upload-pdf-info")
async def upload_pdf_get_info(
session_id: str,
file: UploadFile = File(...),
):
"""
Upload a PDF and get page count and thumbnails for preview.
Use this before processing to let user select pages.
"""
logger.info(f"PDF info request for session {session_id}")
if session_id not in _get_sessions():
raise HTTPException(status_code=404, detail="Session not found")
session = _get_sessions()[session_id]
# Validate file type
extension = file.filename.split('.')[-1].lower() if file.filename else ''
content_type = file.content_type or ''
if extension != 'pdf' and content_type != 'application/pdf':
raise HTTPException(status_code=400, detail="Only PDF files supported for this endpoint")
content = await file.read()
# Save PDF temporarily
session_dir = os.path.join(LOCAL_STORAGE_PATH, session_id)
os.makedirs(session_dir, exist_ok=True)
pdf_path = os.path.join(session_dir, "source.pdf")
with open(pdf_path, 'wb') as f:
f.write(content)
# Get page count
page_count = get_pdf_page_count(content)
# Store PDF data in session for later processing
session["pdf_data"] = content
session["pdf_path"] = pdf_path
session["pdf_page_count"] = page_count
session["status"] = "pdf_uploaded"
# Detect orientation for each page so thumbnails are shown correctly
page_rotations: dict = {}
if OCR_PIPELINE_AVAILABLE:
for pg in range(page_count):
try:
img_bgr = render_pdf_high_res(content, pg, zoom=2.0)
_, rotation = detect_and_fix_orientation(img_bgr)
if rotation:
page_rotations[pg] = rotation
logger.info(f"Page {pg + 1}: orientation {rotation}°")
except Exception as e:
logger.warning(f"Orientation detection failed for page {pg + 1}: {e}")
session["page_rotations"] = page_rotations
return {
"session_id": session_id,
"page_count": page_count,
"filename": file.filename,
"page_rotations": page_rotations,
}
# =============================================================================
# GET /sessions/{session_id}/pdf-thumbnail/{page_number}
# =============================================================================
@upload_router.get("/sessions/{session_id}/pdf-thumbnail/{page_number}")
async def get_pdf_thumbnail(session_id: str, page_number: int, hires: bool = Query(False)):
"""Get a thumbnail image of a specific PDF page.
Uses fitz for rendering so that page_rotations (from OCR orientation
detection) are applied consistently.
Args:
hires: If True, return full-resolution image (zoom=2.0) instead of thumbnail (zoom=0.5).
"""
if session_id not in _get_sessions():
raise HTTPException(status_code=404, detail="Session not found")
session = _get_sessions()[session_id]
pdf_data = session.get("pdf_data")
if not pdf_data:
raise HTTPException(status_code=400, detail="No PDF uploaded for this session")
try:
import fitz
zoom = 2.0 if hires else 0.5
pdf_document = fitz.open(stream=pdf_data, filetype="pdf")
page = pdf_document[page_number]
# Apply orientation correction detected during OCR processing
rot = session.get("page_rotations", {}).get(page_number, 0)
if rot:
page.set_rotation(rot)
mat = fitz.Matrix(zoom, zoom)
pix = page.get_pixmap(matrix=mat)
png_data = pix.tobytes("png")
pdf_document.close()
except Exception as e:
logger.error(f"PDF thumbnail failed: {e}")
raise HTTPException(status_code=500, detail=f"PDF rendering failed: {str(e)}")
return StreamingResponse(
io.BytesIO(png_data),
media_type="image/png",
)
# =============================================================================
# GET /sessions/{session_id}/pdf-page-image/{page_number}
# =============================================================================
@upload_router.get("/sessions/{session_id}/pdf-page-image/{page_number}")
async def get_pdf_page_image(session_id: str, page_number: int, zoom: float = Query(2.0, ge=0.5, le=4.0)):
"""PDF page as PNG at arbitrary resolution (for editor view).
Args:
zoom: Zoom factor (0.5=72DPI, 1.0=144DPI, 2.0=288DPI, 4.0=576DPI).
"""
if session_id not in _get_sessions():
raise HTTPException(status_code=404, detail="Session not found")
session = _get_sessions()[session_id]
pdf_data = session.get("pdf_data")
if not pdf_data:
raise HTTPException(status_code=400, detail="No PDF uploaded for this session")
page_count = session.get("pdf_page_count", 1)
if page_number < 0 or page_number >= page_count:
raise HTTPException(status_code=400, detail=f"Invalid page number. PDF has {page_count} pages (0-indexed).")
try:
import fitz
pdf_document = fitz.open(stream=pdf_data, filetype="pdf")
page = pdf_document[page_number]
# Apply orientation correction detected during OCR processing
rot = session.get("page_rotations", {}).get(page_number, 0)
if rot:
page.set_rotation(rot)
mat = fitz.Matrix(zoom, zoom)
pix = page.get_pixmap(matrix=mat)
png_data = pix.tobytes("png")
pdf_document.close()
logger.info(f"PDF page {page_number} rendered at zoom={zoom} rot={rot}: {len(png_data)} bytes")
except Exception as e:
logger.error(f"PDF page image failed: {e}")
raise HTTPException(status_code=500, detail=f"PDF rendering failed: {str(e)}")
return StreamingResponse(
io.BytesIO(png_data),
media_type="image/png",
)
# =============================================================================
# POST /sessions/{session_id}/process-single-page/{page_number}
# =============================================================================
@upload_router.post("/sessions/{session_id}/process-single-page/{page_number}")
async def process_single_page(
session_id: str,
page_number: int,
ipa_mode: str = Query("none", pattern="^(auto|all|de|en|none)$"),
syllable_mode: str = Query("none", pattern="^(auto|all|de|en|none)$"),
enhance: bool = Query(True, description="Step 3: CLAHE + Denoise for degraded scans"),
max_cols: int = Query(3, description="Step 2: Max column count (0=unlimited)"),
min_conf: int = Query(0, description="Step 1: Min OCR confidence (0=auto from quality score)"),
):
"""
Process a SINGLE page of an uploaded PDF using the Kombi OCR pipeline.
Uses the full Kombi pipeline (orientation -> deskew -> dewarp -> crop ->
dual-engine OCR -> grid-build with autocorrect/merge) for best quality.
Query params:
ipa_mode: "none" (default), "auto", "all", "en", "de"
syllable_mode: "none" (default), "auto", "all", "en", "de"
enhance: true (default) -- apply CLAHE/denoise for degraded scans
max_cols: 3 (default) -- max column count (0=unlimited)
min_conf: 0 (default=auto) -- min OCR confidence (0=from quality score)
The frontend should call this sequentially for each page.
Returns the vocabulary for just this one page.
"""
logger.info(f"Processing SINGLE page {page_number + 1} for session {session_id}")
if session_id not in _get_sessions():
raise HTTPException(
status_code=404,
detail="Session nicht im Speicher. Bitte erstellen Sie eine neue Session und laden Sie das PDF erneut hoch.",
)
session = _get_sessions()[session_id]
pdf_data = session.get("pdf_data")
if not pdf_data:
raise HTTPException(status_code=400, detail="No PDF uploaded for this session")
page_count = session.get("pdf_page_count", 1)
if page_number < 0 or page_number >= page_count:
raise HTTPException(status_code=400, detail=f"Invalid page number. PDF has {page_count} pages (0-indexed).")
# Derive pipeline-level variable names for the quality report
enable_enhance = enhance
max_columns = max_cols if max_cols > 0 else None
override_min_conf = min_conf if min_conf > 0 else None
# --- OCR Pipeline path (use same render_pdf_high_res as admin OCR pipeline) ---
rotation_deg = 0
quality_report = None
min_ocr_conf = 40 # default; overridden by pipeline when quality report is available
if OCR_PIPELINE_AVAILABLE and _run_ocr_pipeline_for_page is not None:
try:
img_bgr = render_pdf_high_res(pdf_data, page_number, zoom=3.0)
page_vocabulary, rotation_deg, quality_report = await _run_ocr_pipeline_for_page(
img_bgr, page_number, session_id,
ipa_mode=ipa_mode, syllable_mode=syllable_mode,
enable_enhance=enable_enhance,
max_columns=max_columns,
override_min_conf=override_min_conf,
)
# Update min_ocr_conf from quality report if available
if quality_report and hasattr(quality_report, 'recommended_min_conf'):
min_ocr_conf = quality_report.recommended_min_conf
except Exception as e:
logger.error(f"OCR pipeline failed for page {page_number + 1}: {e}", exc_info=True)
return {
"session_id": session_id,
"page_number": page_number + 1,
"success": False,
"error": f"OCR pipeline error: {e}",
"vocabulary": [],
"vocabulary_count": 0,
}
else:
# Fallback to LLM vision extraction
logger.warning("OCR pipeline not available, falling back to LLM vision")
image_data = await convert_pdf_page_to_image(pdf_data, page_number, thumbnail=False)
vocabulary, confidence, error = await extract_vocabulary_from_image(
image_data,
f"page_{page_number + 1}.png",
page_number=page_number
)
if error:
logger.warning(f"Page {page_number + 1} failed: {error}")
return {
"session_id": session_id,
"page_number": page_number + 1,
"success": False,
"error": error,
"vocabulary": [],
"vocabulary_count": 0,
}
page_vocabulary = []
for entry in vocabulary:
entry_dict = entry.dict() if hasattr(entry, 'dict') else (entry.__dict__.copy() if hasattr(entry, '__dict__') else dict(entry))
entry_dict['source_page'] = page_number + 1
if 'id' not in entry_dict or not entry_dict['id']:
entry_dict['id'] = str(uuid.uuid4())
page_vocabulary.append(entry_dict)
logger.info(f"Page {page_number + 1}: {len(page_vocabulary)} Vokabeln extrahiert")
# Store rotation for this page (used by image/thumbnail endpoints)
session.setdefault("page_rotations", {})[page_number] = rotation_deg
# Add to session's vocabulary (append, don't replace)
existing_vocab = session.get("vocabulary", [])
# Remove any existing entries from this page (in case of re-processing)
existing_vocab = [v for v in existing_vocab if v.get("source_page") != page_number + 1]
existing_vocab.extend(page_vocabulary)
session["vocabulary"] = existing_vocab
session["vocabulary_count"] = len(existing_vocab)
session["status"] = SessionStatus.EXTRACTED.value
result = {
"session_id": session_id,
"page_number": page_number + 1,
"success": True,
"vocabulary": page_vocabulary,
"vocabulary_count": len(page_vocabulary),
"total_vocabulary_count": len(existing_vocab),
"extraction_confidence": 0.9,
"rotation": rotation_deg,
}
# Add scan quality report + active steps info
if quality_report:
sq = quality_report.to_dict()
sq["active_steps"] = {
"step1_confidence": f"min_conf={min_ocr_conf}" if not override_min_conf else f"min_conf={override_min_conf} (override)",
"step2_max_columns": f"max_cols={max_columns}" if max_columns else "unlimited",
"step3_enhance": "on" if enable_enhance and quality_report.is_degraded else "off",
}
result["scan_quality"] = sq
return result
# =============================================================================
# POST /sessions/{session_id}/process-pages (DEPRECATED)
# =============================================================================
@upload_router.post("/sessions/{session_id}/process-pages")
async def process_pdf_pages(
session_id: str,
pages: List[int] = None,
process_all: bool = False,
):
"""
Process specific pages of an uploaded PDF.
DEPRECATED: Use /process-single-page/{page_number} instead for better results.
Args:
pages: List of 0-indexed page numbers to process
process_all: If True, process all pages
"""
logger.info(f"Process pages request for session {session_id}: pages={pages}, process_all={process_all}")
if session_id not in _get_sessions():
raise HTTPException(status_code=404, detail="Session not found")
session = _get_sessions()[session_id]
pdf_data = session.get("pdf_data")
if not pdf_data:
raise HTTPException(status_code=400, detail="No PDF uploaded for this session")
page_count = session.get("pdf_page_count", 1)
# Determine which pages to process
if process_all:
pages = list(range(page_count))
elif pages is None or len(pages) == 0:
pages = [0] # Default to first page
# Convert selected pages to images
images = await convert_pdf_to_images(pdf_data, pages)
# Extract vocabulary from each page SEQUENTIALLY
all_vocabulary = []
total_confidence = 0.0
successful_pages = []
failed_pages = []
error_messages = []
for i, image_data in enumerate(images):
page_num = pages[i]
logger.info(f"Extracting vocabulary from page {page_num + 1} of {len(images)}...")
vocabulary, confidence, error = await extract_vocabulary_from_image(
image_data,
f"page_{page_num + 1}.png",
page_number=page_num
)
if error:
failed_pages.append(page_num + 1)
error_messages.append(error)
logger.warning(f"Page {page_num + 1} failed: {error}")
else:
successful_pages.append(page_num + 1)
total_confidence += confidence
# Add page info to each entry and convert to dict
for entry in vocabulary:
entry_dict = entry.dict() if hasattr(entry, 'dict') else (entry.__dict__.copy() if hasattr(entry, '__dict__') else dict(entry))
entry_dict['source_page'] = page_num + 1
all_vocabulary.append(entry_dict)
logger.info(f"Page {page_num + 1}: {len(vocabulary)} Vokabeln extrahiert")
avg_confidence = total_confidence / len(successful_pages) if successful_pages else 0
# Update session
session["vocabulary"] = all_vocabulary
session["vocabulary_count"] = len(all_vocabulary)
session["extraction_confidence"] = avg_confidence
session["processed_pages"] = pages
session["successful_pages"] = successful_pages
session["failed_pages"] = failed_pages
session["status"] = SessionStatus.EXTRACTED.value
# Save first page as preview image
if images:
session_dir = os.path.join(LOCAL_STORAGE_PATH, session_id)
image_path = os.path.join(session_dir, "source.png")
with open(image_path, 'wb') as f:
f.write(images[0])
session["image_path"] = image_path
result = {
"session_id": session_id,
"pages_processed": len(pages),
"pages_successful": len(successful_pages),
"pages_failed": len(failed_pages),
"successful_pages": successful_pages,
"failed_pages": failed_pages,
"vocabulary_count": len(all_vocabulary),
"extraction_confidence": avg_confidence,
"status": SessionStatus.EXTRACTED.value,
}
if error_messages:
result["errors"] = error_messages
return result