feat(klausur-service): Add Tesseract OCR, DSFA RAG, TrOCR, grid detection and vocab session store
New modules: - tesseract_vocab_extractor.py: Bounding-box OCR with multi-PSM pipeline - grid_detection_service.py: CV-based grid/table detection for worksheets - vocab_session_store.py: PostgreSQL persistence for vocab sessions - trocr_api.py: TrOCR handwriting recognition endpoint - dsfa_rag_api.py + dsfa_corpus_ingestion.py: DSFA RAG corpus search Changes: - Dockerfile: Install tesseract-ocr + deu/eng language packs - requirements.txt: Add PyMuPDF, pytesseract, Pillow - main.py: Register new routers, init DB pools + Qdrant collections Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -20,6 +20,7 @@ This is the main entry point. All functionality is organized in modular packages
|
||||
import os
|
||||
from contextlib import asynccontextmanager
|
||||
|
||||
import asyncpg
|
||||
from fastapi import FastAPI
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from fastapi.staticfiles import StaticFiles
|
||||
@@ -36,7 +37,19 @@ from admin_api import router as admin_router
|
||||
from zeugnis_api import router as zeugnis_router
|
||||
from training_api import router as training_router
|
||||
from mail.api import router as mail_router
|
||||
from trocr_api import router as trocr_router
|
||||
try:
|
||||
from trocr_api import router as trocr_router
|
||||
except ImportError:
|
||||
trocr_router = None
|
||||
from vocab_worksheet_api import router as vocab_router, set_db_pool as set_vocab_db_pool, _init_vocab_table, _load_all_sessions, DATABASE_URL as VOCAB_DATABASE_URL
|
||||
try:
|
||||
from dsfa_rag_api import router as dsfa_rag_router, set_db_pool as set_dsfa_db_pool
|
||||
from dsfa_corpus_ingestion import DSFAQdrantService, DATABASE_URL as DSFA_DATABASE_URL
|
||||
except ImportError:
|
||||
dsfa_rag_router = None
|
||||
set_dsfa_db_pool = None
|
||||
DSFAQdrantService = None
|
||||
DSFA_DATABASE_URL = None
|
||||
|
||||
# BYOEH Qdrant initialization
|
||||
from qdrant_service import init_qdrant_collection
|
||||
@@ -51,12 +64,42 @@ async def lifespan(app: FastAPI):
|
||||
"""Application lifespan manager for startup and shutdown events."""
|
||||
print("Klausur-Service starting...")
|
||||
|
||||
# Initialize database pool for Vocab Sessions
|
||||
vocab_db_pool = None
|
||||
try:
|
||||
vocab_db_pool = await asyncpg.create_pool(VOCAB_DATABASE_URL, min_size=2, max_size=5)
|
||||
set_vocab_db_pool(vocab_db_pool)
|
||||
await _init_vocab_table()
|
||||
await _load_all_sessions()
|
||||
print(f"Vocab sessions database initialized")
|
||||
except Exception as e:
|
||||
print(f"Warning: Vocab sessions database initialization failed: {e}")
|
||||
|
||||
# Initialize database pool for DSFA RAG
|
||||
dsfa_db_pool = None
|
||||
if DSFA_DATABASE_URL and set_dsfa_db_pool:
|
||||
try:
|
||||
dsfa_db_pool = await asyncpg.create_pool(DSFA_DATABASE_URL, min_size=2, max_size=10)
|
||||
set_dsfa_db_pool(dsfa_db_pool)
|
||||
print(f"DSFA database pool initialized: {DSFA_DATABASE_URL}")
|
||||
except Exception as e:
|
||||
print(f"Warning: DSFA database pool initialization failed: {e}")
|
||||
|
||||
# Initialize Qdrant collection for BYOEH
|
||||
try:
|
||||
await init_qdrant_collection()
|
||||
print("Qdrant BYOEH collection initialized")
|
||||
except Exception as e:
|
||||
print(f"Warning: Qdrant initialization failed: {e}")
|
||||
print(f"Warning: Qdrant BYOEH initialization failed: {e}")
|
||||
|
||||
# Initialize Qdrant collection for DSFA RAG
|
||||
if DSFAQdrantService:
|
||||
try:
|
||||
dsfa_qdrant = DSFAQdrantService()
|
||||
await dsfa_qdrant.ensure_collection()
|
||||
print("Qdrant DSFA corpus collection initialized")
|
||||
except Exception as e:
|
||||
print(f"Warning: Qdrant DSFA initialization failed: {e}")
|
||||
|
||||
# Ensure EH upload directory exists
|
||||
os.makedirs(EH_UPLOAD_DIR, exist_ok=True)
|
||||
@@ -65,6 +108,16 @@ async def lifespan(app: FastAPI):
|
||||
|
||||
print("Klausur-Service shutting down...")
|
||||
|
||||
# Close Vocab sessions database pool
|
||||
if vocab_db_pool:
|
||||
await vocab_db_pool.close()
|
||||
print("Vocab sessions database pool closed")
|
||||
|
||||
# Close DSFA database pool
|
||||
if dsfa_db_pool:
|
||||
await dsfa_db_pool.close()
|
||||
print("DSFA database pool closed")
|
||||
|
||||
|
||||
app = FastAPI(
|
||||
title="Klausur-Service",
|
||||
@@ -94,7 +147,11 @@ app.include_router(admin_router) # NiBiS Ingestion
|
||||
app.include_router(zeugnis_router) # Zeugnis Rights-Aware Crawler
|
||||
app.include_router(training_router) # Training Management
|
||||
app.include_router(mail_router) # Unified Inbox Mail
|
||||
app.include_router(trocr_router) # TrOCR Handwriting OCR
|
||||
if trocr_router:
|
||||
app.include_router(trocr_router) # TrOCR Handwriting OCR
|
||||
app.include_router(vocab_router) # Vocabulary Worksheet Generator
|
||||
if dsfa_rag_router:
|
||||
app.include_router(dsfa_rag_router) # DSFA RAG Corpus Search
|
||||
|
||||
|
||||
# =============================================
|
||||
|
||||
Reference in New Issue
Block a user