feat(klausur-service): Add Tesseract OCR, DSFA RAG, TrOCR, grid detection and vocab session store

New modules:
- tesseract_vocab_extractor.py: Bounding-box OCR with multi-PSM pipeline
- grid_detection_service.py: CV-based grid/table detection for worksheets
- vocab_session_store.py: PostgreSQL persistence for vocab sessions
- trocr_api.py: TrOCR handwriting recognition endpoint
- dsfa_rag_api.py + dsfa_corpus_ingestion.py: DSFA RAG corpus search

Changes:
- Dockerfile: Install tesseract-ocr + deu/eng language packs
- requirements.txt: Add PyMuPDF, pytesseract, Pillow
- main.py: Register new routers, init DB pools + Qdrant collections

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
BreakPilot Dev
2026-02-10 00:00:19 +01:00
parent 46cb873190
commit 53219e3eaf
9 changed files with 3829 additions and 4 deletions

View File

@@ -20,6 +20,7 @@ This is the main entry point. All functionality is organized in modular packages
import os
from contextlib import asynccontextmanager
import asyncpg
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
from fastapi.staticfiles import StaticFiles
@@ -36,7 +37,19 @@ from admin_api import router as admin_router
from zeugnis_api import router as zeugnis_router
from training_api import router as training_router
from mail.api import router as mail_router
from trocr_api import router as trocr_router
try:
from trocr_api import router as trocr_router
except ImportError:
trocr_router = None
from vocab_worksheet_api import router as vocab_router, set_db_pool as set_vocab_db_pool, _init_vocab_table, _load_all_sessions, DATABASE_URL as VOCAB_DATABASE_URL
try:
from dsfa_rag_api import router as dsfa_rag_router, set_db_pool as set_dsfa_db_pool
from dsfa_corpus_ingestion import DSFAQdrantService, DATABASE_URL as DSFA_DATABASE_URL
except ImportError:
dsfa_rag_router = None
set_dsfa_db_pool = None
DSFAQdrantService = None
DSFA_DATABASE_URL = None
# BYOEH Qdrant initialization
from qdrant_service import init_qdrant_collection
@@ -51,12 +64,42 @@ async def lifespan(app: FastAPI):
"""Application lifespan manager for startup and shutdown events."""
print("Klausur-Service starting...")
# Initialize database pool for Vocab Sessions
vocab_db_pool = None
try:
vocab_db_pool = await asyncpg.create_pool(VOCAB_DATABASE_URL, min_size=2, max_size=5)
set_vocab_db_pool(vocab_db_pool)
await _init_vocab_table()
await _load_all_sessions()
print(f"Vocab sessions database initialized")
except Exception as e:
print(f"Warning: Vocab sessions database initialization failed: {e}")
# Initialize database pool for DSFA RAG
dsfa_db_pool = None
if DSFA_DATABASE_URL and set_dsfa_db_pool:
try:
dsfa_db_pool = await asyncpg.create_pool(DSFA_DATABASE_URL, min_size=2, max_size=10)
set_dsfa_db_pool(dsfa_db_pool)
print(f"DSFA database pool initialized: {DSFA_DATABASE_URL}")
except Exception as e:
print(f"Warning: DSFA database pool initialization failed: {e}")
# Initialize Qdrant collection for BYOEH
try:
await init_qdrant_collection()
print("Qdrant BYOEH collection initialized")
except Exception as e:
print(f"Warning: Qdrant initialization failed: {e}")
print(f"Warning: Qdrant BYOEH initialization failed: {e}")
# Initialize Qdrant collection for DSFA RAG
if DSFAQdrantService:
try:
dsfa_qdrant = DSFAQdrantService()
await dsfa_qdrant.ensure_collection()
print("Qdrant DSFA corpus collection initialized")
except Exception as e:
print(f"Warning: Qdrant DSFA initialization failed: {e}")
# Ensure EH upload directory exists
os.makedirs(EH_UPLOAD_DIR, exist_ok=True)
@@ -65,6 +108,16 @@ async def lifespan(app: FastAPI):
print("Klausur-Service shutting down...")
# Close Vocab sessions database pool
if vocab_db_pool:
await vocab_db_pool.close()
print("Vocab sessions database pool closed")
# Close DSFA database pool
if dsfa_db_pool:
await dsfa_db_pool.close()
print("DSFA database pool closed")
app = FastAPI(
title="Klausur-Service",
@@ -94,7 +147,11 @@ app.include_router(admin_router) # NiBiS Ingestion
app.include_router(zeugnis_router) # Zeugnis Rights-Aware Crawler
app.include_router(training_router) # Training Management
app.include_router(mail_router) # Unified Inbox Mail
app.include_router(trocr_router) # TrOCR Handwriting OCR
if trocr_router:
app.include_router(trocr_router) # TrOCR Handwriting OCR
app.include_router(vocab_router) # Vocabulary Worksheet Generator
if dsfa_rag_router:
app.include_router(dsfa_rag_router) # DSFA RAG Corpus Search
# =============================================