New modules: - tesseract_vocab_extractor.py: Bounding-box OCR with multi-PSM pipeline - grid_detection_service.py: CV-based grid/table detection for worksheets - vocab_session_store.py: PostgreSQL persistence for vocab sessions - trocr_api.py: TrOCR handwriting recognition endpoint - dsfa_rag_api.py + dsfa_corpus_ingestion.py: DSFA RAG corpus search Changes: - Dockerfile: Install tesseract-ocr + deu/eng language packs - requirements.txt: Add PyMuPDF, pytesseract, Pillow - main.py: Register new routers, init DB pools + Qdrant collections Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
46 lines
1.0 KiB
Docker
46 lines
1.0 KiB
Docker
# Build stage for React frontend
|
|
FROM node:20-alpine AS frontend-builder
|
|
|
|
WORKDIR /frontend
|
|
COPY frontend/package*.json ./
|
|
RUN npm install
|
|
|
|
COPY frontend/ ./
|
|
RUN npm run build
|
|
|
|
# Production stage
|
|
FROM python:3.11-slim
|
|
|
|
WORKDIR /app
|
|
|
|
# Install system dependencies (incl. Tesseract OCR for bounding-box extraction)
|
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
|
curl \
|
|
tesseract-ocr \
|
|
tesseract-ocr-deu \
|
|
tesseract-ocr-eng \
|
|
&& rm -rf /var/lib/apt/lists/*
|
|
|
|
# Install Python dependencies
|
|
COPY backend/requirements.txt ./
|
|
RUN pip install --no-cache-dir -r requirements.txt
|
|
|
|
# Copy backend code
|
|
COPY backend/ ./
|
|
|
|
# Copy built frontend to the expected path
|
|
COPY --from=frontend-builder /frontend/dist ./frontend/dist
|
|
|
|
# Create uploads directory
|
|
RUN mkdir -p /app/uploads
|
|
|
|
# Expose port
|
|
EXPOSE 8086
|
|
|
|
# Health check
|
|
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
|
|
CMD curl -f http://localhost:8086/health || exit 1
|
|
|
|
# Run the application
|
|
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8086"]
|