feat: Add Document Crawler & Auto-Onboarding service (Phase 1.4)
New standalone Python/FastAPI service for automatic compliance document scanning, LLM-based classification, IPFS archival, and gap analysis. Includes extractors (PDF, DOCX, XLSX, PPTX), keyword fallback classifier, compliance matrix, and full REST API on port 8098. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
37
document-crawler/Dockerfile
Normal file
37
document-crawler/Dockerfile
Normal file
@@ -0,0 +1,37 @@
|
||||
# Document Crawler - Auto-Onboarding Service
|
||||
FROM python:3.11-slim
|
||||
|
||||
LABEL maintainer="BreakPilot <dev@breakpilot.app>"
|
||||
LABEL description="Document Crawler & Auto-Onboarding Service"
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
# Install curl for healthcheck and system dependencies for document extraction
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
curl \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Install Python dependencies
|
||||
COPY requirements.txt .
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
# Copy application
|
||||
COPY . .
|
||||
|
||||
# Environment variables
|
||||
ENV PORT=8098
|
||||
ENV DATABASE_URL=postgresql://breakpilot:breakpilot123@bp-core-postgres:5432/breakpilot_db
|
||||
ENV LLM_GATEWAY_URL=http://ai-compliance-sdk:8090
|
||||
ENV DSMS_GATEWAY_URL=http://dsms-gateway:8082
|
||||
ENV CRAWL_BASE_PATH=/data/crawl
|
||||
ENV MAX_FILE_SIZE_MB=50
|
||||
|
||||
# Expose port
|
||||
EXPOSE 8098
|
||||
|
||||
# Health check
|
||||
HEALTHCHECK --interval=30s --timeout=10s --start-period=15s --retries=3 \
|
||||
CMD curl -f http://localhost:8098/health || exit 1
|
||||
|
||||
# Run application
|
||||
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8098"]
|
||||
Reference in New Issue
Block a user