feat: Add Document Crawler & Auto-Onboarding service (Phase 1.4)
New standalone Python/FastAPI service for automatic compliance document scanning, LLM-based classification, IPFS archival, and gap analysis. Includes extractors (PDF, DOCX, XLSX, PPTX), keyword fallback classifier, compliance matrix, and full REST API on port 8098. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -206,6 +206,39 @@ services:
|
||||
networks:
|
||||
- breakpilot-network
|
||||
|
||||
# =========================================================
|
||||
# DOCUMENT CRAWLER & AUTO-ONBOARDING
|
||||
# =========================================================
|
||||
document-crawler:
|
||||
build:
|
||||
context: ./document-crawler
|
||||
dockerfile: Dockerfile
|
||||
container_name: bp-compliance-document-crawler
|
||||
platform: linux/arm64
|
||||
ports:
|
||||
- "8098:8098"
|
||||
environment:
|
||||
PORT: 8098
|
||||
DATABASE_URL: postgresql://${POSTGRES_USER:-breakpilot}:${POSTGRES_PASSWORD:-breakpilot123}@bp-core-postgres:5432/${POSTGRES_DB:-breakpilot_db}
|
||||
LLM_GATEWAY_URL: http://ai-compliance-sdk:8090
|
||||
DSMS_GATEWAY_URL: http://dsms-gateway:8082
|
||||
CRAWL_BASE_PATH: /data/crawl
|
||||
MAX_FILE_SIZE_MB: 50
|
||||
volumes:
|
||||
- /tmp/breakpilot-crawl-data:/data/crawl:ro
|
||||
depends_on:
|
||||
core-health-check:
|
||||
condition: service_completed_successfully
|
||||
healthcheck:
|
||||
test: ["CMD", "curl", "-f", "http://127.0.0.1:8098/health"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
start_period: 15s
|
||||
retries: 3
|
||||
restart: unless-stopped
|
||||
networks:
|
||||
- breakpilot-network
|
||||
|
||||
# =========================================================
|
||||
# DOCUMENTATION
|
||||
# =========================================================
|
||||
|
||||
Reference in New Issue
Block a user