feat: Add Document Crawler & Auto-Onboarding service (Phase 1.4)

New standalone Python/FastAPI service for automatic compliance document
scanning, LLM-based classification, IPFS archival, and gap analysis.
Includes extractors (PDF, DOCX, XLSX, PPTX), keyword fallback classifier,
compliance matrix, and full REST API on port 8098.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Boenisch
2026-02-13 20:35:15 +01:00
parent 0923c03756
commit 364d2c69ff
34 changed files with 1633 additions and 0 deletions

View File

@@ -206,6 +206,39 @@ services:
networks:
- breakpilot-network
# =========================================================
# DOCUMENT CRAWLER & AUTO-ONBOARDING
# =========================================================
document-crawler:
build:
context: ./document-crawler
dockerfile: Dockerfile
container_name: bp-compliance-document-crawler
platform: linux/arm64
ports:
- "8098:8098"
environment:
PORT: 8098
DATABASE_URL: postgresql://${POSTGRES_USER:-breakpilot}:${POSTGRES_PASSWORD:-breakpilot123}@bp-core-postgres:5432/${POSTGRES_DB:-breakpilot_db}
LLM_GATEWAY_URL: http://ai-compliance-sdk:8090
DSMS_GATEWAY_URL: http://dsms-gateway:8082
CRAWL_BASE_PATH: /data/crawl
MAX_FILE_SIZE_MB: 50
volumes:
- /tmp/breakpilot-crawl-data:/data/crawl:ro
depends_on:
core-health-check:
condition: service_completed_successfully
healthcheck:
test: ["CMD", "curl", "-f", "http://127.0.0.1:8098/health"]
interval: 30s
timeout: 10s
start_period: 15s
retries: 3
restart: unless-stopped
networks:
- breakpilot-network
# =========================================================
# DOCUMENTATION
# =========================================================