feat: Add Document Crawler & Auto-Onboarding service (Phase 1.4)
New standalone Python/FastAPI service for automatic compliance document scanning, LLM-based classification, IPFS archival, and gap analysis. Includes extractors (PDF, DOCX, XLSX, PPTX), keyword fallback classifier, compliance matrix, and full REST API on port 8098. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
21
document-crawler/config.py
Normal file
21
document-crawler/config.py
Normal file
@@ -0,0 +1,21 @@
|
||||
"""Environment-based settings for Document Crawler service."""
|
||||
|
||||
import os
|
||||
|
||||
|
||||
class Settings:
|
||||
PORT: int = int(os.getenv("PORT", "8098"))
|
||||
DATABASE_URL: str = os.getenv(
|
||||
"DATABASE_URL",
|
||||
"postgresql://breakpilot:breakpilot123@bp-core-postgres:5432/breakpilot_db"
|
||||
)
|
||||
LLM_GATEWAY_URL: str = os.getenv("LLM_GATEWAY_URL", "http://ai-compliance-sdk:8090")
|
||||
DSMS_GATEWAY_URL: str = os.getenv("DSMS_GATEWAY_URL", "http://dsms-gateway:8082")
|
||||
CRAWL_BASE_PATH: str = os.getenv("CRAWL_BASE_PATH", "/data/crawl")
|
||||
MAX_FILE_SIZE_MB: int = int(os.getenv("MAX_FILE_SIZE_MB", "50"))
|
||||
MAX_FILE_SIZE_BYTES: int = MAX_FILE_SIZE_MB * 1024 * 1024
|
||||
LLM_TEXT_LIMIT: int = int(os.getenv("LLM_TEXT_LIMIT", "3000"))
|
||||
SUPPORTED_EXTENSIONS: list[str] = [".pdf", ".docx", ".xlsx", ".pptx"]
|
||||
|
||||
|
||||
settings = Settings()
|
||||
Reference in New Issue
Block a user