feat: Add Document Crawler & Auto-Onboarding service (Phase 1.4)
New standalone Python/FastAPI service for automatic compliance document scanning, LLM-based classification, IPFS archival, and gap analysis. Includes extractors (PDF, DOCX, XLSX, PPTX), keyword fallback classifier, compliance matrix, and full REST API on port 8098. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
63
document-crawler/main.py
Normal file
63
document-crawler/main.py
Normal file
@@ -0,0 +1,63 @@
|
||||
"""
|
||||
Document Crawler & Auto-Onboarding Service
|
||||
Scans local filesystems for compliance documents, classifies them via LLM,
|
||||
archives to IPFS, and generates compliance gap analysis reports.
|
||||
"""
|
||||
|
||||
from contextlib import asynccontextmanager
|
||||
from fastapi import FastAPI
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
|
||||
from db import get_pool, close_pool, run_migration
|
||||
from api.sources import router as sources_router
|
||||
from api.jobs import router as jobs_router
|
||||
from api.documents import router as documents_router
|
||||
from api.reports import router as reports_router
|
||||
|
||||
|
||||
@asynccontextmanager
|
||||
async def lifespan(app: FastAPI):
|
||||
# Startup
|
||||
await get_pool()
|
||||
await run_migration()
|
||||
yield
|
||||
# Shutdown
|
||||
await close_pool()
|
||||
|
||||
|
||||
app = FastAPI(
|
||||
title="Document Crawler",
|
||||
description="Auto-Onboarding: Filesystem scanning, LLM classification, IPFS archival, gap analysis",
|
||||
version="1.0.0",
|
||||
lifespan=lifespan,
|
||||
)
|
||||
|
||||
app.add_middleware(
|
||||
CORSMiddleware,
|
||||
allow_origins=["*"],
|
||||
allow_credentials=True,
|
||||
allow_methods=["*"],
|
||||
allow_headers=["*"],
|
||||
)
|
||||
|
||||
# Register routers
|
||||
app.include_router(sources_router, prefix="/api/v1/crawler")
|
||||
app.include_router(jobs_router, prefix="/api/v1/crawler")
|
||||
app.include_router(documents_router, prefix="/api/v1/crawler")
|
||||
app.include_router(reports_router, prefix="/api/v1/crawler")
|
||||
|
||||
|
||||
@app.get("/health")
|
||||
async def health():
|
||||
try:
|
||||
pool = await get_pool()
|
||||
async with pool.acquire() as conn:
|
||||
await conn.fetchval("SELECT 1")
|
||||
return {"status": "healthy", "service": "document-crawler"}
|
||||
except Exception as e:
|
||||
return {"status": "degraded", "error": str(e)}
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import uvicorn
|
||||
uvicorn.run(app, host="0.0.0.0", port=8098)
|
||||
Reference in New Issue
Block a user