feat: Add Document Crawler & Auto-Onboarding service (Phase 1.4)

New standalone Python/FastAPI service for automatic compliance document scanning, LLM-based classification, IPFS archival, and gap analysis. Includes extractors (PDF, DOCX, XLSX, PPTX), keyword fallback classifier, compliance matrix, and full REST API on port 8098. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-13 20:35:15 +01:00
parent 0923c03756
commit 364d2c69ff
34 changed files with 1633 additions and 0 deletions
@@ -0,0 +1,63 @@
+"""
+Document Crawler & Auto-Onboarding Service
+Scans local filesystems for compliance documents, classifies them via LLM,
+archives to IPFS, and generates compliance gap analysis reports.
+"""
+
+from contextlib import asynccontextmanager
+from fastapi import FastAPI
+from fastapi.middleware.cors import CORSMiddleware
+
+from db import get_pool, close_pool, run_migration
+from api.sources import router as sources_router
+from api.jobs import router as jobs_router
+from api.documents import router as documents_router
+from api.reports import router as reports_router
+
+
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    # Startup
+    await get_pool()
+    await run_migration()
+    yield
+    # Shutdown
+    await close_pool()
+
+
+app = FastAPI(
+    title="Document Crawler",
+    description="Auto-Onboarding: Filesystem scanning, LLM classification, IPFS archival, gap analysis",
+    version="1.0.0",
+    lifespan=lifespan,
+)
+
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+
+# Register routers
+app.include_router(sources_router, prefix="/api/v1/crawler")
+app.include_router(jobs_router, prefix="/api/v1/crawler")
+app.include_router(documents_router, prefix="/api/v1/crawler")
+app.include_router(reports_router, prefix="/api/v1/crawler")
+
+
+@app.get("/health")
+async def health():
+    try:
+        pool = await get_pool()
+        async with pool.acquire() as conn:
+            await conn.fetchval("SELECT 1")
+        return {"status": "healthy", "service": "document-crawler"}
+    except Exception as e:
+        return {"status": "degraded", "error": str(e)}
+
+
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=8098)