feat: Add Document Crawler & Auto-Onboarding service (Phase 1.4)

New standalone Python/FastAPI service for automatic compliance document
scanning, LLM-based classification, IPFS archival, and gap analysis.
Includes extractors (PDF, DOCX, XLSX, PPTX), keyword fallback classifier,
compliance matrix, and full REST API on port 8098.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Boenisch
2026-02-13 20:35:15 +01:00
parent 0923c03756
commit 364d2c69ff
34 changed files with 1633 additions and 0 deletions

63
document-crawler/main.py Normal file
View File

@@ -0,0 +1,63 @@
"""
Document Crawler & Auto-Onboarding Service
Scans local filesystems for compliance documents, classifies them via LLM,
archives to IPFS, and generates compliance gap analysis reports.
"""
from contextlib import asynccontextmanager
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
from db import get_pool, close_pool, run_migration
from api.sources import router as sources_router
from api.jobs import router as jobs_router
from api.documents import router as documents_router
from api.reports import router as reports_router
@asynccontextmanager
async def lifespan(app: FastAPI):
# Startup
await get_pool()
await run_migration()
yield
# Shutdown
await close_pool()
app = FastAPI(
title="Document Crawler",
description="Auto-Onboarding: Filesystem scanning, LLM classification, IPFS archival, gap analysis",
version="1.0.0",
lifespan=lifespan,
)
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# Register routers
app.include_router(sources_router, prefix="/api/v1/crawler")
app.include_router(jobs_router, prefix="/api/v1/crawler")
app.include_router(documents_router, prefix="/api/v1/crawler")
app.include_router(reports_router, prefix="/api/v1/crawler")
@app.get("/health")
async def health():
try:
pool = await get_pool()
async with pool.acquire() as conn:
await conn.fetchval("SELECT 1")
return {"status": "healthy", "service": "document-crawler"}
except Exception as e:
return {"status": "degraded", "error": str(e)}
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=8098)