Files
breakpilot-compliance/document-crawler/main.py
Benjamin Boenisch 364d2c69ff feat: Add Document Crawler & Auto-Onboarding service (Phase 1.4)
New standalone Python/FastAPI service for automatic compliance document
scanning, LLM-based classification, IPFS archival, and gap analysis.
Includes extractors (PDF, DOCX, XLSX, PPTX), keyword fallback classifier,
compliance matrix, and full REST API on port 8098.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-13 20:35:15 +01:00

64 lines
1.7 KiB
Python

"""
Document Crawler & Auto-Onboarding Service
Scans local filesystems for compliance documents, classifies them via LLM,
archives to IPFS, and generates compliance gap analysis reports.
"""
from contextlib import asynccontextmanager
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
from db import get_pool, close_pool, run_migration
from api.sources import router as sources_router
from api.jobs import router as jobs_router
from api.documents import router as documents_router
from api.reports import router as reports_router
@asynccontextmanager
async def lifespan(app: FastAPI):
# Startup
await get_pool()
await run_migration()
yield
# Shutdown
await close_pool()
app = FastAPI(
title="Document Crawler",
description="Auto-Onboarding: Filesystem scanning, LLM classification, IPFS archival, gap analysis",
version="1.0.0",
lifespan=lifespan,
)
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# Register routers
app.include_router(sources_router, prefix="/api/v1/crawler")
app.include_router(jobs_router, prefix="/api/v1/crawler")
app.include_router(documents_router, prefix="/api/v1/crawler")
app.include_router(reports_router, prefix="/api/v1/crawler")
@app.get("/health")
async def health():
try:
pool = await get_pool()
async with pool.acquire() as conn:
await conn.fetchval("SELECT 1")
return {"status": "healthy", "service": "document-crawler"}
except Exception as e:
return {"status": "degraded", "error": str(e)}
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=8098)