feat: Add Document Crawler & Auto-Onboarding service (Phase 1.4)

New standalone Python/FastAPI service for automatic compliance document scanning, LLM-based classification, IPFS archival, and gap analysis. Includes extractors (PDF, DOCX, XLSX, PPTX), keyword fallback classifier, compliance matrix, and full REST API on port 8098. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-13 20:35:15 +01:00
parent 0923c03756
commit 364d2c69ff
34 changed files with 1633 additions and 0 deletions
@@ -0,0 +1,47 @@
+"""asyncpg pool management for Document Crawler."""
+
+import asyncpg
+from config import settings
+
+_pool: asyncpg.Pool | None = None
+
+
+async def get_pool() -> asyncpg.Pool:
+    global _pool
+    if _pool is None:
+        _pool = await asyncpg.create_pool(
+            settings.DATABASE_URL,
+            min_size=2,
+            max_size=10,
+        )
+    return _pool
+
+
+async def close_pool():
+    global _pool
+    if _pool is not None:
+        await _pool.close()
+        _pool = None
+
+
+async def run_migration():
+    """Run the crawler migration on startup."""
+    import os
+    migration_path = os.path.join(
+        os.path.dirname(__file__), "migrations", "014_crawler_tables.sql"
+    )
+    if not os.path.exists(migration_path):
+        return
+
+    pool = await get_pool()
+    async with pool.acquire() as conn:
+        # Check if migration already applied
+        exists = await conn.fetchval(
+            "SELECT EXISTS(SELECT 1 FROM information_schema.tables WHERE table_name = 'crawler_sources')"
+        )
+        if exists:
+            return
+
+        with open(migration_path) as f:
+            sql = f.read()
+        await conn.execute(sql)