feat: Add Document Crawler & Auto-Onboarding service (Phase 1.4)
New standalone Python/FastAPI service for automatic compliance document scanning, LLM-based classification, IPFS archival, and gap analysis. Includes extractors (PDF, DOCX, XLSX, PPTX), keyword fallback classifier, compliance matrix, and full REST API on port 8098. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
47
document-crawler/db.py
Normal file
47
document-crawler/db.py
Normal file
@@ -0,0 +1,47 @@
|
||||
"""asyncpg pool management for Document Crawler."""
|
||||
|
||||
import asyncpg
|
||||
from config import settings
|
||||
|
||||
_pool: asyncpg.Pool | None = None
|
||||
|
||||
|
||||
async def get_pool() -> asyncpg.Pool:
|
||||
global _pool
|
||||
if _pool is None:
|
||||
_pool = await asyncpg.create_pool(
|
||||
settings.DATABASE_URL,
|
||||
min_size=2,
|
||||
max_size=10,
|
||||
)
|
||||
return _pool
|
||||
|
||||
|
||||
async def close_pool():
|
||||
global _pool
|
||||
if _pool is not None:
|
||||
await _pool.close()
|
||||
_pool = None
|
||||
|
||||
|
||||
async def run_migration():
|
||||
"""Run the crawler migration on startup."""
|
||||
import os
|
||||
migration_path = os.path.join(
|
||||
os.path.dirname(__file__), "migrations", "014_crawler_tables.sql"
|
||||
)
|
||||
if not os.path.exists(migration_path):
|
||||
return
|
||||
|
||||
pool = await get_pool()
|
||||
async with pool.acquire() as conn:
|
||||
# Check if migration already applied
|
||||
exists = await conn.fetchval(
|
||||
"SELECT EXISTS(SELECT 1 FROM information_schema.tables WHERE table_name = 'crawler_sources')"
|
||||
)
|
||||
if exists:
|
||||
return
|
||||
|
||||
with open(migration_path) as f:
|
||||
sql = f.read()
|
||||
await conn.execute(sql)
|
||||
Reference in New Issue
Block a user