feat: Add Document Crawler & Auto-Onboarding service (Phase 1.4)

New standalone Python/FastAPI service for automatic compliance document
scanning, LLM-based classification, IPFS archival, and gap analysis.
Includes extractors (PDF, DOCX, XLSX, PPTX), keyword fallback classifier,
compliance matrix, and full REST API on port 8098.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Boenisch
2026-02-13 20:35:15 +01:00
parent 0923c03756
commit 364d2c69ff
34 changed files with 1633 additions and 0 deletions

47
document-crawler/db.py Normal file
View File

@@ -0,0 +1,47 @@
"""asyncpg pool management for Document Crawler."""
import asyncpg
from config import settings
_pool: asyncpg.Pool | None = None
async def get_pool() -> asyncpg.Pool:
global _pool
if _pool is None:
_pool = await asyncpg.create_pool(
settings.DATABASE_URL,
min_size=2,
max_size=10,
)
return _pool
async def close_pool():
global _pool
if _pool is not None:
await _pool.close()
_pool = None
async def run_migration():
"""Run the crawler migration on startup."""
import os
migration_path = os.path.join(
os.path.dirname(__file__), "migrations", "014_crawler_tables.sql"
)
if not os.path.exists(migration_path):
return
pool = await get_pool()
async with pool.acquire() as conn:
# Check if migration already applied
exists = await conn.fetchval(
"SELECT EXISTS(SELECT 1 FROM information_schema.tables WHERE table_name = 'crawler_sources')"
)
if exists:
return
with open(migration_path) as f:
sql = f.read()
await conn.execute(sql)