feat: Add Document Crawler & Auto-Onboarding service (Phase 1.4)
New standalone Python/FastAPI service for automatic compliance document scanning, LLM-based classification, IPFS archival, and gap analysis. Includes extractors (PDF, DOCX, XLSX, PPTX), keyword fallback classifier, compliance matrix, and full REST API on port 8098. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
93
document-crawler/migrations/014_crawler_tables.sql
Normal file
93
document-crawler/migrations/014_crawler_tables.sql
Normal file
@@ -0,0 +1,93 @@
|
||||
-- Migration 014: Document Crawler & Auto-Onboarding tables
|
||||
-- 4 tables for crawl source management, job tracking, document storage, and reports
|
||||
|
||||
BEGIN;
|
||||
|
||||
-- 1. Crawl Sources — configurable directories to scan
|
||||
CREATE TABLE IF NOT EXISTS crawler_sources (
|
||||
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
tenant_id UUID NOT NULL,
|
||||
name VARCHAR(255) NOT NULL,
|
||||
source_type VARCHAR(50) NOT NULL DEFAULT 'local', -- local, smb
|
||||
path TEXT NOT NULL,
|
||||
file_extensions JSONB NOT NULL DEFAULT '[".pdf", ".docx", ".xlsx", ".pptx"]',
|
||||
max_depth INT NOT NULL DEFAULT 5,
|
||||
exclude_patterns JSONB NOT NULL DEFAULT '[]',
|
||||
enabled BOOLEAN NOT NULL DEFAULT true,
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_crawler_sources_tenant ON crawler_sources(tenant_id);
|
||||
|
||||
-- 2. Crawl Jobs — each crawl execution
|
||||
CREATE TABLE IF NOT EXISTS crawler_jobs (
|
||||
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
tenant_id UUID NOT NULL,
|
||||
source_id UUID NOT NULL REFERENCES crawler_sources(id) ON DELETE CASCADE,
|
||||
status VARCHAR(50) NOT NULL DEFAULT 'pending', -- pending, running, completed, failed, cancelled
|
||||
job_type VARCHAR(50) NOT NULL DEFAULT 'full', -- full, delta
|
||||
files_found INT NOT NULL DEFAULT 0,
|
||||
files_processed INT NOT NULL DEFAULT 0,
|
||||
files_new INT NOT NULL DEFAULT 0,
|
||||
files_changed INT NOT NULL DEFAULT 0,
|
||||
files_skipped INT NOT NULL DEFAULT 0,
|
||||
files_error INT NOT NULL DEFAULT 0,
|
||||
error_message TEXT,
|
||||
started_at TIMESTAMPTZ,
|
||||
completed_at TIMESTAMPTZ,
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_crawler_jobs_tenant ON crawler_jobs(tenant_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_crawler_jobs_source ON crawler_jobs(source_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_crawler_jobs_status ON crawler_jobs(status);
|
||||
|
||||
-- 3. Crawled Documents — every file discovered
|
||||
CREATE TABLE IF NOT EXISTS crawler_documents (
|
||||
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
tenant_id UUID NOT NULL,
|
||||
source_id UUID NOT NULL REFERENCES crawler_sources(id) ON DELETE CASCADE,
|
||||
job_id UUID NOT NULL REFERENCES crawler_jobs(id) ON DELETE CASCADE,
|
||||
file_path TEXT NOT NULL,
|
||||
file_name VARCHAR(500) NOT NULL,
|
||||
file_extension VARCHAR(20) NOT NULL,
|
||||
file_size_bytes BIGINT NOT NULL DEFAULT 0,
|
||||
file_hash VARCHAR(64), -- SHA-256
|
||||
extracted_text TEXT,
|
||||
extraction_status VARCHAR(50) NOT NULL DEFAULT 'pending', -- pending, completed, failed
|
||||
classification VARCHAR(100),
|
||||
classification_confidence FLOAT,
|
||||
classification_reasoning TEXT,
|
||||
classification_corrected BOOLEAN NOT NULL DEFAULT false,
|
||||
archived BOOLEAN NOT NULL DEFAULT false,
|
||||
ipfs_cid VARCHAR(200),
|
||||
archived_at TIMESTAMPTZ,
|
||||
first_seen_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
last_seen_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
version_count INT NOT NULL DEFAULT 1,
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
UNIQUE(tenant_id, source_id, file_path)
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_crawler_documents_tenant ON crawler_documents(tenant_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_crawler_documents_source ON crawler_documents(source_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_crawler_documents_classification ON crawler_documents(classification);
|
||||
CREATE INDEX IF NOT EXISTS idx_crawler_documents_hash ON crawler_documents(file_hash);
|
||||
|
||||
-- 4. Onboarding Reports — summary reports with gap analysis
|
||||
CREATE TABLE IF NOT EXISTS crawler_onboarding_reports (
|
||||
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
tenant_id UUID NOT NULL,
|
||||
job_id UUID REFERENCES crawler_jobs(id) ON DELETE SET NULL,
|
||||
total_documents_found INT NOT NULL DEFAULT 0,
|
||||
classification_breakdown JSONB NOT NULL DEFAULT '{}',
|
||||
gaps JSONB NOT NULL DEFAULT '[]',
|
||||
compliance_score FLOAT NOT NULL DEFAULT 0.0,
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_crawler_reports_tenant ON crawler_onboarding_reports(tenant_id);
|
||||
|
||||
COMMIT;
|
||||
Reference in New Issue
Block a user