-- Migration 014: Document Crawler & Auto-Onboarding tables -- 4 tables for crawl source management, job tracking, document storage, and reports BEGIN; -- 1. Crawl Sources — configurable directories to scan CREATE TABLE IF NOT EXISTS crawler_sources ( id UUID PRIMARY KEY DEFAULT gen_random_uuid(), tenant_id UUID NOT NULL, name VARCHAR(255) NOT NULL, source_type VARCHAR(50) NOT NULL DEFAULT 'local', -- local, smb path TEXT NOT NULL, file_extensions JSONB NOT NULL DEFAULT '[".pdf", ".docx", ".xlsx", ".pptx"]', max_depth INT NOT NULL DEFAULT 5, exclude_patterns JSONB NOT NULL DEFAULT '[]', enabled BOOLEAN NOT NULL DEFAULT true, created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW() ); CREATE INDEX IF NOT EXISTS idx_crawler_sources_tenant ON crawler_sources(tenant_id); -- 2. Crawl Jobs — each crawl execution CREATE TABLE IF NOT EXISTS crawler_jobs ( id UUID PRIMARY KEY DEFAULT gen_random_uuid(), tenant_id UUID NOT NULL, source_id UUID NOT NULL REFERENCES crawler_sources(id) ON DELETE CASCADE, status VARCHAR(50) NOT NULL DEFAULT 'pending', -- pending, running, completed, failed, cancelled job_type VARCHAR(50) NOT NULL DEFAULT 'full', -- full, delta files_found INT NOT NULL DEFAULT 0, files_processed INT NOT NULL DEFAULT 0, files_new INT NOT NULL DEFAULT 0, files_changed INT NOT NULL DEFAULT 0, files_skipped INT NOT NULL DEFAULT 0, files_error INT NOT NULL DEFAULT 0, error_message TEXT, started_at TIMESTAMPTZ, completed_at TIMESTAMPTZ, created_at TIMESTAMPTZ NOT NULL DEFAULT NOW() ); CREATE INDEX IF NOT EXISTS idx_crawler_jobs_tenant ON crawler_jobs(tenant_id); CREATE INDEX IF NOT EXISTS idx_crawler_jobs_source ON crawler_jobs(source_id); CREATE INDEX IF NOT EXISTS idx_crawler_jobs_status ON crawler_jobs(status); -- 3. Crawled Documents — every file discovered CREATE TABLE IF NOT EXISTS crawler_documents ( id UUID PRIMARY KEY DEFAULT gen_random_uuid(), tenant_id UUID NOT NULL, source_id UUID NOT NULL REFERENCES crawler_sources(id) ON DELETE CASCADE, job_id UUID NOT NULL REFERENCES crawler_jobs(id) ON DELETE CASCADE, file_path TEXT NOT NULL, file_name VARCHAR(500) NOT NULL, file_extension VARCHAR(20) NOT NULL, file_size_bytes BIGINT NOT NULL DEFAULT 0, file_hash VARCHAR(64), -- SHA-256 extracted_text TEXT, extraction_status VARCHAR(50) NOT NULL DEFAULT 'pending', -- pending, completed, failed classification VARCHAR(100), classification_confidence FLOAT, classification_reasoning TEXT, classification_corrected BOOLEAN NOT NULL DEFAULT false, archived BOOLEAN NOT NULL DEFAULT false, ipfs_cid VARCHAR(200), archived_at TIMESTAMPTZ, first_seen_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), last_seen_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), version_count INT NOT NULL DEFAULT 1, created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), UNIQUE(tenant_id, source_id, file_path) ); CREATE INDEX IF NOT EXISTS idx_crawler_documents_tenant ON crawler_documents(tenant_id); CREATE INDEX IF NOT EXISTS idx_crawler_documents_source ON crawler_documents(source_id); CREATE INDEX IF NOT EXISTS idx_crawler_documents_classification ON crawler_documents(classification); CREATE INDEX IF NOT EXISTS idx_crawler_documents_hash ON crawler_documents(file_hash); -- 4. Onboarding Reports — summary reports with gap analysis CREATE TABLE IF NOT EXISTS crawler_onboarding_reports ( id UUID PRIMARY KEY DEFAULT gen_random_uuid(), tenant_id UUID NOT NULL, job_id UUID REFERENCES crawler_jobs(id) ON DELETE SET NULL, total_documents_found INT NOT NULL DEFAULT 0, classification_breakdown JSONB NOT NULL DEFAULT '{}', gaps JSONB NOT NULL DEFAULT '[]', compliance_score FLOAT NOT NULL DEFAULT 0.0, created_at TIMESTAMPTZ NOT NULL DEFAULT NOW() ); CREATE INDEX IF NOT EXISTS idx_crawler_reports_tenant ON crawler_onboarding_reports(tenant_id); COMMIT;