"""Crawl job management + trigger endpoints.""" import asyncio import json import os import uuid from datetime import datetime, timezone from fastapi import APIRouter, HTTPException, Header, BackgroundTasks from pydantic import BaseModel from db import get_pool from config import settings from crawlers.filesystem_crawler import FilesystemCrawler from extractors.dispatcher import extract_text from classifiers.llm_classifier import classify_document router = APIRouter(tags=["jobs"]) class JobCreate(BaseModel): source_id: str job_type: str = "full" # full or delta async def _run_crawl_job(job_id: str, source_id: str, tenant_id: str, job_type: str): """Background task that executes a crawl job.""" pool = await get_pool() async with pool.acquire() as conn: source = await conn.fetchrow( "SELECT * FROM crawler_sources WHERE id = $1", uuid.UUID(source_id) ) if not source: await conn.execute( "UPDATE crawler_jobs SET status = 'failed', error_message = 'Source not found', completed_at = NOW() WHERE id = $1", uuid.UUID(job_id), ) return # Mark job as running await conn.execute( "UPDATE crawler_jobs SET status = 'running', started_at = NOW() WHERE id = $1", uuid.UUID(job_id), ) # Resolve path source_path = source["path"] if not os.path.isabs(source_path): source_path = os.path.join(settings.CRAWL_BASE_PATH, source_path) exts = json.loads(source["file_extensions"]) if isinstance(source["file_extensions"], str) else source["file_extensions"] excludes = json.loads(source["exclude_patterns"]) if isinstance(source["exclude_patterns"], str) else source["exclude_patterns"] crawler = FilesystemCrawler( base_path=source_path, file_extensions=exts, max_depth=source["max_depth"], exclude_patterns=excludes, ) files = crawler.crawl() stats = { "files_found": len(files), "files_processed": 0, "files_new": 0, "files_changed": 0, "files_skipped": 0, "files_error": 0, } for crawled in files: try: async with pool.acquire() as conn: # Check for existing document (delta detection) existing = await conn.fetchrow( "SELECT id, file_hash FROM crawler_documents WHERE tenant_id = $1 AND source_id = $2 AND file_path = $3", uuid.UUID(tenant_id), uuid.UUID(source_id), crawled.file_path, ) if existing: if job_type == "delta" and existing["file_hash"] == crawled.file_hash: # Unchanged — skip await conn.execute( "UPDATE crawler_documents SET last_seen_at = NOW() WHERE id = $1", existing["id"], ) stats["files_skipped"] += 1 stats["files_processed"] += 1 continue elif existing["file_hash"] != crawled.file_hash: stats["files_changed"] += 1 else: stats["files_skipped"] += 1 stats["files_processed"] += 1 continue else: stats["files_new"] += 1 # Extract text extraction_status = "completed" extracted_text = "" try: extracted_text = extract_text(crawled.file_path, crawled.file_extension) except Exception: extraction_status = "failed" # Classify classification_result = {"classification": None, "confidence": None, "reasoning": None} if extracted_text: classification_result = await classify_document( extracted_text, crawled.file_name, tenant_id ) if existing: # Update existing await conn.execute( """UPDATE crawler_documents SET job_id = $1, file_size_bytes = $2, file_hash = $3, extracted_text = $4, extraction_status = $5, classification = $6, classification_confidence = $7, classification_reasoning = $8, classification_corrected = false, last_seen_at = NOW(), version_count = version_count + 1, updated_at = NOW() WHERE id = $9""", uuid.UUID(job_id), crawled.file_size_bytes, crawled.file_hash, extracted_text, extraction_status, classification_result["classification"], classification_result["confidence"], classification_result["reasoning"], existing["id"], ) else: # Insert new await conn.execute( """INSERT INTO crawler_documents (tenant_id, source_id, job_id, file_path, file_name, file_extension, file_size_bytes, file_hash, extracted_text, extraction_status, classification, classification_confidence, classification_reasoning) VALUES ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13)""", uuid.UUID(tenant_id), uuid.UUID(source_id), uuid.UUID(job_id), crawled.file_path, crawled.file_name, crawled.file_extension, crawled.file_size_bytes, crawled.file_hash, extracted_text, extraction_status, classification_result["classification"], classification_result["confidence"], classification_result["reasoning"], ) stats["files_processed"] += 1 except Exception: stats["files_error"] += 1 stats["files_processed"] += 1 # Update job progress async with pool.acquire() as conn: await conn.execute( """UPDATE crawler_jobs SET files_found=$2, files_processed=$3, files_new=$4, files_changed=$5, files_skipped=$6, files_error=$7 WHERE id = $1""", uuid.UUID(job_id), stats["files_found"], stats["files_processed"], stats["files_new"], stats["files_changed"], stats["files_skipped"], stats["files_error"], ) # Mark completed async with pool.acquire() as conn: await conn.execute( "UPDATE crawler_jobs SET status = 'completed', completed_at = NOW() WHERE id = $1", uuid.UUID(job_id), ) @router.post("/jobs", status_code=201) async def create_job( body: JobCreate, background_tasks: BackgroundTasks, x_tenant_id: str = Header(...), ): pool = await get_pool() async with pool.acquire() as conn: # Verify source exists source = await conn.fetchrow( "SELECT id FROM crawler_sources WHERE id = $1 AND tenant_id = $2", uuid.UUID(body.source_id), uuid.UUID(x_tenant_id), ) if not source: raise HTTPException(404, "Source not found") # Check no job already running for this source running = await conn.fetchval( "SELECT EXISTS(SELECT 1 FROM crawler_jobs WHERE source_id = $1 AND status = 'running')", uuid.UUID(body.source_id), ) if running: raise HTTPException(409, "A job is already running for this source") row = await conn.fetchrow( """INSERT INTO crawler_jobs (tenant_id, source_id, job_type) VALUES ($1, $2, $3) RETURNING *""", uuid.UUID(x_tenant_id), uuid.UUID(body.source_id), body.job_type, ) job_id = str(row["id"]) background_tasks.add_task(_run_crawl_job, job_id, body.source_id, x_tenant_id, body.job_type) return dict(row) @router.get("/jobs") async def list_jobs(x_tenant_id: str = Header(...)): pool = await get_pool() async with pool.acquire() as conn: rows = await conn.fetch( """SELECT j.*, s.name as source_name FROM crawler_jobs j JOIN crawler_sources s ON j.source_id = s.id WHERE j.tenant_id = $1 ORDER BY j.created_at DESC LIMIT 50""", uuid.UUID(x_tenant_id), ) return [dict(r) for r in rows] @router.get("/jobs/{job_id}") async def get_job(job_id: str, x_tenant_id: str = Header(...)): pool = await get_pool() async with pool.acquire() as conn: row = await conn.fetchrow( "SELECT * FROM crawler_jobs WHERE id = $1 AND tenant_id = $2", uuid.UUID(job_id), uuid.UUID(x_tenant_id), ) if not row: raise HTTPException(404, "Job not found") return dict(row) @router.post("/jobs/{job_id}/cancel") async def cancel_job(job_id: str, x_tenant_id: str = Header(...)): pool = await get_pool() async with pool.acquire() as conn: result = await conn.execute( "UPDATE crawler_jobs SET status = 'cancelled', completed_at = NOW() WHERE id = $1 AND tenant_id = $2 AND status IN ('pending', 'running')", uuid.UUID(job_id), uuid.UUID(x_tenant_id), ) if result == "UPDATE 0": raise HTTPException(404, "Job not found or not cancellable") return {"status": "cancelled"}