"""Document list, reclassify, archive endpoints.""" import uuid from datetime import datetime, timezone from fastapi import APIRouter, HTTPException, Header, Query from pydantic import BaseModel from db import get_pool from archiver.dsms_client import archive_document router = APIRouter(tags=["documents"]) class ClassifyUpdate(BaseModel): classification: str class ArchiveBatch(BaseModel): document_ids: list[str] @router.get("/documents") async def list_documents( x_tenant_id: str = Header(...), classification: str | None = Query(None), extraction_status: str | None = Query(None), archived: bool | None = Query(None), limit: int = Query(100, le=500), offset: int = Query(0), ): pool = await get_pool() conditions = ["d.tenant_id = $1"] params: list = [uuid.UUID(x_tenant_id)] idx = 2 if classification: conditions.append(f"d.classification = ${idx}") params.append(classification) idx += 1 if extraction_status: conditions.append(f"d.extraction_status = ${idx}") params.append(extraction_status) idx += 1 if archived is not None: conditions.append(f"d.archived = ${idx}") params.append(archived) idx += 1 where = " AND ".join(conditions) async with pool.acquire() as conn: total = await conn.fetchval( f"SELECT COUNT(*) FROM crawler_documents d WHERE {where}", *params ) rows = await conn.fetch( f"""SELECT d.id, d.file_name, d.file_extension, d.file_size_bytes, d.classification, d.classification_confidence, d.classification_corrected, d.extraction_status, d.archived, d.ipfs_cid, d.first_seen_at, d.last_seen_at, d.version_count, s.name as source_name FROM crawler_documents d JOIN crawler_sources s ON d.source_id = s.id WHERE {where} ORDER BY d.created_at DESC LIMIT ${idx} OFFSET ${idx+1}""", *params, limit, offset, ) return {"total": total, "documents": [dict(r) for r in rows]} @router.get("/documents/{doc_id}") async def get_document(doc_id: str, x_tenant_id: str = Header(...)): pool = await get_pool() async with pool.acquire() as conn: row = await conn.fetchrow( """SELECT d.*, s.name as source_name FROM crawler_documents d JOIN crawler_sources s ON d.source_id = s.id WHERE d.id = $1 AND d.tenant_id = $2""", uuid.UUID(doc_id), uuid.UUID(x_tenant_id), ) if not row: raise HTTPException(404, "Document not found") result = dict(row) # Include a text preview (first 500 chars) if result.get("extracted_text"): result["text_preview"] = result["extracted_text"][:500] return result @router.put("/documents/{doc_id}/classify") async def classify_document_manually( doc_id: str, body: ClassifyUpdate, x_tenant_id: str = Header(...) ): pool = await get_pool() async with pool.acquire() as conn: result = await conn.execute( """UPDATE crawler_documents SET classification = $3, classification_corrected = true, updated_at = NOW() WHERE id = $1 AND tenant_id = $2""", uuid.UUID(doc_id), uuid.UUID(x_tenant_id), body.classification, ) if result == "UPDATE 0": raise HTTPException(404, "Document not found") return {"status": "updated", "classification": body.classification, "corrected": True} @router.post("/documents/{doc_id}/archive") async def archive_single_document(doc_id: str, x_tenant_id: str = Header(...)): pool = await get_pool() async with pool.acquire() as conn: row = await conn.fetchrow( "SELECT * FROM crawler_documents WHERE id = $1 AND tenant_id = $2", uuid.UUID(doc_id), uuid.UUID(x_tenant_id), ) if not row: raise HTTPException(404, "Document not found") if row["archived"]: return {"status": "already_archived", "ipfs_cid": row["ipfs_cid"]} try: result = await archive_document( file_path=row["file_path"], file_name=row["file_name"], document_type=row["classification"] or "unknown", document_id=str(row["id"]), ) cid = result.get("cid") except Exception as e: raise HTTPException(502, f"Archival failed: {e}") async with pool.acquire() as conn: await conn.execute( "UPDATE crawler_documents SET archived = true, ipfs_cid = $2, archived_at = NOW(), updated_at = NOW() WHERE id = $1", uuid.UUID(doc_id), cid, ) return {"status": "archived", "ipfs_cid": cid} @router.post("/documents/archive-batch") async def archive_batch(body: ArchiveBatch, x_tenant_id: str = Header(...)): results = [] for did in body.document_ids: try: r = await archive_single_document(did, x_tenant_id) results.append({"id": did, **r}) except HTTPException as e: results.append({"id": did, "status": "error", "error": e.detail}) return {"results": results}