feat: Add Document Crawler & Auto-Onboarding service (Phase 1.4)

New standalone Python/FastAPI service for automatic compliance document scanning, LLM-based classification, IPFS archival, and gap analysis. Includes extractors (PDF, DOCX, XLSX, PPTX), keyword fallback classifier, compliance matrix, and full REST API on port 8098. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-13 20:35:15 +01:00
parent 0923c03756
commit 364d2c69ff
34 changed files with 1633 additions and 0 deletions
@@ -0,0 +1,152 @@
+"""Document list, reclassify, archive endpoints."""
+
+import uuid
+from datetime import datetime, timezone
+from fastapi import APIRouter, HTTPException, Header, Query
+from pydantic import BaseModel
+
+from db import get_pool
+from archiver.dsms_client import archive_document
+
+router = APIRouter(tags=["documents"])
+
+
+class ClassifyUpdate(BaseModel):
+    classification: str
+
+
+class ArchiveBatch(BaseModel):
+    document_ids: list[str]
+
+
+@router.get("/documents")
+async def list_documents(
+    x_tenant_id: str = Header(...),
+    classification: str | None = Query(None),
+    extraction_status: str | None = Query(None),
+    archived: bool | None = Query(None),
+    limit: int = Query(100, le=500),
+    offset: int = Query(0),
+):
+    pool = await get_pool()
+    conditions = ["d.tenant_id = $1"]
+    params: list = [uuid.UUID(x_tenant_id)]
+    idx = 2
+
+    if classification:
+        conditions.append(f"d.classification = ${idx}")
+        params.append(classification)
+        idx += 1
+    if extraction_status:
+        conditions.append(f"d.extraction_status = ${idx}")
+        params.append(extraction_status)
+        idx += 1
+    if archived is not None:
+        conditions.append(f"d.archived = ${idx}")
+        params.append(archived)
+        idx += 1
+
+    where = " AND ".join(conditions)
+
+    async with pool.acquire() as conn:
+        total = await conn.fetchval(
+            f"SELECT COUNT(*) FROM crawler_documents d WHERE {where}", *params
+        )
+        rows = await conn.fetch(
+            f"""SELECT d.id, d.file_name, d.file_extension, d.file_size_bytes,
+                       d.classification, d.classification_confidence,
+                       d.classification_corrected, d.extraction_status,
+                       d.archived, d.ipfs_cid, d.first_seen_at, d.last_seen_at,
+                       d.version_count, s.name as source_name
+                FROM crawler_documents d
+                JOIN crawler_sources s ON d.source_id = s.id
+                WHERE {where}
+                ORDER BY d.created_at DESC
+                LIMIT ${idx} OFFSET ${idx+1}""",
+            *params, limit, offset,
+        )
+
+    return {"total": total, "documents": [dict(r) for r in rows]}
+
+
+@router.get("/documents/{doc_id}")
+async def get_document(doc_id: str, x_tenant_id: str = Header(...)):
+    pool = await get_pool()
+    async with pool.acquire() as conn:
+        row = await conn.fetchrow(
+            """SELECT d.*, s.name as source_name
+               FROM crawler_documents d
+               JOIN crawler_sources s ON d.source_id = s.id
+               WHERE d.id = $1 AND d.tenant_id = $2""",
+            uuid.UUID(doc_id), uuid.UUID(x_tenant_id),
+        )
+    if not row:
+        raise HTTPException(404, "Document not found")
+
+    result = dict(row)
+    # Include a text preview (first 500 chars)
+    if result.get("extracted_text"):
+        result["text_preview"] = result["extracted_text"][:500]
+    return result
+
+
+@router.put("/documents/{doc_id}/classify")
+async def classify_document_manually(
+    doc_id: str, body: ClassifyUpdate, x_tenant_id: str = Header(...)
+):
+    pool = await get_pool()
+    async with pool.acquire() as conn:
+        result = await conn.execute(
+            """UPDATE crawler_documents SET
+                classification = $3, classification_corrected = true, updated_at = NOW()
+               WHERE id = $1 AND tenant_id = $2""",
+            uuid.UUID(doc_id), uuid.UUID(x_tenant_id), body.classification,
+        )
+    if result == "UPDATE 0":
+        raise HTTPException(404, "Document not found")
+    return {"status": "updated", "classification": body.classification, "corrected": True}
+
+
+@router.post("/documents/{doc_id}/archive")
+async def archive_single_document(doc_id: str, x_tenant_id: str = Header(...)):
+    pool = await get_pool()
+    async with pool.acquire() as conn:
+        row = await conn.fetchrow(
+            "SELECT * FROM crawler_documents WHERE id = $1 AND tenant_id = $2",
+            uuid.UUID(doc_id), uuid.UUID(x_tenant_id),
+        )
+    if not row:
+        raise HTTPException(404, "Document not found")
+    if row["archived"]:
+        return {"status": "already_archived", "ipfs_cid": row["ipfs_cid"]}
+
+    try:
+        result = await archive_document(
+            file_path=row["file_path"],
+            file_name=row["file_name"],
+            document_type=row["classification"] or "unknown",
+            document_id=str(row["id"]),
+        )
+        cid = result.get("cid")
+    except Exception as e:
+        raise HTTPException(502, f"Archival failed: {e}")
+
+    async with pool.acquire() as conn:
+        await conn.execute(
+            "UPDATE crawler_documents SET archived = true, ipfs_cid = $2, archived_at = NOW(), updated_at = NOW() WHERE id = $1",
+            uuid.UUID(doc_id), cid,
+        )
+
+    return {"status": "archived", "ipfs_cid": cid}
+
+
+@router.post("/documents/archive-batch")
+async def archive_batch(body: ArchiveBatch, x_tenant_id: str = Header(...)):
+    results = []
+    for did in body.document_ids:
+        try:
+            r = await archive_single_document(did, x_tenant_id)
+            results.append({"id": did, **r})
+        except HTTPException as e:
+            results.append({"id": did, "status": "error", "error": e.detail})
+    return {"results": results}