feat: Add Document Crawler & Auto-Onboarding service (Phase 1.4)
New standalone Python/FastAPI service for automatic compliance document scanning, LLM-based classification, IPFS archival, and gap analysis. Includes extractors (PDF, DOCX, XLSX, PPTX), keyword fallback classifier, compliance matrix, and full REST API on port 8098. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
99
document-crawler/api/reports.py
Normal file
99
document-crawler/api/reports.py
Normal file
@@ -0,0 +1,99 @@
|
||||
"""Onboarding report + gap analysis endpoints."""
|
||||
|
||||
import json
|
||||
import uuid
|
||||
from fastapi import APIRouter, HTTPException, Header
|
||||
from pydantic import BaseModel
|
||||
|
||||
from db import get_pool
|
||||
from gap_analysis.analyzer import generate_gap_analysis
|
||||
|
||||
router = APIRouter(tags=["reports"])
|
||||
|
||||
|
||||
class ReportGenerate(BaseModel):
|
||||
job_id: str | None = None
|
||||
company_profiles: list[str] = ["universal", "data_processor", "ai_user"]
|
||||
|
||||
|
||||
@router.post("/reports/generate", status_code=201)
|
||||
async def generate_report(body: ReportGenerate, x_tenant_id: str = Header(...)):
|
||||
pool = await get_pool()
|
||||
tid = uuid.UUID(x_tenant_id)
|
||||
|
||||
async with pool.acquire() as conn:
|
||||
# Count documents by classification for this tenant
|
||||
rows = await conn.fetch(
|
||||
"""SELECT classification, COUNT(*) as cnt
|
||||
FROM crawler_documents
|
||||
WHERE tenant_id = $1 AND classification IS NOT NULL
|
||||
GROUP BY classification""",
|
||||
tid,
|
||||
)
|
||||
classification_counts = {r["classification"]: r["cnt"] for r in rows}
|
||||
|
||||
total_docs = await conn.fetchval(
|
||||
"SELECT COUNT(*) FROM crawler_documents WHERE tenant_id = $1", tid
|
||||
)
|
||||
|
||||
# Run gap analysis
|
||||
analysis = generate_gap_analysis(classification_counts, body.company_profiles)
|
||||
|
||||
# Store report
|
||||
async with pool.acquire() as conn:
|
||||
jid = uuid.UUID(body.job_id) if body.job_id else None
|
||||
row = await conn.fetchrow(
|
||||
"""INSERT INTO crawler_onboarding_reports
|
||||
(tenant_id, job_id, total_documents_found, classification_breakdown, gaps, compliance_score)
|
||||
VALUES ($1, $2, $3, $4, $5, $6)
|
||||
RETURNING *""",
|
||||
tid, jid, total_docs,
|
||||
json.dumps(classification_counts),
|
||||
json.dumps(analysis["gaps"]),
|
||||
analysis["compliance_score"],
|
||||
)
|
||||
|
||||
result = dict(row)
|
||||
result["gap_summary"] = analysis["gap_summary"]
|
||||
result["covered"] = analysis["covered"]
|
||||
result["total_required"] = analysis["total_required"]
|
||||
return result
|
||||
|
||||
|
||||
@router.get("/reports")
|
||||
async def list_reports(x_tenant_id: str = Header(...)):
|
||||
pool = await get_pool()
|
||||
async with pool.acquire() as conn:
|
||||
rows = await conn.fetch(
|
||||
"SELECT * FROM crawler_onboarding_reports WHERE tenant_id = $1 ORDER BY created_at DESC LIMIT 20",
|
||||
uuid.UUID(x_tenant_id),
|
||||
)
|
||||
return [dict(r) for r in rows]
|
||||
|
||||
|
||||
@router.get("/reports/{report_id}")
|
||||
async def get_report(report_id: str, x_tenant_id: str = Header(...)):
|
||||
pool = await get_pool()
|
||||
async with pool.acquire() as conn:
|
||||
row = await conn.fetchrow(
|
||||
"SELECT * FROM crawler_onboarding_reports WHERE id = $1 AND tenant_id = $2",
|
||||
uuid.UUID(report_id), uuid.UUID(x_tenant_id),
|
||||
)
|
||||
if not row:
|
||||
raise HTTPException(404, "Report not found")
|
||||
|
||||
result = dict(row)
|
||||
# Parse stored JSON
|
||||
if isinstance(result.get("gaps"), str):
|
||||
result["gaps"] = json.loads(result["gaps"])
|
||||
if isinstance(result.get("classification_breakdown"), str):
|
||||
result["classification_breakdown"] = json.loads(result["classification_breakdown"])
|
||||
|
||||
# Add computed summary
|
||||
gaps = result.get("gaps", [])
|
||||
result["gap_summary"] = {
|
||||
"critical": sum(1 for g in gaps if g.get("severity") == "CRITICAL"),
|
||||
"high": sum(1 for g in gaps if g.get("severity") == "HIGH"),
|
||||
"medium": sum(1 for g in gaps if g.get("severity") == "MEDIUM"),
|
||||
}
|
||||
return result
|
||||
Reference in New Issue
Block a user