Files
Benjamin Boenisch 364d2c69ff feat: Add Document Crawler & Auto-Onboarding service (Phase 1.4)
New standalone Python/FastAPI service for automatic compliance document
scanning, LLM-based classification, IPFS archival, and gap analysis.
Includes extractors (PDF, DOCX, XLSX, PPTX), keyword fallback classifier,
compliance matrix, and full REST API on port 8098.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-13 20:35:15 +01:00

100 lines
3.4 KiB
Python

"""Onboarding report + gap analysis endpoints."""
import json
import uuid
from fastapi import APIRouter, HTTPException, Header
from pydantic import BaseModel
from db import get_pool
from gap_analysis.analyzer import generate_gap_analysis
router = APIRouter(tags=["reports"])
class ReportGenerate(BaseModel):
job_id: str | None = None
company_profiles: list[str] = ["universal", "data_processor", "ai_user"]
@router.post("/reports/generate", status_code=201)
async def generate_report(body: ReportGenerate, x_tenant_id: str = Header(...)):
pool = await get_pool()
tid = uuid.UUID(x_tenant_id)
async with pool.acquire() as conn:
# Count documents by classification for this tenant
rows = await conn.fetch(
"""SELECT classification, COUNT(*) as cnt
FROM crawler_documents
WHERE tenant_id = $1 AND classification IS NOT NULL
GROUP BY classification""",
tid,
)
classification_counts = {r["classification"]: r["cnt"] for r in rows}
total_docs = await conn.fetchval(
"SELECT COUNT(*) FROM crawler_documents WHERE tenant_id = $1", tid
)
# Run gap analysis
analysis = generate_gap_analysis(classification_counts, body.company_profiles)
# Store report
async with pool.acquire() as conn:
jid = uuid.UUID(body.job_id) if body.job_id else None
row = await conn.fetchrow(
"""INSERT INTO crawler_onboarding_reports
(tenant_id, job_id, total_documents_found, classification_breakdown, gaps, compliance_score)
VALUES ($1, $2, $3, $4, $5, $6)
RETURNING *""",
tid, jid, total_docs,
json.dumps(classification_counts),
json.dumps(analysis["gaps"]),
analysis["compliance_score"],
)
result = dict(row)
result["gap_summary"] = analysis["gap_summary"]
result["covered"] = analysis["covered"]
result["total_required"] = analysis["total_required"]
return result
@router.get("/reports")
async def list_reports(x_tenant_id: str = Header(...)):
pool = await get_pool()
async with pool.acquire() as conn:
rows = await conn.fetch(
"SELECT * FROM crawler_onboarding_reports WHERE tenant_id = $1 ORDER BY created_at DESC LIMIT 20",
uuid.UUID(x_tenant_id),
)
return [dict(r) for r in rows]
@router.get("/reports/{report_id}")
async def get_report(report_id: str, x_tenant_id: str = Header(...)):
pool = await get_pool()
async with pool.acquire() as conn:
row = await conn.fetchrow(
"SELECT * FROM crawler_onboarding_reports WHERE id = $1 AND tenant_id = $2",
uuid.UUID(report_id), uuid.UUID(x_tenant_id),
)
if not row:
raise HTTPException(404, "Report not found")
result = dict(row)
# Parse stored JSON
if isinstance(result.get("gaps"), str):
result["gaps"] = json.loads(result["gaps"])
if isinstance(result.get("classification_breakdown"), str):
result["classification_breakdown"] = json.loads(result["classification_breakdown"])
# Add computed summary
gaps = result.get("gaps", [])
result["gap_summary"] = {
"critical": sum(1 for g in gaps if g.get("severity") == "CRITICAL"),
"high": sum(1 for g in gaps if g.get("severity") == "HIGH"),
"medium": sum(1 for g in gaps if g.get("severity") == "MEDIUM"),
}
return result