breakpilot-compliance/document-crawler/api/sources.py

"""Crawl source CRUD endpoints."""

import json
import os
import uuid
from fastapi import APIRouter, HTTPException, Header
from pydantic import BaseModel

from db import get_pool
from config import settings

router = APIRouter(tags=["sources"])


class SourceCreate(BaseModel):
    name: str
    source_type: str = "local"
    path: str
    file_extensions: list[str] = [".pdf", ".docx", ".xlsx", ".pptx"]
    max_depth: int = 5
    exclude_patterns: list[str] = []
    enabled: bool = True


class SourceUpdate(BaseModel):
    name: str | None = None
    path: str | None = None
    file_extensions: list[str] | None = None
    max_depth: int | None = None
    exclude_patterns: list[str] | None = None
    enabled: bool | None = None


@router.get("/sources")
async def list_sources(x_tenant_id: str = Header(...)):
    pool = await get_pool()
    async with pool.acquire() as conn:
        rows = await conn.fetch(
            "SELECT * FROM crawler_sources WHERE tenant_id = $1 ORDER BY created_at DESC",
            uuid.UUID(x_tenant_id),
        )
    return [dict(r) for r in rows]


@router.post("/sources", status_code=201)
async def create_source(body: SourceCreate, x_tenant_id: str = Header(...)):
    pool = await get_pool()
    async with pool.acquire() as conn:
        row = await conn.fetchrow(
            """INSERT INTO crawler_sources
               (tenant_id, name, source_type, path, file_extensions, max_depth, exclude_patterns, enabled)
               VALUES ($1, $2, $3, $4, $5, $6, $7, $8)
               RETURNING *""",
            uuid.UUID(x_tenant_id),
            body.name,
            body.source_type,
            body.path,
            json.dumps(body.file_extensions),
            body.max_depth,
            json.dumps(body.exclude_patterns),
            body.enabled,
        )
    return dict(row)


@router.put("/sources/{source_id}")
async def update_source(source_id: str, body: SourceUpdate, x_tenant_id: str = Header(...)):
    pool = await get_pool()
    async with pool.acquire() as conn:
        existing = await conn.fetchrow(
            "SELECT * FROM crawler_sources WHERE id = $1 AND tenant_id = $2",
            uuid.UUID(source_id), uuid.UUID(x_tenant_id),
        )
        if not existing:
            raise HTTPException(404, "Source not found")

        updates = {}
        if body.name is not None:
            updates["name"] = body.name
        if body.path is not None:
            updates["path"] = body.path
        if body.file_extensions is not None:
            updates["file_extensions"] = json.dumps(body.file_extensions)
        if body.max_depth is not None:
            updates["max_depth"] = body.max_depth
        if body.exclude_patterns is not None:
            updates["exclude_patterns"] = json.dumps(body.exclude_patterns)
        if body.enabled is not None:
            updates["enabled"] = body.enabled

        if updates:
            set_clause = ", ".join(f"{k} = ${i+3}" for i, k in enumerate(updates))
            sql = f"UPDATE crawler_sources SET {set_clause}, updated_at = NOW() WHERE id = $1 AND tenant_id = $2 RETURNING *"
            row = await conn.fetchrow(
                sql, uuid.UUID(source_id), uuid.UUID(x_tenant_id), *updates.values()
            )
            return dict(row)

        return dict(existing)


@router.delete("/sources/{source_id}", status_code=204)
async def delete_source(source_id: str, x_tenant_id: str = Header(...)):
    pool = await get_pool()
    async with pool.acquire() as conn:
        result = await conn.execute(
            "DELETE FROM crawler_sources WHERE id = $1 AND tenant_id = $2",
            uuid.UUID(source_id), uuid.UUID(x_tenant_id),
        )
        if result == "DELETE 0":
            raise HTTPException(404, "Source not found")


@router.post("/sources/{source_id}/test")
async def test_source(source_id: str, x_tenant_id: str = Header(...)):
    """Test connectivity to a crawl source."""
    pool = await get_pool()
    async with pool.acquire() as conn:
        row = await conn.fetchrow(
            "SELECT * FROM crawler_sources WHERE id = $1 AND tenant_id = $2",
            uuid.UUID(source_id), uuid.UUID(x_tenant_id),
        )
    if not row:
        raise HTTPException(404, "Source not found")

    # For local sources, check if the path exists inside the container
    source_path = row["path"]
    # Resolve relative to CRAWL_BASE_PATH
    if not os.path.isabs(source_path):
        source_path = os.path.join(settings.CRAWL_BASE_PATH, source_path)

    exists = os.path.isdir(source_path)
    file_count = 0
    if exists:
        exts = json.loads(row["file_extensions"]) if isinstance(row["file_extensions"], str) else row["file_extensions"]
        for root, dirs, files in os.walk(source_path):
            for f in files:
                _, ext = os.path.splitext(f)
                if ext.lower() in exts:
                    file_count += 1
            break  # only top-level for test

    return {
        "reachable": exists,
        "path_resolved": source_path,
        "sample_file_count": file_count,
        "message": "Pfad erreichbar" if exists else "Pfad nicht gefunden",
    }