"""Crawl source CRUD endpoints.""" import json import os import uuid from fastapi import APIRouter, HTTPException, Header from pydantic import BaseModel from db import get_pool from config import settings router = APIRouter(tags=["sources"]) class SourceCreate(BaseModel): name: str source_type: str = "local" path: str file_extensions: list[str] = [".pdf", ".docx", ".xlsx", ".pptx"] max_depth: int = 5 exclude_patterns: list[str] = [] enabled: bool = True class SourceUpdate(BaseModel): name: str | None = None path: str | None = None file_extensions: list[str] | None = None max_depth: int | None = None exclude_patterns: list[str] | None = None enabled: bool | None = None @router.get("/sources") async def list_sources(x_tenant_id: str = Header(...)): pool = await get_pool() async with pool.acquire() as conn: rows = await conn.fetch( "SELECT * FROM crawler_sources WHERE tenant_id = $1 ORDER BY created_at DESC", uuid.UUID(x_tenant_id), ) return [dict(r) for r in rows] @router.post("/sources", status_code=201) async def create_source(body: SourceCreate, x_tenant_id: str = Header(...)): pool = await get_pool() async with pool.acquire() as conn: row = await conn.fetchrow( """INSERT INTO crawler_sources (tenant_id, name, source_type, path, file_extensions, max_depth, exclude_patterns, enabled) VALUES ($1, $2, $3, $4, $5, $6, $7, $8) RETURNING *""", uuid.UUID(x_tenant_id), body.name, body.source_type, body.path, json.dumps(body.file_extensions), body.max_depth, json.dumps(body.exclude_patterns), body.enabled, ) return dict(row) @router.put("/sources/{source_id}") async def update_source(source_id: str, body: SourceUpdate, x_tenant_id: str = Header(...)): pool = await get_pool() async with pool.acquire() as conn: existing = await conn.fetchrow( "SELECT * FROM crawler_sources WHERE id = $1 AND tenant_id = $2", uuid.UUID(source_id), uuid.UUID(x_tenant_id), ) if not existing: raise HTTPException(404, "Source not found") updates = {} if body.name is not None: updates["name"] = body.name if body.path is not None: updates["path"] = body.path if body.file_extensions is not None: updates["file_extensions"] = json.dumps(body.file_extensions) if body.max_depth is not None: updates["max_depth"] = body.max_depth if body.exclude_patterns is not None: updates["exclude_patterns"] = json.dumps(body.exclude_patterns) if body.enabled is not None: updates["enabled"] = body.enabled if updates: set_clause = ", ".join(f"{k} = ${i+3}" for i, k in enumerate(updates)) sql = f"UPDATE crawler_sources SET {set_clause}, updated_at = NOW() WHERE id = $1 AND tenant_id = $2 RETURNING *" row = await conn.fetchrow( sql, uuid.UUID(source_id), uuid.UUID(x_tenant_id), *updates.values() ) return dict(row) return dict(existing) @router.delete("/sources/{source_id}", status_code=204) async def delete_source(source_id: str, x_tenant_id: str = Header(...)): pool = await get_pool() async with pool.acquire() as conn: result = await conn.execute( "DELETE FROM crawler_sources WHERE id = $1 AND tenant_id = $2", uuid.UUID(source_id), uuid.UUID(x_tenant_id), ) if result == "DELETE 0": raise HTTPException(404, "Source not found") @router.post("/sources/{source_id}/test") async def test_source(source_id: str, x_tenant_id: str = Header(...)): """Test connectivity to a crawl source.""" pool = await get_pool() async with pool.acquire() as conn: row = await conn.fetchrow( "SELECT * FROM crawler_sources WHERE id = $1 AND tenant_id = $2", uuid.UUID(source_id), uuid.UUID(x_tenant_id), ) if not row: raise HTTPException(404, "Source not found") # For local sources, check if the path exists inside the container source_path = row["path"] # Resolve relative to CRAWL_BASE_PATH if not os.path.isabs(source_path): source_path = os.path.join(settings.CRAWL_BASE_PATH, source_path) exists = os.path.isdir(source_path) file_count = 0 if exists: exts = json.loads(row["file_extensions"]) if isinstance(row["file_extensions"], str) else row["file_extensions"] for root, dirs, files in os.walk(source_path): for f in files: _, ext = os.path.splitext(f) if ext.lower() in exts: file_count += 1 break # only top-level for test return { "reachable": exists, "path_resolved": source_path, "sample_file_count": file_count, "message": "Pfad erreichbar" if exists else "Pfad nicht gefunden", }