New standalone Python/FastAPI service for automatic compliance document scanning, LLM-based classification, IPFS archival, and gap analysis. Includes extractors (PDF, DOCX, XLSX, PPTX), keyword fallback classifier, compliance matrix, and full REST API on port 8098. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
149 lines
5.0 KiB
Python
149 lines
5.0 KiB
Python
"""Crawl source CRUD endpoints."""
|
|
|
|
import json
|
|
import os
|
|
import uuid
|
|
from fastapi import APIRouter, HTTPException, Header
|
|
from pydantic import BaseModel
|
|
|
|
from db import get_pool
|
|
from config import settings
|
|
|
|
router = APIRouter(tags=["sources"])
|
|
|
|
|
|
class SourceCreate(BaseModel):
|
|
name: str
|
|
source_type: str = "local"
|
|
path: str
|
|
file_extensions: list[str] = [".pdf", ".docx", ".xlsx", ".pptx"]
|
|
max_depth: int = 5
|
|
exclude_patterns: list[str] = []
|
|
enabled: bool = True
|
|
|
|
|
|
class SourceUpdate(BaseModel):
|
|
name: str | None = None
|
|
path: str | None = None
|
|
file_extensions: list[str] | None = None
|
|
max_depth: int | None = None
|
|
exclude_patterns: list[str] | None = None
|
|
enabled: bool | None = None
|
|
|
|
|
|
@router.get("/sources")
|
|
async def list_sources(x_tenant_id: str = Header(...)):
|
|
pool = await get_pool()
|
|
async with pool.acquire() as conn:
|
|
rows = await conn.fetch(
|
|
"SELECT * FROM crawler_sources WHERE tenant_id = $1 ORDER BY created_at DESC",
|
|
uuid.UUID(x_tenant_id),
|
|
)
|
|
return [dict(r) for r in rows]
|
|
|
|
|
|
@router.post("/sources", status_code=201)
|
|
async def create_source(body: SourceCreate, x_tenant_id: str = Header(...)):
|
|
pool = await get_pool()
|
|
async with pool.acquire() as conn:
|
|
row = await conn.fetchrow(
|
|
"""INSERT INTO crawler_sources
|
|
(tenant_id, name, source_type, path, file_extensions, max_depth, exclude_patterns, enabled)
|
|
VALUES ($1, $2, $3, $4, $5, $6, $7, $8)
|
|
RETURNING *""",
|
|
uuid.UUID(x_tenant_id),
|
|
body.name,
|
|
body.source_type,
|
|
body.path,
|
|
json.dumps(body.file_extensions),
|
|
body.max_depth,
|
|
json.dumps(body.exclude_patterns),
|
|
body.enabled,
|
|
)
|
|
return dict(row)
|
|
|
|
|
|
@router.put("/sources/{source_id}")
|
|
async def update_source(source_id: str, body: SourceUpdate, x_tenant_id: str = Header(...)):
|
|
pool = await get_pool()
|
|
async with pool.acquire() as conn:
|
|
existing = await conn.fetchrow(
|
|
"SELECT * FROM crawler_sources WHERE id = $1 AND tenant_id = $2",
|
|
uuid.UUID(source_id), uuid.UUID(x_tenant_id),
|
|
)
|
|
if not existing:
|
|
raise HTTPException(404, "Source not found")
|
|
|
|
updates = {}
|
|
if body.name is not None:
|
|
updates["name"] = body.name
|
|
if body.path is not None:
|
|
updates["path"] = body.path
|
|
if body.file_extensions is not None:
|
|
updates["file_extensions"] = json.dumps(body.file_extensions)
|
|
if body.max_depth is not None:
|
|
updates["max_depth"] = body.max_depth
|
|
if body.exclude_patterns is not None:
|
|
updates["exclude_patterns"] = json.dumps(body.exclude_patterns)
|
|
if body.enabled is not None:
|
|
updates["enabled"] = body.enabled
|
|
|
|
if updates:
|
|
set_clause = ", ".join(f"{k} = ${i+3}" for i, k in enumerate(updates))
|
|
sql = f"UPDATE crawler_sources SET {set_clause}, updated_at = NOW() WHERE id = $1 AND tenant_id = $2 RETURNING *"
|
|
row = await conn.fetchrow(
|
|
sql, uuid.UUID(source_id), uuid.UUID(x_tenant_id), *updates.values()
|
|
)
|
|
return dict(row)
|
|
|
|
return dict(existing)
|
|
|
|
|
|
@router.delete("/sources/{source_id}", status_code=204)
|
|
async def delete_source(source_id: str, x_tenant_id: str = Header(...)):
|
|
pool = await get_pool()
|
|
async with pool.acquire() as conn:
|
|
result = await conn.execute(
|
|
"DELETE FROM crawler_sources WHERE id = $1 AND tenant_id = $2",
|
|
uuid.UUID(source_id), uuid.UUID(x_tenant_id),
|
|
)
|
|
if result == "DELETE 0":
|
|
raise HTTPException(404, "Source not found")
|
|
|
|
|
|
@router.post("/sources/{source_id}/test")
|
|
async def test_source(source_id: str, x_tenant_id: str = Header(...)):
|
|
"""Test connectivity to a crawl source."""
|
|
pool = await get_pool()
|
|
async with pool.acquire() as conn:
|
|
row = await conn.fetchrow(
|
|
"SELECT * FROM crawler_sources WHERE id = $1 AND tenant_id = $2",
|
|
uuid.UUID(source_id), uuid.UUID(x_tenant_id),
|
|
)
|
|
if not row:
|
|
raise HTTPException(404, "Source not found")
|
|
|
|
# For local sources, check if the path exists inside the container
|
|
source_path = row["path"]
|
|
# Resolve relative to CRAWL_BASE_PATH
|
|
if not os.path.isabs(source_path):
|
|
source_path = os.path.join(settings.CRAWL_BASE_PATH, source_path)
|
|
|
|
exists = os.path.isdir(source_path)
|
|
file_count = 0
|
|
if exists:
|
|
exts = json.loads(row["file_extensions"]) if isinstance(row["file_extensions"], str) else row["file_extensions"]
|
|
for root, dirs, files in os.walk(source_path):
|
|
for f in files:
|
|
_, ext = os.path.splitext(f)
|
|
if ext.lower() in exts:
|
|
file_count += 1
|
|
break # only top-level for test
|
|
|
|
return {
|
|
"reachable": exists,
|
|
"path_resolved": source_path,
|
|
"sample_file_count": file_count,
|
|
"message": "Pfad erreichbar" if exists else "Pfad nicht gefunden",
|
|
}
|