Files
breakpilot-compliance/document-crawler/api/sources.py
Benjamin Boenisch 364d2c69ff feat: Add Document Crawler & Auto-Onboarding service (Phase 1.4)
New standalone Python/FastAPI service for automatic compliance document
scanning, LLM-based classification, IPFS archival, and gap analysis.
Includes extractors (PDF, DOCX, XLSX, PPTX), keyword fallback classifier,
compliance matrix, and full REST API on port 8098.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-13 20:35:15 +01:00

149 lines
5.0 KiB
Python

"""Crawl source CRUD endpoints."""
import json
import os
import uuid
from fastapi import APIRouter, HTTPException, Header
from pydantic import BaseModel
from db import get_pool
from config import settings
router = APIRouter(tags=["sources"])
class SourceCreate(BaseModel):
name: str
source_type: str = "local"
path: str
file_extensions: list[str] = [".pdf", ".docx", ".xlsx", ".pptx"]
max_depth: int = 5
exclude_patterns: list[str] = []
enabled: bool = True
class SourceUpdate(BaseModel):
name: str | None = None
path: str | None = None
file_extensions: list[str] | None = None
max_depth: int | None = None
exclude_patterns: list[str] | None = None
enabled: bool | None = None
@router.get("/sources")
async def list_sources(x_tenant_id: str = Header(...)):
pool = await get_pool()
async with pool.acquire() as conn:
rows = await conn.fetch(
"SELECT * FROM crawler_sources WHERE tenant_id = $1 ORDER BY created_at DESC",
uuid.UUID(x_tenant_id),
)
return [dict(r) for r in rows]
@router.post("/sources", status_code=201)
async def create_source(body: SourceCreate, x_tenant_id: str = Header(...)):
pool = await get_pool()
async with pool.acquire() as conn:
row = await conn.fetchrow(
"""INSERT INTO crawler_sources
(tenant_id, name, source_type, path, file_extensions, max_depth, exclude_patterns, enabled)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8)
RETURNING *""",
uuid.UUID(x_tenant_id),
body.name,
body.source_type,
body.path,
json.dumps(body.file_extensions),
body.max_depth,
json.dumps(body.exclude_patterns),
body.enabled,
)
return dict(row)
@router.put("/sources/{source_id}")
async def update_source(source_id: str, body: SourceUpdate, x_tenant_id: str = Header(...)):
pool = await get_pool()
async with pool.acquire() as conn:
existing = await conn.fetchrow(
"SELECT * FROM crawler_sources WHERE id = $1 AND tenant_id = $2",
uuid.UUID(source_id), uuid.UUID(x_tenant_id),
)
if not existing:
raise HTTPException(404, "Source not found")
updates = {}
if body.name is not None:
updates["name"] = body.name
if body.path is not None:
updates["path"] = body.path
if body.file_extensions is not None:
updates["file_extensions"] = json.dumps(body.file_extensions)
if body.max_depth is not None:
updates["max_depth"] = body.max_depth
if body.exclude_patterns is not None:
updates["exclude_patterns"] = json.dumps(body.exclude_patterns)
if body.enabled is not None:
updates["enabled"] = body.enabled
if updates:
set_clause = ", ".join(f"{k} = ${i+3}" for i, k in enumerate(updates))
sql = f"UPDATE crawler_sources SET {set_clause}, updated_at = NOW() WHERE id = $1 AND tenant_id = $2 RETURNING *"
row = await conn.fetchrow(
sql, uuid.UUID(source_id), uuid.UUID(x_tenant_id), *updates.values()
)
return dict(row)
return dict(existing)
@router.delete("/sources/{source_id}", status_code=204)
async def delete_source(source_id: str, x_tenant_id: str = Header(...)):
pool = await get_pool()
async with pool.acquire() as conn:
result = await conn.execute(
"DELETE FROM crawler_sources WHERE id = $1 AND tenant_id = $2",
uuid.UUID(source_id), uuid.UUID(x_tenant_id),
)
if result == "DELETE 0":
raise HTTPException(404, "Source not found")
@router.post("/sources/{source_id}/test")
async def test_source(source_id: str, x_tenant_id: str = Header(...)):
"""Test connectivity to a crawl source."""
pool = await get_pool()
async with pool.acquire() as conn:
row = await conn.fetchrow(
"SELECT * FROM crawler_sources WHERE id = $1 AND tenant_id = $2",
uuid.UUID(source_id), uuid.UUID(x_tenant_id),
)
if not row:
raise HTTPException(404, "Source not found")
# For local sources, check if the path exists inside the container
source_path = row["path"]
# Resolve relative to CRAWL_BASE_PATH
if not os.path.isabs(source_path):
source_path = os.path.join(settings.CRAWL_BASE_PATH, source_path)
exists = os.path.isdir(source_path)
file_count = 0
if exists:
exts = json.loads(row["file_extensions"]) if isinstance(row["file_extensions"], str) else row["file_extensions"]
for root, dirs, files in os.walk(source_path):
for f in files:
_, ext = os.path.splitext(f)
if ext.lower() in exts:
file_count += 1
break # only top-level for test
return {
"reachable": exists,
"path_resolved": source_path,
"sample_file_count": file_count,
"message": "Pfad erreichbar" if exists else "Pfad nicht gefunden",
}