feat: Add Document Crawler & Auto-Onboarding service (Phase 1.4)
New standalone Python/FastAPI service for automatic compliance document scanning, LLM-based classification, IPFS archival, and gap analysis. Includes extractors (PDF, DOCX, XLSX, PPTX), keyword fallback classifier, compliance matrix, and full REST API on port 8098. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
0
document-crawler/api/__init__.py
Normal file
0
document-crawler/api/__init__.py
Normal file
152
document-crawler/api/documents.py
Normal file
152
document-crawler/api/documents.py
Normal file
@@ -0,0 +1,152 @@
|
||||
"""Document list, reclassify, archive endpoints."""
|
||||
|
||||
import uuid
|
||||
from datetime import datetime, timezone
|
||||
from fastapi import APIRouter, HTTPException, Header, Query
|
||||
from pydantic import BaseModel
|
||||
|
||||
from db import get_pool
|
||||
from archiver.dsms_client import archive_document
|
||||
|
||||
router = APIRouter(tags=["documents"])
|
||||
|
||||
|
||||
class ClassifyUpdate(BaseModel):
|
||||
classification: str
|
||||
|
||||
|
||||
class ArchiveBatch(BaseModel):
|
||||
document_ids: list[str]
|
||||
|
||||
|
||||
@router.get("/documents")
|
||||
async def list_documents(
|
||||
x_tenant_id: str = Header(...),
|
||||
classification: str | None = Query(None),
|
||||
extraction_status: str | None = Query(None),
|
||||
archived: bool | None = Query(None),
|
||||
limit: int = Query(100, le=500),
|
||||
offset: int = Query(0),
|
||||
):
|
||||
pool = await get_pool()
|
||||
conditions = ["d.tenant_id = $1"]
|
||||
params: list = [uuid.UUID(x_tenant_id)]
|
||||
idx = 2
|
||||
|
||||
if classification:
|
||||
conditions.append(f"d.classification = ${idx}")
|
||||
params.append(classification)
|
||||
idx += 1
|
||||
if extraction_status:
|
||||
conditions.append(f"d.extraction_status = ${idx}")
|
||||
params.append(extraction_status)
|
||||
idx += 1
|
||||
if archived is not None:
|
||||
conditions.append(f"d.archived = ${idx}")
|
||||
params.append(archived)
|
||||
idx += 1
|
||||
|
||||
where = " AND ".join(conditions)
|
||||
|
||||
async with pool.acquire() as conn:
|
||||
total = await conn.fetchval(
|
||||
f"SELECT COUNT(*) FROM crawler_documents d WHERE {where}", *params
|
||||
)
|
||||
rows = await conn.fetch(
|
||||
f"""SELECT d.id, d.file_name, d.file_extension, d.file_size_bytes,
|
||||
d.classification, d.classification_confidence,
|
||||
d.classification_corrected, d.extraction_status,
|
||||
d.archived, d.ipfs_cid, d.first_seen_at, d.last_seen_at,
|
||||
d.version_count, s.name as source_name
|
||||
FROM crawler_documents d
|
||||
JOIN crawler_sources s ON d.source_id = s.id
|
||||
WHERE {where}
|
||||
ORDER BY d.created_at DESC
|
||||
LIMIT ${idx} OFFSET ${idx+1}""",
|
||||
*params, limit, offset,
|
||||
)
|
||||
|
||||
return {"total": total, "documents": [dict(r) for r in rows]}
|
||||
|
||||
|
||||
@router.get("/documents/{doc_id}")
|
||||
async def get_document(doc_id: str, x_tenant_id: str = Header(...)):
|
||||
pool = await get_pool()
|
||||
async with pool.acquire() as conn:
|
||||
row = await conn.fetchrow(
|
||||
"""SELECT d.*, s.name as source_name
|
||||
FROM crawler_documents d
|
||||
JOIN crawler_sources s ON d.source_id = s.id
|
||||
WHERE d.id = $1 AND d.tenant_id = $2""",
|
||||
uuid.UUID(doc_id), uuid.UUID(x_tenant_id),
|
||||
)
|
||||
if not row:
|
||||
raise HTTPException(404, "Document not found")
|
||||
|
||||
result = dict(row)
|
||||
# Include a text preview (first 500 chars)
|
||||
if result.get("extracted_text"):
|
||||
result["text_preview"] = result["extracted_text"][:500]
|
||||
return result
|
||||
|
||||
|
||||
@router.put("/documents/{doc_id}/classify")
|
||||
async def classify_document_manually(
|
||||
doc_id: str, body: ClassifyUpdate, x_tenant_id: str = Header(...)
|
||||
):
|
||||
pool = await get_pool()
|
||||
async with pool.acquire() as conn:
|
||||
result = await conn.execute(
|
||||
"""UPDATE crawler_documents SET
|
||||
classification = $3, classification_corrected = true, updated_at = NOW()
|
||||
WHERE id = $1 AND tenant_id = $2""",
|
||||
uuid.UUID(doc_id), uuid.UUID(x_tenant_id), body.classification,
|
||||
)
|
||||
if result == "UPDATE 0":
|
||||
raise HTTPException(404, "Document not found")
|
||||
return {"status": "updated", "classification": body.classification, "corrected": True}
|
||||
|
||||
|
||||
@router.post("/documents/{doc_id}/archive")
|
||||
async def archive_single_document(doc_id: str, x_tenant_id: str = Header(...)):
|
||||
pool = await get_pool()
|
||||
async with pool.acquire() as conn:
|
||||
row = await conn.fetchrow(
|
||||
"SELECT * FROM crawler_documents WHERE id = $1 AND tenant_id = $2",
|
||||
uuid.UUID(doc_id), uuid.UUID(x_tenant_id),
|
||||
)
|
||||
if not row:
|
||||
raise HTTPException(404, "Document not found")
|
||||
if row["archived"]:
|
||||
return {"status": "already_archived", "ipfs_cid": row["ipfs_cid"]}
|
||||
|
||||
try:
|
||||
result = await archive_document(
|
||||
file_path=row["file_path"],
|
||||
file_name=row["file_name"],
|
||||
document_type=row["classification"] or "unknown",
|
||||
document_id=str(row["id"]),
|
||||
)
|
||||
cid = result.get("cid")
|
||||
except Exception as e:
|
||||
raise HTTPException(502, f"Archival failed: {e}")
|
||||
|
||||
async with pool.acquire() as conn:
|
||||
await conn.execute(
|
||||
"UPDATE crawler_documents SET archived = true, ipfs_cid = $2, archived_at = NOW(), updated_at = NOW() WHERE id = $1",
|
||||
uuid.UUID(doc_id), cid,
|
||||
)
|
||||
|
||||
return {"status": "archived", "ipfs_cid": cid}
|
||||
|
||||
|
||||
@router.post("/documents/archive-batch")
|
||||
async def archive_batch(body: ArchiveBatch, x_tenant_id: str = Header(...)):
|
||||
results = []
|
||||
for did in body.document_ids:
|
||||
try:
|
||||
r = await archive_single_document(did, x_tenant_id)
|
||||
results.append({"id": did, **r})
|
||||
except HTTPException as e:
|
||||
results.append({"id": did, "status": "error", "error": e.detail})
|
||||
return {"results": results}
|
||||
249
document-crawler/api/jobs.py
Normal file
249
document-crawler/api/jobs.py
Normal file
@@ -0,0 +1,249 @@
|
||||
"""Crawl job management + trigger endpoints."""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import os
|
||||
import uuid
|
||||
from datetime import datetime, timezone
|
||||
from fastapi import APIRouter, HTTPException, Header, BackgroundTasks
|
||||
from pydantic import BaseModel
|
||||
|
||||
from db import get_pool
|
||||
from config import settings
|
||||
from crawlers.filesystem_crawler import FilesystemCrawler
|
||||
from extractors.dispatcher import extract_text
|
||||
from classifiers.llm_classifier import classify_document
|
||||
|
||||
router = APIRouter(tags=["jobs"])
|
||||
|
||||
|
||||
class JobCreate(BaseModel):
|
||||
source_id: str
|
||||
job_type: str = "full" # full or delta
|
||||
|
||||
|
||||
async def _run_crawl_job(job_id: str, source_id: str, tenant_id: str, job_type: str):
|
||||
"""Background task that executes a crawl job."""
|
||||
pool = await get_pool()
|
||||
|
||||
async with pool.acquire() as conn:
|
||||
source = await conn.fetchrow(
|
||||
"SELECT * FROM crawler_sources WHERE id = $1", uuid.UUID(source_id)
|
||||
)
|
||||
if not source:
|
||||
await conn.execute(
|
||||
"UPDATE crawler_jobs SET status = 'failed', error_message = 'Source not found', completed_at = NOW() WHERE id = $1",
|
||||
uuid.UUID(job_id),
|
||||
)
|
||||
return
|
||||
|
||||
# Mark job as running
|
||||
await conn.execute(
|
||||
"UPDATE crawler_jobs SET status = 'running', started_at = NOW() WHERE id = $1",
|
||||
uuid.UUID(job_id),
|
||||
)
|
||||
|
||||
# Resolve path
|
||||
source_path = source["path"]
|
||||
if not os.path.isabs(source_path):
|
||||
source_path = os.path.join(settings.CRAWL_BASE_PATH, source_path)
|
||||
|
||||
exts = json.loads(source["file_extensions"]) if isinstance(source["file_extensions"], str) else source["file_extensions"]
|
||||
excludes = json.loads(source["exclude_patterns"]) if isinstance(source["exclude_patterns"], str) else source["exclude_patterns"]
|
||||
|
||||
crawler = FilesystemCrawler(
|
||||
base_path=source_path,
|
||||
file_extensions=exts,
|
||||
max_depth=source["max_depth"],
|
||||
exclude_patterns=excludes,
|
||||
)
|
||||
|
||||
files = crawler.crawl()
|
||||
|
||||
stats = {
|
||||
"files_found": len(files),
|
||||
"files_processed": 0,
|
||||
"files_new": 0,
|
||||
"files_changed": 0,
|
||||
"files_skipped": 0,
|
||||
"files_error": 0,
|
||||
}
|
||||
|
||||
for crawled in files:
|
||||
try:
|
||||
async with pool.acquire() as conn:
|
||||
# Check for existing document (delta detection)
|
||||
existing = await conn.fetchrow(
|
||||
"SELECT id, file_hash FROM crawler_documents WHERE tenant_id = $1 AND source_id = $2 AND file_path = $3",
|
||||
uuid.UUID(tenant_id), uuid.UUID(source_id), crawled.file_path,
|
||||
)
|
||||
|
||||
if existing:
|
||||
if job_type == "delta" and existing["file_hash"] == crawled.file_hash:
|
||||
# Unchanged — skip
|
||||
await conn.execute(
|
||||
"UPDATE crawler_documents SET last_seen_at = NOW() WHERE id = $1",
|
||||
existing["id"],
|
||||
)
|
||||
stats["files_skipped"] += 1
|
||||
stats["files_processed"] += 1
|
||||
continue
|
||||
elif existing["file_hash"] != crawled.file_hash:
|
||||
stats["files_changed"] += 1
|
||||
else:
|
||||
stats["files_skipped"] += 1
|
||||
stats["files_processed"] += 1
|
||||
continue
|
||||
else:
|
||||
stats["files_new"] += 1
|
||||
|
||||
# Extract text
|
||||
extraction_status = "completed"
|
||||
extracted_text = ""
|
||||
try:
|
||||
extracted_text = extract_text(crawled.file_path, crawled.file_extension)
|
||||
except Exception:
|
||||
extraction_status = "failed"
|
||||
|
||||
# Classify
|
||||
classification_result = {"classification": None, "confidence": None, "reasoning": None}
|
||||
if extracted_text:
|
||||
classification_result = await classify_document(
|
||||
extracted_text, crawled.file_name, tenant_id
|
||||
)
|
||||
|
||||
if existing:
|
||||
# Update existing
|
||||
await conn.execute(
|
||||
"""UPDATE crawler_documents SET
|
||||
job_id = $1, file_size_bytes = $2, file_hash = $3,
|
||||
extracted_text = $4, extraction_status = $5,
|
||||
classification = $6, classification_confidence = $7,
|
||||
classification_reasoning = $8, classification_corrected = false,
|
||||
last_seen_at = NOW(), version_count = version_count + 1,
|
||||
updated_at = NOW()
|
||||
WHERE id = $9""",
|
||||
uuid.UUID(job_id), crawled.file_size_bytes, crawled.file_hash,
|
||||
extracted_text, extraction_status,
|
||||
classification_result["classification"],
|
||||
classification_result["confidence"],
|
||||
classification_result["reasoning"],
|
||||
existing["id"],
|
||||
)
|
||||
else:
|
||||
# Insert new
|
||||
await conn.execute(
|
||||
"""INSERT INTO crawler_documents
|
||||
(tenant_id, source_id, job_id, file_path, file_name, file_extension,
|
||||
file_size_bytes, file_hash, extracted_text, extraction_status,
|
||||
classification, classification_confidence, classification_reasoning)
|
||||
VALUES ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13)""",
|
||||
uuid.UUID(tenant_id), uuid.UUID(source_id), uuid.UUID(job_id),
|
||||
crawled.file_path, crawled.file_name, crawled.file_extension,
|
||||
crawled.file_size_bytes, crawled.file_hash,
|
||||
extracted_text, extraction_status,
|
||||
classification_result["classification"],
|
||||
classification_result["confidence"],
|
||||
classification_result["reasoning"],
|
||||
)
|
||||
|
||||
stats["files_processed"] += 1
|
||||
|
||||
except Exception:
|
||||
stats["files_error"] += 1
|
||||
stats["files_processed"] += 1
|
||||
|
||||
# Update job progress
|
||||
async with pool.acquire() as conn:
|
||||
await conn.execute(
|
||||
"""UPDATE crawler_jobs SET
|
||||
files_found=$2, files_processed=$3, files_new=$4,
|
||||
files_changed=$5, files_skipped=$6, files_error=$7
|
||||
WHERE id = $1""",
|
||||
uuid.UUID(job_id),
|
||||
stats["files_found"], stats["files_processed"],
|
||||
stats["files_new"], stats["files_changed"],
|
||||
stats["files_skipped"], stats["files_error"],
|
||||
)
|
||||
|
||||
# Mark completed
|
||||
async with pool.acquire() as conn:
|
||||
await conn.execute(
|
||||
"UPDATE crawler_jobs SET status = 'completed', completed_at = NOW() WHERE id = $1",
|
||||
uuid.UUID(job_id),
|
||||
)
|
||||
|
||||
|
||||
@router.post("/jobs", status_code=201)
|
||||
async def create_job(
|
||||
body: JobCreate,
|
||||
background_tasks: BackgroundTasks,
|
||||
x_tenant_id: str = Header(...),
|
||||
):
|
||||
pool = await get_pool()
|
||||
async with pool.acquire() as conn:
|
||||
# Verify source exists
|
||||
source = await conn.fetchrow(
|
||||
"SELECT id FROM crawler_sources WHERE id = $1 AND tenant_id = $2",
|
||||
uuid.UUID(body.source_id), uuid.UUID(x_tenant_id),
|
||||
)
|
||||
if not source:
|
||||
raise HTTPException(404, "Source not found")
|
||||
|
||||
# Check no job already running for this source
|
||||
running = await conn.fetchval(
|
||||
"SELECT EXISTS(SELECT 1 FROM crawler_jobs WHERE source_id = $1 AND status = 'running')",
|
||||
uuid.UUID(body.source_id),
|
||||
)
|
||||
if running:
|
||||
raise HTTPException(409, "A job is already running for this source")
|
||||
|
||||
row = await conn.fetchrow(
|
||||
"""INSERT INTO crawler_jobs (tenant_id, source_id, job_type)
|
||||
VALUES ($1, $2, $3) RETURNING *""",
|
||||
uuid.UUID(x_tenant_id), uuid.UUID(body.source_id), body.job_type,
|
||||
)
|
||||
|
||||
job_id = str(row["id"])
|
||||
background_tasks.add_task(_run_crawl_job, job_id, body.source_id, x_tenant_id, body.job_type)
|
||||
|
||||
return dict(row)
|
||||
|
||||
|
||||
@router.get("/jobs")
|
||||
async def list_jobs(x_tenant_id: str = Header(...)):
|
||||
pool = await get_pool()
|
||||
async with pool.acquire() as conn:
|
||||
rows = await conn.fetch(
|
||||
"""SELECT j.*, s.name as source_name
|
||||
FROM crawler_jobs j JOIN crawler_sources s ON j.source_id = s.id
|
||||
WHERE j.tenant_id = $1 ORDER BY j.created_at DESC LIMIT 50""",
|
||||
uuid.UUID(x_tenant_id),
|
||||
)
|
||||
return [dict(r) for r in rows]
|
||||
|
||||
|
||||
@router.get("/jobs/{job_id}")
|
||||
async def get_job(job_id: str, x_tenant_id: str = Header(...)):
|
||||
pool = await get_pool()
|
||||
async with pool.acquire() as conn:
|
||||
row = await conn.fetchrow(
|
||||
"SELECT * FROM crawler_jobs WHERE id = $1 AND tenant_id = $2",
|
||||
uuid.UUID(job_id), uuid.UUID(x_tenant_id),
|
||||
)
|
||||
if not row:
|
||||
raise HTTPException(404, "Job not found")
|
||||
return dict(row)
|
||||
|
||||
|
||||
@router.post("/jobs/{job_id}/cancel")
|
||||
async def cancel_job(job_id: str, x_tenant_id: str = Header(...)):
|
||||
pool = await get_pool()
|
||||
async with pool.acquire() as conn:
|
||||
result = await conn.execute(
|
||||
"UPDATE crawler_jobs SET status = 'cancelled', completed_at = NOW() WHERE id = $1 AND tenant_id = $2 AND status IN ('pending', 'running')",
|
||||
uuid.UUID(job_id), uuid.UUID(x_tenant_id),
|
||||
)
|
||||
if result == "UPDATE 0":
|
||||
raise HTTPException(404, "Job not found or not cancellable")
|
||||
return {"status": "cancelled"}
|
||||
99
document-crawler/api/reports.py
Normal file
99
document-crawler/api/reports.py
Normal file
@@ -0,0 +1,99 @@
|
||||
"""Onboarding report + gap analysis endpoints."""
|
||||
|
||||
import json
|
||||
import uuid
|
||||
from fastapi import APIRouter, HTTPException, Header
|
||||
from pydantic import BaseModel
|
||||
|
||||
from db import get_pool
|
||||
from gap_analysis.analyzer import generate_gap_analysis
|
||||
|
||||
router = APIRouter(tags=["reports"])
|
||||
|
||||
|
||||
class ReportGenerate(BaseModel):
|
||||
job_id: str | None = None
|
||||
company_profiles: list[str] = ["universal", "data_processor", "ai_user"]
|
||||
|
||||
|
||||
@router.post("/reports/generate", status_code=201)
|
||||
async def generate_report(body: ReportGenerate, x_tenant_id: str = Header(...)):
|
||||
pool = await get_pool()
|
||||
tid = uuid.UUID(x_tenant_id)
|
||||
|
||||
async with pool.acquire() as conn:
|
||||
# Count documents by classification for this tenant
|
||||
rows = await conn.fetch(
|
||||
"""SELECT classification, COUNT(*) as cnt
|
||||
FROM crawler_documents
|
||||
WHERE tenant_id = $1 AND classification IS NOT NULL
|
||||
GROUP BY classification""",
|
||||
tid,
|
||||
)
|
||||
classification_counts = {r["classification"]: r["cnt"] for r in rows}
|
||||
|
||||
total_docs = await conn.fetchval(
|
||||
"SELECT COUNT(*) FROM crawler_documents WHERE tenant_id = $1", tid
|
||||
)
|
||||
|
||||
# Run gap analysis
|
||||
analysis = generate_gap_analysis(classification_counts, body.company_profiles)
|
||||
|
||||
# Store report
|
||||
async with pool.acquire() as conn:
|
||||
jid = uuid.UUID(body.job_id) if body.job_id else None
|
||||
row = await conn.fetchrow(
|
||||
"""INSERT INTO crawler_onboarding_reports
|
||||
(tenant_id, job_id, total_documents_found, classification_breakdown, gaps, compliance_score)
|
||||
VALUES ($1, $2, $3, $4, $5, $6)
|
||||
RETURNING *""",
|
||||
tid, jid, total_docs,
|
||||
json.dumps(classification_counts),
|
||||
json.dumps(analysis["gaps"]),
|
||||
analysis["compliance_score"],
|
||||
)
|
||||
|
||||
result = dict(row)
|
||||
result["gap_summary"] = analysis["gap_summary"]
|
||||
result["covered"] = analysis["covered"]
|
||||
result["total_required"] = analysis["total_required"]
|
||||
return result
|
||||
|
||||
|
||||
@router.get("/reports")
|
||||
async def list_reports(x_tenant_id: str = Header(...)):
|
||||
pool = await get_pool()
|
||||
async with pool.acquire() as conn:
|
||||
rows = await conn.fetch(
|
||||
"SELECT * FROM crawler_onboarding_reports WHERE tenant_id = $1 ORDER BY created_at DESC LIMIT 20",
|
||||
uuid.UUID(x_tenant_id),
|
||||
)
|
||||
return [dict(r) for r in rows]
|
||||
|
||||
|
||||
@router.get("/reports/{report_id}")
|
||||
async def get_report(report_id: str, x_tenant_id: str = Header(...)):
|
||||
pool = await get_pool()
|
||||
async with pool.acquire() as conn:
|
||||
row = await conn.fetchrow(
|
||||
"SELECT * FROM crawler_onboarding_reports WHERE id = $1 AND tenant_id = $2",
|
||||
uuid.UUID(report_id), uuid.UUID(x_tenant_id),
|
||||
)
|
||||
if not row:
|
||||
raise HTTPException(404, "Report not found")
|
||||
|
||||
result = dict(row)
|
||||
# Parse stored JSON
|
||||
if isinstance(result.get("gaps"), str):
|
||||
result["gaps"] = json.loads(result["gaps"])
|
||||
if isinstance(result.get("classification_breakdown"), str):
|
||||
result["classification_breakdown"] = json.loads(result["classification_breakdown"])
|
||||
|
||||
# Add computed summary
|
||||
gaps = result.get("gaps", [])
|
||||
result["gap_summary"] = {
|
||||
"critical": sum(1 for g in gaps if g.get("severity") == "CRITICAL"),
|
||||
"high": sum(1 for g in gaps if g.get("severity") == "HIGH"),
|
||||
"medium": sum(1 for g in gaps if g.get("severity") == "MEDIUM"),
|
||||
}
|
||||
return result
|
||||
148
document-crawler/api/sources.py
Normal file
148
document-crawler/api/sources.py
Normal file
@@ -0,0 +1,148 @@
|
||||
"""Crawl source CRUD endpoints."""
|
||||
|
||||
import json
|
||||
import os
|
||||
import uuid
|
||||
from fastapi import APIRouter, HTTPException, Header
|
||||
from pydantic import BaseModel
|
||||
|
||||
from db import get_pool
|
||||
from config import settings
|
||||
|
||||
router = APIRouter(tags=["sources"])
|
||||
|
||||
|
||||
class SourceCreate(BaseModel):
|
||||
name: str
|
||||
source_type: str = "local"
|
||||
path: str
|
||||
file_extensions: list[str] = [".pdf", ".docx", ".xlsx", ".pptx"]
|
||||
max_depth: int = 5
|
||||
exclude_patterns: list[str] = []
|
||||
enabled: bool = True
|
||||
|
||||
|
||||
class SourceUpdate(BaseModel):
|
||||
name: str | None = None
|
||||
path: str | None = None
|
||||
file_extensions: list[str] | None = None
|
||||
max_depth: int | None = None
|
||||
exclude_patterns: list[str] | None = None
|
||||
enabled: bool | None = None
|
||||
|
||||
|
||||
@router.get("/sources")
|
||||
async def list_sources(x_tenant_id: str = Header(...)):
|
||||
pool = await get_pool()
|
||||
async with pool.acquire() as conn:
|
||||
rows = await conn.fetch(
|
||||
"SELECT * FROM crawler_sources WHERE tenant_id = $1 ORDER BY created_at DESC",
|
||||
uuid.UUID(x_tenant_id),
|
||||
)
|
||||
return [dict(r) for r in rows]
|
||||
|
||||
|
||||
@router.post("/sources", status_code=201)
|
||||
async def create_source(body: SourceCreate, x_tenant_id: str = Header(...)):
|
||||
pool = await get_pool()
|
||||
async with pool.acquire() as conn:
|
||||
row = await conn.fetchrow(
|
||||
"""INSERT INTO crawler_sources
|
||||
(tenant_id, name, source_type, path, file_extensions, max_depth, exclude_patterns, enabled)
|
||||
VALUES ($1, $2, $3, $4, $5, $6, $7, $8)
|
||||
RETURNING *""",
|
||||
uuid.UUID(x_tenant_id),
|
||||
body.name,
|
||||
body.source_type,
|
||||
body.path,
|
||||
json.dumps(body.file_extensions),
|
||||
body.max_depth,
|
||||
json.dumps(body.exclude_patterns),
|
||||
body.enabled,
|
||||
)
|
||||
return dict(row)
|
||||
|
||||
|
||||
@router.put("/sources/{source_id}")
|
||||
async def update_source(source_id: str, body: SourceUpdate, x_tenant_id: str = Header(...)):
|
||||
pool = await get_pool()
|
||||
async with pool.acquire() as conn:
|
||||
existing = await conn.fetchrow(
|
||||
"SELECT * FROM crawler_sources WHERE id = $1 AND tenant_id = $2",
|
||||
uuid.UUID(source_id), uuid.UUID(x_tenant_id),
|
||||
)
|
||||
if not existing:
|
||||
raise HTTPException(404, "Source not found")
|
||||
|
||||
updates = {}
|
||||
if body.name is not None:
|
||||
updates["name"] = body.name
|
||||
if body.path is not None:
|
||||
updates["path"] = body.path
|
||||
if body.file_extensions is not None:
|
||||
updates["file_extensions"] = json.dumps(body.file_extensions)
|
||||
if body.max_depth is not None:
|
||||
updates["max_depth"] = body.max_depth
|
||||
if body.exclude_patterns is not None:
|
||||
updates["exclude_patterns"] = json.dumps(body.exclude_patterns)
|
||||
if body.enabled is not None:
|
||||
updates["enabled"] = body.enabled
|
||||
|
||||
if updates:
|
||||
set_clause = ", ".join(f"{k} = ${i+3}" for i, k in enumerate(updates))
|
||||
sql = f"UPDATE crawler_sources SET {set_clause}, updated_at = NOW() WHERE id = $1 AND tenant_id = $2 RETURNING *"
|
||||
row = await conn.fetchrow(
|
||||
sql, uuid.UUID(source_id), uuid.UUID(x_tenant_id), *updates.values()
|
||||
)
|
||||
return dict(row)
|
||||
|
||||
return dict(existing)
|
||||
|
||||
|
||||
@router.delete("/sources/{source_id}", status_code=204)
|
||||
async def delete_source(source_id: str, x_tenant_id: str = Header(...)):
|
||||
pool = await get_pool()
|
||||
async with pool.acquire() as conn:
|
||||
result = await conn.execute(
|
||||
"DELETE FROM crawler_sources WHERE id = $1 AND tenant_id = $2",
|
||||
uuid.UUID(source_id), uuid.UUID(x_tenant_id),
|
||||
)
|
||||
if result == "DELETE 0":
|
||||
raise HTTPException(404, "Source not found")
|
||||
|
||||
|
||||
@router.post("/sources/{source_id}/test")
|
||||
async def test_source(source_id: str, x_tenant_id: str = Header(...)):
|
||||
"""Test connectivity to a crawl source."""
|
||||
pool = await get_pool()
|
||||
async with pool.acquire() as conn:
|
||||
row = await conn.fetchrow(
|
||||
"SELECT * FROM crawler_sources WHERE id = $1 AND tenant_id = $2",
|
||||
uuid.UUID(source_id), uuid.UUID(x_tenant_id),
|
||||
)
|
||||
if not row:
|
||||
raise HTTPException(404, "Source not found")
|
||||
|
||||
# For local sources, check if the path exists inside the container
|
||||
source_path = row["path"]
|
||||
# Resolve relative to CRAWL_BASE_PATH
|
||||
if not os.path.isabs(source_path):
|
||||
source_path = os.path.join(settings.CRAWL_BASE_PATH, source_path)
|
||||
|
||||
exists = os.path.isdir(source_path)
|
||||
file_count = 0
|
||||
if exists:
|
||||
exts = json.loads(row["file_extensions"]) if isinstance(row["file_extensions"], str) else row["file_extensions"]
|
||||
for root, dirs, files in os.walk(source_path):
|
||||
for f in files:
|
||||
_, ext = os.path.splitext(f)
|
||||
if ext.lower() in exts:
|
||||
file_count += 1
|
||||
break # only top-level for test
|
||||
|
||||
return {
|
||||
"reachable": exists,
|
||||
"path_resolved": source_path,
|
||||
"sample_file_count": file_count,
|
||||
"message": "Pfad erreichbar" if exists else "Pfad nicht gefunden",
|
||||
}
|
||||
Reference in New Issue
Block a user