backend-lehrer (11 files): - llm_gateway/routes/schools.py (867 → 5), recording_api.py (848 → 6) - messenger_api.py (840 → 5), print_generator.py (824 → 5) - unit_analytics_api.py (751 → 5), classroom/routes/context.py (726 → 4) - llm_gateway/routes/edu_search_seeds.py (710 → 4) klausur-service (12 files): - ocr_labeling_api.py (845 → 4), metrics_db.py (833 → 4) - legal_corpus_api.py (790 → 4), page_crop.py (758 → 3) - mail/ai_service.py (747 → 4), github_crawler.py (767 → 3) - trocr_service.py (730 → 4), full_compliance_pipeline.py (723 → 4) - dsfa_rag_api.py (715 → 4), ocr_pipeline_auto.py (705 → 4) website (6 pages): - audit-checklist (867 → 8), content (806 → 6) - screen-flow (790 → 4), scraper (789 → 5) - zeugnisse (776 → 5), modules (745 → 4) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
387 lines
13 KiB
Python
387 lines
13 KiB
Python
"""
|
|
EduSearch Seeds CRUD Routes.
|
|
|
|
List, get, create, update, delete, and bulk import for seed URLs.
|
|
"""
|
|
|
|
import os
|
|
import logging
|
|
from typing import Optional, List
|
|
from datetime import datetime
|
|
|
|
from fastapi import APIRouter, HTTPException, Query
|
|
import asyncpg
|
|
|
|
from .edu_search_models import (
|
|
CategoryResponse,
|
|
SeedCreate,
|
|
SeedUpdate,
|
|
SeedResponse,
|
|
SeedsListResponse,
|
|
BulkImportRequest,
|
|
BulkImportResponse,
|
|
)
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
router = APIRouter(tags=["edu-search"])
|
|
|
|
# Database connection pool
|
|
_pool: Optional[asyncpg.Pool] = None
|
|
|
|
|
|
async def get_db_pool() -> asyncpg.Pool:
|
|
"""Get or create database connection pool."""
|
|
global _pool
|
|
if _pool is None:
|
|
database_url = os.environ.get("DATABASE_URL")
|
|
if not database_url:
|
|
raise RuntimeError("DATABASE_URL nicht konfiguriert - bitte via Vault oder Umgebungsvariable setzen")
|
|
_pool = await asyncpg.create_pool(database_url, min_size=2, max_size=10)
|
|
return _pool
|
|
|
|
|
|
@router.get("/categories", response_model=List[CategoryResponse])
|
|
async def list_categories():
|
|
"""List all seed categories."""
|
|
pool = await get_db_pool()
|
|
async with pool.acquire() as conn:
|
|
rows = await conn.fetch("""
|
|
SELECT id, name, display_name, description, icon, sort_order, is_active
|
|
FROM edu_search_categories
|
|
WHERE is_active = TRUE
|
|
ORDER BY sort_order
|
|
""")
|
|
return [
|
|
CategoryResponse(
|
|
id=str(row["id"]),
|
|
name=row["name"],
|
|
display_name=row["display_name"],
|
|
description=row["description"],
|
|
icon=row["icon"],
|
|
sort_order=row["sort_order"],
|
|
is_active=row["is_active"],
|
|
)
|
|
for row in rows
|
|
]
|
|
|
|
|
|
@router.get("/seeds", response_model=SeedsListResponse)
|
|
async def list_seeds(
|
|
category: Optional[str] = Query(None, description="Filter by category name"),
|
|
state: Optional[str] = Query(None, description="Filter by state code"),
|
|
enabled: Optional[bool] = Query(None, description="Filter by enabled status"),
|
|
search: Optional[str] = Query(None, description="Search in name/url"),
|
|
page: int = Query(1, ge=1),
|
|
page_size: int = Query(50, ge=1, le=200),
|
|
):
|
|
"""List seeds with optional filtering and pagination."""
|
|
pool = await get_db_pool()
|
|
async with pool.acquire() as conn:
|
|
# Build WHERE clause
|
|
conditions = []
|
|
params = []
|
|
param_idx = 1
|
|
|
|
if category:
|
|
conditions.append(f"c.name = ${param_idx}")
|
|
params.append(category)
|
|
param_idx += 1
|
|
|
|
if state:
|
|
conditions.append(f"s.state = ${param_idx}")
|
|
params.append(state)
|
|
param_idx += 1
|
|
|
|
if enabled is not None:
|
|
conditions.append(f"s.enabled = ${param_idx}")
|
|
params.append(enabled)
|
|
param_idx += 1
|
|
|
|
if search:
|
|
conditions.append(f"(s.name ILIKE ${param_idx} OR s.url ILIKE ${param_idx})")
|
|
params.append(f"%{search}%")
|
|
param_idx += 1
|
|
|
|
where_clause = " AND ".join(conditions) if conditions else "TRUE"
|
|
|
|
# Count total
|
|
count_query = f"""
|
|
SELECT COUNT(*) FROM edu_search_seeds s
|
|
LEFT JOIN edu_search_categories c ON s.category_id = c.id
|
|
WHERE {where_clause}
|
|
"""
|
|
total = await conn.fetchval(count_query, *params)
|
|
|
|
# Get paginated results
|
|
offset = (page - 1) * page_size
|
|
params.extend([page_size, offset])
|
|
|
|
query = f"""
|
|
SELECT
|
|
s.id, s.url, s.name, s.description,
|
|
c.name as category, c.display_name as category_display_name,
|
|
s.source_type, s.scope, s.state, s.trust_boost, s.enabled,
|
|
s.crawl_depth, s.crawl_frequency, s.last_crawled_at,
|
|
s.last_crawl_status, s.last_crawl_docs, s.total_documents,
|
|
s.created_at, s.updated_at
|
|
FROM edu_search_seeds s
|
|
LEFT JOIN edu_search_categories c ON s.category_id = c.id
|
|
WHERE {where_clause}
|
|
ORDER BY c.sort_order, s.name
|
|
LIMIT ${param_idx} OFFSET ${param_idx + 1}
|
|
"""
|
|
|
|
rows = await conn.fetch(query, *params)
|
|
|
|
seeds = [_row_to_seed_response(row) for row in rows]
|
|
|
|
return SeedsListResponse(
|
|
seeds=seeds,
|
|
total=total,
|
|
page=page,
|
|
page_size=page_size,
|
|
)
|
|
|
|
|
|
@router.get("/seeds/{seed_id}", response_model=SeedResponse)
|
|
async def get_seed(seed_id: str):
|
|
"""Get a single seed by ID."""
|
|
pool = await get_db_pool()
|
|
async with pool.acquire() as conn:
|
|
row = await conn.fetchrow("""
|
|
SELECT
|
|
s.id, s.url, s.name, s.description,
|
|
c.name as category, c.display_name as category_display_name,
|
|
s.source_type, s.scope, s.state, s.trust_boost, s.enabled,
|
|
s.crawl_depth, s.crawl_frequency, s.last_crawled_at,
|
|
s.last_crawl_status, s.last_crawl_docs, s.total_documents,
|
|
s.created_at, s.updated_at
|
|
FROM edu_search_seeds s
|
|
LEFT JOIN edu_search_categories c ON s.category_id = c.id
|
|
WHERE s.id = $1
|
|
""", seed_id)
|
|
|
|
if not row:
|
|
raise HTTPException(status_code=404, detail="Seed nicht gefunden")
|
|
|
|
return _row_to_seed_response(row)
|
|
|
|
|
|
@router.post("/seeds", response_model=SeedResponse, status_code=201)
|
|
async def create_seed(seed: SeedCreate):
|
|
"""Create a new seed URL."""
|
|
pool = await get_db_pool()
|
|
async with pool.acquire() as conn:
|
|
category_id = None
|
|
if seed.category_name:
|
|
category_id = await conn.fetchval(
|
|
"SELECT id FROM edu_search_categories WHERE name = $1",
|
|
seed.category_name
|
|
)
|
|
|
|
try:
|
|
row = await conn.fetchrow("""
|
|
INSERT INTO edu_search_seeds (
|
|
url, name, description, category_id, source_type, scope,
|
|
state, trust_boost, enabled, crawl_depth, crawl_frequency
|
|
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11)
|
|
RETURNING id, created_at, updated_at
|
|
""",
|
|
seed.url, seed.name, seed.description, category_id,
|
|
seed.source_type, seed.scope, seed.state, seed.trust_boost,
|
|
seed.enabled, seed.crawl_depth, seed.crawl_frequency
|
|
)
|
|
except asyncpg.UniqueViolationError:
|
|
raise HTTPException(status_code=409, detail="URL existiert bereits")
|
|
|
|
return SeedResponse(
|
|
id=str(row["id"]),
|
|
url=seed.url,
|
|
name=seed.name,
|
|
description=seed.description,
|
|
category=seed.category_name,
|
|
category_display_name=None,
|
|
source_type=seed.source_type,
|
|
scope=seed.scope,
|
|
state=seed.state,
|
|
trust_boost=seed.trust_boost,
|
|
enabled=seed.enabled,
|
|
crawl_depth=seed.crawl_depth,
|
|
crawl_frequency=seed.crawl_frequency,
|
|
last_crawled_at=None,
|
|
last_crawl_status=None,
|
|
last_crawl_docs=0,
|
|
total_documents=0,
|
|
created_at=row["created_at"],
|
|
updated_at=row["updated_at"],
|
|
)
|
|
|
|
|
|
@router.put("/seeds/{seed_id}", response_model=SeedResponse)
|
|
async def update_seed(seed_id: str, seed: SeedUpdate):
|
|
"""Update an existing seed."""
|
|
pool = await get_db_pool()
|
|
async with pool.acquire() as conn:
|
|
updates = []
|
|
params = []
|
|
param_idx = 1
|
|
|
|
if seed.url is not None:
|
|
updates.append(f"url = ${param_idx}")
|
|
params.append(seed.url)
|
|
param_idx += 1
|
|
|
|
if seed.name is not None:
|
|
updates.append(f"name = ${param_idx}")
|
|
params.append(seed.name)
|
|
param_idx += 1
|
|
|
|
if seed.description is not None:
|
|
updates.append(f"description = ${param_idx}")
|
|
params.append(seed.description)
|
|
param_idx += 1
|
|
|
|
if seed.category_name is not None:
|
|
category_id = await conn.fetchval(
|
|
"SELECT id FROM edu_search_categories WHERE name = $1",
|
|
seed.category_name
|
|
)
|
|
updates.append(f"category_id = ${param_idx}")
|
|
params.append(category_id)
|
|
param_idx += 1
|
|
|
|
if seed.source_type is not None:
|
|
updates.append(f"source_type = ${param_idx}")
|
|
params.append(seed.source_type)
|
|
param_idx += 1
|
|
|
|
if seed.scope is not None:
|
|
updates.append(f"scope = ${param_idx}")
|
|
params.append(seed.scope)
|
|
param_idx += 1
|
|
|
|
if seed.state is not None:
|
|
updates.append(f"state = ${param_idx}")
|
|
params.append(seed.state)
|
|
param_idx += 1
|
|
|
|
if seed.trust_boost is not None:
|
|
updates.append(f"trust_boost = ${param_idx}")
|
|
params.append(seed.trust_boost)
|
|
param_idx += 1
|
|
|
|
if seed.enabled is not None:
|
|
updates.append(f"enabled = ${param_idx}")
|
|
params.append(seed.enabled)
|
|
param_idx += 1
|
|
|
|
if seed.crawl_depth is not None:
|
|
updates.append(f"crawl_depth = ${param_idx}")
|
|
params.append(seed.crawl_depth)
|
|
param_idx += 1
|
|
|
|
if seed.crawl_frequency is not None:
|
|
updates.append(f"crawl_frequency = ${param_idx}")
|
|
params.append(seed.crawl_frequency)
|
|
param_idx += 1
|
|
|
|
if not updates:
|
|
raise HTTPException(status_code=400, detail="Keine Felder zum Aktualisieren")
|
|
|
|
updates.append("updated_at = NOW()")
|
|
params.append(seed_id)
|
|
|
|
query = f"""
|
|
UPDATE edu_search_seeds
|
|
SET {", ".join(updates)}
|
|
WHERE id = ${param_idx}
|
|
RETURNING id
|
|
"""
|
|
|
|
result = await conn.fetchrow(query, *params)
|
|
if not result:
|
|
raise HTTPException(status_code=404, detail="Seed nicht gefunden")
|
|
|
|
# Return updated seed
|
|
return await get_seed(seed_id)
|
|
|
|
|
|
@router.delete("/seeds/{seed_id}")
|
|
async def delete_seed(seed_id: str):
|
|
"""Delete a seed."""
|
|
pool = await get_db_pool()
|
|
async with pool.acquire() as conn:
|
|
result = await conn.execute(
|
|
"DELETE FROM edu_search_seeds WHERE id = $1",
|
|
seed_id
|
|
)
|
|
if result == "DELETE 0":
|
|
raise HTTPException(status_code=404, detail="Seed nicht gefunden")
|
|
|
|
return {"status": "deleted", "id": seed_id}
|
|
|
|
|
|
@router.post("/seeds/bulk-import", response_model=BulkImportResponse)
|
|
async def bulk_import_seeds(request: BulkImportRequest):
|
|
"""Bulk import seeds (skip duplicates)."""
|
|
pool = await get_db_pool()
|
|
imported = 0
|
|
skipped = 0
|
|
errors = []
|
|
|
|
async with pool.acquire() as conn:
|
|
# Pre-fetch all category IDs
|
|
categories = {}
|
|
rows = await conn.fetch("SELECT id, name FROM edu_search_categories")
|
|
for row in rows:
|
|
categories[row["name"]] = row["id"]
|
|
|
|
for seed in request.seeds:
|
|
try:
|
|
category_id = categories.get(seed.category_name) if seed.category_name else None
|
|
|
|
await conn.execute("""
|
|
INSERT INTO edu_search_seeds (
|
|
url, name, description, category_id, source_type, scope,
|
|
state, trust_boost, enabled, crawl_depth, crawl_frequency
|
|
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11)
|
|
ON CONFLICT (url) DO NOTHING
|
|
""",
|
|
seed.url, seed.name, seed.description, category_id,
|
|
seed.source_type, seed.scope, seed.state, seed.trust_boost,
|
|
seed.enabled, seed.crawl_depth, seed.crawl_frequency
|
|
)
|
|
imported += 1
|
|
except asyncpg.UniqueViolationError:
|
|
skipped += 1
|
|
except Exception as e:
|
|
errors.append(f"{seed.url}: {str(e)}")
|
|
|
|
return BulkImportResponse(imported=imported, skipped=skipped, errors=errors)
|
|
|
|
|
|
def _row_to_seed_response(row) -> SeedResponse:
|
|
"""Convert a database row to SeedResponse."""
|
|
return SeedResponse(
|
|
id=str(row["id"]),
|
|
url=row["url"],
|
|
name=row["name"],
|
|
description=row["description"],
|
|
category=row["category"],
|
|
category_display_name=row["category_display_name"],
|
|
source_type=row["source_type"],
|
|
scope=row["scope"],
|
|
state=row["state"],
|
|
trust_boost=float(row["trust_boost"]),
|
|
enabled=row["enabled"],
|
|
crawl_depth=row["crawl_depth"],
|
|
crawl_frequency=row["crawl_frequency"],
|
|
last_crawled_at=row["last_crawled_at"],
|
|
last_crawl_status=row["last_crawl_status"],
|
|
last_crawl_docs=row["last_crawl_docs"] or 0,
|
|
total_documents=row["total_documents"] or 0,
|
|
created_at=row["created_at"],
|
|
updated_at=row["updated_at"],
|
|
)
|