Files
breakpilot-lehrer/backend-lehrer/llm_gateway/routes/edu_search_seeds.py
Benjamin Boenisch 5a31f52310 Initial commit: breakpilot-lehrer - Lehrer KI Platform
Services: Admin-Lehrer, Backend-Lehrer, Studio v2, Website,
Klausur-Service, School-Service, Voice-Service, Geo-Service,
BreakPilot Drive, Agent-Core

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-11 23:47:26 +01:00

711 lines
24 KiB
Python

"""
EduSearch Seeds API Routes.
CRUD operations for managing education search crawler seed URLs.
Direct database access to PostgreSQL.
"""
import os
import logging
from typing import Optional, List
from datetime import datetime
from uuid import UUID
from fastapi import APIRouter, HTTPException, Depends, Query
from pydantic import BaseModel, Field, HttpUrl
import asyncpg
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/edu-search", tags=["edu-search"])
# Database connection pool
_pool: Optional[asyncpg.Pool] = None
async def get_db_pool() -> asyncpg.Pool:
"""Get or create database connection pool."""
global _pool
if _pool is None:
database_url = os.environ.get("DATABASE_URL")
if not database_url:
raise RuntimeError("DATABASE_URL nicht konfiguriert - bitte via Vault oder Umgebungsvariable setzen")
_pool = await asyncpg.create_pool(database_url, min_size=2, max_size=10)
return _pool
# =============================================================================
# Pydantic Models
# =============================================================================
class CategoryResponse(BaseModel):
"""Category response model."""
id: str
name: str
display_name: str
description: Optional[str] = None
icon: Optional[str] = None
sort_order: int
is_active: bool
class SeedBase(BaseModel):
"""Base seed model for creation/update."""
url: str = Field(..., max_length=500)
name: str = Field(..., max_length=255)
description: Optional[str] = None
category_name: Optional[str] = Field(None, description="Category name (federal, states, etc.)")
source_type: str = Field("GOV", description="GOV, EDU, UNI, etc.")
scope: str = Field("FEDERAL", description="FEDERAL, STATE, etc.")
state: Optional[str] = Field(None, max_length=5, description="State code (BW, BY, etc.)")
trust_boost: float = Field(0.50, ge=0.0, le=1.0)
enabled: bool = True
crawl_depth: int = Field(2, ge=1, le=5)
crawl_frequency: str = Field("weekly", description="hourly, daily, weekly, monthly")
class SeedCreate(SeedBase):
"""Seed creation model."""
pass
class SeedUpdate(BaseModel):
"""Seed update model (all fields optional)."""
url: Optional[str] = Field(None, max_length=500)
name: Optional[str] = Field(None, max_length=255)
description: Optional[str] = None
category_name: Optional[str] = None
source_type: Optional[str] = None
scope: Optional[str] = None
state: Optional[str] = Field(None, max_length=5)
trust_boost: Optional[float] = Field(None, ge=0.0, le=1.0)
enabled: Optional[bool] = None
crawl_depth: Optional[int] = Field(None, ge=1, le=5)
crawl_frequency: Optional[str] = None
class SeedResponse(BaseModel):
"""Seed response model."""
id: str
url: str
name: str
description: Optional[str] = None
category: Optional[str] = None
category_display_name: Optional[str] = None
source_type: str
scope: str
state: Optional[str] = None
trust_boost: float
enabled: bool
crawl_depth: int
crawl_frequency: str
last_crawled_at: Optional[datetime] = None
last_crawl_status: Optional[str] = None
last_crawl_docs: int = 0
total_documents: int = 0
created_at: datetime
updated_at: datetime
class SeedsListResponse(BaseModel):
"""List response with pagination info."""
seeds: List[SeedResponse]
total: int
page: int
page_size: int
class StatsResponse(BaseModel):
"""Crawl statistics response."""
total_seeds: int
enabled_seeds: int
total_documents: int
seeds_by_category: dict
seeds_by_state: dict
last_crawl_time: Optional[datetime] = None
class BulkImportRequest(BaseModel):
"""Bulk import request."""
seeds: List[SeedCreate]
class BulkImportResponse(BaseModel):
"""Bulk import response."""
imported: int
skipped: int
errors: List[str]
# =============================================================================
# API Endpoints
# =============================================================================
@router.get("/categories", response_model=List[CategoryResponse])
async def list_categories():
"""List all seed categories."""
pool = await get_db_pool()
async with pool.acquire() as conn:
rows = await conn.fetch("""
SELECT id, name, display_name, description, icon, sort_order, is_active
FROM edu_search_categories
WHERE is_active = TRUE
ORDER BY sort_order
""")
return [
CategoryResponse(
id=str(row["id"]),
name=row["name"],
display_name=row["display_name"],
description=row["description"],
icon=row["icon"],
sort_order=row["sort_order"],
is_active=row["is_active"],
)
for row in rows
]
@router.get("/seeds", response_model=SeedsListResponse)
async def list_seeds(
category: Optional[str] = Query(None, description="Filter by category name"),
state: Optional[str] = Query(None, description="Filter by state code"),
enabled: Optional[bool] = Query(None, description="Filter by enabled status"),
search: Optional[str] = Query(None, description="Search in name/url"),
page: int = Query(1, ge=1),
page_size: int = Query(50, ge=1, le=200),
):
"""List seeds with optional filtering and pagination."""
pool = await get_db_pool()
async with pool.acquire() as conn:
# Build WHERE clause
conditions = []
params = []
param_idx = 1
if category:
conditions.append(f"c.name = ${param_idx}")
params.append(category)
param_idx += 1
if state:
conditions.append(f"s.state = ${param_idx}")
params.append(state)
param_idx += 1
if enabled is not None:
conditions.append(f"s.enabled = ${param_idx}")
params.append(enabled)
param_idx += 1
if search:
conditions.append(f"(s.name ILIKE ${param_idx} OR s.url ILIKE ${param_idx})")
params.append(f"%{search}%")
param_idx += 1
where_clause = " AND ".join(conditions) if conditions else "TRUE"
# Count total
count_query = f"""
SELECT COUNT(*) FROM edu_search_seeds s
LEFT JOIN edu_search_categories c ON s.category_id = c.id
WHERE {where_clause}
"""
total = await conn.fetchval(count_query, *params)
# Get paginated results
offset = (page - 1) * page_size
params.extend([page_size, offset])
query = f"""
SELECT
s.id, s.url, s.name, s.description,
c.name as category, c.display_name as category_display_name,
s.source_type, s.scope, s.state, s.trust_boost, s.enabled,
s.crawl_depth, s.crawl_frequency, s.last_crawled_at,
s.last_crawl_status, s.last_crawl_docs, s.total_documents,
s.created_at, s.updated_at
FROM edu_search_seeds s
LEFT JOIN edu_search_categories c ON s.category_id = c.id
WHERE {where_clause}
ORDER BY c.sort_order, s.name
LIMIT ${param_idx} OFFSET ${param_idx + 1}
"""
rows = await conn.fetch(query, *params)
seeds = [
SeedResponse(
id=str(row["id"]),
url=row["url"],
name=row["name"],
description=row["description"],
category=row["category"],
category_display_name=row["category_display_name"],
source_type=row["source_type"],
scope=row["scope"],
state=row["state"],
trust_boost=float(row["trust_boost"]),
enabled=row["enabled"],
crawl_depth=row["crawl_depth"],
crawl_frequency=row["crawl_frequency"],
last_crawled_at=row["last_crawled_at"],
last_crawl_status=row["last_crawl_status"],
last_crawl_docs=row["last_crawl_docs"] or 0,
total_documents=row["total_documents"] or 0,
created_at=row["created_at"],
updated_at=row["updated_at"],
)
for row in rows
]
return SeedsListResponse(
seeds=seeds,
total=total,
page=page,
page_size=page_size,
)
@router.get("/seeds/{seed_id}", response_model=SeedResponse)
async def get_seed(seed_id: str):
"""Get a single seed by ID."""
pool = await get_db_pool()
async with pool.acquire() as conn:
row = await conn.fetchrow("""
SELECT
s.id, s.url, s.name, s.description,
c.name as category, c.display_name as category_display_name,
s.source_type, s.scope, s.state, s.trust_boost, s.enabled,
s.crawl_depth, s.crawl_frequency, s.last_crawled_at,
s.last_crawl_status, s.last_crawl_docs, s.total_documents,
s.created_at, s.updated_at
FROM edu_search_seeds s
LEFT JOIN edu_search_categories c ON s.category_id = c.id
WHERE s.id = $1
""", seed_id)
if not row:
raise HTTPException(status_code=404, detail="Seed nicht gefunden")
return SeedResponse(
id=str(row["id"]),
url=row["url"],
name=row["name"],
description=row["description"],
category=row["category"],
category_display_name=row["category_display_name"],
source_type=row["source_type"],
scope=row["scope"],
state=row["state"],
trust_boost=float(row["trust_boost"]),
enabled=row["enabled"],
crawl_depth=row["crawl_depth"],
crawl_frequency=row["crawl_frequency"],
last_crawled_at=row["last_crawled_at"],
last_crawl_status=row["last_crawl_status"],
last_crawl_docs=row["last_crawl_docs"] or 0,
total_documents=row["total_documents"] or 0,
created_at=row["created_at"],
updated_at=row["updated_at"],
)
@router.post("/seeds", response_model=SeedResponse, status_code=201)
async def create_seed(seed: SeedCreate):
"""Create a new seed URL."""
pool = await get_db_pool()
async with pool.acquire() as conn:
# Get category ID if provided
category_id = None
if seed.category_name:
category_id = await conn.fetchval(
"SELECT id FROM edu_search_categories WHERE name = $1",
seed.category_name
)
try:
row = await conn.fetchrow("""
INSERT INTO edu_search_seeds (
url, name, description, category_id, source_type, scope,
state, trust_boost, enabled, crawl_depth, crawl_frequency
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11)
RETURNING id, created_at, updated_at
""",
seed.url, seed.name, seed.description, category_id,
seed.source_type, seed.scope, seed.state, seed.trust_boost,
seed.enabled, seed.crawl_depth, seed.crawl_frequency
)
except asyncpg.UniqueViolationError:
raise HTTPException(status_code=409, detail="URL existiert bereits")
return SeedResponse(
id=str(row["id"]),
url=seed.url,
name=seed.name,
description=seed.description,
category=seed.category_name,
category_display_name=None,
source_type=seed.source_type,
scope=seed.scope,
state=seed.state,
trust_boost=seed.trust_boost,
enabled=seed.enabled,
crawl_depth=seed.crawl_depth,
crawl_frequency=seed.crawl_frequency,
last_crawled_at=None,
last_crawl_status=None,
last_crawl_docs=0,
total_documents=0,
created_at=row["created_at"],
updated_at=row["updated_at"],
)
@router.put("/seeds/{seed_id}", response_model=SeedResponse)
async def update_seed(seed_id: str, seed: SeedUpdate):
"""Update an existing seed."""
pool = await get_db_pool()
async with pool.acquire() as conn:
# Build update statement dynamically
updates = []
params = []
param_idx = 1
if seed.url is not None:
updates.append(f"url = ${param_idx}")
params.append(seed.url)
param_idx += 1
if seed.name is not None:
updates.append(f"name = ${param_idx}")
params.append(seed.name)
param_idx += 1
if seed.description is not None:
updates.append(f"description = ${param_idx}")
params.append(seed.description)
param_idx += 1
if seed.category_name is not None:
category_id = await conn.fetchval(
"SELECT id FROM edu_search_categories WHERE name = $1",
seed.category_name
)
updates.append(f"category_id = ${param_idx}")
params.append(category_id)
param_idx += 1
if seed.source_type is not None:
updates.append(f"source_type = ${param_idx}")
params.append(seed.source_type)
param_idx += 1
if seed.scope is not None:
updates.append(f"scope = ${param_idx}")
params.append(seed.scope)
param_idx += 1
if seed.state is not None:
updates.append(f"state = ${param_idx}")
params.append(seed.state)
param_idx += 1
if seed.trust_boost is not None:
updates.append(f"trust_boost = ${param_idx}")
params.append(seed.trust_boost)
param_idx += 1
if seed.enabled is not None:
updates.append(f"enabled = ${param_idx}")
params.append(seed.enabled)
param_idx += 1
if seed.crawl_depth is not None:
updates.append(f"crawl_depth = ${param_idx}")
params.append(seed.crawl_depth)
param_idx += 1
if seed.crawl_frequency is not None:
updates.append(f"crawl_frequency = ${param_idx}")
params.append(seed.crawl_frequency)
param_idx += 1
if not updates:
raise HTTPException(status_code=400, detail="Keine Felder zum Aktualisieren")
updates.append("updated_at = NOW()")
params.append(seed_id)
query = f"""
UPDATE edu_search_seeds
SET {", ".join(updates)}
WHERE id = ${param_idx}
RETURNING id
"""
result = await conn.fetchrow(query, *params)
if not result:
raise HTTPException(status_code=404, detail="Seed nicht gefunden")
# Return updated seed
return await get_seed(seed_id)
@router.delete("/seeds/{seed_id}")
async def delete_seed(seed_id: str):
"""Delete a seed."""
pool = await get_db_pool()
async with pool.acquire() as conn:
result = await conn.execute(
"DELETE FROM edu_search_seeds WHERE id = $1",
seed_id
)
if result == "DELETE 0":
raise HTTPException(status_code=404, detail="Seed nicht gefunden")
return {"status": "deleted", "id": seed_id}
@router.post("/seeds/bulk-import", response_model=BulkImportResponse)
async def bulk_import_seeds(request: BulkImportRequest):
"""Bulk import seeds (skip duplicates)."""
pool = await get_db_pool()
imported = 0
skipped = 0
errors = []
async with pool.acquire() as conn:
# Pre-fetch all category IDs
categories = {}
rows = await conn.fetch("SELECT id, name FROM edu_search_categories")
for row in rows:
categories[row["name"]] = row["id"]
for seed in request.seeds:
try:
category_id = categories.get(seed.category_name) if seed.category_name else None
await conn.execute("""
INSERT INTO edu_search_seeds (
url, name, description, category_id, source_type, scope,
state, trust_boost, enabled, crawl_depth, crawl_frequency
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11)
ON CONFLICT (url) DO NOTHING
""",
seed.url, seed.name, seed.description, category_id,
seed.source_type, seed.scope, seed.state, seed.trust_boost,
seed.enabled, seed.crawl_depth, seed.crawl_frequency
)
imported += 1
except asyncpg.UniqueViolationError:
skipped += 1
except Exception as e:
errors.append(f"{seed.url}: {str(e)}")
return BulkImportResponse(imported=imported, skipped=skipped, errors=errors)
@router.get("/stats", response_model=StatsResponse)
async def get_stats():
"""Get crawl statistics."""
pool = await get_db_pool()
async with pool.acquire() as conn:
# Basic counts
total = await conn.fetchval("SELECT COUNT(*) FROM edu_search_seeds")
enabled = await conn.fetchval("SELECT COUNT(*) FROM edu_search_seeds WHERE enabled = TRUE")
total_docs = await conn.fetchval("SELECT COALESCE(SUM(total_documents), 0) FROM edu_search_seeds")
# By category
cat_rows = await conn.fetch("""
SELECT c.name, COUNT(s.id) as count
FROM edu_search_categories c
LEFT JOIN edu_search_seeds s ON c.id = s.category_id
GROUP BY c.name
""")
by_category = {row["name"]: row["count"] for row in cat_rows}
# By state
state_rows = await conn.fetch("""
SELECT COALESCE(state, 'federal') as state, COUNT(*) as count
FROM edu_search_seeds
GROUP BY state
""")
by_state = {row["state"]: row["count"] for row in state_rows}
# Last crawl time
last_crawl = await conn.fetchval(
"SELECT MAX(last_crawled_at) FROM edu_search_seeds"
)
return StatsResponse(
total_seeds=total,
enabled_seeds=enabled,
total_documents=total_docs,
seeds_by_category=by_category,
seeds_by_state=by_state,
last_crawl_time=last_crawl,
)
# Export for external use (edu-search-service)
@router.get("/seeds/export/for-crawler")
async def export_seeds_for_crawler():
"""Export enabled seeds in format suitable for crawler."""
pool = await get_db_pool()
async with pool.acquire() as conn:
rows = await conn.fetch("""
SELECT
s.url, s.trust_boost, s.source_type, s.scope, s.state,
s.crawl_depth, c.name as category
FROM edu_search_seeds s
LEFT JOIN edu_search_categories c ON s.category_id = c.id
WHERE s.enabled = TRUE
ORDER BY s.trust_boost DESC
""")
return {
"seeds": [
{
"url": row["url"],
"trust": float(row["trust_boost"]),
"source": row["source_type"],
"scope": row["scope"],
"state": row["state"],
"depth": row["crawl_depth"],
"category": row["category"],
}
for row in rows
],
"total": len(rows),
"exported_at": datetime.utcnow().isoformat(),
}
# =============================================================================
# Crawl Status Feedback (from edu-search-service)
# =============================================================================
class CrawlStatusUpdate(BaseModel):
"""Crawl status update from edu-search-service."""
seed_url: str = Field(..., description="The seed URL that was crawled")
status: str = Field(..., description="Crawl status: success, error, partial")
documents_crawled: int = Field(0, ge=0, description="Number of documents crawled")
error_message: Optional[str] = Field(None, description="Error message if status is error")
crawl_duration_seconds: float = Field(0.0, ge=0.0, description="Duration of the crawl in seconds")
class CrawlStatusResponse(BaseModel):
"""Response for crawl status update."""
success: bool
seed_url: str
message: str
@router.post("/seeds/crawl-status", response_model=CrawlStatusResponse)
async def update_crawl_status(update: CrawlStatusUpdate):
"""Update crawl status for a seed URL (called by edu-search-service)."""
pool = await get_db_pool()
async with pool.acquire() as conn:
# Find the seed by URL
seed = await conn.fetchrow(
"SELECT id, total_documents FROM edu_search_seeds WHERE url = $1",
update.seed_url
)
if not seed:
raise HTTPException(
status_code=404,
detail=f"Seed nicht gefunden: {update.seed_url}"
)
# Update the seed with crawl status
new_total = (seed["total_documents"] or 0) + update.documents_crawled
await conn.execute("""
UPDATE edu_search_seeds
SET
last_crawled_at = NOW(),
last_crawl_status = $2,
last_crawl_docs = $3,
total_documents = $4,
updated_at = NOW()
WHERE id = $1
""", seed["id"], update.status, update.documents_crawled, new_total)
logger.info(
f"Crawl status updated: {update.seed_url} - "
f"status={update.status}, docs={update.documents_crawled}, "
f"duration={update.crawl_duration_seconds:.1f}s"
)
return CrawlStatusResponse(
success=True,
seed_url=update.seed_url,
message=f"Status aktualisiert: {update.documents_crawled} Dokumente gecrawlt"
)
class BulkCrawlStatusUpdate(BaseModel):
"""Bulk crawl status update."""
updates: List[CrawlStatusUpdate]
class BulkCrawlStatusResponse(BaseModel):
"""Response for bulk crawl status update."""
updated: int
failed: int
errors: List[str]
@router.post("/seeds/crawl-status/bulk", response_model=BulkCrawlStatusResponse)
async def bulk_update_crawl_status(request: BulkCrawlStatusUpdate):
"""Bulk update crawl status for multiple seeds."""
pool = await get_db_pool()
updated = 0
failed = 0
errors = []
async with pool.acquire() as conn:
for update in request.updates:
try:
seed = await conn.fetchrow(
"SELECT id, total_documents FROM edu_search_seeds WHERE url = $1",
update.seed_url
)
if not seed:
failed += 1
errors.append(f"Seed nicht gefunden: {update.seed_url}")
continue
new_total = (seed["total_documents"] or 0) + update.documents_crawled
await conn.execute("""
UPDATE edu_search_seeds
SET
last_crawled_at = NOW(),
last_crawl_status = $2,
last_crawl_docs = $3,
total_documents = $4,
updated_at = NOW()
WHERE id = $1
""", seed["id"], update.status, update.documents_crawled, new_total)
updated += 1
except Exception as e:
failed += 1
errors.append(f"{update.seed_url}: {str(e)}")
logger.info(f"Bulk crawl status update: {updated} updated, {failed} failed")
return BulkCrawlStatusResponse(
updated=updated,
failed=failed,
errors=errors
)