""" EduSearch Seeds Stats & Crawl Status Routes. Statistics, export for crawler, and crawl status feedback endpoints. """ import logging from typing import List from datetime import datetime from fastapi import APIRouter, HTTPException import asyncpg from .edu_search_models import ( StatsResponse, CrawlStatusUpdate, CrawlStatusResponse, BulkCrawlStatusUpdate, BulkCrawlStatusResponse, ) from .edu_search_crud import get_db_pool logger = logging.getLogger(__name__) router = APIRouter(tags=["edu-search"]) @router.get("/stats", response_model=StatsResponse) async def get_stats(): """Get crawl statistics.""" pool = await get_db_pool() async with pool.acquire() as conn: # Basic counts total = await conn.fetchval("SELECT COUNT(*) FROM edu_search_seeds") enabled = await conn.fetchval("SELECT COUNT(*) FROM edu_search_seeds WHERE enabled = TRUE") total_docs = await conn.fetchval("SELECT COALESCE(SUM(total_documents), 0) FROM edu_search_seeds") # By category cat_rows = await conn.fetch(""" SELECT c.name, COUNT(s.id) as count FROM edu_search_categories c LEFT JOIN edu_search_seeds s ON c.id = s.category_id GROUP BY c.name """) by_category = {row["name"]: row["count"] for row in cat_rows} # By state state_rows = await conn.fetch(""" SELECT COALESCE(state, 'federal') as state, COUNT(*) as count FROM edu_search_seeds GROUP BY state """) by_state = {row["state"]: row["count"] for row in state_rows} # Last crawl time last_crawl = await conn.fetchval( "SELECT MAX(last_crawled_at) FROM edu_search_seeds" ) return StatsResponse( total_seeds=total, enabled_seeds=enabled, total_documents=total_docs, seeds_by_category=by_category, seeds_by_state=by_state, last_crawl_time=last_crawl, ) # Export for external use (edu-search-service) @router.get("/seeds/export/for-crawler") async def export_seeds_for_crawler(): """Export enabled seeds in format suitable for crawler.""" pool = await get_db_pool() async with pool.acquire() as conn: rows = await conn.fetch(""" SELECT s.url, s.trust_boost, s.source_type, s.scope, s.state, s.crawl_depth, c.name as category FROM edu_search_seeds s LEFT JOIN edu_search_categories c ON s.category_id = c.id WHERE s.enabled = TRUE ORDER BY s.trust_boost DESC """) return { "seeds": [ { "url": row["url"], "trust": float(row["trust_boost"]), "source": row["source_type"], "scope": row["scope"], "state": row["state"], "depth": row["crawl_depth"], "category": row["category"], } for row in rows ], "total": len(rows), "exported_at": datetime.utcnow().isoformat(), } # ============================================================================= # Crawl Status Feedback (from edu-search-service) # ============================================================================= @router.post("/seeds/crawl-status", response_model=CrawlStatusResponse) async def update_crawl_status(update: CrawlStatusUpdate): """Update crawl status for a seed URL (called by edu-search-service).""" pool = await get_db_pool() async with pool.acquire() as conn: # Find the seed by URL seed = await conn.fetchrow( "SELECT id, total_documents FROM edu_search_seeds WHERE url = $1", update.seed_url ) if not seed: raise HTTPException( status_code=404, detail=f"Seed nicht gefunden: {update.seed_url}" ) # Update the seed with crawl status new_total = (seed["total_documents"] or 0) + update.documents_crawled await conn.execute(""" UPDATE edu_search_seeds SET last_crawled_at = NOW(), last_crawl_status = $2, last_crawl_docs = $3, total_documents = $4, updated_at = NOW() WHERE id = $1 """, seed["id"], update.status, update.documents_crawled, new_total) logger.info( f"Crawl status updated: {update.seed_url} - " f"status={update.status}, docs={update.documents_crawled}, " f"duration={update.crawl_duration_seconds:.1f}s" ) return CrawlStatusResponse( success=True, seed_url=update.seed_url, message=f"Status aktualisiert: {update.documents_crawled} Dokumente gecrawlt" ) @router.post("/seeds/crawl-status/bulk", response_model=BulkCrawlStatusResponse) async def bulk_update_crawl_status(request: BulkCrawlStatusUpdate): """Bulk update crawl status for multiple seeds.""" pool = await get_db_pool() updated = 0 failed = 0 errors = [] async with pool.acquire() as conn: for update in request.updates: try: seed = await conn.fetchrow( "SELECT id, total_documents FROM edu_search_seeds WHERE url = $1", update.seed_url ) if not seed: failed += 1 errors.append(f"Seed nicht gefunden: {update.seed_url}") continue new_total = (seed["total_documents"] or 0) + update.documents_crawled await conn.execute(""" UPDATE edu_search_seeds SET last_crawled_at = NOW(), last_crawl_status = $2, last_crawl_docs = $3, total_documents = $4, updated_at = NOW() WHERE id = $1 """, seed["id"], update.status, update.documents_crawled, new_total) updated += 1 except Exception as e: failed += 1 errors.append(f"{update.seed_url}: {str(e)}") logger.info(f"Bulk crawl status update: {updated} updated, {failed} failed") return BulkCrawlStatusResponse( updated=updated, failed=failed, errors=errors )