""" Zeugnis Crawler - Start/stop/status control functions. """ import asyncio from typing import Optional, Dict, Any from zeugnis_worker import ZeugnisCrawler, get_crawler_state _crawler_instance: Optional[ZeugnisCrawler] = None _crawler_task: Optional[asyncio.Task] = None async def start_crawler(bundesland: Optional[str] = None, source_id: Optional[str] = None) -> bool: """Start the crawler.""" global _crawler_instance, _crawler_task state = get_crawler_state() if state.is_running: return False state.is_running = True state.documents_crawled_today = 0 state.documents_indexed_today = 0 state.errors_today = 0 _crawler_instance = ZeugnisCrawler() await _crawler_instance.init() async def run_crawler(): try: from metrics_db import get_pool pool = await get_pool() if pool: async with pool.acquire() as conn: # Get sources to crawl if source_id: sources = await conn.fetch( "SELECT id, bundesland FROM zeugnis_sources WHERE id = $1", source_id ) elif bundesland: sources = await conn.fetch( "SELECT id, bundesland FROM zeugnis_sources WHERE bundesland = $1", bundesland ) else: sources = await conn.fetch( "SELECT id, bundesland FROM zeugnis_sources ORDER BY bundesland" ) for source in sources: if not state.is_running: break await _crawler_instance.crawl_source(source["id"]) except Exception as e: print(f"Crawler error: {e}") finally: state.is_running = False if _crawler_instance: await _crawler_instance.close() _crawler_task = asyncio.create_task(run_crawler()) return True async def stop_crawler() -> bool: """Stop the crawler.""" global _crawler_task state = get_crawler_state() if not state.is_running: return False state.is_running = False if _crawler_task: _crawler_task.cancel() try: await _crawler_task except asyncio.CancelledError: pass return True def get_crawler_status() -> Dict[str, Any]: """Get current crawler status.""" state = get_crawler_state() return { "is_running": state.is_running, "current_source": state.current_source_id, "current_bundesland": state.current_bundesland, "queue_length": len(state.queue), "documents_crawled_today": state.documents_crawled_today, "documents_indexed_today": state.documents_indexed_today, "errors_today": state.errors_today, "last_activity": state.last_activity.isoformat() if state.last_activity else None, }