""" Zeugnis Crawler - ZeugnisCrawler worker class and CrawlerState. Crawls official government documents about school certificates from all 16 German federal states. Only indexes documents where AI training is legally permitted. """ import asyncio from datetime import datetime from typing import Optional, List, Dict, Any, Tuple from dataclasses import dataclass, field import httpx from zeugnis_models import generate_id from zeugnis_text import ( extract_text_from_pdf, extract_text_from_html, chunk_text, compute_hash, ) from zeugnis_storage import ( upload_to_minio, generate_embeddings, index_in_qdrant, ) # ============================================================================= # Configuration # ============================================================================= MAX_RETRIES = 3 RETRY_DELAY = 5 # seconds REQUEST_TIMEOUT = 30 # seconds USER_AGENT = "BreakPilot-Zeugnis-Crawler/1.0 (Educational Research)" # ============================================================================= # Crawler State # ============================================================================= @dataclass class CrawlerState: """Global crawler state.""" is_running: bool = False current_source_id: Optional[str] = None current_bundesland: Optional[str] = None queue: List[Dict] = field(default_factory=list) documents_crawled_today: int = 0 documents_indexed_today: int = 0 errors_today: int = 0 last_activity: Optional[datetime] = None _crawler_state = CrawlerState() def get_crawler_state() -> CrawlerState: """Get the global crawler state.""" return _crawler_state # ============================================================================= # Crawler Worker # ============================================================================= class ZeugnisCrawler: """Rights-aware crawler for zeugnis documents.""" def __init__(self): self.http_client: Optional[httpx.AsyncClient] = None self.db_pool = None async def init(self): """Initialize crawler resources.""" self.http_client = httpx.AsyncClient( timeout=REQUEST_TIMEOUT, follow_redirects=True, headers={"User-Agent": USER_AGENT}, ) # Initialize database connection try: from metrics_db import get_pool self.db_pool = await get_pool() except Exception as e: print(f"Failed to get database pool: {e}") async def close(self): """Close crawler resources.""" if self.http_client: await self.http_client.aclose() async def fetch_url(self, url: str) -> Tuple[Optional[bytes], Optional[str]]: """Fetch URL with retry logic.""" for attempt in range(MAX_RETRIES): try: response = await self.http_client.get(url) response.raise_for_status() content_type = response.headers.get("content-type", "") return response.content, content_type except httpx.HTTPStatusError as e: print(f"HTTP error {e.response.status_code} for {url}") if e.response.status_code == 404: return None, None except Exception as e: print(f"Attempt {attempt + 1}/{MAX_RETRIES} failed for {url}: {e}") if attempt < MAX_RETRIES - 1: await asyncio.sleep(RETRY_DELAY * (attempt + 1)) return None, None async def crawl_seed_url( self, seed_url_id: str, url: str, bundesland: str, doc_type: str, training_allowed: bool, ) -> Dict[str, Any]: """Crawl a single seed URL.""" global _crawler_state result = { "seed_url_id": seed_url_id, "url": url, "success": False, "document_id": None, "indexed": False, "error": None, } try: # Fetch content content, content_type = await self.fetch_url(url) if not content: result["error"] = "Failed to fetch URL" return result # Determine file type is_pdf = "pdf" in content_type.lower() or url.lower().endswith(".pdf") # Extract text if is_pdf: text = extract_text_from_pdf(content) filename = url.split("/")[-1] or f"document_{seed_url_id}.pdf" else: text = extract_text_from_html(content) filename = f"document_{seed_url_id}.html" if not text: result["error"] = "No text extracted" return result # Compute hash for versioning content_hash = compute_hash(content) # Upload to MinIO minio_path = await upload_to_minio( content, bundesland, filename, content_type=content_type or "application/octet-stream", ) # Generate document ID doc_id = generate_id() # Store document in database if self.db_pool: async with self.db_pool.acquire() as conn: await conn.execute( """ INSERT INTO zeugnis_documents (id, seed_url_id, title, url, content_hash, minio_path, training_allowed, file_size, content_type) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9) ON CONFLICT DO NOTHING """, doc_id, seed_url_id, filename, url, content_hash, minio_path, training_allowed, len(content), content_type ) result["document_id"] = doc_id result["success"] = True _crawler_state.documents_crawled_today += 1 # Only index if training is allowed if training_allowed: chunks = chunk_text(text) if chunks: embeddings = await generate_embeddings(chunks) if embeddings: indexed_count = await index_in_qdrant( doc_id, chunks, embeddings, { "bundesland": bundesland, "doc_type": doc_type, "title": filename, "url": url, "training_allowed": True, } ) if indexed_count > 0: result["indexed"] = True _crawler_state.documents_indexed_today += 1 # Update database if self.db_pool: async with self.db_pool.acquire() as conn: await conn.execute( "UPDATE zeugnis_documents SET indexed_in_qdrant = true WHERE id = $1", doc_id ) else: result["indexed"] = False result["error"] = "Training not allowed for this source" _crawler_state.last_activity = datetime.now() except Exception as e: result["error"] = str(e) _crawler_state.errors_today += 1 return result async def crawl_source(self, source_id: str) -> Dict[str, Any]: """Crawl all seed URLs for a source.""" global _crawler_state result = { "source_id": source_id, "documents_found": 0, "documents_indexed": 0, "errors": [], "started_at": datetime.now(), "completed_at": None, } if not self.db_pool: result["errors"].append("Database not available") return result try: async with self.db_pool.acquire() as conn: # Get source info source = await conn.fetchrow( "SELECT * FROM zeugnis_sources WHERE id = $1", source_id ) if not source: result["errors"].append(f"Source not found: {source_id}") return result bundesland = source["bundesland"] training_allowed = source["training_allowed"] _crawler_state.current_source_id = source_id _crawler_state.current_bundesland = bundesland # Get seed URLs seed_urls = await conn.fetch( "SELECT * FROM zeugnis_seed_urls WHERE source_id = $1 AND status != 'completed'", source_id ) for seed_url in seed_urls: # Update status to running await conn.execute( "UPDATE zeugnis_seed_urls SET status = 'running' WHERE id = $1", seed_url["id"] ) # Crawl crawl_result = await self.crawl_seed_url( seed_url["id"], seed_url["url"], bundesland, seed_url["doc_type"], training_allowed, ) # Update status if crawl_result["success"]: result["documents_found"] += 1 if crawl_result["indexed"]: result["documents_indexed"] += 1 await conn.execute( "UPDATE zeugnis_seed_urls SET status = 'completed', last_crawled = NOW() WHERE id = $1", seed_url["id"] ) else: result["errors"].append(f"{seed_url['url']}: {crawl_result['error']}") await conn.execute( "UPDATE zeugnis_seed_urls SET status = 'failed', error_message = $2 WHERE id = $1", seed_url["id"], crawl_result["error"] ) # Small delay between requests await asyncio.sleep(1) except Exception as e: result["errors"].append(str(e)) finally: result["completed_at"] = datetime.now() _crawler_state.current_source_id = None _crawler_state.current_bundesland = None return result