""" Zeugnis Rights-Aware Crawler Crawls official government documents about school certificates (Zeugnisse) from all 16 German federal states. Only indexes documents where AI training is legally permitted. """ import asyncio import hashlib import os import re import uuid from datetime import datetime from typing import Optional, List, Dict, Any, Tuple from dataclasses import dataclass, field import httpx # Local imports from zeugnis_models import ( CrawlStatus, LicenseType, DocType, EventType, BUNDESLAENDER, TRAINING_PERMISSIONS, generate_id, get_training_allowed, get_bundesland_name, ) # ============================================================================= # Configuration # ============================================================================= QDRANT_URL = os.getenv("QDRANT_URL", "http://localhost:6333") MINIO_ENDPOINT = os.getenv("MINIO_ENDPOINT", "localhost:9000") MINIO_ACCESS_KEY = os.getenv("MINIO_ACCESS_KEY", "test-access-key") MINIO_SECRET_KEY = os.getenv("MINIO_SECRET_KEY", "test-secret-key") MINIO_BUCKET = os.getenv("MINIO_BUCKET", "breakpilot-rag") EMBEDDING_BACKEND = os.getenv("EMBEDDING_BACKEND", "local") ZEUGNIS_COLLECTION = "bp_zeugnis" CHUNK_SIZE = 1000 CHUNK_OVERLAP = 200 MAX_RETRIES = 3 RETRY_DELAY = 5 # seconds REQUEST_TIMEOUT = 30 # seconds USER_AGENT = "BreakPilot-Zeugnis-Crawler/1.0 (Educational Research)" # ============================================================================= # Crawler State # ============================================================================= @dataclass class CrawlerState: """Global crawler state.""" is_running: bool = False current_source_id: Optional[str] = None current_bundesland: Optional[str] = None queue: List[Dict] = field(default_factory=list) documents_crawled_today: int = 0 documents_indexed_today: int = 0 errors_today: int = 0 last_activity: Optional[datetime] = None _crawler_state = CrawlerState() # ============================================================================= # Text Extraction # ============================================================================= def extract_text_from_pdf(content: bytes) -> str: """Extract text from PDF bytes.""" try: from PyPDF2 import PdfReader import io reader = PdfReader(io.BytesIO(content)) text_parts = [] for page in reader.pages: text = page.extract_text() if text: text_parts.append(text) return "\n\n".join(text_parts) except Exception as e: print(f"PDF extraction failed: {e}") return "" def extract_text_from_html(content: bytes, encoding: str = "utf-8") -> str: """Extract text from HTML bytes.""" try: from bs4 import BeautifulSoup html = content.decode(encoding, errors="replace") soup = BeautifulSoup(html, "html.parser") # Remove script and style elements for element in soup(["script", "style", "nav", "header", "footer"]): element.decompose() # Get text text = soup.get_text(separator="\n", strip=True) # Clean up whitespace lines = [line.strip() for line in text.splitlines() if line.strip()] return "\n".join(lines) except Exception as e: print(f"HTML extraction failed: {e}") return "" def chunk_text(text: str, chunk_size: int = CHUNK_SIZE, overlap: int = CHUNK_OVERLAP) -> List[str]: """Split text into overlapping chunks.""" if not text: return [] chunks = [] separators = ["\n\n", "\n", ". ", " "] def split_recursive(text: str, sep_index: int = 0) -> List[str]: if len(text) <= chunk_size: return [text] if text.strip() else [] if sep_index >= len(separators): # Force split at chunk_size result = [] for i in range(0, len(text), chunk_size - overlap): chunk = text[i:i + chunk_size] if chunk.strip(): result.append(chunk) return result sep = separators[sep_index] parts = text.split(sep) result = [] current = "" for part in parts: if len(current) + len(sep) + len(part) <= chunk_size: current = current + sep + part if current else part else: if current.strip(): result.extend(split_recursive(current, sep_index + 1) if len(current) > chunk_size else [current]) current = part if current.strip(): result.extend(split_recursive(current, sep_index + 1) if len(current) > chunk_size else [current]) return result chunks = split_recursive(text) # Add overlap if overlap > 0 and len(chunks) > 1: overlapped = [] for i, chunk in enumerate(chunks): if i > 0: # Add end of previous chunk prev_end = chunks[i - 1][-overlap:] chunk = prev_end + chunk overlapped.append(chunk) chunks = overlapped return chunks def compute_hash(content: bytes) -> str: """Compute SHA-256 hash of content.""" return hashlib.sha256(content).hexdigest() # ============================================================================= # Embedding Generation # ============================================================================= _embedding_model = None def get_embedding_model(): """Get or initialize embedding model.""" global _embedding_model if _embedding_model is None and EMBEDDING_BACKEND == "local": try: from sentence_transformers import SentenceTransformer _embedding_model = SentenceTransformer("all-MiniLM-L6-v2") print("Loaded local embedding model: all-MiniLM-L6-v2") except ImportError: print("Warning: sentence-transformers not installed") return _embedding_model async def generate_embeddings(texts: List[str]) -> List[List[float]]: """Generate embeddings for a list of texts.""" if not texts: return [] if EMBEDDING_BACKEND == "local": model = get_embedding_model() if model: embeddings = model.encode(texts, show_progress_bar=False) return [emb.tolist() for emb in embeddings] return [] elif EMBEDDING_BACKEND == "openai": import openai api_key = os.getenv("OPENAI_API_KEY") if not api_key: print("Warning: OPENAI_API_KEY not set") return [] client = openai.AsyncOpenAI(api_key=api_key) response = await client.embeddings.create( input=texts, model="text-embedding-3-small" ) return [item.embedding for item in response.data] return [] # ============================================================================= # MinIO Storage # ============================================================================= async def upload_to_minio( content: bytes, bundesland: str, filename: str, content_type: str = "application/pdf", year: Optional[int] = None, ) -> Optional[str]: """Upload document to MinIO.""" try: from minio import Minio client = Minio( MINIO_ENDPOINT, access_key=MINIO_ACCESS_KEY, secret_key=MINIO_SECRET_KEY, secure=os.getenv("MINIO_SECURE", "false").lower() == "true" ) # Ensure bucket exists if not client.bucket_exists(MINIO_BUCKET): client.make_bucket(MINIO_BUCKET) # Build path year_str = str(year) if year else str(datetime.now().year) object_name = f"landes-daten/{bundesland}/zeugnis/{year_str}/{filename}" # Upload import io client.put_object( MINIO_BUCKET, object_name, io.BytesIO(content), len(content), content_type=content_type, ) return object_name except Exception as e: print(f"MinIO upload failed: {e}") return None # ============================================================================= # Qdrant Indexing # ============================================================================= async def index_in_qdrant( doc_id: str, chunks: List[str], embeddings: List[List[float]], metadata: Dict[str, Any], ) -> int: """Index document chunks in Qdrant.""" try: from qdrant_client import QdrantClient from qdrant_client.models import VectorParams, Distance, PointStruct client = QdrantClient(url=QDRANT_URL) # Ensure collection exists collections = client.get_collections().collections if not any(c.name == ZEUGNIS_COLLECTION for c in collections): vector_size = len(embeddings[0]) if embeddings else 384 client.create_collection( collection_name=ZEUGNIS_COLLECTION, vectors_config=VectorParams( size=vector_size, distance=Distance.COSINE, ), ) print(f"Created Qdrant collection: {ZEUGNIS_COLLECTION}") # Create points points = [] for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)): point_id = str(uuid.uuid4()) points.append(PointStruct( id=point_id, vector=embedding, payload={ "document_id": doc_id, "chunk_index": i, "chunk_text": chunk[:500], # Store first 500 chars for preview "bundesland": metadata.get("bundesland"), "doc_type": metadata.get("doc_type"), "title": metadata.get("title"), "source_url": metadata.get("url"), "training_allowed": metadata.get("training_allowed", False), "indexed_at": datetime.now().isoformat(), } )) # Upsert if points: client.upsert( collection_name=ZEUGNIS_COLLECTION, points=points, ) return len(points) except Exception as e: print(f"Qdrant indexing failed: {e}") return 0 # ============================================================================= # Crawler Worker # ============================================================================= class ZeugnisCrawler: """Rights-aware crawler for zeugnis documents.""" def __init__(self): self.http_client: Optional[httpx.AsyncClient] = None self.db_pool = None async def init(self): """Initialize crawler resources.""" self.http_client = httpx.AsyncClient( timeout=REQUEST_TIMEOUT, follow_redirects=True, headers={"User-Agent": USER_AGENT}, ) # Initialize database connection try: from metrics_db import get_pool self.db_pool = await get_pool() except Exception as e: print(f"Failed to get database pool: {e}") async def close(self): """Close crawler resources.""" if self.http_client: await self.http_client.aclose() async def fetch_url(self, url: str) -> Tuple[Optional[bytes], Optional[str]]: """Fetch URL with retry logic.""" for attempt in range(MAX_RETRIES): try: response = await self.http_client.get(url) response.raise_for_status() content_type = response.headers.get("content-type", "") return response.content, content_type except httpx.HTTPStatusError as e: print(f"HTTP error {e.response.status_code} for {url}") if e.response.status_code == 404: return None, None except Exception as e: print(f"Attempt {attempt + 1}/{MAX_RETRIES} failed for {url}: {e}") if attempt < MAX_RETRIES - 1: await asyncio.sleep(RETRY_DELAY * (attempt + 1)) return None, None async def crawl_seed_url( self, seed_url_id: str, url: str, bundesland: str, doc_type: str, training_allowed: bool, ) -> Dict[str, Any]: """Crawl a single seed URL.""" global _crawler_state result = { "seed_url_id": seed_url_id, "url": url, "success": False, "document_id": None, "indexed": False, "error": None, } try: # Fetch content content, content_type = await self.fetch_url(url) if not content: result["error"] = "Failed to fetch URL" return result # Determine file type is_pdf = "pdf" in content_type.lower() or url.lower().endswith(".pdf") # Extract text if is_pdf: text = extract_text_from_pdf(content) filename = url.split("/")[-1] or f"document_{seed_url_id}.pdf" else: text = extract_text_from_html(content) filename = f"document_{seed_url_id}.html" if not text: result["error"] = "No text extracted" return result # Compute hash for versioning content_hash = compute_hash(content) # Upload to MinIO minio_path = await upload_to_minio( content, bundesland, filename, content_type=content_type or "application/octet-stream", ) # Generate document ID doc_id = generate_id() # Store document in database if self.db_pool: async with self.db_pool.acquire() as conn: await conn.execute( """ INSERT INTO zeugnis_documents (id, seed_url_id, title, url, content_hash, minio_path, training_allowed, file_size, content_type) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9) ON CONFLICT DO NOTHING """, doc_id, seed_url_id, filename, url, content_hash, minio_path, training_allowed, len(content), content_type ) result["document_id"] = doc_id result["success"] = True _crawler_state.documents_crawled_today += 1 # Only index if training is allowed if training_allowed: chunks = chunk_text(text) if chunks: embeddings = await generate_embeddings(chunks) if embeddings: indexed_count = await index_in_qdrant( doc_id, chunks, embeddings, { "bundesland": bundesland, "doc_type": doc_type, "title": filename, "url": url, "training_allowed": True, } ) if indexed_count > 0: result["indexed"] = True _crawler_state.documents_indexed_today += 1 # Update database if self.db_pool: async with self.db_pool.acquire() as conn: await conn.execute( "UPDATE zeugnis_documents SET indexed_in_qdrant = true WHERE id = $1", doc_id ) else: result["indexed"] = False result["error"] = "Training not allowed for this source" _crawler_state.last_activity = datetime.now() except Exception as e: result["error"] = str(e) _crawler_state.errors_today += 1 return result async def crawl_source(self, source_id: str) -> Dict[str, Any]: """Crawl all seed URLs for a source.""" global _crawler_state result = { "source_id": source_id, "documents_found": 0, "documents_indexed": 0, "errors": [], "started_at": datetime.now(), "completed_at": None, } if not self.db_pool: result["errors"].append("Database not available") return result try: async with self.db_pool.acquire() as conn: # Get source info source = await conn.fetchrow( "SELECT * FROM zeugnis_sources WHERE id = $1", source_id ) if not source: result["errors"].append(f"Source not found: {source_id}") return result bundesland = source["bundesland"] training_allowed = source["training_allowed"] _crawler_state.current_source_id = source_id _crawler_state.current_bundesland = bundesland # Get seed URLs seed_urls = await conn.fetch( "SELECT * FROM zeugnis_seed_urls WHERE source_id = $1 AND status != 'completed'", source_id ) for seed_url in seed_urls: # Update status to running await conn.execute( "UPDATE zeugnis_seed_urls SET status = 'running' WHERE id = $1", seed_url["id"] ) # Crawl crawl_result = await self.crawl_seed_url( seed_url["id"], seed_url["url"], bundesland, seed_url["doc_type"], training_allowed, ) # Update status if crawl_result["success"]: result["documents_found"] += 1 if crawl_result["indexed"]: result["documents_indexed"] += 1 await conn.execute( "UPDATE zeugnis_seed_urls SET status = 'completed', last_crawled = NOW() WHERE id = $1", seed_url["id"] ) else: result["errors"].append(f"{seed_url['url']}: {crawl_result['error']}") await conn.execute( "UPDATE zeugnis_seed_urls SET status = 'failed', error_message = $2 WHERE id = $1", seed_url["id"], crawl_result["error"] ) # Small delay between requests await asyncio.sleep(1) except Exception as e: result["errors"].append(str(e)) finally: result["completed_at"] = datetime.now() _crawler_state.current_source_id = None _crawler_state.current_bundesland = None return result # ============================================================================= # Crawler Control Functions # ============================================================================= _crawler_instance: Optional[ZeugnisCrawler] = None _crawler_task: Optional[asyncio.Task] = None async def start_crawler(bundesland: Optional[str] = None, source_id: Optional[str] = None) -> bool: """Start the crawler.""" global _crawler_state, _crawler_instance, _crawler_task if _crawler_state.is_running: return False _crawler_state.is_running = True _crawler_state.documents_crawled_today = 0 _crawler_state.documents_indexed_today = 0 _crawler_state.errors_today = 0 _crawler_instance = ZeugnisCrawler() await _crawler_instance.init() async def run_crawler(): try: from metrics_db import get_pool pool = await get_pool() if pool: async with pool.acquire() as conn: # Get sources to crawl if source_id: sources = await conn.fetch( "SELECT id, bundesland FROM zeugnis_sources WHERE id = $1", source_id ) elif bundesland: sources = await conn.fetch( "SELECT id, bundesland FROM zeugnis_sources WHERE bundesland = $1", bundesland ) else: sources = await conn.fetch( "SELECT id, bundesland FROM zeugnis_sources ORDER BY bundesland" ) for source in sources: if not _crawler_state.is_running: break await _crawler_instance.crawl_source(source["id"]) except Exception as e: print(f"Crawler error: {e}") finally: _crawler_state.is_running = False if _crawler_instance: await _crawler_instance.close() _crawler_task = asyncio.create_task(run_crawler()) return True async def stop_crawler() -> bool: """Stop the crawler.""" global _crawler_state, _crawler_task if not _crawler_state.is_running: return False _crawler_state.is_running = False if _crawler_task: _crawler_task.cancel() try: await _crawler_task except asyncio.CancelledError: pass return True def get_crawler_status() -> Dict[str, Any]: """Get current crawler status.""" global _crawler_state return { "is_running": _crawler_state.is_running, "current_source": _crawler_state.current_source_id, "current_bundesland": _crawler_state.current_bundesland, "queue_length": len(_crawler_state.queue), "documents_crawled_today": _crawler_state.documents_crawled_today, "documents_indexed_today": _crawler_state.documents_indexed_today, "errors_today": _crawler_state.errors_today, "last_activity": _crawler_state.last_activity.isoformat() if _crawler_state.last_activity else None, }