breakpilot-lehrer/klausur-service/backend/zeugnis_worker.py

"""
Zeugnis Crawler - ZeugnisCrawler worker class and CrawlerState.

Crawls official government documents about school certificates from
all 16 German federal states. Only indexes documents where AI training
is legally permitted.
"""

import asyncio
from datetime import datetime
from typing import Optional, List, Dict, Any, Tuple
from dataclasses import dataclass, field

import httpx

from zeugnis_models import generate_id
from zeugnis_text import (
    extract_text_from_pdf,
    extract_text_from_html,
    chunk_text,
    compute_hash,
)
from zeugnis_storage import (
    upload_to_minio,
    generate_embeddings,
    index_in_qdrant,
)


# =============================================================================
# Configuration
# =============================================================================

MAX_RETRIES = 3
RETRY_DELAY = 5  # seconds
REQUEST_TIMEOUT = 30  # seconds
USER_AGENT = "BreakPilot-Zeugnis-Crawler/1.0 (Educational Research)"


# =============================================================================
# Crawler State
# =============================================================================

@dataclass
class CrawlerState:
    """Global crawler state."""
    is_running: bool = False
    current_source_id: Optional[str] = None
    current_bundesland: Optional[str] = None
    queue: List[Dict] = field(default_factory=list)
    documents_crawled_today: int = 0
    documents_indexed_today: int = 0
    errors_today: int = 0
    last_activity: Optional[datetime] = None


_crawler_state = CrawlerState()


def get_crawler_state() -> CrawlerState:
    """Get the global crawler state."""
    return _crawler_state


# =============================================================================
# Crawler Worker
# =============================================================================

class ZeugnisCrawler:
    """Rights-aware crawler for zeugnis documents."""

    def __init__(self):
        self.http_client: Optional[httpx.AsyncClient] = None
        self.db_pool = None

    async def init(self):
        """Initialize crawler resources."""
        self.http_client = httpx.AsyncClient(
            timeout=REQUEST_TIMEOUT,
            follow_redirects=True,
            headers={"User-Agent": USER_AGENT},
        )

        # Initialize database connection
        try:
            from metrics_db import get_pool
            self.db_pool = await get_pool()
        except Exception as e:
            print(f"Failed to get database pool: {e}")

    async def close(self):
        """Close crawler resources."""
        if self.http_client:
            await self.http_client.aclose()

    async def fetch_url(self, url: str) -> Tuple[Optional[bytes], Optional[str]]:
        """Fetch URL with retry logic."""
        for attempt in range(MAX_RETRIES):
            try:
                response = await self.http_client.get(url)
                response.raise_for_status()
                content_type = response.headers.get("content-type", "")
                return response.content, content_type
            except httpx.HTTPStatusError as e:
                print(f"HTTP error {e.response.status_code} for {url}")
                if e.response.status_code == 404:
                    return None, None
            except Exception as e:
                print(f"Attempt {attempt + 1}/{MAX_RETRIES} failed for {url}: {e}")
                if attempt < MAX_RETRIES - 1:
                    await asyncio.sleep(RETRY_DELAY * (attempt + 1))
        return None, None

    async def crawl_seed_url(
        self,
        seed_url_id: str,
        url: str,
        bundesland: str,
        doc_type: str,
        training_allowed: bool,
    ) -> Dict[str, Any]:
        """Crawl a single seed URL."""
        global _crawler_state

        result = {
            "seed_url_id": seed_url_id,
            "url": url,
            "success": False,
            "document_id": None,
            "indexed": False,
            "error": None,
        }

        try:
            # Fetch content
            content, content_type = await self.fetch_url(url)
            if not content:
                result["error"] = "Failed to fetch URL"
                return result

            # Determine file type
            is_pdf = "pdf" in content_type.lower() or url.lower().endswith(".pdf")

            # Extract text
            if is_pdf:
                text = extract_text_from_pdf(content)
                filename = url.split("/")[-1] or f"document_{seed_url_id}.pdf"
            else:
                text = extract_text_from_html(content)
                filename = f"document_{seed_url_id}.html"

            if not text:
                result["error"] = "No text extracted"
                return result

            # Compute hash for versioning
            content_hash = compute_hash(content)

            # Upload to MinIO
            minio_path = await upload_to_minio(
                content,
                bundesland,
                filename,
                content_type=content_type or "application/octet-stream",
            )

            # Generate document ID
            doc_id = generate_id()

            # Store document in database
            if self.db_pool:
                async with self.db_pool.acquire() as conn:
                    await conn.execute(
                        """
                        INSERT INTO zeugnis_documents
                        (id, seed_url_id, title, url, content_hash, minio_path,
                         training_allowed, file_size, content_type)
                        VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9)
                        ON CONFLICT DO NOTHING
                        """,
                        doc_id, seed_url_id, filename, url, content_hash,
                        minio_path, training_allowed, len(content), content_type
                    )

            result["document_id"] = doc_id
            result["success"] = True
            _crawler_state.documents_crawled_today += 1

            # Only index if training is allowed
            if training_allowed:
                chunks = chunk_text(text)
                if chunks:
                    embeddings = await generate_embeddings(chunks)
                    if embeddings:
                        indexed_count = await index_in_qdrant(
                            doc_id,
                            chunks,
                            embeddings,
                            {
                                "bundesland": bundesland,
                                "doc_type": doc_type,
                                "title": filename,
                                "url": url,
                                "training_allowed": True,
                            }
                        )
                        if indexed_count > 0:
                            result["indexed"] = True
                            _crawler_state.documents_indexed_today += 1

                            # Update database
                            if self.db_pool:
                                async with self.db_pool.acquire() as conn:
                                    await conn.execute(
                                        "UPDATE zeugnis_documents SET indexed_in_qdrant = true WHERE id = $1",
                                        doc_id
                                    )
            else:
                result["indexed"] = False
                result["error"] = "Training not allowed for this source"

            _crawler_state.last_activity = datetime.now()

        except Exception as e:
            result["error"] = str(e)
            _crawler_state.errors_today += 1

        return result

    async def crawl_source(self, source_id: str) -> Dict[str, Any]:
        """Crawl all seed URLs for a source."""
        global _crawler_state

        result = {
            "source_id": source_id,
            "documents_found": 0,
            "documents_indexed": 0,
            "errors": [],
            "started_at": datetime.now(),
            "completed_at": None,
        }

        if not self.db_pool:
            result["errors"].append("Database not available")
            return result

        try:
            async with self.db_pool.acquire() as conn:
                # Get source info
                source = await conn.fetchrow(
                    "SELECT * FROM zeugnis_sources WHERE id = $1",
                    source_id
                )
                if not source:
                    result["errors"].append(f"Source not found: {source_id}")
                    return result

                bundesland = source["bundesland"]
                training_allowed = source["training_allowed"]

                _crawler_state.current_source_id = source_id
                _crawler_state.current_bundesland = bundesland

                # Get seed URLs
                seed_urls = await conn.fetch(
                    "SELECT * FROM zeugnis_seed_urls WHERE source_id = $1 AND status != 'completed'",
                    source_id
                )

                for seed_url in seed_urls:
                    # Update status to running
                    await conn.execute(
                        "UPDATE zeugnis_seed_urls SET status = 'running' WHERE id = $1",
                        seed_url["id"]
                    )

                    # Crawl
                    crawl_result = await self.crawl_seed_url(
                        seed_url["id"],
                        seed_url["url"],
                        bundesland,
                        seed_url["doc_type"],
                        training_allowed,
                    )

                    # Update status
                    if crawl_result["success"]:
                        result["documents_found"] += 1
                        if crawl_result["indexed"]:
                            result["documents_indexed"] += 1
                        await conn.execute(
                            "UPDATE zeugnis_seed_urls SET status = 'completed', last_crawled = NOW() WHERE id = $1",
                            seed_url["id"]
                        )
                    else:
                        result["errors"].append(f"{seed_url['url']}: {crawl_result['error']}")
                        await conn.execute(
                            "UPDATE zeugnis_seed_urls SET status = 'failed', error_message = $2 WHERE id = $1",
                            seed_url["id"], crawl_result["error"]
                        )

                    # Small delay between requests
                    await asyncio.sleep(1)

        except Exception as e:
            result["errors"].append(str(e))

        finally:
            result["completed_at"] = datetime.now()
            _crawler_state.current_source_id = None
            _crawler_state.current_bundesland = None

        return result