[split-required] Split 500-850 LOC files (batch 2)

backend-lehrer (10 files): - game/database.py (785 → 5), correction_api.py (683 → 4) - classroom_engine/antizipation.py (676 → 5) - llm_gateway schools/edu_search already done in prior batch klausur-service (12 files): - orientation_crop_api.py (694 → 5), pdf_export.py (677 → 4) - zeugnis_crawler.py (676 → 5), grid_editor_api.py (671 → 5) - eh_templates.py (658 → 5), mail/api.py (651 → 5) - qdrant_service.py (638 → 5), training_api.py (625 → 4) website (6 pages): - middleware (696 → 8), mail (733 → 6), consent (628 → 8) - compliance/risks (622 → 5), export (502 → 5), brandbook (629 → 7) studio-v2 (3 components): - B2BMigrationWizard (848 → 3), CleanupPanel (765 → 2) - dashboard-experimental (739 → 2) admin-lehrer (4 files): - uebersetzungen (769 → 4), manager (670 → 2) - ChunkBrowserQA (675 → 6), dsfa/page (674 → 5) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-25 08:24:01 +02:00
parent 34da9f4cda
commit b4613e26f3
118 changed files with 15258 additions and 14680 deletions
--- a/klausur-service/backend/zeugnis_worker.py
+++ b/klausur-service/backend/zeugnis_worker.py
@@ -0,0 +1,313 @@
+"""
+Zeugnis Crawler - ZeugnisCrawler worker class and CrawlerState.
+
+Crawls official government documents about school certificates from
+all 16 German federal states. Only indexes documents where AI training
+is legally permitted.
+"""
+
+import asyncio
+from datetime import datetime
+from typing import Optional, List, Dict, Any, Tuple
+from dataclasses import dataclass, field
+
+import httpx
+
+from zeugnis_models import generate_id
+from zeugnis_text import (
+    extract_text_from_pdf,
+    extract_text_from_html,
+    chunk_text,
+    compute_hash,
+)
+from zeugnis_storage import (
+    upload_to_minio,
+    generate_embeddings,
+    index_in_qdrant,
+)
+
+
+# =============================================================================
+# Configuration
+# =============================================================================
+
+MAX_RETRIES = 3
+RETRY_DELAY = 5  # seconds
+REQUEST_TIMEOUT = 30  # seconds
+USER_AGENT = "BreakPilot-Zeugnis-Crawler/1.0 (Educational Research)"
+
+
+# =============================================================================
+# Crawler State
+# =============================================================================
+
+@dataclass
+class CrawlerState:
+    """Global crawler state."""
+    is_running: bool = False
+    current_source_id: Optional[str] = None
+    current_bundesland: Optional[str] = None
+    queue: List[Dict] = field(default_factory=list)
+    documents_crawled_today: int = 0
+    documents_indexed_today: int = 0
+    errors_today: int = 0
+    last_activity: Optional[datetime] = None
+
+
+_crawler_state = CrawlerState()
+
+
+def get_crawler_state() -> CrawlerState:
+    """Get the global crawler state."""
+    return _crawler_state
+
+
+# =============================================================================
+# Crawler Worker
+# =============================================================================
+
+class ZeugnisCrawler:
+    """Rights-aware crawler for zeugnis documents."""
+
+    def __init__(self):
+        self.http_client: Optional[httpx.AsyncClient] = None
+        self.db_pool = None
+
+    async def init(self):
+        """Initialize crawler resources."""
+        self.http_client = httpx.AsyncClient(
+            timeout=REQUEST_TIMEOUT,
+            follow_redirects=True,
+            headers={"User-Agent": USER_AGENT},
+        )
+
+        # Initialize database connection
+        try:
+            from metrics_db import get_pool
+            self.db_pool = await get_pool()
+        except Exception as e:
+            print(f"Failed to get database pool: {e}")
+
+    async def close(self):
+        """Close crawler resources."""
+        if self.http_client:
+            await self.http_client.aclose()
+
+    async def fetch_url(self, url: str) -> Tuple[Optional[bytes], Optional[str]]:
+        """Fetch URL with retry logic."""
+        for attempt in range(MAX_RETRIES):
+            try:
+                response = await self.http_client.get(url)
+                response.raise_for_status()
+                content_type = response.headers.get("content-type", "")
+                return response.content, content_type
+            except httpx.HTTPStatusError as e:
+                print(f"HTTP error {e.response.status_code} for {url}")
+                if e.response.status_code == 404:
+                    return None, None
+            except Exception as e:
+                print(f"Attempt {attempt + 1}/{MAX_RETRIES} failed for {url}: {e}")
+                if attempt < MAX_RETRIES - 1:
+                    await asyncio.sleep(RETRY_DELAY * (attempt + 1))
+        return None, None
+
+    async def crawl_seed_url(
+        self,
+        seed_url_id: str,
+        url: str,
+        bundesland: str,
+        doc_type: str,
+        training_allowed: bool,
+    ) -> Dict[str, Any]:
+        """Crawl a single seed URL."""
+        global _crawler_state
+
+        result = {
+            "seed_url_id": seed_url_id,
+            "url": url,
+            "success": False,
+            "document_id": None,
+            "indexed": False,
+            "error": None,
+        }
+
+        try:
+            # Fetch content
+            content, content_type = await self.fetch_url(url)
+            if not content:
+                result["error"] = "Failed to fetch URL"
+                return result
+
+            # Determine file type
+            is_pdf = "pdf" in content_type.lower() or url.lower().endswith(".pdf")
+
+            # Extract text
+            if is_pdf:
+                text = extract_text_from_pdf(content)
+                filename = url.split("/")[-1] or f"document_{seed_url_id}.pdf"
+            else:
+                text = extract_text_from_html(content)
+                filename = f"document_{seed_url_id}.html"
+
+            if not text:
+                result["error"] = "No text extracted"
+                return result
+
+            # Compute hash for versioning
+            content_hash = compute_hash(content)
+
+            # Upload to MinIO
+            minio_path = await upload_to_minio(
+                content,
+                bundesland,
+                filename,
+                content_type=content_type or "application/octet-stream",
+            )
+
+            # Generate document ID
+            doc_id = generate_id()
+
+            # Store document in database
+            if self.db_pool:
+                async with self.db_pool.acquire() as conn:
+                    await conn.execute(
+                        """
+                        INSERT INTO zeugnis_documents
+                        (id, seed_url_id, title, url, content_hash, minio_path,
+                         training_allowed, file_size, content_type)
+                        VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9)
+                        ON CONFLICT DO NOTHING
+                        """,
+                        doc_id, seed_url_id, filename, url, content_hash,
+                        minio_path, training_allowed, len(content), content_type
+                    )
+
+            result["document_id"] = doc_id
+            result["success"] = True
+            _crawler_state.documents_crawled_today += 1
+
+            # Only index if training is allowed
+            if training_allowed:
+                chunks = chunk_text(text)
+                if chunks:
+                    embeddings = await generate_embeddings(chunks)
+                    if embeddings:
+                        indexed_count = await index_in_qdrant(
+                            doc_id,
+                            chunks,
+                            embeddings,
+                            {
+                                "bundesland": bundesland,
+                                "doc_type": doc_type,
+                                "title": filename,
+                                "url": url,
+                                "training_allowed": True,
+                            }
+                        )
+                        if indexed_count > 0:
+                            result["indexed"] = True
+                            _crawler_state.documents_indexed_today += 1
+
+                            # Update database
+                            if self.db_pool:
+                                async with self.db_pool.acquire() as conn:
+                                    await conn.execute(
+                                        "UPDATE zeugnis_documents SET indexed_in_qdrant = true WHERE id = $1",
+                                        doc_id
+                                    )
+            else:
+                result["indexed"] = False
+                result["error"] = "Training not allowed for this source"
+
+            _crawler_state.last_activity = datetime.now()
+
+        except Exception as e:
+            result["error"] = str(e)
+            _crawler_state.errors_today += 1
+
+        return result
+
+    async def crawl_source(self, source_id: str) -> Dict[str, Any]:
+        """Crawl all seed URLs for a source."""
+        global _crawler_state
+
+        result = {
+            "source_id": source_id,
+            "documents_found": 0,
+            "documents_indexed": 0,
+            "errors": [],
+            "started_at": datetime.now(),
+            "completed_at": None,
+        }
+
+        if not self.db_pool:
+            result["errors"].append("Database not available")
+            return result
+
+        try:
+            async with self.db_pool.acquire() as conn:
+                # Get source info
+                source = await conn.fetchrow(
+                    "SELECT * FROM zeugnis_sources WHERE id = $1",
+                    source_id
+                )
+                if not source:
+                    result["errors"].append(f"Source not found: {source_id}")
+                    return result
+
+                bundesland = source["bundesland"]
+                training_allowed = source["training_allowed"]
+
+                _crawler_state.current_source_id = source_id
+                _crawler_state.current_bundesland = bundesland
+
+                # Get seed URLs
+                seed_urls = await conn.fetch(
+                    "SELECT * FROM zeugnis_seed_urls WHERE source_id = $1 AND status != 'completed'",
+                    source_id
+                )
+
+                for seed_url in seed_urls:
+                    # Update status to running
+                    await conn.execute(
+                        "UPDATE zeugnis_seed_urls SET status = 'running' WHERE id = $1",
+                        seed_url["id"]
+                    )
+
+                    # Crawl
+                    crawl_result = await self.crawl_seed_url(
+                        seed_url["id"],
+                        seed_url["url"],
+                        bundesland,
+                        seed_url["doc_type"],
+                        training_allowed,
+                    )
+
+                    # Update status
+                    if crawl_result["success"]:
+                        result["documents_found"] += 1
+                        if crawl_result["indexed"]:
+                            result["documents_indexed"] += 1
+                        await conn.execute(
+                            "UPDATE zeugnis_seed_urls SET status = 'completed', last_crawled = NOW() WHERE id = $1",
+                            seed_url["id"]
+                        )
+                    else:
+                        result["errors"].append(f"{seed_url['url']}: {crawl_result['error']}")
+                        await conn.execute(
+                            "UPDATE zeugnis_seed_urls SET status = 'failed', error_message = $2 WHERE id = $1",
+                            seed_url["id"], crawl_result["error"]
+                        )
+
+                    # Small delay between requests
+                    await asyncio.sleep(1)
+
+        except Exception as e:
+            result["errors"].append(str(e))
+
+        finally:
+            result["completed_at"] = datetime.now()
+            _crawler_state.current_source_id = None
+            _crawler_state.current_bundesland = None
+
+        return result