Files
breakpilot-lehrer/klausur-service/backend/zeugnis_worker.py
Benjamin Admin b4613e26f3 [split-required] Split 500-850 LOC files (batch 2)
backend-lehrer (10 files):
- game/database.py (785 → 5), correction_api.py (683 → 4)
- classroom_engine/antizipation.py (676 → 5)
- llm_gateway schools/edu_search already done in prior batch

klausur-service (12 files):
- orientation_crop_api.py (694 → 5), pdf_export.py (677 → 4)
- zeugnis_crawler.py (676 → 5), grid_editor_api.py (671 → 5)
- eh_templates.py (658 → 5), mail/api.py (651 → 5)
- qdrant_service.py (638 → 5), training_api.py (625 → 4)

website (6 pages):
- middleware (696 → 8), mail (733 → 6), consent (628 → 8)
- compliance/risks (622 → 5), export (502 → 5), brandbook (629 → 7)

studio-v2 (3 components):
- B2BMigrationWizard (848 → 3), CleanupPanel (765 → 2)
- dashboard-experimental (739 → 2)

admin-lehrer (4 files):
- uebersetzungen (769 → 4), manager (670 → 2)
- ChunkBrowserQA (675 → 6), dsfa/page (674 → 5)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-25 08:24:01 +02:00

314 lines
11 KiB
Python

"""
Zeugnis Crawler - ZeugnisCrawler worker class and CrawlerState.
Crawls official government documents about school certificates from
all 16 German federal states. Only indexes documents where AI training
is legally permitted.
"""
import asyncio
from datetime import datetime
from typing import Optional, List, Dict, Any, Tuple
from dataclasses import dataclass, field
import httpx
from zeugnis_models import generate_id
from zeugnis_text import (
extract_text_from_pdf,
extract_text_from_html,
chunk_text,
compute_hash,
)
from zeugnis_storage import (
upload_to_minio,
generate_embeddings,
index_in_qdrant,
)
# =============================================================================
# Configuration
# =============================================================================
MAX_RETRIES = 3
RETRY_DELAY = 5 # seconds
REQUEST_TIMEOUT = 30 # seconds
USER_AGENT = "BreakPilot-Zeugnis-Crawler/1.0 (Educational Research)"
# =============================================================================
# Crawler State
# =============================================================================
@dataclass
class CrawlerState:
"""Global crawler state."""
is_running: bool = False
current_source_id: Optional[str] = None
current_bundesland: Optional[str] = None
queue: List[Dict] = field(default_factory=list)
documents_crawled_today: int = 0
documents_indexed_today: int = 0
errors_today: int = 0
last_activity: Optional[datetime] = None
_crawler_state = CrawlerState()
def get_crawler_state() -> CrawlerState:
"""Get the global crawler state."""
return _crawler_state
# =============================================================================
# Crawler Worker
# =============================================================================
class ZeugnisCrawler:
"""Rights-aware crawler for zeugnis documents."""
def __init__(self):
self.http_client: Optional[httpx.AsyncClient] = None
self.db_pool = None
async def init(self):
"""Initialize crawler resources."""
self.http_client = httpx.AsyncClient(
timeout=REQUEST_TIMEOUT,
follow_redirects=True,
headers={"User-Agent": USER_AGENT},
)
# Initialize database connection
try:
from metrics_db import get_pool
self.db_pool = await get_pool()
except Exception as e:
print(f"Failed to get database pool: {e}")
async def close(self):
"""Close crawler resources."""
if self.http_client:
await self.http_client.aclose()
async def fetch_url(self, url: str) -> Tuple[Optional[bytes], Optional[str]]:
"""Fetch URL with retry logic."""
for attempt in range(MAX_RETRIES):
try:
response = await self.http_client.get(url)
response.raise_for_status()
content_type = response.headers.get("content-type", "")
return response.content, content_type
except httpx.HTTPStatusError as e:
print(f"HTTP error {e.response.status_code} for {url}")
if e.response.status_code == 404:
return None, None
except Exception as e:
print(f"Attempt {attempt + 1}/{MAX_RETRIES} failed for {url}: {e}")
if attempt < MAX_RETRIES - 1:
await asyncio.sleep(RETRY_DELAY * (attempt + 1))
return None, None
async def crawl_seed_url(
self,
seed_url_id: str,
url: str,
bundesland: str,
doc_type: str,
training_allowed: bool,
) -> Dict[str, Any]:
"""Crawl a single seed URL."""
global _crawler_state
result = {
"seed_url_id": seed_url_id,
"url": url,
"success": False,
"document_id": None,
"indexed": False,
"error": None,
}
try:
# Fetch content
content, content_type = await self.fetch_url(url)
if not content:
result["error"] = "Failed to fetch URL"
return result
# Determine file type
is_pdf = "pdf" in content_type.lower() or url.lower().endswith(".pdf")
# Extract text
if is_pdf:
text = extract_text_from_pdf(content)
filename = url.split("/")[-1] or f"document_{seed_url_id}.pdf"
else:
text = extract_text_from_html(content)
filename = f"document_{seed_url_id}.html"
if not text:
result["error"] = "No text extracted"
return result
# Compute hash for versioning
content_hash = compute_hash(content)
# Upload to MinIO
minio_path = await upload_to_minio(
content,
bundesland,
filename,
content_type=content_type or "application/octet-stream",
)
# Generate document ID
doc_id = generate_id()
# Store document in database
if self.db_pool:
async with self.db_pool.acquire() as conn:
await conn.execute(
"""
INSERT INTO zeugnis_documents
(id, seed_url_id, title, url, content_hash, minio_path,
training_allowed, file_size, content_type)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9)
ON CONFLICT DO NOTHING
""",
doc_id, seed_url_id, filename, url, content_hash,
minio_path, training_allowed, len(content), content_type
)
result["document_id"] = doc_id
result["success"] = True
_crawler_state.documents_crawled_today += 1
# Only index if training is allowed
if training_allowed:
chunks = chunk_text(text)
if chunks:
embeddings = await generate_embeddings(chunks)
if embeddings:
indexed_count = await index_in_qdrant(
doc_id,
chunks,
embeddings,
{
"bundesland": bundesland,
"doc_type": doc_type,
"title": filename,
"url": url,
"training_allowed": True,
}
)
if indexed_count > 0:
result["indexed"] = True
_crawler_state.documents_indexed_today += 1
# Update database
if self.db_pool:
async with self.db_pool.acquire() as conn:
await conn.execute(
"UPDATE zeugnis_documents SET indexed_in_qdrant = true WHERE id = $1",
doc_id
)
else:
result["indexed"] = False
result["error"] = "Training not allowed for this source"
_crawler_state.last_activity = datetime.now()
except Exception as e:
result["error"] = str(e)
_crawler_state.errors_today += 1
return result
async def crawl_source(self, source_id: str) -> Dict[str, Any]:
"""Crawl all seed URLs for a source."""
global _crawler_state
result = {
"source_id": source_id,
"documents_found": 0,
"documents_indexed": 0,
"errors": [],
"started_at": datetime.now(),
"completed_at": None,
}
if not self.db_pool:
result["errors"].append("Database not available")
return result
try:
async with self.db_pool.acquire() as conn:
# Get source info
source = await conn.fetchrow(
"SELECT * FROM zeugnis_sources WHERE id = $1",
source_id
)
if not source:
result["errors"].append(f"Source not found: {source_id}")
return result
bundesland = source["bundesland"]
training_allowed = source["training_allowed"]
_crawler_state.current_source_id = source_id
_crawler_state.current_bundesland = bundesland
# Get seed URLs
seed_urls = await conn.fetch(
"SELECT * FROM zeugnis_seed_urls WHERE source_id = $1 AND status != 'completed'",
source_id
)
for seed_url in seed_urls:
# Update status to running
await conn.execute(
"UPDATE zeugnis_seed_urls SET status = 'running' WHERE id = $1",
seed_url["id"]
)
# Crawl
crawl_result = await self.crawl_seed_url(
seed_url["id"],
seed_url["url"],
bundesland,
seed_url["doc_type"],
training_allowed,
)
# Update status
if crawl_result["success"]:
result["documents_found"] += 1
if crawl_result["indexed"]:
result["documents_indexed"] += 1
await conn.execute(
"UPDATE zeugnis_seed_urls SET status = 'completed', last_crawled = NOW() WHERE id = $1",
seed_url["id"]
)
else:
result["errors"].append(f"{seed_url['url']}: {crawl_result['error']}")
await conn.execute(
"UPDATE zeugnis_seed_urls SET status = 'failed', error_message = $2 WHERE id = $1",
seed_url["id"], crawl_result["error"]
)
# Small delay between requests
await asyncio.sleep(1)
except Exception as e:
result["errors"].append(str(e))
finally:
result["completed_at"] = datetime.now()
_crawler_state.current_source_id = None
_crawler_state.current_bundesland = None
return result