[split-required] Split 500-850 LOC files (batch 2)
backend-lehrer (10 files): - game/database.py (785 → 5), correction_api.py (683 → 4) - classroom_engine/antizipation.py (676 → 5) - llm_gateway schools/edu_search already done in prior batch klausur-service (12 files): - orientation_crop_api.py (694 → 5), pdf_export.py (677 → 4) - zeugnis_crawler.py (676 → 5), grid_editor_api.py (671 → 5) - eh_templates.py (658 → 5), mail/api.py (651 → 5) - qdrant_service.py (638 → 5), training_api.py (625 → 4) website (6 pages): - middleware (696 → 8), mail (733 → 6), consent (628 → 8) - compliance/risks (622 → 5), export (502 → 5), brandbook (629 → 7) studio-v2 (3 components): - B2BMigrationWizard (848 → 3), CleanupPanel (765 → 2) - dashboard-experimental (739 → 2) admin-lehrer (4 files): - uebersetzungen (769 → 4), manager (670 → 2) - ChunkBrowserQA (675 → 6), dsfa/page (674 → 5) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
313
klausur-service/backend/zeugnis_worker.py
Normal file
313
klausur-service/backend/zeugnis_worker.py
Normal file
@@ -0,0 +1,313 @@
|
||||
"""
|
||||
Zeugnis Crawler - ZeugnisCrawler worker class and CrawlerState.
|
||||
|
||||
Crawls official government documents about school certificates from
|
||||
all 16 German federal states. Only indexes documents where AI training
|
||||
is legally permitted.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
from datetime import datetime
|
||||
from typing import Optional, List, Dict, Any, Tuple
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
import httpx
|
||||
|
||||
from zeugnis_models import generate_id
|
||||
from zeugnis_text import (
|
||||
extract_text_from_pdf,
|
||||
extract_text_from_html,
|
||||
chunk_text,
|
||||
compute_hash,
|
||||
)
|
||||
from zeugnis_storage import (
|
||||
upload_to_minio,
|
||||
generate_embeddings,
|
||||
index_in_qdrant,
|
||||
)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Configuration
|
||||
# =============================================================================
|
||||
|
||||
MAX_RETRIES = 3
|
||||
RETRY_DELAY = 5 # seconds
|
||||
REQUEST_TIMEOUT = 30 # seconds
|
||||
USER_AGENT = "BreakPilot-Zeugnis-Crawler/1.0 (Educational Research)"
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Crawler State
|
||||
# =============================================================================
|
||||
|
||||
@dataclass
|
||||
class CrawlerState:
|
||||
"""Global crawler state."""
|
||||
is_running: bool = False
|
||||
current_source_id: Optional[str] = None
|
||||
current_bundesland: Optional[str] = None
|
||||
queue: List[Dict] = field(default_factory=list)
|
||||
documents_crawled_today: int = 0
|
||||
documents_indexed_today: int = 0
|
||||
errors_today: int = 0
|
||||
last_activity: Optional[datetime] = None
|
||||
|
||||
|
||||
_crawler_state = CrawlerState()
|
||||
|
||||
|
||||
def get_crawler_state() -> CrawlerState:
|
||||
"""Get the global crawler state."""
|
||||
return _crawler_state
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Crawler Worker
|
||||
# =============================================================================
|
||||
|
||||
class ZeugnisCrawler:
|
||||
"""Rights-aware crawler for zeugnis documents."""
|
||||
|
||||
def __init__(self):
|
||||
self.http_client: Optional[httpx.AsyncClient] = None
|
||||
self.db_pool = None
|
||||
|
||||
async def init(self):
|
||||
"""Initialize crawler resources."""
|
||||
self.http_client = httpx.AsyncClient(
|
||||
timeout=REQUEST_TIMEOUT,
|
||||
follow_redirects=True,
|
||||
headers={"User-Agent": USER_AGENT},
|
||||
)
|
||||
|
||||
# Initialize database connection
|
||||
try:
|
||||
from metrics_db import get_pool
|
||||
self.db_pool = await get_pool()
|
||||
except Exception as e:
|
||||
print(f"Failed to get database pool: {e}")
|
||||
|
||||
async def close(self):
|
||||
"""Close crawler resources."""
|
||||
if self.http_client:
|
||||
await self.http_client.aclose()
|
||||
|
||||
async def fetch_url(self, url: str) -> Tuple[Optional[bytes], Optional[str]]:
|
||||
"""Fetch URL with retry logic."""
|
||||
for attempt in range(MAX_RETRIES):
|
||||
try:
|
||||
response = await self.http_client.get(url)
|
||||
response.raise_for_status()
|
||||
content_type = response.headers.get("content-type", "")
|
||||
return response.content, content_type
|
||||
except httpx.HTTPStatusError as e:
|
||||
print(f"HTTP error {e.response.status_code} for {url}")
|
||||
if e.response.status_code == 404:
|
||||
return None, None
|
||||
except Exception as e:
|
||||
print(f"Attempt {attempt + 1}/{MAX_RETRIES} failed for {url}: {e}")
|
||||
if attempt < MAX_RETRIES - 1:
|
||||
await asyncio.sleep(RETRY_DELAY * (attempt + 1))
|
||||
return None, None
|
||||
|
||||
async def crawl_seed_url(
|
||||
self,
|
||||
seed_url_id: str,
|
||||
url: str,
|
||||
bundesland: str,
|
||||
doc_type: str,
|
||||
training_allowed: bool,
|
||||
) -> Dict[str, Any]:
|
||||
"""Crawl a single seed URL."""
|
||||
global _crawler_state
|
||||
|
||||
result = {
|
||||
"seed_url_id": seed_url_id,
|
||||
"url": url,
|
||||
"success": False,
|
||||
"document_id": None,
|
||||
"indexed": False,
|
||||
"error": None,
|
||||
}
|
||||
|
||||
try:
|
||||
# Fetch content
|
||||
content, content_type = await self.fetch_url(url)
|
||||
if not content:
|
||||
result["error"] = "Failed to fetch URL"
|
||||
return result
|
||||
|
||||
# Determine file type
|
||||
is_pdf = "pdf" in content_type.lower() or url.lower().endswith(".pdf")
|
||||
|
||||
# Extract text
|
||||
if is_pdf:
|
||||
text = extract_text_from_pdf(content)
|
||||
filename = url.split("/")[-1] or f"document_{seed_url_id}.pdf"
|
||||
else:
|
||||
text = extract_text_from_html(content)
|
||||
filename = f"document_{seed_url_id}.html"
|
||||
|
||||
if not text:
|
||||
result["error"] = "No text extracted"
|
||||
return result
|
||||
|
||||
# Compute hash for versioning
|
||||
content_hash = compute_hash(content)
|
||||
|
||||
# Upload to MinIO
|
||||
minio_path = await upload_to_minio(
|
||||
content,
|
||||
bundesland,
|
||||
filename,
|
||||
content_type=content_type or "application/octet-stream",
|
||||
)
|
||||
|
||||
# Generate document ID
|
||||
doc_id = generate_id()
|
||||
|
||||
# Store document in database
|
||||
if self.db_pool:
|
||||
async with self.db_pool.acquire() as conn:
|
||||
await conn.execute(
|
||||
"""
|
||||
INSERT INTO zeugnis_documents
|
||||
(id, seed_url_id, title, url, content_hash, minio_path,
|
||||
training_allowed, file_size, content_type)
|
||||
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9)
|
||||
ON CONFLICT DO NOTHING
|
||||
""",
|
||||
doc_id, seed_url_id, filename, url, content_hash,
|
||||
minio_path, training_allowed, len(content), content_type
|
||||
)
|
||||
|
||||
result["document_id"] = doc_id
|
||||
result["success"] = True
|
||||
_crawler_state.documents_crawled_today += 1
|
||||
|
||||
# Only index if training is allowed
|
||||
if training_allowed:
|
||||
chunks = chunk_text(text)
|
||||
if chunks:
|
||||
embeddings = await generate_embeddings(chunks)
|
||||
if embeddings:
|
||||
indexed_count = await index_in_qdrant(
|
||||
doc_id,
|
||||
chunks,
|
||||
embeddings,
|
||||
{
|
||||
"bundesland": bundesland,
|
||||
"doc_type": doc_type,
|
||||
"title": filename,
|
||||
"url": url,
|
||||
"training_allowed": True,
|
||||
}
|
||||
)
|
||||
if indexed_count > 0:
|
||||
result["indexed"] = True
|
||||
_crawler_state.documents_indexed_today += 1
|
||||
|
||||
# Update database
|
||||
if self.db_pool:
|
||||
async with self.db_pool.acquire() as conn:
|
||||
await conn.execute(
|
||||
"UPDATE zeugnis_documents SET indexed_in_qdrant = true WHERE id = $1",
|
||||
doc_id
|
||||
)
|
||||
else:
|
||||
result["indexed"] = False
|
||||
result["error"] = "Training not allowed for this source"
|
||||
|
||||
_crawler_state.last_activity = datetime.now()
|
||||
|
||||
except Exception as e:
|
||||
result["error"] = str(e)
|
||||
_crawler_state.errors_today += 1
|
||||
|
||||
return result
|
||||
|
||||
async def crawl_source(self, source_id: str) -> Dict[str, Any]:
|
||||
"""Crawl all seed URLs for a source."""
|
||||
global _crawler_state
|
||||
|
||||
result = {
|
||||
"source_id": source_id,
|
||||
"documents_found": 0,
|
||||
"documents_indexed": 0,
|
||||
"errors": [],
|
||||
"started_at": datetime.now(),
|
||||
"completed_at": None,
|
||||
}
|
||||
|
||||
if not self.db_pool:
|
||||
result["errors"].append("Database not available")
|
||||
return result
|
||||
|
||||
try:
|
||||
async with self.db_pool.acquire() as conn:
|
||||
# Get source info
|
||||
source = await conn.fetchrow(
|
||||
"SELECT * FROM zeugnis_sources WHERE id = $1",
|
||||
source_id
|
||||
)
|
||||
if not source:
|
||||
result["errors"].append(f"Source not found: {source_id}")
|
||||
return result
|
||||
|
||||
bundesland = source["bundesland"]
|
||||
training_allowed = source["training_allowed"]
|
||||
|
||||
_crawler_state.current_source_id = source_id
|
||||
_crawler_state.current_bundesland = bundesland
|
||||
|
||||
# Get seed URLs
|
||||
seed_urls = await conn.fetch(
|
||||
"SELECT * FROM zeugnis_seed_urls WHERE source_id = $1 AND status != 'completed'",
|
||||
source_id
|
||||
)
|
||||
|
||||
for seed_url in seed_urls:
|
||||
# Update status to running
|
||||
await conn.execute(
|
||||
"UPDATE zeugnis_seed_urls SET status = 'running' WHERE id = $1",
|
||||
seed_url["id"]
|
||||
)
|
||||
|
||||
# Crawl
|
||||
crawl_result = await self.crawl_seed_url(
|
||||
seed_url["id"],
|
||||
seed_url["url"],
|
||||
bundesland,
|
||||
seed_url["doc_type"],
|
||||
training_allowed,
|
||||
)
|
||||
|
||||
# Update status
|
||||
if crawl_result["success"]:
|
||||
result["documents_found"] += 1
|
||||
if crawl_result["indexed"]:
|
||||
result["documents_indexed"] += 1
|
||||
await conn.execute(
|
||||
"UPDATE zeugnis_seed_urls SET status = 'completed', last_crawled = NOW() WHERE id = $1",
|
||||
seed_url["id"]
|
||||
)
|
||||
else:
|
||||
result["errors"].append(f"{seed_url['url']}: {crawl_result['error']}")
|
||||
await conn.execute(
|
||||
"UPDATE zeugnis_seed_urls SET status = 'failed', error_message = $2 WHERE id = $1",
|
||||
seed_url["id"], crawl_result["error"]
|
||||
)
|
||||
|
||||
# Small delay between requests
|
||||
await asyncio.sleep(1)
|
||||
|
||||
except Exception as e:
|
||||
result["errors"].append(str(e))
|
||||
|
||||
finally:
|
||||
result["completed_at"] = datetime.now()
|
||||
_crawler_state.current_source_id = None
|
||||
_crawler_state.current_bundesland = None
|
||||
|
||||
return result
|
||||
Reference in New Issue
Block a user