backend-lehrer (10 files): - game/database.py (785 → 5), correction_api.py (683 → 4) - classroom_engine/antizipation.py (676 → 5) - llm_gateway schools/edu_search already done in prior batch klausur-service (12 files): - orientation_crop_api.py (694 → 5), pdf_export.py (677 → 4) - zeugnis_crawler.py (676 → 5), grid_editor_api.py (671 → 5) - eh_templates.py (658 → 5), mail/api.py (651 → 5) - qdrant_service.py (638 → 5), training_api.py (625 → 4) website (6 pages): - middleware (696 → 8), mail (733 → 6), consent (628 → 8) - compliance/risks (622 → 5), export (502 → 5), brandbook (629 → 7) studio-v2 (3 components): - B2BMigrationWizard (848 → 3), CleanupPanel (765 → 2) - dashboard-experimental (739 → 2) admin-lehrer (4 files): - uebersetzungen (769 → 4), manager (670 → 2) - ChunkBrowserQA (675 → 6), dsfa/page (674 → 5) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
314 lines
11 KiB
Python
314 lines
11 KiB
Python
"""
|
|
Zeugnis Crawler - ZeugnisCrawler worker class and CrawlerState.
|
|
|
|
Crawls official government documents about school certificates from
|
|
all 16 German federal states. Only indexes documents where AI training
|
|
is legally permitted.
|
|
"""
|
|
|
|
import asyncio
|
|
from datetime import datetime
|
|
from typing import Optional, List, Dict, Any, Tuple
|
|
from dataclasses import dataclass, field
|
|
|
|
import httpx
|
|
|
|
from zeugnis_models import generate_id
|
|
from zeugnis_text import (
|
|
extract_text_from_pdf,
|
|
extract_text_from_html,
|
|
chunk_text,
|
|
compute_hash,
|
|
)
|
|
from zeugnis_storage import (
|
|
upload_to_minio,
|
|
generate_embeddings,
|
|
index_in_qdrant,
|
|
)
|
|
|
|
|
|
# =============================================================================
|
|
# Configuration
|
|
# =============================================================================
|
|
|
|
MAX_RETRIES = 3
|
|
RETRY_DELAY = 5 # seconds
|
|
REQUEST_TIMEOUT = 30 # seconds
|
|
USER_AGENT = "BreakPilot-Zeugnis-Crawler/1.0 (Educational Research)"
|
|
|
|
|
|
# =============================================================================
|
|
# Crawler State
|
|
# =============================================================================
|
|
|
|
@dataclass
|
|
class CrawlerState:
|
|
"""Global crawler state."""
|
|
is_running: bool = False
|
|
current_source_id: Optional[str] = None
|
|
current_bundesland: Optional[str] = None
|
|
queue: List[Dict] = field(default_factory=list)
|
|
documents_crawled_today: int = 0
|
|
documents_indexed_today: int = 0
|
|
errors_today: int = 0
|
|
last_activity: Optional[datetime] = None
|
|
|
|
|
|
_crawler_state = CrawlerState()
|
|
|
|
|
|
def get_crawler_state() -> CrawlerState:
|
|
"""Get the global crawler state."""
|
|
return _crawler_state
|
|
|
|
|
|
# =============================================================================
|
|
# Crawler Worker
|
|
# =============================================================================
|
|
|
|
class ZeugnisCrawler:
|
|
"""Rights-aware crawler for zeugnis documents."""
|
|
|
|
def __init__(self):
|
|
self.http_client: Optional[httpx.AsyncClient] = None
|
|
self.db_pool = None
|
|
|
|
async def init(self):
|
|
"""Initialize crawler resources."""
|
|
self.http_client = httpx.AsyncClient(
|
|
timeout=REQUEST_TIMEOUT,
|
|
follow_redirects=True,
|
|
headers={"User-Agent": USER_AGENT},
|
|
)
|
|
|
|
# Initialize database connection
|
|
try:
|
|
from metrics_db import get_pool
|
|
self.db_pool = await get_pool()
|
|
except Exception as e:
|
|
print(f"Failed to get database pool: {e}")
|
|
|
|
async def close(self):
|
|
"""Close crawler resources."""
|
|
if self.http_client:
|
|
await self.http_client.aclose()
|
|
|
|
async def fetch_url(self, url: str) -> Tuple[Optional[bytes], Optional[str]]:
|
|
"""Fetch URL with retry logic."""
|
|
for attempt in range(MAX_RETRIES):
|
|
try:
|
|
response = await self.http_client.get(url)
|
|
response.raise_for_status()
|
|
content_type = response.headers.get("content-type", "")
|
|
return response.content, content_type
|
|
except httpx.HTTPStatusError as e:
|
|
print(f"HTTP error {e.response.status_code} for {url}")
|
|
if e.response.status_code == 404:
|
|
return None, None
|
|
except Exception as e:
|
|
print(f"Attempt {attempt + 1}/{MAX_RETRIES} failed for {url}: {e}")
|
|
if attempt < MAX_RETRIES - 1:
|
|
await asyncio.sleep(RETRY_DELAY * (attempt + 1))
|
|
return None, None
|
|
|
|
async def crawl_seed_url(
|
|
self,
|
|
seed_url_id: str,
|
|
url: str,
|
|
bundesland: str,
|
|
doc_type: str,
|
|
training_allowed: bool,
|
|
) -> Dict[str, Any]:
|
|
"""Crawl a single seed URL."""
|
|
global _crawler_state
|
|
|
|
result = {
|
|
"seed_url_id": seed_url_id,
|
|
"url": url,
|
|
"success": False,
|
|
"document_id": None,
|
|
"indexed": False,
|
|
"error": None,
|
|
}
|
|
|
|
try:
|
|
# Fetch content
|
|
content, content_type = await self.fetch_url(url)
|
|
if not content:
|
|
result["error"] = "Failed to fetch URL"
|
|
return result
|
|
|
|
# Determine file type
|
|
is_pdf = "pdf" in content_type.lower() or url.lower().endswith(".pdf")
|
|
|
|
# Extract text
|
|
if is_pdf:
|
|
text = extract_text_from_pdf(content)
|
|
filename = url.split("/")[-1] or f"document_{seed_url_id}.pdf"
|
|
else:
|
|
text = extract_text_from_html(content)
|
|
filename = f"document_{seed_url_id}.html"
|
|
|
|
if not text:
|
|
result["error"] = "No text extracted"
|
|
return result
|
|
|
|
# Compute hash for versioning
|
|
content_hash = compute_hash(content)
|
|
|
|
# Upload to MinIO
|
|
minio_path = await upload_to_minio(
|
|
content,
|
|
bundesland,
|
|
filename,
|
|
content_type=content_type or "application/octet-stream",
|
|
)
|
|
|
|
# Generate document ID
|
|
doc_id = generate_id()
|
|
|
|
# Store document in database
|
|
if self.db_pool:
|
|
async with self.db_pool.acquire() as conn:
|
|
await conn.execute(
|
|
"""
|
|
INSERT INTO zeugnis_documents
|
|
(id, seed_url_id, title, url, content_hash, minio_path,
|
|
training_allowed, file_size, content_type)
|
|
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9)
|
|
ON CONFLICT DO NOTHING
|
|
""",
|
|
doc_id, seed_url_id, filename, url, content_hash,
|
|
minio_path, training_allowed, len(content), content_type
|
|
)
|
|
|
|
result["document_id"] = doc_id
|
|
result["success"] = True
|
|
_crawler_state.documents_crawled_today += 1
|
|
|
|
# Only index if training is allowed
|
|
if training_allowed:
|
|
chunks = chunk_text(text)
|
|
if chunks:
|
|
embeddings = await generate_embeddings(chunks)
|
|
if embeddings:
|
|
indexed_count = await index_in_qdrant(
|
|
doc_id,
|
|
chunks,
|
|
embeddings,
|
|
{
|
|
"bundesland": bundesland,
|
|
"doc_type": doc_type,
|
|
"title": filename,
|
|
"url": url,
|
|
"training_allowed": True,
|
|
}
|
|
)
|
|
if indexed_count > 0:
|
|
result["indexed"] = True
|
|
_crawler_state.documents_indexed_today += 1
|
|
|
|
# Update database
|
|
if self.db_pool:
|
|
async with self.db_pool.acquire() as conn:
|
|
await conn.execute(
|
|
"UPDATE zeugnis_documents SET indexed_in_qdrant = true WHERE id = $1",
|
|
doc_id
|
|
)
|
|
else:
|
|
result["indexed"] = False
|
|
result["error"] = "Training not allowed for this source"
|
|
|
|
_crawler_state.last_activity = datetime.now()
|
|
|
|
except Exception as e:
|
|
result["error"] = str(e)
|
|
_crawler_state.errors_today += 1
|
|
|
|
return result
|
|
|
|
async def crawl_source(self, source_id: str) -> Dict[str, Any]:
|
|
"""Crawl all seed URLs for a source."""
|
|
global _crawler_state
|
|
|
|
result = {
|
|
"source_id": source_id,
|
|
"documents_found": 0,
|
|
"documents_indexed": 0,
|
|
"errors": [],
|
|
"started_at": datetime.now(),
|
|
"completed_at": None,
|
|
}
|
|
|
|
if not self.db_pool:
|
|
result["errors"].append("Database not available")
|
|
return result
|
|
|
|
try:
|
|
async with self.db_pool.acquire() as conn:
|
|
# Get source info
|
|
source = await conn.fetchrow(
|
|
"SELECT * FROM zeugnis_sources WHERE id = $1",
|
|
source_id
|
|
)
|
|
if not source:
|
|
result["errors"].append(f"Source not found: {source_id}")
|
|
return result
|
|
|
|
bundesland = source["bundesland"]
|
|
training_allowed = source["training_allowed"]
|
|
|
|
_crawler_state.current_source_id = source_id
|
|
_crawler_state.current_bundesland = bundesland
|
|
|
|
# Get seed URLs
|
|
seed_urls = await conn.fetch(
|
|
"SELECT * FROM zeugnis_seed_urls WHERE source_id = $1 AND status != 'completed'",
|
|
source_id
|
|
)
|
|
|
|
for seed_url in seed_urls:
|
|
# Update status to running
|
|
await conn.execute(
|
|
"UPDATE zeugnis_seed_urls SET status = 'running' WHERE id = $1",
|
|
seed_url["id"]
|
|
)
|
|
|
|
# Crawl
|
|
crawl_result = await self.crawl_seed_url(
|
|
seed_url["id"],
|
|
seed_url["url"],
|
|
bundesland,
|
|
seed_url["doc_type"],
|
|
training_allowed,
|
|
)
|
|
|
|
# Update status
|
|
if crawl_result["success"]:
|
|
result["documents_found"] += 1
|
|
if crawl_result["indexed"]:
|
|
result["documents_indexed"] += 1
|
|
await conn.execute(
|
|
"UPDATE zeugnis_seed_urls SET status = 'completed', last_crawled = NOW() WHERE id = $1",
|
|
seed_url["id"]
|
|
)
|
|
else:
|
|
result["errors"].append(f"{seed_url['url']}: {crawl_result['error']}")
|
|
await conn.execute(
|
|
"UPDATE zeugnis_seed_urls SET status = 'failed', error_message = $2 WHERE id = $1",
|
|
seed_url["id"], crawl_result["error"]
|
|
)
|
|
|
|
# Small delay between requests
|
|
await asyncio.sleep(1)
|
|
|
|
except Exception as e:
|
|
result["errors"].append(str(e))
|
|
|
|
finally:
|
|
result["completed_at"] = datetime.now()
|
|
_crawler_state.current_source_id = None
|
|
_crawler_state.current_bundesland = None
|
|
|
|
return result
|