Initial commit: breakpilot-lehrer - Lehrer KI Platform
Services: Admin-Lehrer, Backend-Lehrer, Studio v2, Website, Klausur-Service, School-Service, Voice-Service, Geo-Service, BreakPilot Drive, Agent-Core Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
676
klausur-service/backend/zeugnis_crawler.py
Normal file
676
klausur-service/backend/zeugnis_crawler.py
Normal file
@@ -0,0 +1,676 @@
|
||||
"""
|
||||
Zeugnis Rights-Aware Crawler
|
||||
|
||||
Crawls official government documents about school certificates (Zeugnisse)
|
||||
from all 16 German federal states. Only indexes documents where AI training
|
||||
is legally permitted.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import hashlib
|
||||
import os
|
||||
import re
|
||||
import uuid
|
||||
from datetime import datetime
|
||||
from typing import Optional, List, Dict, Any, Tuple
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
import httpx
|
||||
|
||||
# Local imports
|
||||
from zeugnis_models import (
|
||||
CrawlStatus, LicenseType, DocType, EventType,
|
||||
BUNDESLAENDER, TRAINING_PERMISSIONS,
|
||||
generate_id, get_training_allowed, get_bundesland_name,
|
||||
)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Configuration
|
||||
# =============================================================================
|
||||
|
||||
QDRANT_URL = os.getenv("QDRANT_URL", "http://localhost:6333")
|
||||
MINIO_ENDPOINT = os.getenv("MINIO_ENDPOINT", "localhost:9000")
|
||||
MINIO_ACCESS_KEY = os.getenv("MINIO_ACCESS_KEY", "test-access-key")
|
||||
MINIO_SECRET_KEY = os.getenv("MINIO_SECRET_KEY", "test-secret-key")
|
||||
MINIO_BUCKET = os.getenv("MINIO_BUCKET", "breakpilot-rag")
|
||||
EMBEDDING_BACKEND = os.getenv("EMBEDDING_BACKEND", "local")
|
||||
|
||||
ZEUGNIS_COLLECTION = "bp_zeugnis"
|
||||
CHUNK_SIZE = 1000
|
||||
CHUNK_OVERLAP = 200
|
||||
MAX_RETRIES = 3
|
||||
RETRY_DELAY = 5 # seconds
|
||||
REQUEST_TIMEOUT = 30 # seconds
|
||||
USER_AGENT = "BreakPilot-Zeugnis-Crawler/1.0 (Educational Research)"
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Crawler State
|
||||
# =============================================================================
|
||||
|
||||
@dataclass
|
||||
class CrawlerState:
|
||||
"""Global crawler state."""
|
||||
is_running: bool = False
|
||||
current_source_id: Optional[str] = None
|
||||
current_bundesland: Optional[str] = None
|
||||
queue: List[Dict] = field(default_factory=list)
|
||||
documents_crawled_today: int = 0
|
||||
documents_indexed_today: int = 0
|
||||
errors_today: int = 0
|
||||
last_activity: Optional[datetime] = None
|
||||
|
||||
|
||||
_crawler_state = CrawlerState()
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Text Extraction
|
||||
# =============================================================================
|
||||
|
||||
def extract_text_from_pdf(content: bytes) -> str:
|
||||
"""Extract text from PDF bytes."""
|
||||
try:
|
||||
from PyPDF2 import PdfReader
|
||||
import io
|
||||
|
||||
reader = PdfReader(io.BytesIO(content))
|
||||
text_parts = []
|
||||
for page in reader.pages:
|
||||
text = page.extract_text()
|
||||
if text:
|
||||
text_parts.append(text)
|
||||
return "\n\n".join(text_parts)
|
||||
except Exception as e:
|
||||
print(f"PDF extraction failed: {e}")
|
||||
return ""
|
||||
|
||||
|
||||
def extract_text_from_html(content: bytes, encoding: str = "utf-8") -> str:
|
||||
"""Extract text from HTML bytes."""
|
||||
try:
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
html = content.decode(encoding, errors="replace")
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
|
||||
# Remove script and style elements
|
||||
for element in soup(["script", "style", "nav", "header", "footer"]):
|
||||
element.decompose()
|
||||
|
||||
# Get text
|
||||
text = soup.get_text(separator="\n", strip=True)
|
||||
|
||||
# Clean up whitespace
|
||||
lines = [line.strip() for line in text.splitlines() if line.strip()]
|
||||
return "\n".join(lines)
|
||||
except Exception as e:
|
||||
print(f"HTML extraction failed: {e}")
|
||||
return ""
|
||||
|
||||
|
||||
def chunk_text(text: str, chunk_size: int = CHUNK_SIZE, overlap: int = CHUNK_OVERLAP) -> List[str]:
|
||||
"""Split text into overlapping chunks."""
|
||||
if not text:
|
||||
return []
|
||||
|
||||
chunks = []
|
||||
separators = ["\n\n", "\n", ". ", " "]
|
||||
|
||||
def split_recursive(text: str, sep_index: int = 0) -> List[str]:
|
||||
if len(text) <= chunk_size:
|
||||
return [text] if text.strip() else []
|
||||
|
||||
if sep_index >= len(separators):
|
||||
# Force split at chunk_size
|
||||
result = []
|
||||
for i in range(0, len(text), chunk_size - overlap):
|
||||
chunk = text[i:i + chunk_size]
|
||||
if chunk.strip():
|
||||
result.append(chunk)
|
||||
return result
|
||||
|
||||
sep = separators[sep_index]
|
||||
parts = text.split(sep)
|
||||
result = []
|
||||
current = ""
|
||||
|
||||
for part in parts:
|
||||
if len(current) + len(sep) + len(part) <= chunk_size:
|
||||
current = current + sep + part if current else part
|
||||
else:
|
||||
if current.strip():
|
||||
result.extend(split_recursive(current, sep_index + 1) if len(current) > chunk_size else [current])
|
||||
current = part
|
||||
|
||||
if current.strip():
|
||||
result.extend(split_recursive(current, sep_index + 1) if len(current) > chunk_size else [current])
|
||||
|
||||
return result
|
||||
|
||||
chunks = split_recursive(text)
|
||||
|
||||
# Add overlap
|
||||
if overlap > 0 and len(chunks) > 1:
|
||||
overlapped = []
|
||||
for i, chunk in enumerate(chunks):
|
||||
if i > 0:
|
||||
# Add end of previous chunk
|
||||
prev_end = chunks[i - 1][-overlap:]
|
||||
chunk = prev_end + chunk
|
||||
overlapped.append(chunk)
|
||||
chunks = overlapped
|
||||
|
||||
return chunks
|
||||
|
||||
|
||||
def compute_hash(content: bytes) -> str:
|
||||
"""Compute SHA-256 hash of content."""
|
||||
return hashlib.sha256(content).hexdigest()
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Embedding Generation
|
||||
# =============================================================================
|
||||
|
||||
_embedding_model = None
|
||||
|
||||
|
||||
def get_embedding_model():
|
||||
"""Get or initialize embedding model."""
|
||||
global _embedding_model
|
||||
if _embedding_model is None and EMBEDDING_BACKEND == "local":
|
||||
try:
|
||||
from sentence_transformers import SentenceTransformer
|
||||
_embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
|
||||
print("Loaded local embedding model: all-MiniLM-L6-v2")
|
||||
except ImportError:
|
||||
print("Warning: sentence-transformers not installed")
|
||||
return _embedding_model
|
||||
|
||||
|
||||
async def generate_embeddings(texts: List[str]) -> List[List[float]]:
|
||||
"""Generate embeddings for a list of texts."""
|
||||
if not texts:
|
||||
return []
|
||||
|
||||
if EMBEDDING_BACKEND == "local":
|
||||
model = get_embedding_model()
|
||||
if model:
|
||||
embeddings = model.encode(texts, show_progress_bar=False)
|
||||
return [emb.tolist() for emb in embeddings]
|
||||
return []
|
||||
|
||||
elif EMBEDDING_BACKEND == "openai":
|
||||
import openai
|
||||
api_key = os.getenv("OPENAI_API_KEY")
|
||||
if not api_key:
|
||||
print("Warning: OPENAI_API_KEY not set")
|
||||
return []
|
||||
|
||||
client = openai.AsyncOpenAI(api_key=api_key)
|
||||
response = await client.embeddings.create(
|
||||
input=texts,
|
||||
model="text-embedding-3-small"
|
||||
)
|
||||
return [item.embedding for item in response.data]
|
||||
|
||||
return []
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# MinIO Storage
|
||||
# =============================================================================
|
||||
|
||||
async def upload_to_minio(
|
||||
content: bytes,
|
||||
bundesland: str,
|
||||
filename: str,
|
||||
content_type: str = "application/pdf",
|
||||
year: Optional[int] = None,
|
||||
) -> Optional[str]:
|
||||
"""Upload document to MinIO."""
|
||||
try:
|
||||
from minio import Minio
|
||||
|
||||
client = Minio(
|
||||
MINIO_ENDPOINT,
|
||||
access_key=MINIO_ACCESS_KEY,
|
||||
secret_key=MINIO_SECRET_KEY,
|
||||
secure=os.getenv("MINIO_SECURE", "false").lower() == "true"
|
||||
)
|
||||
|
||||
# Ensure bucket exists
|
||||
if not client.bucket_exists(MINIO_BUCKET):
|
||||
client.make_bucket(MINIO_BUCKET)
|
||||
|
||||
# Build path
|
||||
year_str = str(year) if year else str(datetime.now().year)
|
||||
object_name = f"landes-daten/{bundesland}/zeugnis/{year_str}/{filename}"
|
||||
|
||||
# Upload
|
||||
import io
|
||||
client.put_object(
|
||||
MINIO_BUCKET,
|
||||
object_name,
|
||||
io.BytesIO(content),
|
||||
len(content),
|
||||
content_type=content_type,
|
||||
)
|
||||
|
||||
return object_name
|
||||
except Exception as e:
|
||||
print(f"MinIO upload failed: {e}")
|
||||
return None
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Qdrant Indexing
|
||||
# =============================================================================
|
||||
|
||||
async def index_in_qdrant(
|
||||
doc_id: str,
|
||||
chunks: List[str],
|
||||
embeddings: List[List[float]],
|
||||
metadata: Dict[str, Any],
|
||||
) -> int:
|
||||
"""Index document chunks in Qdrant."""
|
||||
try:
|
||||
from qdrant_client import QdrantClient
|
||||
from qdrant_client.models import VectorParams, Distance, PointStruct
|
||||
|
||||
client = QdrantClient(url=QDRANT_URL)
|
||||
|
||||
# Ensure collection exists
|
||||
collections = client.get_collections().collections
|
||||
if not any(c.name == ZEUGNIS_COLLECTION for c in collections):
|
||||
vector_size = len(embeddings[0]) if embeddings else 384
|
||||
client.create_collection(
|
||||
collection_name=ZEUGNIS_COLLECTION,
|
||||
vectors_config=VectorParams(
|
||||
size=vector_size,
|
||||
distance=Distance.COSINE,
|
||||
),
|
||||
)
|
||||
print(f"Created Qdrant collection: {ZEUGNIS_COLLECTION}")
|
||||
|
||||
# Create points
|
||||
points = []
|
||||
for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
|
||||
point_id = str(uuid.uuid4())
|
||||
points.append(PointStruct(
|
||||
id=point_id,
|
||||
vector=embedding,
|
||||
payload={
|
||||
"document_id": doc_id,
|
||||
"chunk_index": i,
|
||||
"chunk_text": chunk[:500], # Store first 500 chars for preview
|
||||
"bundesland": metadata.get("bundesland"),
|
||||
"doc_type": metadata.get("doc_type"),
|
||||
"title": metadata.get("title"),
|
||||
"source_url": metadata.get("url"),
|
||||
"training_allowed": metadata.get("training_allowed", False),
|
||||
"indexed_at": datetime.now().isoformat(),
|
||||
}
|
||||
))
|
||||
|
||||
# Upsert
|
||||
if points:
|
||||
client.upsert(
|
||||
collection_name=ZEUGNIS_COLLECTION,
|
||||
points=points,
|
||||
)
|
||||
|
||||
return len(points)
|
||||
except Exception as e:
|
||||
print(f"Qdrant indexing failed: {e}")
|
||||
return 0
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Crawler Worker
|
||||
# =============================================================================
|
||||
|
||||
class ZeugnisCrawler:
|
||||
"""Rights-aware crawler for zeugnis documents."""
|
||||
|
||||
def __init__(self):
|
||||
self.http_client: Optional[httpx.AsyncClient] = None
|
||||
self.db_pool = None
|
||||
|
||||
async def init(self):
|
||||
"""Initialize crawler resources."""
|
||||
self.http_client = httpx.AsyncClient(
|
||||
timeout=REQUEST_TIMEOUT,
|
||||
follow_redirects=True,
|
||||
headers={"User-Agent": USER_AGENT},
|
||||
)
|
||||
|
||||
# Initialize database connection
|
||||
try:
|
||||
from metrics_db import get_pool
|
||||
self.db_pool = await get_pool()
|
||||
except Exception as e:
|
||||
print(f"Failed to get database pool: {e}")
|
||||
|
||||
async def close(self):
|
||||
"""Close crawler resources."""
|
||||
if self.http_client:
|
||||
await self.http_client.aclose()
|
||||
|
||||
async def fetch_url(self, url: str) -> Tuple[Optional[bytes], Optional[str]]:
|
||||
"""Fetch URL with retry logic."""
|
||||
for attempt in range(MAX_RETRIES):
|
||||
try:
|
||||
response = await self.http_client.get(url)
|
||||
response.raise_for_status()
|
||||
content_type = response.headers.get("content-type", "")
|
||||
return response.content, content_type
|
||||
except httpx.HTTPStatusError as e:
|
||||
print(f"HTTP error {e.response.status_code} for {url}")
|
||||
if e.response.status_code == 404:
|
||||
return None, None
|
||||
except Exception as e:
|
||||
print(f"Attempt {attempt + 1}/{MAX_RETRIES} failed for {url}: {e}")
|
||||
if attempt < MAX_RETRIES - 1:
|
||||
await asyncio.sleep(RETRY_DELAY * (attempt + 1))
|
||||
return None, None
|
||||
|
||||
async def crawl_seed_url(
|
||||
self,
|
||||
seed_url_id: str,
|
||||
url: str,
|
||||
bundesland: str,
|
||||
doc_type: str,
|
||||
training_allowed: bool,
|
||||
) -> Dict[str, Any]:
|
||||
"""Crawl a single seed URL."""
|
||||
global _crawler_state
|
||||
|
||||
result = {
|
||||
"seed_url_id": seed_url_id,
|
||||
"url": url,
|
||||
"success": False,
|
||||
"document_id": None,
|
||||
"indexed": False,
|
||||
"error": None,
|
||||
}
|
||||
|
||||
try:
|
||||
# Fetch content
|
||||
content, content_type = await self.fetch_url(url)
|
||||
if not content:
|
||||
result["error"] = "Failed to fetch URL"
|
||||
return result
|
||||
|
||||
# Determine file type
|
||||
is_pdf = "pdf" in content_type.lower() or url.lower().endswith(".pdf")
|
||||
|
||||
# Extract text
|
||||
if is_pdf:
|
||||
text = extract_text_from_pdf(content)
|
||||
filename = url.split("/")[-1] or f"document_{seed_url_id}.pdf"
|
||||
else:
|
||||
text = extract_text_from_html(content)
|
||||
filename = f"document_{seed_url_id}.html"
|
||||
|
||||
if not text:
|
||||
result["error"] = "No text extracted"
|
||||
return result
|
||||
|
||||
# Compute hash for versioning
|
||||
content_hash = compute_hash(content)
|
||||
|
||||
# Upload to MinIO
|
||||
minio_path = await upload_to_minio(
|
||||
content,
|
||||
bundesland,
|
||||
filename,
|
||||
content_type=content_type or "application/octet-stream",
|
||||
)
|
||||
|
||||
# Generate document ID
|
||||
doc_id = generate_id()
|
||||
|
||||
# Store document in database
|
||||
if self.db_pool:
|
||||
async with self.db_pool.acquire() as conn:
|
||||
await conn.execute(
|
||||
"""
|
||||
INSERT INTO zeugnis_documents
|
||||
(id, seed_url_id, title, url, content_hash, minio_path,
|
||||
training_allowed, file_size, content_type)
|
||||
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9)
|
||||
ON CONFLICT DO NOTHING
|
||||
""",
|
||||
doc_id, seed_url_id, filename, url, content_hash,
|
||||
minio_path, training_allowed, len(content), content_type
|
||||
)
|
||||
|
||||
result["document_id"] = doc_id
|
||||
result["success"] = True
|
||||
_crawler_state.documents_crawled_today += 1
|
||||
|
||||
# Only index if training is allowed
|
||||
if training_allowed:
|
||||
chunks = chunk_text(text)
|
||||
if chunks:
|
||||
embeddings = await generate_embeddings(chunks)
|
||||
if embeddings:
|
||||
indexed_count = await index_in_qdrant(
|
||||
doc_id,
|
||||
chunks,
|
||||
embeddings,
|
||||
{
|
||||
"bundesland": bundesland,
|
||||
"doc_type": doc_type,
|
||||
"title": filename,
|
||||
"url": url,
|
||||
"training_allowed": True,
|
||||
}
|
||||
)
|
||||
if indexed_count > 0:
|
||||
result["indexed"] = True
|
||||
_crawler_state.documents_indexed_today += 1
|
||||
|
||||
# Update database
|
||||
if self.db_pool:
|
||||
async with self.db_pool.acquire() as conn:
|
||||
await conn.execute(
|
||||
"UPDATE zeugnis_documents SET indexed_in_qdrant = true WHERE id = $1",
|
||||
doc_id
|
||||
)
|
||||
else:
|
||||
result["indexed"] = False
|
||||
result["error"] = "Training not allowed for this source"
|
||||
|
||||
_crawler_state.last_activity = datetime.now()
|
||||
|
||||
except Exception as e:
|
||||
result["error"] = str(e)
|
||||
_crawler_state.errors_today += 1
|
||||
|
||||
return result
|
||||
|
||||
async def crawl_source(self, source_id: str) -> Dict[str, Any]:
|
||||
"""Crawl all seed URLs for a source."""
|
||||
global _crawler_state
|
||||
|
||||
result = {
|
||||
"source_id": source_id,
|
||||
"documents_found": 0,
|
||||
"documents_indexed": 0,
|
||||
"errors": [],
|
||||
"started_at": datetime.now(),
|
||||
"completed_at": None,
|
||||
}
|
||||
|
||||
if not self.db_pool:
|
||||
result["errors"].append("Database not available")
|
||||
return result
|
||||
|
||||
try:
|
||||
async with self.db_pool.acquire() as conn:
|
||||
# Get source info
|
||||
source = await conn.fetchrow(
|
||||
"SELECT * FROM zeugnis_sources WHERE id = $1",
|
||||
source_id
|
||||
)
|
||||
if not source:
|
||||
result["errors"].append(f"Source not found: {source_id}")
|
||||
return result
|
||||
|
||||
bundesland = source["bundesland"]
|
||||
training_allowed = source["training_allowed"]
|
||||
|
||||
_crawler_state.current_source_id = source_id
|
||||
_crawler_state.current_bundesland = bundesland
|
||||
|
||||
# Get seed URLs
|
||||
seed_urls = await conn.fetch(
|
||||
"SELECT * FROM zeugnis_seed_urls WHERE source_id = $1 AND status != 'completed'",
|
||||
source_id
|
||||
)
|
||||
|
||||
for seed_url in seed_urls:
|
||||
# Update status to running
|
||||
await conn.execute(
|
||||
"UPDATE zeugnis_seed_urls SET status = 'running' WHERE id = $1",
|
||||
seed_url["id"]
|
||||
)
|
||||
|
||||
# Crawl
|
||||
crawl_result = await self.crawl_seed_url(
|
||||
seed_url["id"],
|
||||
seed_url["url"],
|
||||
bundesland,
|
||||
seed_url["doc_type"],
|
||||
training_allowed,
|
||||
)
|
||||
|
||||
# Update status
|
||||
if crawl_result["success"]:
|
||||
result["documents_found"] += 1
|
||||
if crawl_result["indexed"]:
|
||||
result["documents_indexed"] += 1
|
||||
await conn.execute(
|
||||
"UPDATE zeugnis_seed_urls SET status = 'completed', last_crawled = NOW() WHERE id = $1",
|
||||
seed_url["id"]
|
||||
)
|
||||
else:
|
||||
result["errors"].append(f"{seed_url['url']}: {crawl_result['error']}")
|
||||
await conn.execute(
|
||||
"UPDATE zeugnis_seed_urls SET status = 'failed', error_message = $2 WHERE id = $1",
|
||||
seed_url["id"], crawl_result["error"]
|
||||
)
|
||||
|
||||
# Small delay between requests
|
||||
await asyncio.sleep(1)
|
||||
|
||||
except Exception as e:
|
||||
result["errors"].append(str(e))
|
||||
|
||||
finally:
|
||||
result["completed_at"] = datetime.now()
|
||||
_crawler_state.current_source_id = None
|
||||
_crawler_state.current_bundesland = None
|
||||
|
||||
return result
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Crawler Control Functions
|
||||
# =============================================================================
|
||||
|
||||
_crawler_instance: Optional[ZeugnisCrawler] = None
|
||||
_crawler_task: Optional[asyncio.Task] = None
|
||||
|
||||
|
||||
async def start_crawler(bundesland: Optional[str] = None, source_id: Optional[str] = None) -> bool:
|
||||
"""Start the crawler."""
|
||||
global _crawler_state, _crawler_instance, _crawler_task
|
||||
|
||||
if _crawler_state.is_running:
|
||||
return False
|
||||
|
||||
_crawler_state.is_running = True
|
||||
_crawler_state.documents_crawled_today = 0
|
||||
_crawler_state.documents_indexed_today = 0
|
||||
_crawler_state.errors_today = 0
|
||||
|
||||
_crawler_instance = ZeugnisCrawler()
|
||||
await _crawler_instance.init()
|
||||
|
||||
async def run_crawler():
|
||||
try:
|
||||
from metrics_db import get_pool
|
||||
pool = await get_pool()
|
||||
|
||||
if pool:
|
||||
async with pool.acquire() as conn:
|
||||
# Get sources to crawl
|
||||
if source_id:
|
||||
sources = await conn.fetch(
|
||||
"SELECT id, bundesland FROM zeugnis_sources WHERE id = $1",
|
||||
source_id
|
||||
)
|
||||
elif bundesland:
|
||||
sources = await conn.fetch(
|
||||
"SELECT id, bundesland FROM zeugnis_sources WHERE bundesland = $1",
|
||||
bundesland
|
||||
)
|
||||
else:
|
||||
sources = await conn.fetch(
|
||||
"SELECT id, bundesland FROM zeugnis_sources ORDER BY bundesland"
|
||||
)
|
||||
|
||||
for source in sources:
|
||||
if not _crawler_state.is_running:
|
||||
break
|
||||
await _crawler_instance.crawl_source(source["id"])
|
||||
|
||||
except Exception as e:
|
||||
print(f"Crawler error: {e}")
|
||||
|
||||
finally:
|
||||
_crawler_state.is_running = False
|
||||
if _crawler_instance:
|
||||
await _crawler_instance.close()
|
||||
|
||||
_crawler_task = asyncio.create_task(run_crawler())
|
||||
return True
|
||||
|
||||
|
||||
async def stop_crawler() -> bool:
|
||||
"""Stop the crawler."""
|
||||
global _crawler_state, _crawler_task
|
||||
|
||||
if not _crawler_state.is_running:
|
||||
return False
|
||||
|
||||
_crawler_state.is_running = False
|
||||
|
||||
if _crawler_task:
|
||||
_crawler_task.cancel()
|
||||
try:
|
||||
await _crawler_task
|
||||
except asyncio.CancelledError:
|
||||
pass
|
||||
|
||||
return True
|
||||
|
||||
|
||||
def get_crawler_status() -> Dict[str, Any]:
|
||||
"""Get current crawler status."""
|
||||
global _crawler_state
|
||||
return {
|
||||
"is_running": _crawler_state.is_running,
|
||||
"current_source": _crawler_state.current_source_id,
|
||||
"current_bundesland": _crawler_state.current_bundesland,
|
||||
"queue_length": len(_crawler_state.queue),
|
||||
"documents_crawled_today": _crawler_state.documents_crawled_today,
|
||||
"documents_indexed_today": _crawler_state.documents_indexed_today,
|
||||
"errors_today": _crawler_state.errors_today,
|
||||
"last_activity": _crawler_state.last_activity.isoformat() if _crawler_state.last_activity else None,
|
||||
}
|
||||
Reference in New Issue
Block a user