[split-required] Split 700-870 LOC files across all services
backend-lehrer (11 files): - llm_gateway/routes/schools.py (867 → 5), recording_api.py (848 → 6) - messenger_api.py (840 → 5), print_generator.py (824 → 5) - unit_analytics_api.py (751 → 5), classroom/routes/context.py (726 → 4) - llm_gateway/routes/edu_search_seeds.py (710 → 4) klausur-service (12 files): - ocr_labeling_api.py (845 → 4), metrics_db.py (833 → 4) - legal_corpus_api.py (790 → 4), page_crop.py (758 → 3) - mail/ai_service.py (747 → 4), github_crawler.py (767 → 3) - trocr_service.py (730 → 4), full_compliance_pipeline.py (723 → 4) - dsfa_rag_api.py (715 → 4), ocr_pipeline_auto.py (705 → 4) website (6 pages): - audit-checklist (867 → 8), content (806 → 6) - screen-flow (790 → 4), scraper (789 → 5) - zeugnisse (776 → 5), modules (745 → 4) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
198
backend-lehrer/llm_gateway/routes/edu_search_status.py
Normal file
198
backend-lehrer/llm_gateway/routes/edu_search_status.py
Normal file
@@ -0,0 +1,198 @@
|
||||
"""
|
||||
EduSearch Seeds Stats & Crawl Status Routes.
|
||||
|
||||
Statistics, export for crawler, and crawl status feedback endpoints.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from typing import List
|
||||
from datetime import datetime
|
||||
|
||||
from fastapi import APIRouter, HTTPException
|
||||
import asyncpg
|
||||
|
||||
from .edu_search_models import (
|
||||
StatsResponse,
|
||||
CrawlStatusUpdate,
|
||||
CrawlStatusResponse,
|
||||
BulkCrawlStatusUpdate,
|
||||
BulkCrawlStatusResponse,
|
||||
)
|
||||
from .edu_search_crud import get_db_pool
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
router = APIRouter(tags=["edu-search"])
|
||||
|
||||
|
||||
@router.get("/stats", response_model=StatsResponse)
|
||||
async def get_stats():
|
||||
"""Get crawl statistics."""
|
||||
pool = await get_db_pool()
|
||||
async with pool.acquire() as conn:
|
||||
# Basic counts
|
||||
total = await conn.fetchval("SELECT COUNT(*) FROM edu_search_seeds")
|
||||
enabled = await conn.fetchval("SELECT COUNT(*) FROM edu_search_seeds WHERE enabled = TRUE")
|
||||
total_docs = await conn.fetchval("SELECT COALESCE(SUM(total_documents), 0) FROM edu_search_seeds")
|
||||
|
||||
# By category
|
||||
cat_rows = await conn.fetch("""
|
||||
SELECT c.name, COUNT(s.id) as count
|
||||
FROM edu_search_categories c
|
||||
LEFT JOIN edu_search_seeds s ON c.id = s.category_id
|
||||
GROUP BY c.name
|
||||
""")
|
||||
by_category = {row["name"]: row["count"] for row in cat_rows}
|
||||
|
||||
# By state
|
||||
state_rows = await conn.fetch("""
|
||||
SELECT COALESCE(state, 'federal') as state, COUNT(*) as count
|
||||
FROM edu_search_seeds
|
||||
GROUP BY state
|
||||
""")
|
||||
by_state = {row["state"]: row["count"] for row in state_rows}
|
||||
|
||||
# Last crawl time
|
||||
last_crawl = await conn.fetchval(
|
||||
"SELECT MAX(last_crawled_at) FROM edu_search_seeds"
|
||||
)
|
||||
|
||||
return StatsResponse(
|
||||
total_seeds=total,
|
||||
enabled_seeds=enabled,
|
||||
total_documents=total_docs,
|
||||
seeds_by_category=by_category,
|
||||
seeds_by_state=by_state,
|
||||
last_crawl_time=last_crawl,
|
||||
)
|
||||
|
||||
|
||||
# Export for external use (edu-search-service)
|
||||
@router.get("/seeds/export/for-crawler")
|
||||
async def export_seeds_for_crawler():
|
||||
"""Export enabled seeds in format suitable for crawler."""
|
||||
pool = await get_db_pool()
|
||||
async with pool.acquire() as conn:
|
||||
rows = await conn.fetch("""
|
||||
SELECT
|
||||
s.url, s.trust_boost, s.source_type, s.scope, s.state,
|
||||
s.crawl_depth, c.name as category
|
||||
FROM edu_search_seeds s
|
||||
LEFT JOIN edu_search_categories c ON s.category_id = c.id
|
||||
WHERE s.enabled = TRUE
|
||||
ORDER BY s.trust_boost DESC
|
||||
""")
|
||||
|
||||
return {
|
||||
"seeds": [
|
||||
{
|
||||
"url": row["url"],
|
||||
"trust": float(row["trust_boost"]),
|
||||
"source": row["source_type"],
|
||||
"scope": row["scope"],
|
||||
"state": row["state"],
|
||||
"depth": row["crawl_depth"],
|
||||
"category": row["category"],
|
||||
}
|
||||
for row in rows
|
||||
],
|
||||
"total": len(rows),
|
||||
"exported_at": datetime.utcnow().isoformat(),
|
||||
}
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Crawl Status Feedback (from edu-search-service)
|
||||
# =============================================================================
|
||||
|
||||
@router.post("/seeds/crawl-status", response_model=CrawlStatusResponse)
|
||||
async def update_crawl_status(update: CrawlStatusUpdate):
|
||||
"""Update crawl status for a seed URL (called by edu-search-service)."""
|
||||
pool = await get_db_pool()
|
||||
async with pool.acquire() as conn:
|
||||
# Find the seed by URL
|
||||
seed = await conn.fetchrow(
|
||||
"SELECT id, total_documents FROM edu_search_seeds WHERE url = $1",
|
||||
update.seed_url
|
||||
)
|
||||
|
||||
if not seed:
|
||||
raise HTTPException(
|
||||
status_code=404,
|
||||
detail=f"Seed nicht gefunden: {update.seed_url}"
|
||||
)
|
||||
|
||||
# Update the seed with crawl status
|
||||
new_total = (seed["total_documents"] or 0) + update.documents_crawled
|
||||
|
||||
await conn.execute("""
|
||||
UPDATE edu_search_seeds
|
||||
SET
|
||||
last_crawled_at = NOW(),
|
||||
last_crawl_status = $2,
|
||||
last_crawl_docs = $3,
|
||||
total_documents = $4,
|
||||
updated_at = NOW()
|
||||
WHERE id = $1
|
||||
""", seed["id"], update.status, update.documents_crawled, new_total)
|
||||
|
||||
logger.info(
|
||||
f"Crawl status updated: {update.seed_url} - "
|
||||
f"status={update.status}, docs={update.documents_crawled}, "
|
||||
f"duration={update.crawl_duration_seconds:.1f}s"
|
||||
)
|
||||
|
||||
return CrawlStatusResponse(
|
||||
success=True,
|
||||
seed_url=update.seed_url,
|
||||
message=f"Status aktualisiert: {update.documents_crawled} Dokumente gecrawlt"
|
||||
)
|
||||
|
||||
|
||||
@router.post("/seeds/crawl-status/bulk", response_model=BulkCrawlStatusResponse)
|
||||
async def bulk_update_crawl_status(request: BulkCrawlStatusUpdate):
|
||||
"""Bulk update crawl status for multiple seeds."""
|
||||
pool = await get_db_pool()
|
||||
updated = 0
|
||||
failed = 0
|
||||
errors = []
|
||||
|
||||
async with pool.acquire() as conn:
|
||||
for update in request.updates:
|
||||
try:
|
||||
seed = await conn.fetchrow(
|
||||
"SELECT id, total_documents FROM edu_search_seeds WHERE url = $1",
|
||||
update.seed_url
|
||||
)
|
||||
|
||||
if not seed:
|
||||
failed += 1
|
||||
errors.append(f"Seed nicht gefunden: {update.seed_url}")
|
||||
continue
|
||||
|
||||
new_total = (seed["total_documents"] or 0) + update.documents_crawled
|
||||
|
||||
await conn.execute("""
|
||||
UPDATE edu_search_seeds
|
||||
SET
|
||||
last_crawled_at = NOW(),
|
||||
last_crawl_status = $2,
|
||||
last_crawl_docs = $3,
|
||||
total_documents = $4,
|
||||
updated_at = NOW()
|
||||
WHERE id = $1
|
||||
""", seed["id"], update.status, update.documents_crawled, new_total)
|
||||
|
||||
updated += 1
|
||||
|
||||
except Exception as e:
|
||||
failed += 1
|
||||
errors.append(f"{update.seed_url}: {str(e)}")
|
||||
|
||||
logger.info(f"Bulk crawl status update: {updated} updated, {failed} failed")
|
||||
|
||||
return BulkCrawlStatusResponse(
|
||||
updated=updated,
|
||||
failed=failed,
|
||||
errors=errors
|
||||
)
|
||||
Reference in New Issue
Block a user