backend-lehrer (11 files): - llm_gateway/routes/schools.py (867 → 5), recording_api.py (848 → 6) - messenger_api.py (840 → 5), print_generator.py (824 → 5) - unit_analytics_api.py (751 → 5), classroom/routes/context.py (726 → 4) - llm_gateway/routes/edu_search_seeds.py (710 → 4) klausur-service (12 files): - ocr_labeling_api.py (845 → 4), metrics_db.py (833 → 4) - legal_corpus_api.py (790 → 4), page_crop.py (758 → 3) - mail/ai_service.py (747 → 4), github_crawler.py (767 → 3) - trocr_service.py (730 → 4), full_compliance_pipeline.py (723 → 4) - dsfa_rag_api.py (715 → 4), ocr_pipeline_auto.py (705 → 4) website (6 pages): - audit-checklist (867 → 8), content (806 → 6) - screen-flow (790 → 4), scraper (789 → 5) - zeugnisse (776 → 5), modules (745 → 4) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
199 lines
6.5 KiB
Python
199 lines
6.5 KiB
Python
"""
|
|
EduSearch Seeds Stats & Crawl Status Routes.
|
|
|
|
Statistics, export for crawler, and crawl status feedback endpoints.
|
|
"""
|
|
|
|
import logging
|
|
from typing import List
|
|
from datetime import datetime
|
|
|
|
from fastapi import APIRouter, HTTPException
|
|
import asyncpg
|
|
|
|
from .edu_search_models import (
|
|
StatsResponse,
|
|
CrawlStatusUpdate,
|
|
CrawlStatusResponse,
|
|
BulkCrawlStatusUpdate,
|
|
BulkCrawlStatusResponse,
|
|
)
|
|
from .edu_search_crud import get_db_pool
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
router = APIRouter(tags=["edu-search"])
|
|
|
|
|
|
@router.get("/stats", response_model=StatsResponse)
|
|
async def get_stats():
|
|
"""Get crawl statistics."""
|
|
pool = await get_db_pool()
|
|
async with pool.acquire() as conn:
|
|
# Basic counts
|
|
total = await conn.fetchval("SELECT COUNT(*) FROM edu_search_seeds")
|
|
enabled = await conn.fetchval("SELECT COUNT(*) FROM edu_search_seeds WHERE enabled = TRUE")
|
|
total_docs = await conn.fetchval("SELECT COALESCE(SUM(total_documents), 0) FROM edu_search_seeds")
|
|
|
|
# By category
|
|
cat_rows = await conn.fetch("""
|
|
SELECT c.name, COUNT(s.id) as count
|
|
FROM edu_search_categories c
|
|
LEFT JOIN edu_search_seeds s ON c.id = s.category_id
|
|
GROUP BY c.name
|
|
""")
|
|
by_category = {row["name"]: row["count"] for row in cat_rows}
|
|
|
|
# By state
|
|
state_rows = await conn.fetch("""
|
|
SELECT COALESCE(state, 'federal') as state, COUNT(*) as count
|
|
FROM edu_search_seeds
|
|
GROUP BY state
|
|
""")
|
|
by_state = {row["state"]: row["count"] for row in state_rows}
|
|
|
|
# Last crawl time
|
|
last_crawl = await conn.fetchval(
|
|
"SELECT MAX(last_crawled_at) FROM edu_search_seeds"
|
|
)
|
|
|
|
return StatsResponse(
|
|
total_seeds=total,
|
|
enabled_seeds=enabled,
|
|
total_documents=total_docs,
|
|
seeds_by_category=by_category,
|
|
seeds_by_state=by_state,
|
|
last_crawl_time=last_crawl,
|
|
)
|
|
|
|
|
|
# Export for external use (edu-search-service)
|
|
@router.get("/seeds/export/for-crawler")
|
|
async def export_seeds_for_crawler():
|
|
"""Export enabled seeds in format suitable for crawler."""
|
|
pool = await get_db_pool()
|
|
async with pool.acquire() as conn:
|
|
rows = await conn.fetch("""
|
|
SELECT
|
|
s.url, s.trust_boost, s.source_type, s.scope, s.state,
|
|
s.crawl_depth, c.name as category
|
|
FROM edu_search_seeds s
|
|
LEFT JOIN edu_search_categories c ON s.category_id = c.id
|
|
WHERE s.enabled = TRUE
|
|
ORDER BY s.trust_boost DESC
|
|
""")
|
|
|
|
return {
|
|
"seeds": [
|
|
{
|
|
"url": row["url"],
|
|
"trust": float(row["trust_boost"]),
|
|
"source": row["source_type"],
|
|
"scope": row["scope"],
|
|
"state": row["state"],
|
|
"depth": row["crawl_depth"],
|
|
"category": row["category"],
|
|
}
|
|
for row in rows
|
|
],
|
|
"total": len(rows),
|
|
"exported_at": datetime.utcnow().isoformat(),
|
|
}
|
|
|
|
|
|
# =============================================================================
|
|
# Crawl Status Feedback (from edu-search-service)
|
|
# =============================================================================
|
|
|
|
@router.post("/seeds/crawl-status", response_model=CrawlStatusResponse)
|
|
async def update_crawl_status(update: CrawlStatusUpdate):
|
|
"""Update crawl status for a seed URL (called by edu-search-service)."""
|
|
pool = await get_db_pool()
|
|
async with pool.acquire() as conn:
|
|
# Find the seed by URL
|
|
seed = await conn.fetchrow(
|
|
"SELECT id, total_documents FROM edu_search_seeds WHERE url = $1",
|
|
update.seed_url
|
|
)
|
|
|
|
if not seed:
|
|
raise HTTPException(
|
|
status_code=404,
|
|
detail=f"Seed nicht gefunden: {update.seed_url}"
|
|
)
|
|
|
|
# Update the seed with crawl status
|
|
new_total = (seed["total_documents"] or 0) + update.documents_crawled
|
|
|
|
await conn.execute("""
|
|
UPDATE edu_search_seeds
|
|
SET
|
|
last_crawled_at = NOW(),
|
|
last_crawl_status = $2,
|
|
last_crawl_docs = $3,
|
|
total_documents = $4,
|
|
updated_at = NOW()
|
|
WHERE id = $1
|
|
""", seed["id"], update.status, update.documents_crawled, new_total)
|
|
|
|
logger.info(
|
|
f"Crawl status updated: {update.seed_url} - "
|
|
f"status={update.status}, docs={update.documents_crawled}, "
|
|
f"duration={update.crawl_duration_seconds:.1f}s"
|
|
)
|
|
|
|
return CrawlStatusResponse(
|
|
success=True,
|
|
seed_url=update.seed_url,
|
|
message=f"Status aktualisiert: {update.documents_crawled} Dokumente gecrawlt"
|
|
)
|
|
|
|
|
|
@router.post("/seeds/crawl-status/bulk", response_model=BulkCrawlStatusResponse)
|
|
async def bulk_update_crawl_status(request: BulkCrawlStatusUpdate):
|
|
"""Bulk update crawl status for multiple seeds."""
|
|
pool = await get_db_pool()
|
|
updated = 0
|
|
failed = 0
|
|
errors = []
|
|
|
|
async with pool.acquire() as conn:
|
|
for update in request.updates:
|
|
try:
|
|
seed = await conn.fetchrow(
|
|
"SELECT id, total_documents FROM edu_search_seeds WHERE url = $1",
|
|
update.seed_url
|
|
)
|
|
|
|
if not seed:
|
|
failed += 1
|
|
errors.append(f"Seed nicht gefunden: {update.seed_url}")
|
|
continue
|
|
|
|
new_total = (seed["total_documents"] or 0) + update.documents_crawled
|
|
|
|
await conn.execute("""
|
|
UPDATE edu_search_seeds
|
|
SET
|
|
last_crawled_at = NOW(),
|
|
last_crawl_status = $2,
|
|
last_crawl_docs = $3,
|
|
total_documents = $4,
|
|
updated_at = NOW()
|
|
WHERE id = $1
|
|
""", seed["id"], update.status, update.documents_crawled, new_total)
|
|
|
|
updated += 1
|
|
|
|
except Exception as e:
|
|
failed += 1
|
|
errors.append(f"{update.seed_url}: {str(e)}")
|
|
|
|
logger.info(f"Bulk crawl status update: {updated} updated, {failed} failed")
|
|
|
|
return BulkCrawlStatusResponse(
|
|
updated=updated,
|
|
failed=failed,
|
|
errors=errors
|
|
)
|