Restructure: Move 52 files into 7 domain packages
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 28s
CI / test-go-edu-search (push) Successful in 28s
CI / test-python-klausur (push) Failing after 2m22s
CI / test-python-agent-core (push) Successful in 21s
CI / test-nodejs-website (push) Successful in 23s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 28s
CI / test-go-edu-search (push) Successful in 28s
CI / test-python-klausur (push) Failing after 2m22s
CI / test-python-agent-core (push) Successful in 21s
CI / test-nodejs-website (push) Successful in 23s
korrektur/ zeugnis/ admin/ compliance/ worksheet/ training/ metrics/ 52 shims, relative imports, RAG untouched. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
6
klausur-service/backend/zeugnis/__init__.py
Normal file
6
klausur-service/backend/zeugnis/__init__.py
Normal file
@@ -0,0 +1,6 @@
|
||||
"""
|
||||
zeugnis package — certificate crawler, models, storage.
|
||||
|
||||
Backward-compatible re-exports: consumers can still use
|
||||
``from zeugnis_api import ...`` etc. via the shim files in backend/.
|
||||
"""
|
||||
19
klausur-service/backend/zeugnis/api.py
Normal file
19
klausur-service/backend/zeugnis/api.py
Normal file
@@ -0,0 +1,19 @@
|
||||
"""
|
||||
Zeugnis Rights-Aware Crawler — barrel re-export.
|
||||
|
||||
All implementation split into:
|
||||
zeugnis_api_sources — sources, seed URLs, initialization
|
||||
zeugnis_api_docs — documents, crawler, statistics, audit
|
||||
|
||||
FastAPI router for managing zeugnis sources, documents, and crawler operations.
|
||||
"""
|
||||
|
||||
from fastapi import APIRouter
|
||||
|
||||
from .api_sources import router as _sources_router # noqa: F401
|
||||
from .api_docs import router as _docs_router # noqa: F401
|
||||
|
||||
# Composite router (used by main.py)
|
||||
router = APIRouter()
|
||||
router.include_router(_sources_router)
|
||||
router.include_router(_docs_router)
|
||||
321
klausur-service/backend/zeugnis/api_docs.py
Normal file
321
klausur-service/backend/zeugnis/api_docs.py
Normal file
@@ -0,0 +1,321 @@
|
||||
"""
|
||||
Zeugnis API Docs — documents, crawler control, statistics, audit endpoints.
|
||||
|
||||
Extracted from zeugnis_api.py for modularity.
|
||||
"""
|
||||
|
||||
from datetime import datetime, timedelta
|
||||
from typing import Optional, List
|
||||
from fastapi import APIRouter, HTTPException, BackgroundTasks, Query
|
||||
|
||||
from .models import (
|
||||
CrawlRequest, EventType,
|
||||
BUNDESLAENDER,
|
||||
generate_id, get_training_allowed, get_license_for_bundesland,
|
||||
)
|
||||
from .crawler import (
|
||||
start_crawler, stop_crawler, get_crawler_status,
|
||||
)
|
||||
from metrics_db import (
|
||||
get_zeugnis_documents, get_zeugnis_stats,
|
||||
log_zeugnis_event, get_pool,
|
||||
)
|
||||
|
||||
|
||||
router = APIRouter(prefix="/api/v1/admin/zeugnis", tags=["Zeugnis Crawler"])
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Documents Endpoints
|
||||
# =============================================================================
|
||||
|
||||
@router.get("/documents", response_model=List[dict])
|
||||
async def list_documents(
|
||||
bundesland: Optional[str] = None,
|
||||
limit: int = Query(100, le=500),
|
||||
offset: int = 0,
|
||||
):
|
||||
"""Get all zeugnis documents with optional filtering."""
|
||||
documents = await get_zeugnis_documents(bundesland=bundesland, limit=limit, offset=offset)
|
||||
return documents
|
||||
|
||||
|
||||
@router.get("/documents/{document_id}", response_model=dict)
|
||||
async def get_document(document_id: str):
|
||||
"""Get details for a specific document."""
|
||||
pool = await get_pool()
|
||||
if not pool:
|
||||
raise HTTPException(status_code=503, detail="Database not available")
|
||||
|
||||
try:
|
||||
async with pool.acquire() as conn:
|
||||
doc = await conn.fetchrow(
|
||||
"""
|
||||
SELECT d.*, s.bundesland, s.name as source_name
|
||||
FROM zeugnis_documents d
|
||||
JOIN zeugnis_seed_urls u ON d.seed_url_id = u.id
|
||||
JOIN zeugnis_sources s ON u.source_id = s.id
|
||||
WHERE d.id = $1
|
||||
""",
|
||||
document_id
|
||||
)
|
||||
if not doc:
|
||||
raise HTTPException(status_code=404, detail="Document not found")
|
||||
|
||||
# Log view event
|
||||
await log_zeugnis_event(document_id, EventType.VIEWED.value)
|
||||
|
||||
return dict(doc)
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
@router.get("/documents/{document_id}/versions", response_model=List[dict])
|
||||
async def get_document_versions(document_id: str):
|
||||
"""Get version history for a document."""
|
||||
pool = await get_pool()
|
||||
if not pool:
|
||||
raise HTTPException(status_code=503, detail="Database not available")
|
||||
|
||||
try:
|
||||
async with pool.acquire() as conn:
|
||||
rows = await conn.fetch(
|
||||
"""
|
||||
SELECT * FROM zeugnis_document_versions
|
||||
WHERE document_id = $1
|
||||
ORDER BY version DESC
|
||||
""",
|
||||
document_id
|
||||
)
|
||||
return [dict(r) for r in rows]
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Crawler Control Endpoints
|
||||
# =============================================================================
|
||||
|
||||
@router.get("/crawler/status", response_model=dict)
|
||||
async def crawler_status():
|
||||
"""Get current crawler status."""
|
||||
return get_crawler_status()
|
||||
|
||||
|
||||
@router.post("/crawler/start", response_model=dict)
|
||||
async def start_crawl(request: CrawlRequest, background_tasks: BackgroundTasks):
|
||||
"""Start the crawler."""
|
||||
success = await start_crawler(
|
||||
bundesland=request.bundesland,
|
||||
source_id=request.source_id,
|
||||
)
|
||||
if not success:
|
||||
raise HTTPException(status_code=409, detail="Crawler already running")
|
||||
return {"success": True, "message": "Crawler started"}
|
||||
|
||||
|
||||
@router.post("/crawler/stop", response_model=dict)
|
||||
async def stop_crawl():
|
||||
"""Stop the crawler."""
|
||||
success = await stop_crawler()
|
||||
if not success:
|
||||
raise HTTPException(status_code=409, detail="Crawler not running")
|
||||
return {"success": True, "message": "Crawler stopped"}
|
||||
|
||||
|
||||
@router.get("/crawler/queue", response_model=List[dict])
|
||||
async def get_queue():
|
||||
"""Get the crawler queue."""
|
||||
pool = await get_pool()
|
||||
if not pool:
|
||||
return []
|
||||
|
||||
try:
|
||||
async with pool.acquire() as conn:
|
||||
rows = await conn.fetch(
|
||||
"""
|
||||
SELECT q.*, s.bundesland, s.name as source_name
|
||||
FROM zeugnis_crawler_queue q
|
||||
JOIN zeugnis_sources s ON q.source_id = s.id
|
||||
ORDER BY q.priority DESC, q.created_at
|
||||
"""
|
||||
)
|
||||
return [dict(r) for r in rows]
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
@router.post("/crawler/queue", response_model=dict)
|
||||
async def add_to_queue(request: CrawlRequest):
|
||||
"""Add a source to the crawler queue."""
|
||||
pool = await get_pool()
|
||||
if not pool:
|
||||
raise HTTPException(status_code=503, detail="Database not available")
|
||||
|
||||
queue_id = generate_id()
|
||||
try:
|
||||
async with pool.acquire() as conn:
|
||||
# Get source ID if bundesland provided
|
||||
source_id = request.source_id
|
||||
if not source_id and request.bundesland:
|
||||
source = await conn.fetchrow(
|
||||
"SELECT id FROM zeugnis_sources WHERE bundesland = $1",
|
||||
request.bundesland
|
||||
)
|
||||
if source:
|
||||
source_id = source["id"]
|
||||
|
||||
if not source_id:
|
||||
raise HTTPException(status_code=400, detail="Source not found")
|
||||
|
||||
await conn.execute(
|
||||
"""
|
||||
INSERT INTO zeugnis_crawler_queue (id, source_id, priority, status)
|
||||
VALUES ($1, $2, $3, 'pending')
|
||||
""",
|
||||
queue_id, source_id, request.priority
|
||||
)
|
||||
return {"id": queue_id, "success": True}
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Statistics Endpoints
|
||||
# =============================================================================
|
||||
|
||||
@router.get("/stats", response_model=dict)
|
||||
async def get_stats():
|
||||
"""Get zeugnis crawler statistics."""
|
||||
stats = await get_zeugnis_stats()
|
||||
return stats
|
||||
|
||||
|
||||
@router.get("/stats/bundesland", response_model=List[dict])
|
||||
async def get_bundesland_stats():
|
||||
"""Get statistics per Bundesland."""
|
||||
pool = await get_pool()
|
||||
|
||||
# Build stats from BUNDESLAENDER with DB data if available
|
||||
stats = []
|
||||
for code, info in BUNDESLAENDER.items():
|
||||
stat = {
|
||||
"bundesland": code,
|
||||
"name": info["name"],
|
||||
"training_allowed": get_training_allowed(code),
|
||||
"document_count": 0,
|
||||
"indexed_count": 0,
|
||||
"last_crawled": None,
|
||||
}
|
||||
|
||||
if pool:
|
||||
try:
|
||||
async with pool.acquire() as conn:
|
||||
row = await conn.fetchrow(
|
||||
"""
|
||||
SELECT
|
||||
COUNT(d.id) as doc_count,
|
||||
COUNT(CASE WHEN d.indexed_in_qdrant THEN 1 END) as indexed_count,
|
||||
MAX(u.last_crawled) as last_crawled
|
||||
FROM zeugnis_sources s
|
||||
LEFT JOIN zeugnis_seed_urls u ON s.id = u.source_id
|
||||
LEFT JOIN zeugnis_documents d ON u.id = d.seed_url_id
|
||||
WHERE s.bundesland = $1
|
||||
GROUP BY s.id
|
||||
""",
|
||||
code
|
||||
)
|
||||
if row:
|
||||
stat["document_count"] = row["doc_count"] or 0
|
||||
stat["indexed_count"] = row["indexed_count"] or 0
|
||||
stat["last_crawled"] = row["last_crawled"].isoformat() if row["last_crawled"] else None
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
stats.append(stat)
|
||||
|
||||
return stats
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Audit Endpoints
|
||||
# =============================================================================
|
||||
|
||||
@router.get("/audit/events", response_model=List[dict])
|
||||
async def get_audit_events(
|
||||
document_id: Optional[str] = None,
|
||||
event_type: Optional[str] = None,
|
||||
limit: int = Query(100, le=1000),
|
||||
days: int = Query(30, le=365),
|
||||
):
|
||||
"""Get audit events with optional filtering."""
|
||||
pool = await get_pool()
|
||||
if not pool:
|
||||
return []
|
||||
|
||||
try:
|
||||
since = datetime.now() - timedelta(days=days)
|
||||
async with pool.acquire() as conn:
|
||||
query = """
|
||||
SELECT * FROM zeugnis_usage_events
|
||||
WHERE created_at >= $1
|
||||
"""
|
||||
params = [since]
|
||||
|
||||
if document_id:
|
||||
query += " AND document_id = $2"
|
||||
params.append(document_id)
|
||||
if event_type:
|
||||
query += f" AND event_type = ${len(params) + 1}"
|
||||
params.append(event_type)
|
||||
|
||||
query += f" ORDER BY created_at DESC LIMIT ${len(params) + 1}"
|
||||
params.append(limit)
|
||||
|
||||
rows = await conn.fetch(query, *params)
|
||||
return [dict(r) for r in rows]
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
@router.get("/audit/export", response_model=dict)
|
||||
async def export_audit(
|
||||
days: int = Query(30, le=365),
|
||||
requested_by: str = Query(..., description="User requesting the export"),
|
||||
):
|
||||
"""Export audit data for GDPR compliance."""
|
||||
pool = await get_pool()
|
||||
if not pool:
|
||||
raise HTTPException(status_code=503, detail="Database not available")
|
||||
|
||||
try:
|
||||
since = datetime.now() - timedelta(days=days)
|
||||
async with pool.acquire() as conn:
|
||||
rows = await conn.fetch(
|
||||
"""
|
||||
SELECT * FROM zeugnis_usage_events
|
||||
WHERE created_at >= $1
|
||||
ORDER BY created_at DESC
|
||||
""",
|
||||
since
|
||||
)
|
||||
|
||||
doc_count = await conn.fetchval(
|
||||
"SELECT COUNT(DISTINCT document_id) FROM zeugnis_usage_events WHERE created_at >= $1",
|
||||
since
|
||||
)
|
||||
|
||||
return {
|
||||
"export_date": datetime.now().isoformat(),
|
||||
"requested_by": requested_by,
|
||||
"events": [dict(r) for r in rows],
|
||||
"document_count": doc_count or 0,
|
||||
"date_range_start": since.isoformat(),
|
||||
"date_range_end": datetime.now().isoformat(),
|
||||
}
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
232
klausur-service/backend/zeugnis/api_sources.py
Normal file
232
klausur-service/backend/zeugnis/api_sources.py
Normal file
@@ -0,0 +1,232 @@
|
||||
"""
|
||||
Zeugnis API Sources — source and seed URL management endpoints.
|
||||
|
||||
Extracted from zeugnis_api.py for modularity.
|
||||
"""
|
||||
|
||||
from typing import Optional, List
|
||||
from fastapi import APIRouter, HTTPException
|
||||
from pydantic import BaseModel
|
||||
|
||||
from .models import (
|
||||
ZeugnisSourceCreate, ZeugnisSourceVerify,
|
||||
SeedUrlCreate,
|
||||
LicenseType, DocType,
|
||||
BUNDESLAENDER,
|
||||
generate_id, get_training_allowed, get_bundesland_name, get_license_for_bundesland,
|
||||
)
|
||||
from metrics_db import (
|
||||
get_zeugnis_sources, upsert_zeugnis_source, get_pool,
|
||||
)
|
||||
|
||||
|
||||
router = APIRouter(prefix="/api/v1/admin/zeugnis", tags=["Zeugnis Crawler"])
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Sources Endpoints
|
||||
# =============================================================================
|
||||
|
||||
@router.get("/sources", response_model=List[dict])
|
||||
async def list_sources():
|
||||
"""Get all zeugnis sources (Bundeslaender)."""
|
||||
sources = await get_zeugnis_sources()
|
||||
if not sources:
|
||||
# Return default sources if none exist
|
||||
return [
|
||||
{
|
||||
"id": None,
|
||||
"bundesland": code,
|
||||
"name": info["name"],
|
||||
"base_url": None,
|
||||
"license_type": str(get_license_for_bundesland(code).value),
|
||||
"training_allowed": get_training_allowed(code),
|
||||
"verified_by": None,
|
||||
"verified_at": None,
|
||||
"created_at": None,
|
||||
"updated_at": None,
|
||||
}
|
||||
for code, info in BUNDESLAENDER.items()
|
||||
]
|
||||
return sources
|
||||
|
||||
|
||||
@router.post("/sources", response_model=dict)
|
||||
async def create_source(source: ZeugnisSourceCreate):
|
||||
"""Create or update a zeugnis source."""
|
||||
source_id = generate_id()
|
||||
success = await upsert_zeugnis_source(
|
||||
id=source_id,
|
||||
bundesland=source.bundesland,
|
||||
name=source.name,
|
||||
license_type=source.license_type.value,
|
||||
training_allowed=source.training_allowed,
|
||||
base_url=source.base_url,
|
||||
)
|
||||
if not success:
|
||||
raise HTTPException(status_code=500, detail="Failed to create source")
|
||||
return {"id": source_id, "success": True}
|
||||
|
||||
|
||||
@router.put("/sources/{source_id}/verify", response_model=dict)
|
||||
async def verify_source(source_id: str, verification: ZeugnisSourceVerify):
|
||||
"""Verify a source's license status."""
|
||||
pool = await get_pool()
|
||||
if not pool:
|
||||
raise HTTPException(status_code=503, detail="Database not available")
|
||||
|
||||
try:
|
||||
async with pool.acquire() as conn:
|
||||
await conn.execute(
|
||||
"""
|
||||
UPDATE zeugnis_sources
|
||||
SET license_type = $2,
|
||||
training_allowed = $3,
|
||||
verified_by = $4,
|
||||
verified_at = NOW(),
|
||||
updated_at = NOW()
|
||||
WHERE id = $1
|
||||
""",
|
||||
source_id, verification.license_type.value,
|
||||
verification.training_allowed, verification.verified_by
|
||||
)
|
||||
return {"success": True, "source_id": source_id}
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
@router.get("/sources/{bundesland}", response_model=dict)
|
||||
async def get_source_by_bundesland(bundesland: str):
|
||||
"""Get source details for a specific Bundesland."""
|
||||
pool = await get_pool()
|
||||
if not pool:
|
||||
# Return default info
|
||||
if bundesland not in BUNDESLAENDER:
|
||||
raise HTTPException(status_code=404, detail=f"Bundesland not found: {bundesland}")
|
||||
return {
|
||||
"bundesland": bundesland,
|
||||
"name": get_bundesland_name(bundesland),
|
||||
"training_allowed": get_training_allowed(bundesland),
|
||||
"license_type": get_license_for_bundesland(bundesland).value,
|
||||
"document_count": 0,
|
||||
}
|
||||
|
||||
try:
|
||||
async with pool.acquire() as conn:
|
||||
source = await conn.fetchrow(
|
||||
"SELECT * FROM zeugnis_sources WHERE bundesland = $1",
|
||||
bundesland
|
||||
)
|
||||
if source:
|
||||
doc_count = await conn.fetchval(
|
||||
"""
|
||||
SELECT COUNT(*) FROM zeugnis_documents d
|
||||
JOIN zeugnis_seed_urls u ON d.seed_url_id = u.id
|
||||
WHERE u.source_id = $1
|
||||
""",
|
||||
source["id"]
|
||||
)
|
||||
return {**dict(source), "document_count": doc_count or 0}
|
||||
|
||||
# Return default
|
||||
return {
|
||||
"bundesland": bundesland,
|
||||
"name": get_bundesland_name(bundesland),
|
||||
"training_allowed": get_training_allowed(bundesland),
|
||||
"license_type": get_license_for_bundesland(bundesland).value,
|
||||
"document_count": 0,
|
||||
}
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Seed URLs Endpoints
|
||||
# =============================================================================
|
||||
|
||||
@router.get("/sources/{source_id}/urls", response_model=List[dict])
|
||||
async def list_seed_urls(source_id: str):
|
||||
"""Get all seed URLs for a source."""
|
||||
pool = await get_pool()
|
||||
if not pool:
|
||||
return []
|
||||
|
||||
try:
|
||||
async with pool.acquire() as conn:
|
||||
rows = await conn.fetch(
|
||||
"SELECT * FROM zeugnis_seed_urls WHERE source_id = $1 ORDER BY created_at",
|
||||
source_id
|
||||
)
|
||||
return [dict(r) for r in rows]
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
@router.post("/sources/{source_id}/urls", response_model=dict)
|
||||
async def add_seed_url(source_id: str, seed_url: SeedUrlCreate):
|
||||
"""Add a new seed URL to a source."""
|
||||
pool = await get_pool()
|
||||
if not pool:
|
||||
raise HTTPException(status_code=503, detail="Database not available")
|
||||
|
||||
url_id = generate_id()
|
||||
try:
|
||||
async with pool.acquire() as conn:
|
||||
await conn.execute(
|
||||
"""
|
||||
INSERT INTO zeugnis_seed_urls (id, source_id, url, doc_type, status)
|
||||
VALUES ($1, $2, $3, $4, 'pending')
|
||||
""",
|
||||
url_id, source_id, seed_url.url, seed_url.doc_type.value
|
||||
)
|
||||
return {"id": url_id, "success": True}
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
@router.delete("/urls/{url_id}", response_model=dict)
|
||||
async def delete_seed_url(url_id: str):
|
||||
"""Delete a seed URL."""
|
||||
pool = await get_pool()
|
||||
if not pool:
|
||||
raise HTTPException(status_code=503, detail="Database not available")
|
||||
|
||||
try:
|
||||
async with pool.acquire() as conn:
|
||||
await conn.execute(
|
||||
"DELETE FROM zeugnis_seed_urls WHERE id = $1",
|
||||
url_id
|
||||
)
|
||||
return {"success": True}
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Initialization Endpoint
|
||||
# =============================================================================
|
||||
|
||||
@router.post("/init", response_model=dict)
|
||||
async def initialize_sources():
|
||||
"""Initialize default sources from BUNDESLAENDER."""
|
||||
pool = await get_pool()
|
||||
if not pool:
|
||||
raise HTTPException(status_code=503, detail="Database not available")
|
||||
|
||||
created = 0
|
||||
try:
|
||||
for code, info in BUNDESLAENDER.items():
|
||||
source_id = generate_id()
|
||||
success = await upsert_zeugnis_source(
|
||||
id=source_id,
|
||||
bundesland=code,
|
||||
name=info["name"],
|
||||
license_type=get_license_for_bundesland(code).value,
|
||||
training_allowed=get_training_allowed(code),
|
||||
)
|
||||
if success:
|
||||
created += 1
|
||||
|
||||
return {"success": True, "sources_created": created}
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
105
klausur-service/backend/zeugnis/control.py
Normal file
105
klausur-service/backend/zeugnis/control.py
Normal file
@@ -0,0 +1,105 @@
|
||||
"""
|
||||
Zeugnis Crawler - Start/stop/status control functions.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
from typing import Optional, Dict, Any
|
||||
|
||||
from .worker import ZeugnisCrawler, get_crawler_state
|
||||
|
||||
|
||||
_crawler_instance: Optional[ZeugnisCrawler] = None
|
||||
_crawler_task: Optional[asyncio.Task] = None
|
||||
|
||||
|
||||
async def start_crawler(bundesland: Optional[str] = None, source_id: Optional[str] = None) -> bool:
|
||||
"""Start the crawler."""
|
||||
global _crawler_instance, _crawler_task
|
||||
|
||||
state = get_crawler_state()
|
||||
|
||||
if state.is_running:
|
||||
return False
|
||||
|
||||
state.is_running = True
|
||||
state.documents_crawled_today = 0
|
||||
state.documents_indexed_today = 0
|
||||
state.errors_today = 0
|
||||
|
||||
_crawler_instance = ZeugnisCrawler()
|
||||
await _crawler_instance.init()
|
||||
|
||||
async def run_crawler():
|
||||
try:
|
||||
from metrics_db import get_pool
|
||||
pool = await get_pool()
|
||||
|
||||
if pool:
|
||||
async with pool.acquire() as conn:
|
||||
# Get sources to crawl
|
||||
if source_id:
|
||||
sources = await conn.fetch(
|
||||
"SELECT id, bundesland FROM zeugnis_sources WHERE id = $1",
|
||||
source_id
|
||||
)
|
||||
elif bundesland:
|
||||
sources = await conn.fetch(
|
||||
"SELECT id, bundesland FROM zeugnis_sources WHERE bundesland = $1",
|
||||
bundesland
|
||||
)
|
||||
else:
|
||||
sources = await conn.fetch(
|
||||
"SELECT id, bundesland FROM zeugnis_sources ORDER BY bundesland"
|
||||
)
|
||||
|
||||
for source in sources:
|
||||
if not state.is_running:
|
||||
break
|
||||
await _crawler_instance.crawl_source(source["id"])
|
||||
|
||||
except Exception as e:
|
||||
print(f"Crawler error: {e}")
|
||||
|
||||
finally:
|
||||
state.is_running = False
|
||||
if _crawler_instance:
|
||||
await _crawler_instance.close()
|
||||
|
||||
_crawler_task = asyncio.create_task(run_crawler())
|
||||
return True
|
||||
|
||||
|
||||
async def stop_crawler() -> bool:
|
||||
"""Stop the crawler."""
|
||||
global _crawler_task
|
||||
|
||||
state = get_crawler_state()
|
||||
|
||||
if not state.is_running:
|
||||
return False
|
||||
|
||||
state.is_running = False
|
||||
|
||||
if _crawler_task:
|
||||
_crawler_task.cancel()
|
||||
try:
|
||||
await _crawler_task
|
||||
except asyncio.CancelledError:
|
||||
pass
|
||||
|
||||
return True
|
||||
|
||||
|
||||
def get_crawler_status() -> Dict[str, Any]:
|
||||
"""Get current crawler status."""
|
||||
state = get_crawler_state()
|
||||
return {
|
||||
"is_running": state.is_running,
|
||||
"current_source": state.current_source_id,
|
||||
"current_bundesland": state.current_bundesland,
|
||||
"queue_length": len(state.queue),
|
||||
"documents_crawled_today": state.documents_crawled_today,
|
||||
"documents_indexed_today": state.documents_indexed_today,
|
||||
"errors_today": state.errors_today,
|
||||
"last_activity": state.last_activity.isoformat() if state.last_activity else None,
|
||||
}
|
||||
26
klausur-service/backend/zeugnis/crawler.py
Normal file
26
klausur-service/backend/zeugnis/crawler.py
Normal file
@@ -0,0 +1,26 @@
|
||||
"""
|
||||
Zeugnis Rights-Aware Crawler
|
||||
|
||||
Barrel re-export: all public symbols for backward compatibility.
|
||||
"""
|
||||
|
||||
from .text import ( # noqa: F401
|
||||
extract_text_from_pdf,
|
||||
extract_text_from_html,
|
||||
chunk_text,
|
||||
compute_hash,
|
||||
)
|
||||
from .storage import ( # noqa: F401
|
||||
generate_embeddings,
|
||||
upload_to_minio,
|
||||
index_in_qdrant,
|
||||
)
|
||||
from .worker import ( # noqa: F401
|
||||
CrawlerState,
|
||||
ZeugnisCrawler,
|
||||
)
|
||||
from .control import ( # noqa: F401
|
||||
start_crawler,
|
||||
stop_crawler,
|
||||
get_crawler_status,
|
||||
)
|
||||
340
klausur-service/backend/zeugnis/models.py
Normal file
340
klausur-service/backend/zeugnis/models.py
Normal file
@@ -0,0 +1,340 @@
|
||||
"""
|
||||
Zeugnis Rights-Aware Crawler - Data Models
|
||||
|
||||
Pydantic models for API requests/responses and internal data structures.
|
||||
Database schema is defined in metrics_db.py.
|
||||
"""
|
||||
|
||||
from datetime import datetime
|
||||
from enum import Enum
|
||||
from typing import Optional, List, Dict, Any
|
||||
from pydantic import BaseModel, Field
|
||||
import uuid
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Enums
|
||||
# =============================================================================
|
||||
|
||||
class LicenseType(str, Enum):
|
||||
"""License classification for training permission."""
|
||||
PUBLIC_DOMAIN = "public_domain" # Amtliche Werke (§5 UrhG)
|
||||
CC_BY = "cc_by" # Creative Commons Attribution
|
||||
CC_BY_SA = "cc_by_sa" # CC Attribution-ShareAlike
|
||||
CC_BY_NC = "cc_by_nc" # CC NonCommercial - NO TRAINING
|
||||
CC_BY_NC_SA = "cc_by_nc_sa" # CC NC-SA - NO TRAINING
|
||||
GOV_STATUTE_FREE_USE = "gov_statute" # Government statutes (gemeinfrei)
|
||||
ALL_RIGHTS_RESERVED = "all_rights" # Standard copyright - NO TRAINING
|
||||
UNKNOWN_REQUIRES_REVIEW = "unknown" # Needs manual review
|
||||
|
||||
|
||||
class CrawlStatus(str, Enum):
|
||||
"""Status of a crawl job or seed URL."""
|
||||
PENDING = "pending"
|
||||
RUNNING = "running"
|
||||
COMPLETED = "completed"
|
||||
FAILED = "failed"
|
||||
PAUSED = "paused"
|
||||
|
||||
|
||||
class DocType(str, Enum):
|
||||
"""Type of zeugnis document."""
|
||||
VERORDNUNG = "verordnung" # Official regulation
|
||||
HANDREICHUNG = "handreichung" # Implementation guide
|
||||
FORMULAR = "formular" # Form template
|
||||
ERLASS = "erlass" # Decree
|
||||
SCHULORDNUNG = "schulordnung" # School regulations
|
||||
SONSTIGES = "sonstiges" # Other
|
||||
|
||||
|
||||
class EventType(str, Enum):
|
||||
"""Audit event types."""
|
||||
CRAWLED = "crawled"
|
||||
INDEXED = "indexed"
|
||||
DOWNLOADED = "downloaded"
|
||||
VIEWED = "viewed"
|
||||
EXPORTED = "exported"
|
||||
TRAINED_ON = "trained_on"
|
||||
DELETED = "deleted"
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Bundesland Definitions
|
||||
# =============================================================================
|
||||
|
||||
BUNDESLAENDER = {
|
||||
"bw": {"name": "Baden-Württemberg", "short": "BW"},
|
||||
"by": {"name": "Bayern", "short": "BY"},
|
||||
"be": {"name": "Berlin", "short": "BE"},
|
||||
"bb": {"name": "Brandenburg", "short": "BB"},
|
||||
"hb": {"name": "Bremen", "short": "HB"},
|
||||
"hh": {"name": "Hamburg", "short": "HH"},
|
||||
"he": {"name": "Hessen", "short": "HE"},
|
||||
"mv": {"name": "Mecklenburg-Vorpommern", "short": "MV"},
|
||||
"ni": {"name": "Niedersachsen", "short": "NI"},
|
||||
"nw": {"name": "Nordrhein-Westfalen", "short": "NW"},
|
||||
"rp": {"name": "Rheinland-Pfalz", "short": "RP"},
|
||||
"sl": {"name": "Saarland", "short": "SL"},
|
||||
"sn": {"name": "Sachsen", "short": "SN"},
|
||||
"st": {"name": "Sachsen-Anhalt", "short": "ST"},
|
||||
"sh": {"name": "Schleswig-Holstein", "short": "SH"},
|
||||
"th": {"name": "Thüringen", "short": "TH"},
|
||||
}
|
||||
|
||||
|
||||
# Training permission based on Word document analysis
|
||||
TRAINING_PERMISSIONS = {
|
||||
"bw": True, # Amtliches Werk
|
||||
"by": True, # Amtliches Werk
|
||||
"be": False, # Keine Lizenz
|
||||
"bb": False, # Keine Lizenz
|
||||
"hb": False, # Eingeschränkt -> False for safety
|
||||
"hh": False, # Keine Lizenz
|
||||
"he": True, # Amtliches Werk
|
||||
"mv": False, # Eingeschränkt -> False for safety
|
||||
"ni": True, # Amtliches Werk
|
||||
"nw": True, # Amtliches Werk
|
||||
"rp": True, # Amtliches Werk
|
||||
"sl": False, # Keine Lizenz
|
||||
"sn": True, # Amtliches Werk
|
||||
"st": False, # Eingeschränkt -> False for safety
|
||||
"sh": True, # Amtliches Werk
|
||||
"th": True, # Amtliches Werk
|
||||
}
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# API Models - Sources
|
||||
# =============================================================================
|
||||
|
||||
class ZeugnisSourceBase(BaseModel):
|
||||
"""Base model for zeugnis source."""
|
||||
bundesland: str = Field(..., description="Bundesland code (e.g., 'ni', 'by')")
|
||||
name: str = Field(..., description="Full name of the source")
|
||||
base_url: Optional[str] = Field(None, description="Base URL for the source")
|
||||
license_type: LicenseType = Field(..., description="License classification")
|
||||
training_allowed: bool = Field(False, description="Whether AI training is permitted")
|
||||
|
||||
|
||||
class ZeugnisSourceCreate(ZeugnisSourceBase):
|
||||
"""Model for creating a new source."""
|
||||
pass
|
||||
|
||||
|
||||
class ZeugnisSource(ZeugnisSourceBase):
|
||||
"""Full source model with all fields."""
|
||||
id: str
|
||||
verified_by: Optional[str] = None
|
||||
verified_at: Optional[datetime] = None
|
||||
created_at: datetime
|
||||
updated_at: datetime
|
||||
|
||||
class Config:
|
||||
from_attributes = True
|
||||
|
||||
|
||||
class ZeugnisSourceVerify(BaseModel):
|
||||
"""Model for verifying a source's license."""
|
||||
verified_by: str = Field(..., description="User ID who verified")
|
||||
license_type: LicenseType
|
||||
training_allowed: bool
|
||||
notes: Optional[str] = None
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# API Models - Seed URLs
|
||||
# =============================================================================
|
||||
|
||||
class SeedUrlBase(BaseModel):
|
||||
"""Base model for seed URL."""
|
||||
url: str = Field(..., description="URL to crawl")
|
||||
doc_type: DocType = Field(DocType.VERORDNUNG, description="Type of document")
|
||||
|
||||
|
||||
class SeedUrlCreate(SeedUrlBase):
|
||||
"""Model for creating a new seed URL."""
|
||||
source_id: str
|
||||
|
||||
|
||||
class SeedUrl(SeedUrlBase):
|
||||
"""Full seed URL model."""
|
||||
id: str
|
||||
source_id: str
|
||||
status: CrawlStatus = CrawlStatus.PENDING
|
||||
last_crawled: Optional[datetime] = None
|
||||
error_message: Optional[str] = None
|
||||
created_at: datetime
|
||||
|
||||
class Config:
|
||||
from_attributes = True
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# API Models - Documents
|
||||
# =============================================================================
|
||||
|
||||
class ZeugnisDocumentBase(BaseModel):
|
||||
"""Base model for zeugnis document."""
|
||||
title: Optional[str] = None
|
||||
url: str
|
||||
content_type: Optional[str] = None
|
||||
file_size: Optional[int] = None
|
||||
|
||||
|
||||
class ZeugnisDocument(ZeugnisDocumentBase):
|
||||
"""Full document model."""
|
||||
id: str
|
||||
seed_url_id: str
|
||||
content_hash: Optional[str] = None
|
||||
minio_path: Optional[str] = None
|
||||
training_allowed: bool = False
|
||||
indexed_in_qdrant: bool = False
|
||||
bundesland: Optional[str] = None
|
||||
source_name: Optional[str] = None
|
||||
created_at: datetime
|
||||
updated_at: datetime
|
||||
|
||||
class Config:
|
||||
from_attributes = True
|
||||
|
||||
|
||||
class ZeugnisDocumentVersion(BaseModel):
|
||||
"""Document version for history tracking."""
|
||||
id: str
|
||||
document_id: str
|
||||
version: int
|
||||
content_hash: str
|
||||
minio_path: Optional[str] = None
|
||||
change_summary: Optional[str] = None
|
||||
created_at: datetime
|
||||
|
||||
class Config:
|
||||
from_attributes = True
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# API Models - Crawler
|
||||
# =============================================================================
|
||||
|
||||
class CrawlerStatus(BaseModel):
|
||||
"""Current status of the crawler."""
|
||||
is_running: bool = False
|
||||
current_source: Optional[str] = None
|
||||
current_bundesland: Optional[str] = None
|
||||
queue_length: int = 0
|
||||
documents_crawled_today: int = 0
|
||||
documents_indexed_today: int = 0
|
||||
last_activity: Optional[datetime] = None
|
||||
errors_today: int = 0
|
||||
|
||||
|
||||
class CrawlQueueItem(BaseModel):
|
||||
"""Item in the crawl queue."""
|
||||
id: str
|
||||
source_id: str
|
||||
bundesland: str
|
||||
source_name: str
|
||||
priority: int = 5
|
||||
status: CrawlStatus = CrawlStatus.PENDING
|
||||
started_at: Optional[datetime] = None
|
||||
completed_at: Optional[datetime] = None
|
||||
documents_found: int = 0
|
||||
documents_indexed: int = 0
|
||||
error_count: int = 0
|
||||
created_at: datetime
|
||||
|
||||
|
||||
class CrawlRequest(BaseModel):
|
||||
"""Request to start a crawl."""
|
||||
bundesland: Optional[str] = Field(None, description="Specific Bundesland to crawl")
|
||||
source_id: Optional[str] = Field(None, description="Specific source ID to crawl")
|
||||
priority: int = Field(5, ge=1, le=10, description="Priority (1=lowest, 10=highest)")
|
||||
|
||||
|
||||
class CrawlResult(BaseModel):
|
||||
"""Result of a crawl operation."""
|
||||
source_id: str
|
||||
bundesland: str
|
||||
documents_found: int
|
||||
documents_indexed: int
|
||||
documents_skipped: int
|
||||
errors: List[str]
|
||||
duration_seconds: float
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# API Models - Statistics
|
||||
# =============================================================================
|
||||
|
||||
class ZeugnisStats(BaseModel):
|
||||
"""Statistics for the zeugnis crawler."""
|
||||
total_sources: int = 0
|
||||
total_documents: int = 0
|
||||
indexed_documents: int = 0
|
||||
training_allowed_documents: int = 0
|
||||
active_crawls: int = 0
|
||||
per_bundesland: List[Dict[str, Any]] = []
|
||||
|
||||
|
||||
class BundeslandStats(BaseModel):
|
||||
"""Statistics per Bundesland."""
|
||||
bundesland: str
|
||||
name: str
|
||||
training_allowed: bool
|
||||
document_count: int
|
||||
indexed_count: int
|
||||
last_crawled: Optional[datetime] = None
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# API Models - Audit
|
||||
# =============================================================================
|
||||
|
||||
class UsageEvent(BaseModel):
|
||||
"""Usage event for audit trail."""
|
||||
id: str
|
||||
document_id: str
|
||||
event_type: EventType
|
||||
user_id: Optional[str] = None
|
||||
details: Optional[Dict[str, Any]] = None
|
||||
created_at: datetime
|
||||
|
||||
class Config:
|
||||
from_attributes = True
|
||||
|
||||
|
||||
class AuditExport(BaseModel):
|
||||
"""GDPR-compliant audit export."""
|
||||
export_date: datetime
|
||||
requested_by: str
|
||||
events: List[UsageEvent]
|
||||
document_count: int
|
||||
date_range_start: datetime
|
||||
date_range_end: datetime
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Helper Functions
|
||||
# =============================================================================
|
||||
|
||||
def generate_id() -> str:
|
||||
"""Generate a new UUID."""
|
||||
return str(uuid.uuid4())
|
||||
|
||||
|
||||
def get_training_allowed(bundesland: str) -> bool:
|
||||
"""Get training permission for a Bundesland."""
|
||||
return TRAINING_PERMISSIONS.get(bundesland.lower(), False)
|
||||
|
||||
|
||||
def get_bundesland_name(code: str) -> str:
|
||||
"""Get full Bundesland name from code."""
|
||||
info = BUNDESLAENDER.get(code.lower(), {})
|
||||
return info.get("name", code)
|
||||
|
||||
|
||||
def get_license_for_bundesland(bundesland: str) -> LicenseType:
|
||||
"""Get appropriate license type for a Bundesland."""
|
||||
if TRAINING_PERMISSIONS.get(bundesland.lower(), False):
|
||||
return LicenseType.GOV_STATUTE_FREE_USE
|
||||
return LicenseType.UNKNOWN_REQUIRES_REVIEW
|
||||
415
klausur-service/backend/zeugnis/seed_data.py
Normal file
415
klausur-service/backend/zeugnis/seed_data.py
Normal file
@@ -0,0 +1,415 @@
|
||||
"""
|
||||
Zeugnis Seed Data - Initial URLs from Word Document
|
||||
|
||||
Contains seed URLs for all 16 German federal states (Bundesländer)
|
||||
based on the "Bundesland URL Zeugnisse.docx" document.
|
||||
|
||||
Training permissions:
|
||||
- Ja: Amtliches Werk (§5 UrhG) - training allowed
|
||||
- Nein: Keine Lizenz angegeben - training NOT allowed
|
||||
- Eingeschränkt: Treated as NOT allowed for safety
|
||||
"""
|
||||
|
||||
from typing import Dict, List, Any
|
||||
|
||||
# Seed data structure: bundesland -> list of seed URLs
|
||||
SEED_DATA: Dict[str, Dict[str, Any]] = {
|
||||
"bw": {
|
||||
"name": "Baden-Württemberg",
|
||||
"license": "gov_statute",
|
||||
"training_allowed": True,
|
||||
"base_url": "https://www.landesrecht-bw.de",
|
||||
"urls": [
|
||||
{
|
||||
"url": "https://www.landesrecht-bw.de/jportal/portal/t/cru/page/bsbawueprod.psml?pid=Dokumentanzeige&showdoccase=1&js_peid=Trefferliste&documentnumber=1&numberofresults=1&fromdoctodoc=yes&doc.id=jlr-SchulGBWpP5&doc.part=X&doc.price=0.0&doc.hl=1",
|
||||
"doc_type": "verordnung",
|
||||
"title": "Schulgesetz BW - Zeugnisse"
|
||||
},
|
||||
{
|
||||
"url": "https://www.landesrecht-bw.de/jportal/portal/t/cs9/page/bsbawueprod.psml?pid=Dokumentanzeige&showdoccase=1&js_peid=Trefferliste&documentnumber=1&numberofresults=1&fromdoctodoc=yes&doc.id=jlr-NotenBildVBW2016rahmen&doc.part=X&doc.price=0.0",
|
||||
"doc_type": "verordnung",
|
||||
"title": "Notenbildungsverordnung"
|
||||
}
|
||||
]
|
||||
},
|
||||
"by": {
|
||||
"name": "Bayern",
|
||||
"license": "gov_statute",
|
||||
"training_allowed": True,
|
||||
"base_url": "https://www.gesetze-bayern.de",
|
||||
"urls": [
|
||||
{
|
||||
"url": "https://www.gesetze-bayern.de/Content/Document/BaySchO2016",
|
||||
"doc_type": "schulordnung",
|
||||
"title": "Bayerische Schulordnung"
|
||||
},
|
||||
{
|
||||
"url": "https://www.gesetze-bayern.de/Content/Document/BayGSO",
|
||||
"doc_type": "schulordnung",
|
||||
"title": "Grundschulordnung Bayern"
|
||||
},
|
||||
{
|
||||
"url": "https://www.gesetze-bayern.de/Content/Document/BayVSO",
|
||||
"doc_type": "schulordnung",
|
||||
"title": "Volksschulordnung Bayern"
|
||||
}
|
||||
]
|
||||
},
|
||||
"be": {
|
||||
"name": "Berlin",
|
||||
"license": "unknown",
|
||||
"training_allowed": False,
|
||||
"base_url": "https://gesetze.berlin.de",
|
||||
"urls": [
|
||||
{
|
||||
"url": "https://gesetze.berlin.de/bsbe/document/jlr-SchulGBEpP58",
|
||||
"doc_type": "verordnung",
|
||||
"title": "Berliner Schulgesetz - Zeugnisse"
|
||||
},
|
||||
{
|
||||
"url": "https://gesetze.berlin.de/bsbe/document/jlr-SekIVBE2010rahmen",
|
||||
"doc_type": "verordnung",
|
||||
"title": "Sekundarstufe I-Verordnung"
|
||||
}
|
||||
]
|
||||
},
|
||||
"bb": {
|
||||
"name": "Brandenburg",
|
||||
"license": "unknown",
|
||||
"training_allowed": False,
|
||||
"base_url": "https://bravors.brandenburg.de",
|
||||
"urls": [
|
||||
{
|
||||
"url": "https://bravors.brandenburg.de/verordnungen/vvzeugnis",
|
||||
"doc_type": "verordnung",
|
||||
"title": "Verwaltungsvorschriften Zeugnisse"
|
||||
},
|
||||
{
|
||||
"url": "https://bravors.brandenburg.de/verordnungen/gostv",
|
||||
"doc_type": "verordnung",
|
||||
"title": "GOST-Verordnung Brandenburg"
|
||||
}
|
||||
]
|
||||
},
|
||||
"hb": {
|
||||
"name": "Bremen",
|
||||
"license": "unknown",
|
||||
"training_allowed": False, # Eingeschränkt -> False for safety
|
||||
"base_url": "https://www.transparenz.bremen.de",
|
||||
"urls": [
|
||||
{
|
||||
"url": "https://www.transparenz.bremen.de/metainformationen/bremisches-schulgesetz-bremschg-vom-28-juni-2005-121009",
|
||||
"doc_type": "verordnung",
|
||||
"title": "Bremisches Schulgesetz"
|
||||
},
|
||||
{
|
||||
"url": "https://www.transparenz.bremen.de/metainformationen/verordnung-ueber-die-sekundarstufe-i-der-oberschule-vom-20-juni-2017-130380",
|
||||
"doc_type": "verordnung",
|
||||
"title": "Sekundarstufe I Verordnung Bremen"
|
||||
}
|
||||
]
|
||||
},
|
||||
"hh": {
|
||||
"name": "Hamburg",
|
||||
"license": "unknown",
|
||||
"training_allowed": False,
|
||||
"base_url": "https://www.landesrecht-hamburg.de",
|
||||
"urls": [
|
||||
{
|
||||
"url": "https://www.landesrecht-hamburg.de/bsha/document/jlr-SchulGHA2009pP44",
|
||||
"doc_type": "verordnung",
|
||||
"title": "Hamburgisches Schulgesetz - Zeugnisse"
|
||||
},
|
||||
{
|
||||
"url": "https://www.landesrecht-hamburg.de/bsha/document/jlr-AusglLeistVHA2011rahmen",
|
||||
"doc_type": "verordnung",
|
||||
"title": "Ausbildungs- und Prüfungsordnung"
|
||||
}
|
||||
]
|
||||
},
|
||||
"he": {
|
||||
"name": "Hessen",
|
||||
"license": "gov_statute",
|
||||
"training_allowed": True,
|
||||
"base_url": "https://www.rv.hessenrecht.hessen.de",
|
||||
"urls": [
|
||||
{
|
||||
"url": "https://www.rv.hessenrecht.hessen.de/bshe/document/jlr-SchulGHE2017pP73",
|
||||
"doc_type": "verordnung",
|
||||
"title": "Hessisches Schulgesetz - Zeugnisse"
|
||||
},
|
||||
{
|
||||
"url": "https://www.rv.hessenrecht.hessen.de/bshe/document/jlr-VOBGM11HE2011rahmen",
|
||||
"doc_type": "verordnung",
|
||||
"title": "Verordnung zur Gestaltung des Schulverhältnisses"
|
||||
}
|
||||
]
|
||||
},
|
||||
"mv": {
|
||||
"name": "Mecklenburg-Vorpommern",
|
||||
"license": "unknown",
|
||||
"training_allowed": False, # Eingeschränkt -> False for safety
|
||||
"base_url": "https://www.landesrecht-mv.de",
|
||||
"urls": [
|
||||
{
|
||||
"url": "https://www.landesrecht-mv.de/bsmv/document/jlr-SchulGMV2010pP63",
|
||||
"doc_type": "verordnung",
|
||||
"title": "Schulgesetz MV - Zeugnisse"
|
||||
},
|
||||
{
|
||||
"url": "https://www.landesrecht-mv.de/bsmv/document/jlr-ZeugnVMVrahmen",
|
||||
"doc_type": "verordnung",
|
||||
"title": "Zeugnisverordnung MV"
|
||||
}
|
||||
]
|
||||
},
|
||||
"ni": {
|
||||
"name": "Niedersachsen",
|
||||
"license": "gov_statute",
|
||||
"training_allowed": True,
|
||||
"base_url": "https://www.nds-voris.de",
|
||||
"urls": [
|
||||
{
|
||||
"url": "https://www.nds-voris.de/jportal/portal/t/1gxi/page/bsvorisprod.psml?pid=Dokumentanzeige&showdoccase=1&js_peid=Trefferliste&documentnumber=1&numberofresults=1&fromdoctodoc=yes&doc.id=jlr-SchulGNDpP59",
|
||||
"doc_type": "verordnung",
|
||||
"title": "Niedersächsisches Schulgesetz - Zeugnisse"
|
||||
},
|
||||
{
|
||||
"url": "https://www.nds-voris.de/jportal/portal/t/1gxi/page/bsvorisprod.psml?pid=Dokumentanzeige&showdoccase=1&js_peid=Trefferliste&documentnumber=1&numberofresults=1&fromdoctodoc=yes&doc.id=jlr-ErgZeugnErlNDrahmen",
|
||||
"doc_type": "erlass",
|
||||
"title": "Ergänzende Bestimmungen für Zeugnisse"
|
||||
},
|
||||
{
|
||||
"url": "https://www.mk.niedersachsen.de/startseite/schule/unsere_schulen/allgemein_bildende_schulen/zeugnisse_versetzungen/zeugnisse-und-versetzungen-6351.html",
|
||||
"doc_type": "handreichung",
|
||||
"title": "Handreichung Zeugnisse NI"
|
||||
}
|
||||
]
|
||||
},
|
||||
"nw": {
|
||||
"name": "Nordrhein-Westfalen",
|
||||
"license": "gov_statute",
|
||||
"training_allowed": True,
|
||||
"base_url": "https://recht.nrw.de",
|
||||
"urls": [
|
||||
{
|
||||
"url": "https://recht.nrw.de/lmi/owa/br_text_anzeigen?v_id=10000000000000000521",
|
||||
"doc_type": "verordnung",
|
||||
"title": "Schulgesetz NRW"
|
||||
},
|
||||
{
|
||||
"url": "https://recht.nrw.de/lmi/owa/br_text_anzeigen?v_id=10000000000000000525",
|
||||
"doc_type": "verordnung",
|
||||
"title": "Ausbildungs- und Prüfungsordnung Sek I"
|
||||
},
|
||||
{
|
||||
"url": "https://www.schulministerium.nrw/zeugnisse",
|
||||
"doc_type": "handreichung",
|
||||
"title": "Handreichung Zeugnisse NRW"
|
||||
}
|
||||
]
|
||||
},
|
||||
"rp": {
|
||||
"name": "Rheinland-Pfalz",
|
||||
"license": "gov_statute",
|
||||
"training_allowed": True,
|
||||
"base_url": "https://landesrecht.rlp.de",
|
||||
"urls": [
|
||||
{
|
||||
"url": "https://landesrecht.rlp.de/bsrp/document/jlr-SchulGRPpP61",
|
||||
"doc_type": "verordnung",
|
||||
"title": "Schulgesetz RP - Zeugnisse"
|
||||
},
|
||||
{
|
||||
"url": "https://landesrecht.rlp.de/bsrp/document/jlr-ZeugnVRPrahmen",
|
||||
"doc_type": "verordnung",
|
||||
"title": "Zeugnisverordnung RP"
|
||||
}
|
||||
]
|
||||
},
|
||||
"sl": {
|
||||
"name": "Saarland",
|
||||
"license": "unknown",
|
||||
"training_allowed": False,
|
||||
"base_url": "https://recht.saarland.de",
|
||||
"urls": [
|
||||
{
|
||||
"url": "https://recht.saarland.de/bssl/document/jlr-SchulOGSLrahmen",
|
||||
"doc_type": "schulordnung",
|
||||
"title": "Schulordnungsgesetz Saarland"
|
||||
},
|
||||
{
|
||||
"url": "https://recht.saarland.de/bssl/document/jlr-ZeugnVSL2014rahmen",
|
||||
"doc_type": "verordnung",
|
||||
"title": "Zeugnisverordnung Saarland"
|
||||
}
|
||||
]
|
||||
},
|
||||
"sn": {
|
||||
"name": "Sachsen",
|
||||
"license": "gov_statute",
|
||||
"training_allowed": True,
|
||||
"base_url": "https://www.revosax.sachsen.de",
|
||||
"urls": [
|
||||
{
|
||||
"url": "https://www.revosax.sachsen.de/vorschrift/4192-Schulgesetz-fuer-den-Freistaat-Sachsen",
|
||||
"doc_type": "verordnung",
|
||||
"title": "Schulgesetz Sachsen"
|
||||
},
|
||||
{
|
||||
"url": "https://www.revosax.sachsen.de/vorschrift/13500-Schulordnung-Gymnasien-Abiturpruefung",
|
||||
"doc_type": "schulordnung",
|
||||
"title": "Schulordnung Gymnasien Sachsen"
|
||||
}
|
||||
]
|
||||
},
|
||||
"st": {
|
||||
"name": "Sachsen-Anhalt",
|
||||
"license": "unknown",
|
||||
"training_allowed": False, # Eingeschränkt -> False for safety
|
||||
"base_url": "https://www.landesrecht.sachsen-anhalt.de",
|
||||
"urls": [
|
||||
{
|
||||
"url": "https://www.landesrecht.sachsen-anhalt.de/bsst/document/jlr-SchulGSTpP27",
|
||||
"doc_type": "verordnung",
|
||||
"title": "Schulgesetz Sachsen-Anhalt"
|
||||
},
|
||||
{
|
||||
"url": "https://www.landesrecht.sachsen-anhalt.de/bsst/document/jlr-VersetzVST2017rahmen",
|
||||
"doc_type": "verordnung",
|
||||
"title": "Versetzungsverordnung ST"
|
||||
}
|
||||
]
|
||||
},
|
||||
"sh": {
|
||||
"name": "Schleswig-Holstein",
|
||||
"license": "gov_statute",
|
||||
"training_allowed": True,
|
||||
"base_url": "https://www.gesetze-rechtsprechung.sh.juris.de",
|
||||
"urls": [
|
||||
{
|
||||
"url": "https://www.gesetze-rechtsprechung.sh.juris.de/jportal/portal/t/10wx/page/bsshoprod.psml?pid=Dokumentanzeige&showdoccase=1&js_peid=Trefferliste&documentnumber=1&numberofresults=1&fromdoctodoc=yes&doc.id=jlr-SchulGSHpP22",
|
||||
"doc_type": "verordnung",
|
||||
"title": "Schulgesetz SH - Zeugnisse"
|
||||
},
|
||||
{
|
||||
"url": "https://www.gesetze-rechtsprechung.sh.juris.de/jportal/portal/t/10wx/page/bsshoprod.psml?pid=Dokumentanzeige&showdoccase=1&js_peid=Trefferliste&documentnumber=1&numberofresults=1&fromdoctodoc=yes&doc.id=jlr-ZeugnVSHrahmen",
|
||||
"doc_type": "verordnung",
|
||||
"title": "Zeugnisverordnung SH"
|
||||
}
|
||||
]
|
||||
},
|
||||
"th": {
|
||||
"name": "Thüringen",
|
||||
"license": "gov_statute",
|
||||
"training_allowed": True,
|
||||
"base_url": "https://landesrecht.thueringen.de",
|
||||
"urls": [
|
||||
{
|
||||
"url": "https://landesrecht.thueringen.de/bsth/document/jlr-SchulGTHpP58",
|
||||
"doc_type": "verordnung",
|
||||
"title": "Thüringer Schulgesetz - Zeugnisse"
|
||||
},
|
||||
{
|
||||
"url": "https://landesrecht.thueringen.de/bsth/document/jlr-SchulOTH2018rahmen",
|
||||
"doc_type": "schulordnung",
|
||||
"title": "Thüringer Schulordnung"
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
async def populate_seed_data():
|
||||
"""Populate database with seed data."""
|
||||
from metrics_db import get_pool, upsert_zeugnis_source
|
||||
from zeugnis_models import generate_id
|
||||
|
||||
pool = await get_pool()
|
||||
if not pool:
|
||||
print("Database not available")
|
||||
return False
|
||||
|
||||
try:
|
||||
async with pool.acquire() as conn:
|
||||
for bundesland, data in SEED_DATA.items():
|
||||
# Create or update source
|
||||
source_id = generate_id()
|
||||
await upsert_zeugnis_source(
|
||||
id=source_id,
|
||||
bundesland=bundesland,
|
||||
name=data["name"],
|
||||
license_type=data["license"],
|
||||
training_allowed=data["training_allowed"],
|
||||
base_url=data.get("base_url"),
|
||||
)
|
||||
|
||||
# Get the actual source ID (might be existing)
|
||||
existing = await conn.fetchrow(
|
||||
"SELECT id FROM zeugnis_sources WHERE bundesland = $1",
|
||||
bundesland
|
||||
)
|
||||
if existing:
|
||||
source_id = existing["id"]
|
||||
|
||||
# Add seed URLs
|
||||
for url_data in data.get("urls", []):
|
||||
url_id = generate_id()
|
||||
await conn.execute(
|
||||
"""
|
||||
INSERT INTO zeugnis_seed_urls (id, source_id, url, doc_type, status)
|
||||
VALUES ($1, $2, $3, $4, 'pending')
|
||||
ON CONFLICT DO NOTHING
|
||||
""",
|
||||
url_id, source_id, url_data["url"], url_data["doc_type"]
|
||||
)
|
||||
|
||||
print(f"Populated {bundesland}: {len(data.get('urls', []))} URLs")
|
||||
|
||||
print("Seed data population complete!")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
print(f"Failed to populate seed data: {e}")
|
||||
return False
|
||||
|
||||
|
||||
def get_training_summary() -> Dict[str, List[str]]:
|
||||
"""Get summary of training permissions."""
|
||||
allowed = []
|
||||
not_allowed = []
|
||||
|
||||
for bundesland, data in SEED_DATA.items():
|
||||
name = data["name"]
|
||||
if data["training_allowed"]:
|
||||
allowed.append(f"{name} ({bundesland})")
|
||||
else:
|
||||
not_allowed.append(f"{name} ({bundesland})")
|
||||
|
||||
return {
|
||||
"training_allowed": sorted(allowed),
|
||||
"training_not_allowed": sorted(not_allowed),
|
||||
"total_allowed": len(allowed),
|
||||
"total_not_allowed": len(not_allowed),
|
||||
}
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import asyncio
|
||||
|
||||
print("=" * 60)
|
||||
print("Zeugnis Seed Data Summary")
|
||||
print("=" * 60)
|
||||
|
||||
summary = get_training_summary()
|
||||
print(f"\nTraining ALLOWED ({summary['total_allowed']} Bundesländer):")
|
||||
for bl in summary["training_allowed"]:
|
||||
print(f" ✓ {bl}")
|
||||
|
||||
print(f"\nTraining NOT ALLOWED ({summary['total_not_allowed']} Bundesländer):")
|
||||
for bl in summary["training_not_allowed"]:
|
||||
print(f" ✗ {bl}")
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print("To populate database, run:")
|
||||
print(" python -c 'import asyncio; from zeugnis_seed_data import populate_seed_data; asyncio.run(populate_seed_data())'")
|
||||
180
klausur-service/backend/zeugnis/storage.py
Normal file
180
klausur-service/backend/zeugnis/storage.py
Normal file
@@ -0,0 +1,180 @@
|
||||
"""
|
||||
Zeugnis Crawler - Embedding generation, MinIO upload, and Qdrant indexing.
|
||||
"""
|
||||
|
||||
import io
|
||||
import os
|
||||
import uuid
|
||||
from datetime import datetime
|
||||
from typing import Optional, List, Dict, Any
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Configuration
|
||||
# =============================================================================
|
||||
|
||||
QDRANT_URL = os.getenv("QDRANT_URL", "http://localhost:6333")
|
||||
MINIO_ENDPOINT = os.getenv("MINIO_ENDPOINT", "localhost:9000")
|
||||
MINIO_ACCESS_KEY = os.getenv("MINIO_ACCESS_KEY", "test-access-key")
|
||||
MINIO_SECRET_KEY = os.getenv("MINIO_SECRET_KEY", "test-secret-key")
|
||||
MINIO_BUCKET = os.getenv("MINIO_BUCKET", "breakpilot-rag")
|
||||
EMBEDDING_BACKEND = os.getenv("EMBEDDING_BACKEND", "local")
|
||||
|
||||
ZEUGNIS_COLLECTION = "bp_zeugnis"
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Embedding Generation
|
||||
# =============================================================================
|
||||
|
||||
_embedding_model = None
|
||||
|
||||
|
||||
def get_embedding_model():
|
||||
"""Get or initialize embedding model."""
|
||||
global _embedding_model
|
||||
if _embedding_model is None and EMBEDDING_BACKEND == "local":
|
||||
try:
|
||||
from sentence_transformers import SentenceTransformer
|
||||
_embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
|
||||
print("Loaded local embedding model: all-MiniLM-L6-v2")
|
||||
except ImportError:
|
||||
print("Warning: sentence-transformers not installed")
|
||||
return _embedding_model
|
||||
|
||||
|
||||
async def generate_embeddings(texts: List[str]) -> List[List[float]]:
|
||||
"""Generate embeddings for a list of texts."""
|
||||
if not texts:
|
||||
return []
|
||||
|
||||
if EMBEDDING_BACKEND == "local":
|
||||
model = get_embedding_model()
|
||||
if model:
|
||||
embeddings = model.encode(texts, show_progress_bar=False)
|
||||
return [emb.tolist() for emb in embeddings]
|
||||
return []
|
||||
|
||||
elif EMBEDDING_BACKEND == "openai":
|
||||
import openai
|
||||
api_key = os.getenv("OPENAI_API_KEY")
|
||||
if not api_key:
|
||||
print("Warning: OPENAI_API_KEY not set")
|
||||
return []
|
||||
|
||||
client = openai.AsyncOpenAI(api_key=api_key)
|
||||
response = await client.embeddings.create(
|
||||
input=texts,
|
||||
model="text-embedding-3-small"
|
||||
)
|
||||
return [item.embedding for item in response.data]
|
||||
|
||||
return []
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# MinIO Storage
|
||||
# =============================================================================
|
||||
|
||||
async def upload_to_minio(
|
||||
content: bytes,
|
||||
bundesland: str,
|
||||
filename: str,
|
||||
content_type: str = "application/pdf",
|
||||
year: Optional[int] = None,
|
||||
) -> Optional[str]:
|
||||
"""Upload document to MinIO."""
|
||||
try:
|
||||
from minio import Minio
|
||||
|
||||
client = Minio(
|
||||
MINIO_ENDPOINT,
|
||||
access_key=MINIO_ACCESS_KEY,
|
||||
secret_key=MINIO_SECRET_KEY,
|
||||
secure=os.getenv("MINIO_SECURE", "false").lower() == "true"
|
||||
)
|
||||
|
||||
# Ensure bucket exists
|
||||
if not client.bucket_exists(MINIO_BUCKET):
|
||||
client.make_bucket(MINIO_BUCKET)
|
||||
|
||||
# Build path
|
||||
year_str = str(year) if year else str(datetime.now().year)
|
||||
object_name = f"landes-daten/{bundesland}/zeugnis/{year_str}/{filename}"
|
||||
|
||||
# Upload
|
||||
client.put_object(
|
||||
MINIO_BUCKET,
|
||||
object_name,
|
||||
io.BytesIO(content),
|
||||
len(content),
|
||||
content_type=content_type,
|
||||
)
|
||||
|
||||
return object_name
|
||||
except Exception as e:
|
||||
print(f"MinIO upload failed: {e}")
|
||||
return None
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Qdrant Indexing
|
||||
# =============================================================================
|
||||
|
||||
async def index_in_qdrant(
|
||||
doc_id: str,
|
||||
chunks: List[str],
|
||||
embeddings: List[List[float]],
|
||||
metadata: Dict[str, Any],
|
||||
) -> int:
|
||||
"""Index document chunks in Qdrant."""
|
||||
try:
|
||||
from qdrant_client import QdrantClient
|
||||
from qdrant_client.models import VectorParams, Distance, PointStruct
|
||||
|
||||
client = QdrantClient(url=QDRANT_URL)
|
||||
|
||||
# Ensure collection exists
|
||||
collections = client.get_collections().collections
|
||||
if not any(c.name == ZEUGNIS_COLLECTION for c in collections):
|
||||
vector_size = len(embeddings[0]) if embeddings else 384
|
||||
client.create_collection(
|
||||
collection_name=ZEUGNIS_COLLECTION,
|
||||
vectors_config=VectorParams(
|
||||
size=vector_size,
|
||||
distance=Distance.COSINE,
|
||||
),
|
||||
)
|
||||
print(f"Created Qdrant collection: {ZEUGNIS_COLLECTION}")
|
||||
|
||||
# Create points
|
||||
points = []
|
||||
for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
|
||||
point_id = str(uuid.uuid4())
|
||||
points.append(PointStruct(
|
||||
id=point_id,
|
||||
vector=embedding,
|
||||
payload={
|
||||
"document_id": doc_id,
|
||||
"chunk_index": i,
|
||||
"chunk_text": chunk[:500], # Store first 500 chars for preview
|
||||
"bundesland": metadata.get("bundesland"),
|
||||
"doc_type": metadata.get("doc_type"),
|
||||
"title": metadata.get("title"),
|
||||
"source_url": metadata.get("url"),
|
||||
"training_allowed": metadata.get("training_allowed", False),
|
||||
"indexed_at": datetime.now().isoformat(),
|
||||
}
|
||||
))
|
||||
|
||||
# Upsert
|
||||
if points:
|
||||
client.upsert(
|
||||
collection_name=ZEUGNIS_COLLECTION,
|
||||
points=points,
|
||||
)
|
||||
|
||||
return len(points)
|
||||
except Exception as e:
|
||||
print(f"Qdrant indexing failed: {e}")
|
||||
return 0
|
||||
110
klausur-service/backend/zeugnis/text.py
Normal file
110
klausur-service/backend/zeugnis/text.py
Normal file
@@ -0,0 +1,110 @@
|
||||
"""
|
||||
Zeugnis Crawler - Text extraction, chunking, and hashing utilities.
|
||||
"""
|
||||
|
||||
import hashlib
|
||||
from typing import List
|
||||
|
||||
CHUNK_SIZE = 1000
|
||||
CHUNK_OVERLAP = 200
|
||||
|
||||
|
||||
def extract_text_from_pdf(content: bytes) -> str:
|
||||
"""Extract text from PDF bytes."""
|
||||
try:
|
||||
from PyPDF2 import PdfReader
|
||||
import io
|
||||
|
||||
reader = PdfReader(io.BytesIO(content))
|
||||
text_parts = []
|
||||
for page in reader.pages:
|
||||
text = page.extract_text()
|
||||
if text:
|
||||
text_parts.append(text)
|
||||
return "\n\n".join(text_parts)
|
||||
except Exception as e:
|
||||
print(f"PDF extraction failed: {e}")
|
||||
return ""
|
||||
|
||||
|
||||
def extract_text_from_html(content: bytes, encoding: str = "utf-8") -> str:
|
||||
"""Extract text from HTML bytes."""
|
||||
try:
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
html = content.decode(encoding, errors="replace")
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
|
||||
# Remove script and style elements
|
||||
for element in soup(["script", "style", "nav", "header", "footer"]):
|
||||
element.decompose()
|
||||
|
||||
# Get text
|
||||
text = soup.get_text(separator="\n", strip=True)
|
||||
|
||||
# Clean up whitespace
|
||||
lines = [line.strip() for line in text.splitlines() if line.strip()]
|
||||
return "\n".join(lines)
|
||||
except Exception as e:
|
||||
print(f"HTML extraction failed: {e}")
|
||||
return ""
|
||||
|
||||
|
||||
def chunk_text(text: str, chunk_size: int = CHUNK_SIZE, overlap: int = CHUNK_OVERLAP) -> List[str]:
|
||||
"""Split text into overlapping chunks."""
|
||||
if not text:
|
||||
return []
|
||||
|
||||
chunks = []
|
||||
separators = ["\n\n", "\n", ". ", " "]
|
||||
|
||||
def split_recursive(text: str, sep_index: int = 0) -> List[str]:
|
||||
if len(text) <= chunk_size:
|
||||
return [text] if text.strip() else []
|
||||
|
||||
if sep_index >= len(separators):
|
||||
# Force split at chunk_size
|
||||
result = []
|
||||
for i in range(0, len(text), chunk_size - overlap):
|
||||
chunk = text[i:i + chunk_size]
|
||||
if chunk.strip():
|
||||
result.append(chunk)
|
||||
return result
|
||||
|
||||
sep = separators[sep_index]
|
||||
parts = text.split(sep)
|
||||
result = []
|
||||
current = ""
|
||||
|
||||
for part in parts:
|
||||
if len(current) + len(sep) + len(part) <= chunk_size:
|
||||
current = current + sep + part if current else part
|
||||
else:
|
||||
if current.strip():
|
||||
result.extend(split_recursive(current, sep_index + 1) if len(current) > chunk_size else [current])
|
||||
current = part
|
||||
|
||||
if current.strip():
|
||||
result.extend(split_recursive(current, sep_index + 1) if len(current) > chunk_size else [current])
|
||||
|
||||
return result
|
||||
|
||||
chunks = split_recursive(text)
|
||||
|
||||
# Add overlap
|
||||
if overlap > 0 and len(chunks) > 1:
|
||||
overlapped = []
|
||||
for i, chunk in enumerate(chunks):
|
||||
if i > 0:
|
||||
# Add end of previous chunk
|
||||
prev_end = chunks[i - 1][-overlap:]
|
||||
chunk = prev_end + chunk
|
||||
overlapped.append(chunk)
|
||||
chunks = overlapped
|
||||
|
||||
return chunks
|
||||
|
||||
|
||||
def compute_hash(content: bytes) -> str:
|
||||
"""Compute SHA-256 hash of content."""
|
||||
return hashlib.sha256(content).hexdigest()
|
||||
313
klausur-service/backend/zeugnis/worker.py
Normal file
313
klausur-service/backend/zeugnis/worker.py
Normal file
@@ -0,0 +1,313 @@
|
||||
"""
|
||||
Zeugnis Crawler - ZeugnisCrawler worker class and CrawlerState.
|
||||
|
||||
Crawls official government documents about school certificates from
|
||||
all 16 German federal states. Only indexes documents where AI training
|
||||
is legally permitted.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
from datetime import datetime
|
||||
from typing import Optional, List, Dict, Any, Tuple
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
import httpx
|
||||
|
||||
from .models import generate_id
|
||||
from .text import (
|
||||
extract_text_from_pdf,
|
||||
extract_text_from_html,
|
||||
chunk_text,
|
||||
compute_hash,
|
||||
)
|
||||
from .storage import (
|
||||
upload_to_minio,
|
||||
generate_embeddings,
|
||||
index_in_qdrant,
|
||||
)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Configuration
|
||||
# =============================================================================
|
||||
|
||||
MAX_RETRIES = 3
|
||||
RETRY_DELAY = 5 # seconds
|
||||
REQUEST_TIMEOUT = 30 # seconds
|
||||
USER_AGENT = "BreakPilot-Zeugnis-Crawler/1.0 (Educational Research)"
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Crawler State
|
||||
# =============================================================================
|
||||
|
||||
@dataclass
|
||||
class CrawlerState:
|
||||
"""Global crawler state."""
|
||||
is_running: bool = False
|
||||
current_source_id: Optional[str] = None
|
||||
current_bundesland: Optional[str] = None
|
||||
queue: List[Dict] = field(default_factory=list)
|
||||
documents_crawled_today: int = 0
|
||||
documents_indexed_today: int = 0
|
||||
errors_today: int = 0
|
||||
last_activity: Optional[datetime] = None
|
||||
|
||||
|
||||
_crawler_state = CrawlerState()
|
||||
|
||||
|
||||
def get_crawler_state() -> CrawlerState:
|
||||
"""Get the global crawler state."""
|
||||
return _crawler_state
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Crawler Worker
|
||||
# =============================================================================
|
||||
|
||||
class ZeugnisCrawler:
|
||||
"""Rights-aware crawler for zeugnis documents."""
|
||||
|
||||
def __init__(self):
|
||||
self.http_client: Optional[httpx.AsyncClient] = None
|
||||
self.db_pool = None
|
||||
|
||||
async def init(self):
|
||||
"""Initialize crawler resources."""
|
||||
self.http_client = httpx.AsyncClient(
|
||||
timeout=REQUEST_TIMEOUT,
|
||||
follow_redirects=True,
|
||||
headers={"User-Agent": USER_AGENT},
|
||||
)
|
||||
|
||||
# Initialize database connection
|
||||
try:
|
||||
from metrics_db import get_pool
|
||||
self.db_pool = await get_pool()
|
||||
except Exception as e:
|
||||
print(f"Failed to get database pool: {e}")
|
||||
|
||||
async def close(self):
|
||||
"""Close crawler resources."""
|
||||
if self.http_client:
|
||||
await self.http_client.aclose()
|
||||
|
||||
async def fetch_url(self, url: str) -> Tuple[Optional[bytes], Optional[str]]:
|
||||
"""Fetch URL with retry logic."""
|
||||
for attempt in range(MAX_RETRIES):
|
||||
try:
|
||||
response = await self.http_client.get(url)
|
||||
response.raise_for_status()
|
||||
content_type = response.headers.get("content-type", "")
|
||||
return response.content, content_type
|
||||
except httpx.HTTPStatusError as e:
|
||||
print(f"HTTP error {e.response.status_code} for {url}")
|
||||
if e.response.status_code == 404:
|
||||
return None, None
|
||||
except Exception as e:
|
||||
print(f"Attempt {attempt + 1}/{MAX_RETRIES} failed for {url}: {e}")
|
||||
if attempt < MAX_RETRIES - 1:
|
||||
await asyncio.sleep(RETRY_DELAY * (attempt + 1))
|
||||
return None, None
|
||||
|
||||
async def crawl_seed_url(
|
||||
self,
|
||||
seed_url_id: str,
|
||||
url: str,
|
||||
bundesland: str,
|
||||
doc_type: str,
|
||||
training_allowed: bool,
|
||||
) -> Dict[str, Any]:
|
||||
"""Crawl a single seed URL."""
|
||||
global _crawler_state
|
||||
|
||||
result = {
|
||||
"seed_url_id": seed_url_id,
|
||||
"url": url,
|
||||
"success": False,
|
||||
"document_id": None,
|
||||
"indexed": False,
|
||||
"error": None,
|
||||
}
|
||||
|
||||
try:
|
||||
# Fetch content
|
||||
content, content_type = await self.fetch_url(url)
|
||||
if not content:
|
||||
result["error"] = "Failed to fetch URL"
|
||||
return result
|
||||
|
||||
# Determine file type
|
||||
is_pdf = "pdf" in content_type.lower() or url.lower().endswith(".pdf")
|
||||
|
||||
# Extract text
|
||||
if is_pdf:
|
||||
text = extract_text_from_pdf(content)
|
||||
filename = url.split("/")[-1] or f"document_{seed_url_id}.pdf"
|
||||
else:
|
||||
text = extract_text_from_html(content)
|
||||
filename = f"document_{seed_url_id}.html"
|
||||
|
||||
if not text:
|
||||
result["error"] = "No text extracted"
|
||||
return result
|
||||
|
||||
# Compute hash for versioning
|
||||
content_hash = compute_hash(content)
|
||||
|
||||
# Upload to MinIO
|
||||
minio_path = await upload_to_minio(
|
||||
content,
|
||||
bundesland,
|
||||
filename,
|
||||
content_type=content_type or "application/octet-stream",
|
||||
)
|
||||
|
||||
# Generate document ID
|
||||
doc_id = generate_id()
|
||||
|
||||
# Store document in database
|
||||
if self.db_pool:
|
||||
async with self.db_pool.acquire() as conn:
|
||||
await conn.execute(
|
||||
"""
|
||||
INSERT INTO zeugnis_documents
|
||||
(id, seed_url_id, title, url, content_hash, minio_path,
|
||||
training_allowed, file_size, content_type)
|
||||
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9)
|
||||
ON CONFLICT DO NOTHING
|
||||
""",
|
||||
doc_id, seed_url_id, filename, url, content_hash,
|
||||
minio_path, training_allowed, len(content), content_type
|
||||
)
|
||||
|
||||
result["document_id"] = doc_id
|
||||
result["success"] = True
|
||||
_crawler_state.documents_crawled_today += 1
|
||||
|
||||
# Only index if training is allowed
|
||||
if training_allowed:
|
||||
chunks = chunk_text(text)
|
||||
if chunks:
|
||||
embeddings = await generate_embeddings(chunks)
|
||||
if embeddings:
|
||||
indexed_count = await index_in_qdrant(
|
||||
doc_id,
|
||||
chunks,
|
||||
embeddings,
|
||||
{
|
||||
"bundesland": bundesland,
|
||||
"doc_type": doc_type,
|
||||
"title": filename,
|
||||
"url": url,
|
||||
"training_allowed": True,
|
||||
}
|
||||
)
|
||||
if indexed_count > 0:
|
||||
result["indexed"] = True
|
||||
_crawler_state.documents_indexed_today += 1
|
||||
|
||||
# Update database
|
||||
if self.db_pool:
|
||||
async with self.db_pool.acquire() as conn:
|
||||
await conn.execute(
|
||||
"UPDATE zeugnis_documents SET indexed_in_qdrant = true WHERE id = $1",
|
||||
doc_id
|
||||
)
|
||||
else:
|
||||
result["indexed"] = False
|
||||
result["error"] = "Training not allowed for this source"
|
||||
|
||||
_crawler_state.last_activity = datetime.now()
|
||||
|
||||
except Exception as e:
|
||||
result["error"] = str(e)
|
||||
_crawler_state.errors_today += 1
|
||||
|
||||
return result
|
||||
|
||||
async def crawl_source(self, source_id: str) -> Dict[str, Any]:
|
||||
"""Crawl all seed URLs for a source."""
|
||||
global _crawler_state
|
||||
|
||||
result = {
|
||||
"source_id": source_id,
|
||||
"documents_found": 0,
|
||||
"documents_indexed": 0,
|
||||
"errors": [],
|
||||
"started_at": datetime.now(),
|
||||
"completed_at": None,
|
||||
}
|
||||
|
||||
if not self.db_pool:
|
||||
result["errors"].append("Database not available")
|
||||
return result
|
||||
|
||||
try:
|
||||
async with self.db_pool.acquire() as conn:
|
||||
# Get source info
|
||||
source = await conn.fetchrow(
|
||||
"SELECT * FROM zeugnis_sources WHERE id = $1",
|
||||
source_id
|
||||
)
|
||||
if not source:
|
||||
result["errors"].append(f"Source not found: {source_id}")
|
||||
return result
|
||||
|
||||
bundesland = source["bundesland"]
|
||||
training_allowed = source["training_allowed"]
|
||||
|
||||
_crawler_state.current_source_id = source_id
|
||||
_crawler_state.current_bundesland = bundesland
|
||||
|
||||
# Get seed URLs
|
||||
seed_urls = await conn.fetch(
|
||||
"SELECT * FROM zeugnis_seed_urls WHERE source_id = $1 AND status != 'completed'",
|
||||
source_id
|
||||
)
|
||||
|
||||
for seed_url in seed_urls:
|
||||
# Update status to running
|
||||
await conn.execute(
|
||||
"UPDATE zeugnis_seed_urls SET status = 'running' WHERE id = $1",
|
||||
seed_url["id"]
|
||||
)
|
||||
|
||||
# Crawl
|
||||
crawl_result = await self.crawl_seed_url(
|
||||
seed_url["id"],
|
||||
seed_url["url"],
|
||||
bundesland,
|
||||
seed_url["doc_type"],
|
||||
training_allowed,
|
||||
)
|
||||
|
||||
# Update status
|
||||
if crawl_result["success"]:
|
||||
result["documents_found"] += 1
|
||||
if crawl_result["indexed"]:
|
||||
result["documents_indexed"] += 1
|
||||
await conn.execute(
|
||||
"UPDATE zeugnis_seed_urls SET status = 'completed', last_crawled = NOW() WHERE id = $1",
|
||||
seed_url["id"]
|
||||
)
|
||||
else:
|
||||
result["errors"].append(f"{seed_url['url']}: {crawl_result['error']}")
|
||||
await conn.execute(
|
||||
"UPDATE zeugnis_seed_urls SET status = 'failed', error_message = $2 WHERE id = $1",
|
||||
seed_url["id"], crawl_result["error"]
|
||||
)
|
||||
|
||||
# Small delay between requests
|
||||
await asyncio.sleep(1)
|
||||
|
||||
except Exception as e:
|
||||
result["errors"].append(str(e))
|
||||
|
||||
finally:
|
||||
result["completed_at"] = datetime.now()
|
||||
_crawler_state.current_source_id = None
|
||||
_crawler_state.current_bundesland = None
|
||||
|
||||
return result
|
||||
Reference in New Issue
Block a user