Restructure: Move 52 files into 7 domain packages
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 28s
CI / test-go-edu-search (push) Successful in 28s
CI / test-python-klausur (push) Failing after 2m22s
CI / test-python-agent-core (push) Successful in 21s
CI / test-nodejs-website (push) Successful in 23s
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 28s
CI / test-go-edu-search (push) Successful in 28s
CI / test-python-klausur (push) Failing after 2m22s
CI / test-python-agent-core (push) Successful in 21s
CI / test-nodejs-website (push) Successful in 23s
korrektur/ zeugnis/ admin/ compliance/ worksheet/ training/ metrics/ 52 shims, relative imports, RAG untouched. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,6 @@
|
||||
"""
|
||||
admin package — admin APIs for NiBiS, RAG, templates.
|
||||
|
||||
Backward-compatible re-exports: consumers can still use
|
||||
``from admin_api import ...`` etc. via the shim files in backend/.
|
||||
"""
|
||||
@@ -0,0 +1,33 @@
|
||||
"""
|
||||
Admin API for NiBiS Data Management (barrel re-export)
|
||||
|
||||
This module was split into:
|
||||
- admin_nibis.py (NiBiS ingestion, search, stats)
|
||||
- admin_rag.py (RAG upload, metrics, storage)
|
||||
- admin_templates.py (Legal templates ingestion, search)
|
||||
|
||||
The `router` object is assembled here by including all sub-routers.
|
||||
Importers that did `from admin_api import router` continue to work.
|
||||
"""
|
||||
|
||||
from fastapi import APIRouter
|
||||
|
||||
from .nibis import router as _nibis_router
|
||||
from .rag import router as _rag_router
|
||||
from .templates import router as _templates_router
|
||||
|
||||
# Re-export internal state for test importers
|
||||
from .nibis import ( # noqa: F401
|
||||
_ingestion_status,
|
||||
NiBiSSearchRequest,
|
||||
search_nibis,
|
||||
)
|
||||
from .rag import _upload_history # noqa: F401
|
||||
from .templates import _templates_ingestion_status # noqa: F401
|
||||
|
||||
# Assemble the combined router.
|
||||
# All sub-routers use prefix="/api/v1/admin", so include without extra prefix.
|
||||
router = APIRouter()
|
||||
router.include_router(_nibis_router)
|
||||
router.include_router(_rag_router)
|
||||
router.include_router(_templates_router)
|
||||
@@ -0,0 +1,316 @@
|
||||
"""
|
||||
Admin API - NiBiS Ingestion & Search
|
||||
|
||||
Endpoints for NiBiS data discovery, ingestion, search, and statistics.
|
||||
Extracted from admin_api.py for file-size compliance.
|
||||
"""
|
||||
|
||||
from fastapi import APIRouter, HTTPException, BackgroundTasks, Query
|
||||
from pydantic import BaseModel
|
||||
from typing import Optional, List, Dict
|
||||
from datetime import datetime
|
||||
|
||||
from nibis_ingestion import (
|
||||
run_ingestion,
|
||||
discover_documents,
|
||||
extract_zip_files,
|
||||
DOCS_BASE_PATH,
|
||||
)
|
||||
from qdrant_service import QdrantService, search_nibis_eh, get_qdrant_client
|
||||
from eh_pipeline import generate_single_embedding
|
||||
|
||||
router = APIRouter(prefix="/api/v1/admin", tags=["Admin"])
|
||||
|
||||
# Store for background task status
|
||||
_ingestion_status: Dict = {
|
||||
"running": False,
|
||||
"last_run": None,
|
||||
"last_result": None,
|
||||
}
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Models
|
||||
# =============================================================================
|
||||
|
||||
class IngestionRequest(BaseModel):
|
||||
ewh_only: bool = True
|
||||
year_filter: Optional[int] = None
|
||||
subject_filter: Optional[str] = None
|
||||
|
||||
|
||||
class IngestionStatus(BaseModel):
|
||||
running: bool
|
||||
last_run: Optional[str]
|
||||
documents_indexed: Optional[int]
|
||||
chunks_created: Optional[int]
|
||||
errors: Optional[List[str]]
|
||||
|
||||
|
||||
class NiBiSSearchRequest(BaseModel):
|
||||
query: str
|
||||
year: Optional[int] = None
|
||||
subject: Optional[str] = None
|
||||
niveau: Optional[str] = None
|
||||
limit: int = 5
|
||||
|
||||
|
||||
class NiBiSSearchResult(BaseModel):
|
||||
id: str
|
||||
score: float
|
||||
text: str
|
||||
year: Optional[int]
|
||||
subject: Optional[str]
|
||||
niveau: Optional[str]
|
||||
task_number: Optional[int]
|
||||
|
||||
|
||||
class DataSourceStats(BaseModel):
|
||||
source_dir: str
|
||||
year: int
|
||||
document_count: int
|
||||
subjects: List[str]
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Endpoints
|
||||
# =============================================================================
|
||||
|
||||
@router.get("/nibis/status", response_model=IngestionStatus)
|
||||
async def get_ingestion_status():
|
||||
"""Get status of NiBiS ingestion pipeline."""
|
||||
last_result = _ingestion_status.get("last_result") or {}
|
||||
return IngestionStatus(
|
||||
running=_ingestion_status["running"],
|
||||
last_run=_ingestion_status.get("last_run"),
|
||||
documents_indexed=last_result.get("documents_indexed"),
|
||||
chunks_created=last_result.get("chunks_created"),
|
||||
errors=(last_result.get("errors") or [])[:10],
|
||||
)
|
||||
|
||||
|
||||
@router.post("/nibis/extract-zips")
|
||||
async def extract_zip_files_endpoint():
|
||||
"""Extract all ZIP files in za-download directories."""
|
||||
try:
|
||||
extracted = extract_zip_files(DOCS_BASE_PATH)
|
||||
return {
|
||||
"status": "success",
|
||||
"extracted_count": len(extracted),
|
||||
"directories": [str(d) for d in extracted],
|
||||
}
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
@router.get("/nibis/discover")
|
||||
async def discover_nibis_documents(
|
||||
ewh_only: bool = Query(True, description="Only return Erwartungshorizonte"),
|
||||
year: Optional[int] = Query(None, description="Filter by year"),
|
||||
subject: Optional[str] = Query(None, description="Filter by subject"),
|
||||
):
|
||||
"""
|
||||
Discover available NiBiS documents without indexing.
|
||||
Useful for previewing what will be indexed.
|
||||
"""
|
||||
try:
|
||||
documents = discover_documents(DOCS_BASE_PATH, ewh_only=ewh_only)
|
||||
|
||||
# Apply filters
|
||||
if year:
|
||||
documents = [d for d in documents if d.year == year]
|
||||
if subject:
|
||||
documents = [d for d in documents if subject.lower() in d.subject.lower()]
|
||||
|
||||
# Group by year and subject
|
||||
by_year: Dict[int, int] = {}
|
||||
by_subject: Dict[str, int] = {}
|
||||
for doc in documents:
|
||||
by_year[doc.year] = by_year.get(doc.year, 0) + 1
|
||||
by_subject[doc.subject] = by_subject.get(doc.subject, 0) + 1
|
||||
|
||||
return {
|
||||
"total_documents": len(documents),
|
||||
"by_year": dict(sorted(by_year.items())),
|
||||
"by_subject": dict(sorted(by_subject.items(), key=lambda x: -x[1])),
|
||||
"sample_documents": [
|
||||
{
|
||||
"id": d.id,
|
||||
"filename": d.raw_filename,
|
||||
"year": d.year,
|
||||
"subject": d.subject,
|
||||
"niveau": d.niveau,
|
||||
"doc_type": d.doc_type,
|
||||
}
|
||||
for d in documents[:20]
|
||||
],
|
||||
}
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
@router.post("/nibis/ingest")
|
||||
async def start_ingestion(
|
||||
request: IngestionRequest,
|
||||
background_tasks: BackgroundTasks,
|
||||
):
|
||||
"""
|
||||
Start NiBiS data ingestion in background.
|
||||
"""
|
||||
if _ingestion_status["running"]:
|
||||
raise HTTPException(
|
||||
status_code=409,
|
||||
detail="Ingestion already running. Check /nibis/status for progress."
|
||||
)
|
||||
|
||||
async def run_ingestion_task():
|
||||
global _ingestion_status
|
||||
_ingestion_status["running"] = True
|
||||
_ingestion_status["last_run"] = datetime.now().isoformat()
|
||||
|
||||
try:
|
||||
result = await run_ingestion(
|
||||
ewh_only=request.ewh_only,
|
||||
dry_run=False,
|
||||
year_filter=request.year_filter,
|
||||
subject_filter=request.subject_filter,
|
||||
)
|
||||
_ingestion_status["last_result"] = result
|
||||
except Exception as e:
|
||||
_ingestion_status["last_result"] = {"error": str(e), "errors": [str(e)]}
|
||||
finally:
|
||||
_ingestion_status["running"] = False
|
||||
|
||||
background_tasks.add_task(run_ingestion_task)
|
||||
|
||||
return {
|
||||
"status": "started",
|
||||
"message": "Ingestion started in background. Check /nibis/status for progress.",
|
||||
"filters": {
|
||||
"ewh_only": request.ewh_only,
|
||||
"year": request.year_filter,
|
||||
"subject": request.subject_filter,
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
@router.post("/nibis/search", response_model=List[NiBiSSearchResult])
|
||||
async def search_nibis(request: NiBiSSearchRequest):
|
||||
"""
|
||||
Semantic search in NiBiS Erwartungshorizonte.
|
||||
"""
|
||||
try:
|
||||
query_embedding = await generate_single_embedding(request.query)
|
||||
|
||||
if not query_embedding:
|
||||
raise HTTPException(status_code=500, detail="Failed to generate embedding")
|
||||
|
||||
results = await search_nibis_eh(
|
||||
query_embedding=query_embedding,
|
||||
year=request.year,
|
||||
subject=request.subject,
|
||||
niveau=request.niveau,
|
||||
limit=request.limit,
|
||||
)
|
||||
|
||||
return [
|
||||
NiBiSSearchResult(
|
||||
id=r["id"],
|
||||
score=r["score"],
|
||||
text=r.get("text", "")[:500],
|
||||
year=r.get("year"),
|
||||
subject=r.get("subject"),
|
||||
niveau=r.get("niveau"),
|
||||
task_number=r.get("task_number"),
|
||||
)
|
||||
for r in results
|
||||
]
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
@router.get("/nibis/collections")
|
||||
async def get_collections_info():
|
||||
"""Get information about all Qdrant collections."""
|
||||
try:
|
||||
client = get_qdrant_client()
|
||||
collections = client.get_collections().collections
|
||||
|
||||
result = []
|
||||
for c in collections:
|
||||
try:
|
||||
info = client.get_collection(c.name)
|
||||
result.append({
|
||||
"name": c.name,
|
||||
"vectors_count": info.vectors_count,
|
||||
"points_count": info.points_count,
|
||||
"status": info.status.value,
|
||||
})
|
||||
except Exception as e:
|
||||
result.append({
|
||||
"name": c.name,
|
||||
"error": str(e),
|
||||
})
|
||||
|
||||
return {"collections": result}
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
@router.get("/nibis/stats")
|
||||
async def get_nibis_stats():
|
||||
"""Get detailed statistics about indexed NiBiS data."""
|
||||
try:
|
||||
qdrant = QdrantService()
|
||||
stats = await qdrant.get_stats("bp_nibis_eh")
|
||||
|
||||
if "error" in stats:
|
||||
return {
|
||||
"indexed": False,
|
||||
"message": "NiBiS collection not yet created. Run ingestion first.",
|
||||
}
|
||||
|
||||
client = get_qdrant_client()
|
||||
scroll_result = client.scroll(
|
||||
collection_name="bp_nibis_eh",
|
||||
limit=1000,
|
||||
with_payload=True,
|
||||
with_vectors=False,
|
||||
)
|
||||
|
||||
years = set()
|
||||
subjects = set()
|
||||
niveaus = set()
|
||||
|
||||
for point in scroll_result[0]:
|
||||
if point.payload:
|
||||
if "year" in point.payload:
|
||||
years.add(point.payload["year"])
|
||||
if "subject" in point.payload:
|
||||
subjects.add(point.payload["subject"])
|
||||
if "niveau" in point.payload:
|
||||
niveaus.add(point.payload["niveau"])
|
||||
|
||||
return {
|
||||
"indexed": True,
|
||||
"total_chunks": stats.get("points_count", 0),
|
||||
"years": sorted(list(years)),
|
||||
"subjects": sorted(list(subjects)),
|
||||
"niveaus": sorted(list(niveaus)),
|
||||
}
|
||||
except Exception as e:
|
||||
return {
|
||||
"indexed": False,
|
||||
"error": str(e),
|
||||
}
|
||||
|
||||
|
||||
@router.delete("/nibis/collection")
|
||||
async def delete_nibis_collection():
|
||||
"""Delete the entire NiBiS collection. WARNING: removes all indexed data!"""
|
||||
try:
|
||||
client = get_qdrant_client()
|
||||
client.delete_collection("bp_nibis_eh")
|
||||
return {"status": "deleted", "collection": "bp_nibis_eh"}
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
@@ -0,0 +1,281 @@
|
||||
"""
|
||||
Admin API - RAG Upload & Metrics
|
||||
|
||||
Endpoints for uploading documents, tracking uploads, RAG metrics,
|
||||
search feedback, storage stats, and service initialization.
|
||||
Extracted from admin_api.py for file-size compliance.
|
||||
"""
|
||||
|
||||
from fastapi import APIRouter, HTTPException, BackgroundTasks, Query, UploadFile, File, Form
|
||||
from pydantic import BaseModel
|
||||
from typing import Optional, List, Dict
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
import zipfile
|
||||
import tempfile
|
||||
import os
|
||||
|
||||
from nibis_ingestion import run_ingestion, DOCS_BASE_PATH
|
||||
|
||||
# Import ingestion status from nibis module for auto-ingest
|
||||
from .nibis import _ingestion_status
|
||||
|
||||
# Optional: MinIO and PostgreSQL integrations
|
||||
try:
|
||||
from minio_storage import upload_rag_document, get_storage_stats, init_minio_bucket
|
||||
MINIO_AVAILABLE = True
|
||||
except ImportError:
|
||||
MINIO_AVAILABLE = False
|
||||
|
||||
try:
|
||||
from metrics_db import (
|
||||
init_metrics_tables, store_feedback, log_search, log_upload,
|
||||
calculate_metrics, get_recent_feedback, get_upload_history
|
||||
)
|
||||
METRICS_DB_AVAILABLE = True
|
||||
except ImportError:
|
||||
METRICS_DB_AVAILABLE = False
|
||||
|
||||
router = APIRouter(prefix="/api/v1/admin", tags=["Admin"])
|
||||
|
||||
# Upload directory configuration
|
||||
RAG_UPLOAD_BASE = Path(os.getenv("RAG_UPLOAD_BASE", str(DOCS_BASE_PATH)))
|
||||
|
||||
# Store for upload tracking
|
||||
_upload_history: List[Dict] = []
|
||||
|
||||
|
||||
class UploadResult(BaseModel):
|
||||
status: str
|
||||
files_received: int
|
||||
pdfs_extracted: int
|
||||
target_directory: str
|
||||
errors: List[str]
|
||||
|
||||
|
||||
@router.post("/rag/upload", response_model=UploadResult)
|
||||
async def upload_rag_documents(
|
||||
background_tasks: BackgroundTasks,
|
||||
file: UploadFile = File(...),
|
||||
collection: str = Form(default="bp_nibis_eh"),
|
||||
year: Optional[int] = Form(default=None),
|
||||
auto_ingest: bool = Form(default=False),
|
||||
):
|
||||
"""
|
||||
Upload documents for RAG indexing.
|
||||
|
||||
Supports:
|
||||
- ZIP archives (automatically extracted)
|
||||
- Individual PDF files
|
||||
"""
|
||||
errors = []
|
||||
pdfs_extracted = 0
|
||||
|
||||
# Determine target year
|
||||
target_year = year or datetime.now().year
|
||||
|
||||
# Target directory: za-download/YYYY/
|
||||
target_dir = RAG_UPLOAD_BASE / "za-download" / str(target_year)
|
||||
target_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
try:
|
||||
filename = file.filename or "upload"
|
||||
|
||||
if filename.lower().endswith(".zip"):
|
||||
# Handle ZIP file
|
||||
with tempfile.NamedTemporaryFile(delete=False, suffix=".zip") as tmp:
|
||||
content = await file.read()
|
||||
tmp.write(content)
|
||||
tmp_path = tmp.name
|
||||
|
||||
try:
|
||||
with zipfile.ZipFile(tmp_path, 'r') as zf:
|
||||
for member in zf.namelist():
|
||||
if member.lower().endswith(".pdf") and not member.startswith("__MACOSX"):
|
||||
pdf_name = Path(member).name
|
||||
if pdf_name:
|
||||
target_path = target_dir / pdf_name
|
||||
with zf.open(member) as src:
|
||||
with open(target_path, 'wb') as dst:
|
||||
dst.write(src.read())
|
||||
pdfs_extracted += 1
|
||||
finally:
|
||||
os.unlink(tmp_path)
|
||||
|
||||
elif filename.lower().endswith(".pdf"):
|
||||
target_path = target_dir / filename
|
||||
content = await file.read()
|
||||
with open(target_path, 'wb') as f:
|
||||
f.write(content)
|
||||
pdfs_extracted = 1
|
||||
else:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail=f"Unsupported file type: {filename}. Only .zip and .pdf are allowed."
|
||||
)
|
||||
|
||||
# Track upload in memory
|
||||
upload_record = {
|
||||
"timestamp": datetime.now().isoformat(),
|
||||
"filename": filename,
|
||||
"collection": collection,
|
||||
"year": target_year,
|
||||
"pdfs_extracted": pdfs_extracted,
|
||||
"target_directory": str(target_dir),
|
||||
}
|
||||
_upload_history.append(upload_record)
|
||||
|
||||
# Keep only last 100 uploads in memory
|
||||
if len(_upload_history) > 100:
|
||||
_upload_history.pop(0)
|
||||
|
||||
# Store in PostgreSQL if available
|
||||
if METRICS_DB_AVAILABLE:
|
||||
await log_upload(
|
||||
filename=filename,
|
||||
collection_name=collection,
|
||||
year=target_year,
|
||||
pdfs_extracted=pdfs_extracted,
|
||||
minio_path=str(target_dir),
|
||||
)
|
||||
|
||||
# Auto-ingest if requested
|
||||
if auto_ingest and not _ingestion_status["running"]:
|
||||
async def run_auto_ingest():
|
||||
global _ingestion_status
|
||||
_ingestion_status["running"] = True
|
||||
_ingestion_status["last_run"] = datetime.now().isoformat()
|
||||
|
||||
try:
|
||||
result = await run_ingestion(
|
||||
ewh_only=True,
|
||||
dry_run=False,
|
||||
year_filter=target_year,
|
||||
)
|
||||
_ingestion_status["last_result"] = result
|
||||
except Exception as e:
|
||||
_ingestion_status["last_result"] = {"error": str(e), "errors": [str(e)]}
|
||||
finally:
|
||||
_ingestion_status["running"] = False
|
||||
|
||||
background_tasks.add_task(run_auto_ingest)
|
||||
|
||||
return UploadResult(
|
||||
status="success",
|
||||
files_received=1,
|
||||
pdfs_extracted=pdfs_extracted,
|
||||
target_directory=str(target_dir),
|
||||
errors=errors,
|
||||
)
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
errors.append(str(e))
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
@router.get("/rag/upload/history")
|
||||
async def get_upload_history_endpoint(limit: int = Query(default=20, le=100)):
|
||||
"""Get recent upload history."""
|
||||
return {
|
||||
"uploads": _upload_history[-limit:][::-1],
|
||||
"total": len(_upload_history),
|
||||
}
|
||||
|
||||
|
||||
@router.get("/rag/metrics")
|
||||
async def get_rag_metrics(
|
||||
collection: Optional[str] = Query(default=None),
|
||||
days: int = Query(default=7, le=90),
|
||||
):
|
||||
"""Get RAG quality metrics."""
|
||||
if METRICS_DB_AVAILABLE:
|
||||
metrics = await calculate_metrics(collection_name=collection, days=days)
|
||||
if metrics.get("connected"):
|
||||
return metrics
|
||||
|
||||
# Fallback: Return placeholder metrics
|
||||
return {
|
||||
"precision_at_5": 0.78,
|
||||
"recall_at_10": 0.85,
|
||||
"mrr": 0.72,
|
||||
"avg_latency_ms": 52,
|
||||
"total_ratings": len(_upload_history),
|
||||
"error_rate": 0.3,
|
||||
"score_distribution": {
|
||||
"0.9+": 23,
|
||||
"0.7-0.9": 41,
|
||||
"0.5-0.7": 28,
|
||||
"<0.5": 8,
|
||||
},
|
||||
"note": "Placeholder metrics - PostgreSQL not connected",
|
||||
"connected": False,
|
||||
}
|
||||
|
||||
|
||||
@router.post("/rag/search/feedback")
|
||||
async def submit_search_feedback(
|
||||
result_id: str = Form(...),
|
||||
rating: int = Form(..., ge=1, le=5),
|
||||
notes: Optional[str] = Form(default=None),
|
||||
query: Optional[str] = Form(default=None),
|
||||
collection: Optional[str] = Form(default=None),
|
||||
score: Optional[float] = Form(default=None),
|
||||
):
|
||||
"""Submit feedback for a search result."""
|
||||
feedback_record = {
|
||||
"timestamp": datetime.now().isoformat(),
|
||||
"result_id": result_id,
|
||||
"rating": rating,
|
||||
"notes": notes,
|
||||
}
|
||||
|
||||
stored = False
|
||||
if METRICS_DB_AVAILABLE:
|
||||
stored = await store_feedback(
|
||||
result_id=result_id,
|
||||
rating=rating,
|
||||
query_text=query,
|
||||
collection_name=collection,
|
||||
score=score,
|
||||
notes=notes,
|
||||
)
|
||||
|
||||
return {
|
||||
"status": "stored" if stored else "received",
|
||||
"feedback": feedback_record,
|
||||
"persisted": stored,
|
||||
}
|
||||
|
||||
|
||||
@router.get("/rag/storage/stats")
|
||||
async def get_storage_statistics():
|
||||
"""Get MinIO storage statistics."""
|
||||
if MINIO_AVAILABLE:
|
||||
stats = await get_storage_stats()
|
||||
return stats
|
||||
return {
|
||||
"error": "MinIO not available",
|
||||
"connected": False,
|
||||
}
|
||||
|
||||
|
||||
@router.post("/rag/init")
|
||||
async def initialize_rag_services():
|
||||
"""Initialize RAG services (MinIO bucket, PostgreSQL tables)."""
|
||||
results = {
|
||||
"minio": False,
|
||||
"postgres": False,
|
||||
}
|
||||
|
||||
if MINIO_AVAILABLE:
|
||||
results["minio"] = await init_minio_bucket()
|
||||
|
||||
if METRICS_DB_AVAILABLE:
|
||||
results["postgres"] = await init_metrics_tables()
|
||||
|
||||
return {
|
||||
"status": "initialized",
|
||||
"services": results,
|
||||
}
|
||||
@@ -0,0 +1,389 @@
|
||||
"""
|
||||
Admin API - Legal Templates
|
||||
|
||||
Endpoints for legal template ingestion, search, source management,
|
||||
license info, and collection management.
|
||||
Extracted from admin_api.py for file-size compliance.
|
||||
"""
|
||||
|
||||
from fastapi import APIRouter, HTTPException, BackgroundTasks, Query
|
||||
from pydantic import BaseModel
|
||||
from typing import Optional, List, Dict
|
||||
from datetime import datetime
|
||||
|
||||
from eh_pipeline import generate_single_embedding
|
||||
|
||||
# Import legal templates modules
|
||||
try:
|
||||
from legal_templates_ingestion import (
|
||||
LegalTemplatesIngestion,
|
||||
LEGAL_TEMPLATES_COLLECTION,
|
||||
)
|
||||
from template_sources import (
|
||||
TEMPLATE_SOURCES,
|
||||
TEMPLATE_TYPES,
|
||||
JURISDICTIONS,
|
||||
LicenseType,
|
||||
get_enabled_sources,
|
||||
get_sources_by_priority,
|
||||
)
|
||||
from qdrant_service import (
|
||||
search_legal_templates,
|
||||
get_legal_templates_stats,
|
||||
init_legal_templates_collection,
|
||||
)
|
||||
LEGAL_TEMPLATES_AVAILABLE = True
|
||||
except ImportError as e:
|
||||
print(f"Legal templates module not available: {e}")
|
||||
LEGAL_TEMPLATES_AVAILABLE = False
|
||||
|
||||
router = APIRouter(prefix="/api/v1/admin", tags=["Admin"])
|
||||
|
||||
# Store for templates ingestion status
|
||||
_templates_ingestion_status: Dict = {
|
||||
"running": False,
|
||||
"last_run": None,
|
||||
"current_source": None,
|
||||
"results": {},
|
||||
}
|
||||
|
||||
|
||||
class TemplatesSearchRequest(BaseModel):
|
||||
query: str
|
||||
template_type: Optional[str] = None
|
||||
license_types: Optional[List[str]] = None
|
||||
language: Optional[str] = None
|
||||
jurisdiction: Optional[str] = None
|
||||
attribution_required: Optional[bool] = None
|
||||
limit: int = 10
|
||||
|
||||
|
||||
class TemplatesSearchResult(BaseModel):
|
||||
id: str
|
||||
score: float
|
||||
text: str
|
||||
document_title: Optional[str]
|
||||
template_type: Optional[str]
|
||||
clause_category: Optional[str]
|
||||
language: Optional[str]
|
||||
jurisdiction: Optional[str]
|
||||
license_id: Optional[str]
|
||||
license_name: Optional[str]
|
||||
attribution_required: Optional[bool]
|
||||
attribution_text: Optional[str]
|
||||
source_name: Optional[str]
|
||||
source_url: Optional[str]
|
||||
placeholders: Optional[List[str]]
|
||||
is_complete_document: Optional[bool]
|
||||
requires_customization: Optional[bool]
|
||||
|
||||
|
||||
class SourceIngestRequest(BaseModel):
|
||||
source_name: str
|
||||
|
||||
|
||||
@router.get("/templates/status")
|
||||
async def get_templates_status():
|
||||
"""Get status of legal templates collection and ingestion."""
|
||||
if not LEGAL_TEMPLATES_AVAILABLE:
|
||||
return {
|
||||
"available": False,
|
||||
"error": "Legal templates module not available",
|
||||
}
|
||||
|
||||
try:
|
||||
stats = await get_legal_templates_stats()
|
||||
|
||||
return {
|
||||
"available": True,
|
||||
"collection": LEGAL_TEMPLATES_COLLECTION,
|
||||
"ingestion": {
|
||||
"running": _templates_ingestion_status["running"],
|
||||
"last_run": _templates_ingestion_status.get("last_run"),
|
||||
"current_source": _templates_ingestion_status.get("current_source"),
|
||||
"results": _templates_ingestion_status.get("results", {}),
|
||||
},
|
||||
"stats": stats,
|
||||
}
|
||||
except Exception as e:
|
||||
return {
|
||||
"available": True,
|
||||
"error": str(e),
|
||||
"ingestion": _templates_ingestion_status,
|
||||
}
|
||||
|
||||
|
||||
@router.get("/templates/sources")
|
||||
async def get_templates_sources():
|
||||
"""Get list of all template sources with their configuration."""
|
||||
if not LEGAL_TEMPLATES_AVAILABLE:
|
||||
raise HTTPException(status_code=503, detail="Legal templates module not available")
|
||||
|
||||
sources = []
|
||||
for source in TEMPLATE_SOURCES:
|
||||
sources.append({
|
||||
"name": source.name,
|
||||
"description": source.description,
|
||||
"license_type": source.license_type.value,
|
||||
"license_name": source.license_info.name,
|
||||
"template_types": source.template_types,
|
||||
"languages": source.languages,
|
||||
"jurisdiction": source.jurisdiction,
|
||||
"repo_url": source.repo_url,
|
||||
"web_url": source.web_url,
|
||||
"priority": source.priority,
|
||||
"enabled": source.enabled,
|
||||
"attribution_required": source.license_info.attribution_required,
|
||||
})
|
||||
|
||||
return {
|
||||
"sources": sources,
|
||||
"total": len(sources),
|
||||
"enabled": len([s for s in TEMPLATE_SOURCES if s.enabled]),
|
||||
"template_types": TEMPLATE_TYPES,
|
||||
"jurisdictions": JURISDICTIONS,
|
||||
}
|
||||
|
||||
|
||||
@router.get("/templates/licenses")
|
||||
async def get_templates_licenses():
|
||||
"""Get license statistics for indexed templates."""
|
||||
if not LEGAL_TEMPLATES_AVAILABLE:
|
||||
raise HTTPException(status_code=503, detail="Legal templates module not available")
|
||||
|
||||
try:
|
||||
stats = await get_legal_templates_stats()
|
||||
return {
|
||||
"licenses": stats.get("licenses", {}),
|
||||
"total_chunks": stats.get("points_count", 0),
|
||||
}
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
@router.post("/templates/ingest")
|
||||
async def start_templates_ingestion(
|
||||
background_tasks: BackgroundTasks,
|
||||
max_priority: int = Query(default=3, ge=1, le=5, description="Maximum priority level (1=highest)"),
|
||||
):
|
||||
"""
|
||||
Start legal templates ingestion in background.
|
||||
Ingests all enabled sources up to the specified priority level.
|
||||
"""
|
||||
if not LEGAL_TEMPLATES_AVAILABLE:
|
||||
raise HTTPException(status_code=503, detail="Legal templates module not available")
|
||||
|
||||
if _templates_ingestion_status["running"]:
|
||||
raise HTTPException(
|
||||
status_code=409,
|
||||
detail="Templates ingestion already running. Check /templates/status for progress."
|
||||
)
|
||||
|
||||
async def run_templates_ingestion():
|
||||
global _templates_ingestion_status
|
||||
_templates_ingestion_status["running"] = True
|
||||
_templates_ingestion_status["last_run"] = datetime.now().isoformat()
|
||||
_templates_ingestion_status["results"] = {}
|
||||
|
||||
try:
|
||||
ingestion = LegalTemplatesIngestion()
|
||||
sources = get_sources_by_priority(max_priority)
|
||||
|
||||
for source in sources:
|
||||
_templates_ingestion_status["current_source"] = source.name
|
||||
|
||||
try:
|
||||
status = await ingestion.ingest_source(source)
|
||||
_templates_ingestion_status["results"][source.name] = {
|
||||
"status": status.status,
|
||||
"documents_found": status.documents_found,
|
||||
"chunks_indexed": status.chunks_indexed,
|
||||
"errors": status.errors[:5] if status.errors else [],
|
||||
}
|
||||
except Exception as e:
|
||||
_templates_ingestion_status["results"][source.name] = {
|
||||
"status": "failed",
|
||||
"error": str(e),
|
||||
}
|
||||
|
||||
await ingestion.close()
|
||||
|
||||
except Exception as e:
|
||||
_templates_ingestion_status["results"]["_global_error"] = str(e)
|
||||
finally:
|
||||
_templates_ingestion_status["running"] = False
|
||||
_templates_ingestion_status["current_source"] = None
|
||||
|
||||
background_tasks.add_task(run_templates_ingestion)
|
||||
|
||||
sources = get_sources_by_priority(max_priority)
|
||||
return {
|
||||
"status": "started",
|
||||
"message": f"Ingesting {len(sources)} sources up to priority {max_priority}",
|
||||
"sources": [s.name for s in sources],
|
||||
}
|
||||
|
||||
|
||||
@router.post("/templates/ingest-source")
|
||||
async def ingest_single_source(
|
||||
request: SourceIngestRequest,
|
||||
background_tasks: BackgroundTasks,
|
||||
):
|
||||
"""Ingest a single template source by name."""
|
||||
if not LEGAL_TEMPLATES_AVAILABLE:
|
||||
raise HTTPException(status_code=503, detail="Legal templates module not available")
|
||||
|
||||
source = next((s for s in TEMPLATE_SOURCES if s.name == request.source_name), None)
|
||||
if not source:
|
||||
raise HTTPException(
|
||||
status_code=404,
|
||||
detail=f"Source not found: {request.source_name}. Use /templates/sources to list available sources."
|
||||
)
|
||||
|
||||
if not source.enabled:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail=f"Source is disabled: {request.source_name}"
|
||||
)
|
||||
|
||||
if _templates_ingestion_status["running"]:
|
||||
raise HTTPException(
|
||||
status_code=409,
|
||||
detail="Templates ingestion already running."
|
||||
)
|
||||
|
||||
async def run_single_ingestion():
|
||||
global _templates_ingestion_status
|
||||
_templates_ingestion_status["running"] = True
|
||||
_templates_ingestion_status["current_source"] = source.name
|
||||
_templates_ingestion_status["last_run"] = datetime.now().isoformat()
|
||||
|
||||
try:
|
||||
ingestion = LegalTemplatesIngestion()
|
||||
status = await ingestion.ingest_source(source)
|
||||
_templates_ingestion_status["results"][source.name] = {
|
||||
"status": status.status,
|
||||
"documents_found": status.documents_found,
|
||||
"chunks_indexed": status.chunks_indexed,
|
||||
"errors": status.errors[:5] if status.errors else [],
|
||||
}
|
||||
await ingestion.close()
|
||||
|
||||
except Exception as e:
|
||||
_templates_ingestion_status["results"][source.name] = {
|
||||
"status": "failed",
|
||||
"error": str(e),
|
||||
}
|
||||
finally:
|
||||
_templates_ingestion_status["running"] = False
|
||||
_templates_ingestion_status["current_source"] = None
|
||||
|
||||
background_tasks.add_task(run_single_ingestion)
|
||||
|
||||
return {
|
||||
"status": "started",
|
||||
"source": source.name,
|
||||
"license": source.license_type.value,
|
||||
"template_types": source.template_types,
|
||||
}
|
||||
|
||||
|
||||
@router.post("/templates/search", response_model=List[TemplatesSearchResult])
|
||||
async def search_templates(request: TemplatesSearchRequest):
|
||||
"""Semantic search in legal templates collection."""
|
||||
if not LEGAL_TEMPLATES_AVAILABLE:
|
||||
raise HTTPException(status_code=503, detail="Legal templates module not available")
|
||||
|
||||
try:
|
||||
query_embedding = await generate_single_embedding(request.query)
|
||||
|
||||
if not query_embedding:
|
||||
raise HTTPException(status_code=500, detail="Failed to generate embedding")
|
||||
|
||||
results = await search_legal_templates(
|
||||
query_embedding=query_embedding,
|
||||
template_type=request.template_type,
|
||||
license_types=request.license_types,
|
||||
language=request.language,
|
||||
jurisdiction=request.jurisdiction,
|
||||
attribution_required=request.attribution_required,
|
||||
limit=request.limit,
|
||||
)
|
||||
|
||||
return [
|
||||
TemplatesSearchResult(
|
||||
id=r["id"],
|
||||
score=r["score"],
|
||||
text=r.get("text", "")[:1000],
|
||||
document_title=r.get("document_title"),
|
||||
template_type=r.get("template_type"),
|
||||
clause_category=r.get("clause_category"),
|
||||
language=r.get("language"),
|
||||
jurisdiction=r.get("jurisdiction"),
|
||||
license_id=r.get("license_id"),
|
||||
license_name=r.get("license_name"),
|
||||
attribution_required=r.get("attribution_required"),
|
||||
attribution_text=r.get("attribution_text"),
|
||||
source_name=r.get("source_name"),
|
||||
source_url=r.get("source_url"),
|
||||
placeholders=r.get("placeholders"),
|
||||
is_complete_document=r.get("is_complete_document"),
|
||||
requires_customization=r.get("requires_customization"),
|
||||
)
|
||||
for r in results
|
||||
]
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
@router.delete("/templates/reset")
|
||||
async def reset_templates_collection():
|
||||
"""Delete and recreate the legal templates collection."""
|
||||
if not LEGAL_TEMPLATES_AVAILABLE:
|
||||
raise HTTPException(status_code=503, detail="Legal templates module not available")
|
||||
|
||||
if _templates_ingestion_status["running"]:
|
||||
raise HTTPException(
|
||||
status_code=409,
|
||||
detail="Cannot reset while ingestion is running"
|
||||
)
|
||||
|
||||
try:
|
||||
ingestion = LegalTemplatesIngestion()
|
||||
ingestion.reset_collection()
|
||||
await ingestion.close()
|
||||
|
||||
_templates_ingestion_status["results"] = {}
|
||||
|
||||
return {
|
||||
"status": "reset",
|
||||
"collection": LEGAL_TEMPLATES_COLLECTION,
|
||||
"message": "Collection deleted and recreated. Run ingestion to populate.",
|
||||
}
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
@router.delete("/templates/source/{source_name}")
|
||||
async def delete_templates_source(source_name: str):
|
||||
"""Delete all templates from a specific source."""
|
||||
if not LEGAL_TEMPLATES_AVAILABLE:
|
||||
raise HTTPException(status_code=503, detail="Legal templates module not available")
|
||||
|
||||
try:
|
||||
from qdrant_service import delete_legal_templates_by_source
|
||||
|
||||
count = await delete_legal_templates_by_source(source_name)
|
||||
|
||||
if source_name in _templates_ingestion_status.get("results", {}):
|
||||
del _templates_ingestion_status["results"][source_name]
|
||||
|
||||
return {
|
||||
"status": "deleted",
|
||||
"source": source_name,
|
||||
"chunks_deleted": count,
|
||||
}
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
@@ -1,33 +1,4 @@
|
||||
"""
|
||||
Admin API for NiBiS Data Management (barrel re-export)
|
||||
|
||||
This module was split into:
|
||||
- admin_nibis.py (NiBiS ingestion, search, stats)
|
||||
- admin_rag.py (RAG upload, metrics, storage)
|
||||
- admin_templates.py (Legal templates ingestion, search)
|
||||
|
||||
The `router` object is assembled here by including all sub-routers.
|
||||
Importers that did `from admin_api import router` continue to work.
|
||||
"""
|
||||
|
||||
from fastapi import APIRouter
|
||||
|
||||
from admin_nibis import router as _nibis_router
|
||||
from admin_rag import router as _rag_router
|
||||
from admin_templates import router as _templates_router
|
||||
|
||||
# Re-export internal state for test importers
|
||||
from admin_nibis import ( # noqa: F401
|
||||
_ingestion_status,
|
||||
NiBiSSearchRequest,
|
||||
search_nibis,
|
||||
)
|
||||
from admin_rag import _upload_history # noqa: F401
|
||||
from admin_templates import _templates_ingestion_status # noqa: F401
|
||||
|
||||
# Assemble the combined router.
|
||||
# All sub-routers use prefix="/api/v1/admin", so include without extra prefix.
|
||||
router = APIRouter()
|
||||
router.include_router(_nibis_router)
|
||||
router.include_router(_rag_router)
|
||||
router.include_router(_templates_router)
|
||||
# Backward-compat shim -- module moved to admin/api.py
|
||||
import importlib as _importlib
|
||||
import sys as _sys
|
||||
_sys.modules[__name__] = _importlib.import_module("admin.api")
|
||||
|
||||
@@ -1,316 +1,4 @@
|
||||
"""
|
||||
Admin API - NiBiS Ingestion & Search
|
||||
|
||||
Endpoints for NiBiS data discovery, ingestion, search, and statistics.
|
||||
Extracted from admin_api.py for file-size compliance.
|
||||
"""
|
||||
|
||||
from fastapi import APIRouter, HTTPException, BackgroundTasks, Query
|
||||
from pydantic import BaseModel
|
||||
from typing import Optional, List, Dict
|
||||
from datetime import datetime
|
||||
|
||||
from nibis_ingestion import (
|
||||
run_ingestion,
|
||||
discover_documents,
|
||||
extract_zip_files,
|
||||
DOCS_BASE_PATH,
|
||||
)
|
||||
from qdrant_service import QdrantService, search_nibis_eh, get_qdrant_client
|
||||
from eh_pipeline import generate_single_embedding
|
||||
|
||||
router = APIRouter(prefix="/api/v1/admin", tags=["Admin"])
|
||||
|
||||
# Store for background task status
|
||||
_ingestion_status: Dict = {
|
||||
"running": False,
|
||||
"last_run": None,
|
||||
"last_result": None,
|
||||
}
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Models
|
||||
# =============================================================================
|
||||
|
||||
class IngestionRequest(BaseModel):
|
||||
ewh_only: bool = True
|
||||
year_filter: Optional[int] = None
|
||||
subject_filter: Optional[str] = None
|
||||
|
||||
|
||||
class IngestionStatus(BaseModel):
|
||||
running: bool
|
||||
last_run: Optional[str]
|
||||
documents_indexed: Optional[int]
|
||||
chunks_created: Optional[int]
|
||||
errors: Optional[List[str]]
|
||||
|
||||
|
||||
class NiBiSSearchRequest(BaseModel):
|
||||
query: str
|
||||
year: Optional[int] = None
|
||||
subject: Optional[str] = None
|
||||
niveau: Optional[str] = None
|
||||
limit: int = 5
|
||||
|
||||
|
||||
class NiBiSSearchResult(BaseModel):
|
||||
id: str
|
||||
score: float
|
||||
text: str
|
||||
year: Optional[int]
|
||||
subject: Optional[str]
|
||||
niveau: Optional[str]
|
||||
task_number: Optional[int]
|
||||
|
||||
|
||||
class DataSourceStats(BaseModel):
|
||||
source_dir: str
|
||||
year: int
|
||||
document_count: int
|
||||
subjects: List[str]
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Endpoints
|
||||
# =============================================================================
|
||||
|
||||
@router.get("/nibis/status", response_model=IngestionStatus)
|
||||
async def get_ingestion_status():
|
||||
"""Get status of NiBiS ingestion pipeline."""
|
||||
last_result = _ingestion_status.get("last_result") or {}
|
||||
return IngestionStatus(
|
||||
running=_ingestion_status["running"],
|
||||
last_run=_ingestion_status.get("last_run"),
|
||||
documents_indexed=last_result.get("documents_indexed"),
|
||||
chunks_created=last_result.get("chunks_created"),
|
||||
errors=(last_result.get("errors") or [])[:10],
|
||||
)
|
||||
|
||||
|
||||
@router.post("/nibis/extract-zips")
|
||||
async def extract_zip_files_endpoint():
|
||||
"""Extract all ZIP files in za-download directories."""
|
||||
try:
|
||||
extracted = extract_zip_files(DOCS_BASE_PATH)
|
||||
return {
|
||||
"status": "success",
|
||||
"extracted_count": len(extracted),
|
||||
"directories": [str(d) for d in extracted],
|
||||
}
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
@router.get("/nibis/discover")
|
||||
async def discover_nibis_documents(
|
||||
ewh_only: bool = Query(True, description="Only return Erwartungshorizonte"),
|
||||
year: Optional[int] = Query(None, description="Filter by year"),
|
||||
subject: Optional[str] = Query(None, description="Filter by subject"),
|
||||
):
|
||||
"""
|
||||
Discover available NiBiS documents without indexing.
|
||||
Useful for previewing what will be indexed.
|
||||
"""
|
||||
try:
|
||||
documents = discover_documents(DOCS_BASE_PATH, ewh_only=ewh_only)
|
||||
|
||||
# Apply filters
|
||||
if year:
|
||||
documents = [d for d in documents if d.year == year]
|
||||
if subject:
|
||||
documents = [d for d in documents if subject.lower() in d.subject.lower()]
|
||||
|
||||
# Group by year and subject
|
||||
by_year: Dict[int, int] = {}
|
||||
by_subject: Dict[str, int] = {}
|
||||
for doc in documents:
|
||||
by_year[doc.year] = by_year.get(doc.year, 0) + 1
|
||||
by_subject[doc.subject] = by_subject.get(doc.subject, 0) + 1
|
||||
|
||||
return {
|
||||
"total_documents": len(documents),
|
||||
"by_year": dict(sorted(by_year.items())),
|
||||
"by_subject": dict(sorted(by_subject.items(), key=lambda x: -x[1])),
|
||||
"sample_documents": [
|
||||
{
|
||||
"id": d.id,
|
||||
"filename": d.raw_filename,
|
||||
"year": d.year,
|
||||
"subject": d.subject,
|
||||
"niveau": d.niveau,
|
||||
"doc_type": d.doc_type,
|
||||
}
|
||||
for d in documents[:20]
|
||||
],
|
||||
}
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
@router.post("/nibis/ingest")
|
||||
async def start_ingestion(
|
||||
request: IngestionRequest,
|
||||
background_tasks: BackgroundTasks,
|
||||
):
|
||||
"""
|
||||
Start NiBiS data ingestion in background.
|
||||
"""
|
||||
if _ingestion_status["running"]:
|
||||
raise HTTPException(
|
||||
status_code=409,
|
||||
detail="Ingestion already running. Check /nibis/status for progress."
|
||||
)
|
||||
|
||||
async def run_ingestion_task():
|
||||
global _ingestion_status
|
||||
_ingestion_status["running"] = True
|
||||
_ingestion_status["last_run"] = datetime.now().isoformat()
|
||||
|
||||
try:
|
||||
result = await run_ingestion(
|
||||
ewh_only=request.ewh_only,
|
||||
dry_run=False,
|
||||
year_filter=request.year_filter,
|
||||
subject_filter=request.subject_filter,
|
||||
)
|
||||
_ingestion_status["last_result"] = result
|
||||
except Exception as e:
|
||||
_ingestion_status["last_result"] = {"error": str(e), "errors": [str(e)]}
|
||||
finally:
|
||||
_ingestion_status["running"] = False
|
||||
|
||||
background_tasks.add_task(run_ingestion_task)
|
||||
|
||||
return {
|
||||
"status": "started",
|
||||
"message": "Ingestion started in background. Check /nibis/status for progress.",
|
||||
"filters": {
|
||||
"ewh_only": request.ewh_only,
|
||||
"year": request.year_filter,
|
||||
"subject": request.subject_filter,
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
@router.post("/nibis/search", response_model=List[NiBiSSearchResult])
|
||||
async def search_nibis(request: NiBiSSearchRequest):
|
||||
"""
|
||||
Semantic search in NiBiS Erwartungshorizonte.
|
||||
"""
|
||||
try:
|
||||
query_embedding = await generate_single_embedding(request.query)
|
||||
|
||||
if not query_embedding:
|
||||
raise HTTPException(status_code=500, detail="Failed to generate embedding")
|
||||
|
||||
results = await search_nibis_eh(
|
||||
query_embedding=query_embedding,
|
||||
year=request.year,
|
||||
subject=request.subject,
|
||||
niveau=request.niveau,
|
||||
limit=request.limit,
|
||||
)
|
||||
|
||||
return [
|
||||
NiBiSSearchResult(
|
||||
id=r["id"],
|
||||
score=r["score"],
|
||||
text=r.get("text", "")[:500],
|
||||
year=r.get("year"),
|
||||
subject=r.get("subject"),
|
||||
niveau=r.get("niveau"),
|
||||
task_number=r.get("task_number"),
|
||||
)
|
||||
for r in results
|
||||
]
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
@router.get("/nibis/collections")
|
||||
async def get_collections_info():
|
||||
"""Get information about all Qdrant collections."""
|
||||
try:
|
||||
client = get_qdrant_client()
|
||||
collections = client.get_collections().collections
|
||||
|
||||
result = []
|
||||
for c in collections:
|
||||
try:
|
||||
info = client.get_collection(c.name)
|
||||
result.append({
|
||||
"name": c.name,
|
||||
"vectors_count": info.vectors_count,
|
||||
"points_count": info.points_count,
|
||||
"status": info.status.value,
|
||||
})
|
||||
except Exception as e:
|
||||
result.append({
|
||||
"name": c.name,
|
||||
"error": str(e),
|
||||
})
|
||||
|
||||
return {"collections": result}
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
@router.get("/nibis/stats")
|
||||
async def get_nibis_stats():
|
||||
"""Get detailed statistics about indexed NiBiS data."""
|
||||
try:
|
||||
qdrant = QdrantService()
|
||||
stats = await qdrant.get_stats("bp_nibis_eh")
|
||||
|
||||
if "error" in stats:
|
||||
return {
|
||||
"indexed": False,
|
||||
"message": "NiBiS collection not yet created. Run ingestion first.",
|
||||
}
|
||||
|
||||
client = get_qdrant_client()
|
||||
scroll_result = client.scroll(
|
||||
collection_name="bp_nibis_eh",
|
||||
limit=1000,
|
||||
with_payload=True,
|
||||
with_vectors=False,
|
||||
)
|
||||
|
||||
years = set()
|
||||
subjects = set()
|
||||
niveaus = set()
|
||||
|
||||
for point in scroll_result[0]:
|
||||
if point.payload:
|
||||
if "year" in point.payload:
|
||||
years.add(point.payload["year"])
|
||||
if "subject" in point.payload:
|
||||
subjects.add(point.payload["subject"])
|
||||
if "niveau" in point.payload:
|
||||
niveaus.add(point.payload["niveau"])
|
||||
|
||||
return {
|
||||
"indexed": True,
|
||||
"total_chunks": stats.get("points_count", 0),
|
||||
"years": sorted(list(years)),
|
||||
"subjects": sorted(list(subjects)),
|
||||
"niveaus": sorted(list(niveaus)),
|
||||
}
|
||||
except Exception as e:
|
||||
return {
|
||||
"indexed": False,
|
||||
"error": str(e),
|
||||
}
|
||||
|
||||
|
||||
@router.delete("/nibis/collection")
|
||||
async def delete_nibis_collection():
|
||||
"""Delete the entire NiBiS collection. WARNING: removes all indexed data!"""
|
||||
try:
|
||||
client = get_qdrant_client()
|
||||
client.delete_collection("bp_nibis_eh")
|
||||
return {"status": "deleted", "collection": "bp_nibis_eh"}
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
# Backward-compat shim -- module moved to admin/nibis.py
|
||||
import importlib as _importlib
|
||||
import sys as _sys
|
||||
_sys.modules[__name__] = _importlib.import_module("admin.nibis")
|
||||
|
||||
@@ -1,281 +1,4 @@
|
||||
"""
|
||||
Admin API - RAG Upload & Metrics
|
||||
|
||||
Endpoints for uploading documents, tracking uploads, RAG metrics,
|
||||
search feedback, storage stats, and service initialization.
|
||||
Extracted from admin_api.py for file-size compliance.
|
||||
"""
|
||||
|
||||
from fastapi import APIRouter, HTTPException, BackgroundTasks, Query, UploadFile, File, Form
|
||||
from pydantic import BaseModel
|
||||
from typing import Optional, List, Dict
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
import zipfile
|
||||
import tempfile
|
||||
import os
|
||||
|
||||
from nibis_ingestion import run_ingestion, DOCS_BASE_PATH
|
||||
|
||||
# Import ingestion status from nibis module for auto-ingest
|
||||
from admin_nibis import _ingestion_status
|
||||
|
||||
# Optional: MinIO and PostgreSQL integrations
|
||||
try:
|
||||
from minio_storage import upload_rag_document, get_storage_stats, init_minio_bucket
|
||||
MINIO_AVAILABLE = True
|
||||
except ImportError:
|
||||
MINIO_AVAILABLE = False
|
||||
|
||||
try:
|
||||
from metrics_db import (
|
||||
init_metrics_tables, store_feedback, log_search, log_upload,
|
||||
calculate_metrics, get_recent_feedback, get_upload_history
|
||||
)
|
||||
METRICS_DB_AVAILABLE = True
|
||||
except ImportError:
|
||||
METRICS_DB_AVAILABLE = False
|
||||
|
||||
router = APIRouter(prefix="/api/v1/admin", tags=["Admin"])
|
||||
|
||||
# Upload directory configuration
|
||||
RAG_UPLOAD_BASE = Path(os.getenv("RAG_UPLOAD_BASE", str(DOCS_BASE_PATH)))
|
||||
|
||||
# Store for upload tracking
|
||||
_upload_history: List[Dict] = []
|
||||
|
||||
|
||||
class UploadResult(BaseModel):
|
||||
status: str
|
||||
files_received: int
|
||||
pdfs_extracted: int
|
||||
target_directory: str
|
||||
errors: List[str]
|
||||
|
||||
|
||||
@router.post("/rag/upload", response_model=UploadResult)
|
||||
async def upload_rag_documents(
|
||||
background_tasks: BackgroundTasks,
|
||||
file: UploadFile = File(...),
|
||||
collection: str = Form(default="bp_nibis_eh"),
|
||||
year: Optional[int] = Form(default=None),
|
||||
auto_ingest: bool = Form(default=False),
|
||||
):
|
||||
"""
|
||||
Upload documents for RAG indexing.
|
||||
|
||||
Supports:
|
||||
- ZIP archives (automatically extracted)
|
||||
- Individual PDF files
|
||||
"""
|
||||
errors = []
|
||||
pdfs_extracted = 0
|
||||
|
||||
# Determine target year
|
||||
target_year = year or datetime.now().year
|
||||
|
||||
# Target directory: za-download/YYYY/
|
||||
target_dir = RAG_UPLOAD_BASE / "za-download" / str(target_year)
|
||||
target_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
try:
|
||||
filename = file.filename or "upload"
|
||||
|
||||
if filename.lower().endswith(".zip"):
|
||||
# Handle ZIP file
|
||||
with tempfile.NamedTemporaryFile(delete=False, suffix=".zip") as tmp:
|
||||
content = await file.read()
|
||||
tmp.write(content)
|
||||
tmp_path = tmp.name
|
||||
|
||||
try:
|
||||
with zipfile.ZipFile(tmp_path, 'r') as zf:
|
||||
for member in zf.namelist():
|
||||
if member.lower().endswith(".pdf") and not member.startswith("__MACOSX"):
|
||||
pdf_name = Path(member).name
|
||||
if pdf_name:
|
||||
target_path = target_dir / pdf_name
|
||||
with zf.open(member) as src:
|
||||
with open(target_path, 'wb') as dst:
|
||||
dst.write(src.read())
|
||||
pdfs_extracted += 1
|
||||
finally:
|
||||
os.unlink(tmp_path)
|
||||
|
||||
elif filename.lower().endswith(".pdf"):
|
||||
target_path = target_dir / filename
|
||||
content = await file.read()
|
||||
with open(target_path, 'wb') as f:
|
||||
f.write(content)
|
||||
pdfs_extracted = 1
|
||||
else:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail=f"Unsupported file type: {filename}. Only .zip and .pdf are allowed."
|
||||
)
|
||||
|
||||
# Track upload in memory
|
||||
upload_record = {
|
||||
"timestamp": datetime.now().isoformat(),
|
||||
"filename": filename,
|
||||
"collection": collection,
|
||||
"year": target_year,
|
||||
"pdfs_extracted": pdfs_extracted,
|
||||
"target_directory": str(target_dir),
|
||||
}
|
||||
_upload_history.append(upload_record)
|
||||
|
||||
# Keep only last 100 uploads in memory
|
||||
if len(_upload_history) > 100:
|
||||
_upload_history.pop(0)
|
||||
|
||||
# Store in PostgreSQL if available
|
||||
if METRICS_DB_AVAILABLE:
|
||||
await log_upload(
|
||||
filename=filename,
|
||||
collection_name=collection,
|
||||
year=target_year,
|
||||
pdfs_extracted=pdfs_extracted,
|
||||
minio_path=str(target_dir),
|
||||
)
|
||||
|
||||
# Auto-ingest if requested
|
||||
if auto_ingest and not _ingestion_status["running"]:
|
||||
async def run_auto_ingest():
|
||||
global _ingestion_status
|
||||
_ingestion_status["running"] = True
|
||||
_ingestion_status["last_run"] = datetime.now().isoformat()
|
||||
|
||||
try:
|
||||
result = await run_ingestion(
|
||||
ewh_only=True,
|
||||
dry_run=False,
|
||||
year_filter=target_year,
|
||||
)
|
||||
_ingestion_status["last_result"] = result
|
||||
except Exception as e:
|
||||
_ingestion_status["last_result"] = {"error": str(e), "errors": [str(e)]}
|
||||
finally:
|
||||
_ingestion_status["running"] = False
|
||||
|
||||
background_tasks.add_task(run_auto_ingest)
|
||||
|
||||
return UploadResult(
|
||||
status="success",
|
||||
files_received=1,
|
||||
pdfs_extracted=pdfs_extracted,
|
||||
target_directory=str(target_dir),
|
||||
errors=errors,
|
||||
)
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
errors.append(str(e))
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
@router.get("/rag/upload/history")
|
||||
async def get_upload_history_endpoint(limit: int = Query(default=20, le=100)):
|
||||
"""Get recent upload history."""
|
||||
return {
|
||||
"uploads": _upload_history[-limit:][::-1],
|
||||
"total": len(_upload_history),
|
||||
}
|
||||
|
||||
|
||||
@router.get("/rag/metrics")
|
||||
async def get_rag_metrics(
|
||||
collection: Optional[str] = Query(default=None),
|
||||
days: int = Query(default=7, le=90),
|
||||
):
|
||||
"""Get RAG quality metrics."""
|
||||
if METRICS_DB_AVAILABLE:
|
||||
metrics = await calculate_metrics(collection_name=collection, days=days)
|
||||
if metrics.get("connected"):
|
||||
return metrics
|
||||
|
||||
# Fallback: Return placeholder metrics
|
||||
return {
|
||||
"precision_at_5": 0.78,
|
||||
"recall_at_10": 0.85,
|
||||
"mrr": 0.72,
|
||||
"avg_latency_ms": 52,
|
||||
"total_ratings": len(_upload_history),
|
||||
"error_rate": 0.3,
|
||||
"score_distribution": {
|
||||
"0.9+": 23,
|
||||
"0.7-0.9": 41,
|
||||
"0.5-0.7": 28,
|
||||
"<0.5": 8,
|
||||
},
|
||||
"note": "Placeholder metrics - PostgreSQL not connected",
|
||||
"connected": False,
|
||||
}
|
||||
|
||||
|
||||
@router.post("/rag/search/feedback")
|
||||
async def submit_search_feedback(
|
||||
result_id: str = Form(...),
|
||||
rating: int = Form(..., ge=1, le=5),
|
||||
notes: Optional[str] = Form(default=None),
|
||||
query: Optional[str] = Form(default=None),
|
||||
collection: Optional[str] = Form(default=None),
|
||||
score: Optional[float] = Form(default=None),
|
||||
):
|
||||
"""Submit feedback for a search result."""
|
||||
feedback_record = {
|
||||
"timestamp": datetime.now().isoformat(),
|
||||
"result_id": result_id,
|
||||
"rating": rating,
|
||||
"notes": notes,
|
||||
}
|
||||
|
||||
stored = False
|
||||
if METRICS_DB_AVAILABLE:
|
||||
stored = await store_feedback(
|
||||
result_id=result_id,
|
||||
rating=rating,
|
||||
query_text=query,
|
||||
collection_name=collection,
|
||||
score=score,
|
||||
notes=notes,
|
||||
)
|
||||
|
||||
return {
|
||||
"status": "stored" if stored else "received",
|
||||
"feedback": feedback_record,
|
||||
"persisted": stored,
|
||||
}
|
||||
|
||||
|
||||
@router.get("/rag/storage/stats")
|
||||
async def get_storage_statistics():
|
||||
"""Get MinIO storage statistics."""
|
||||
if MINIO_AVAILABLE:
|
||||
stats = await get_storage_stats()
|
||||
return stats
|
||||
return {
|
||||
"error": "MinIO not available",
|
||||
"connected": False,
|
||||
}
|
||||
|
||||
|
||||
@router.post("/rag/init")
|
||||
async def initialize_rag_services():
|
||||
"""Initialize RAG services (MinIO bucket, PostgreSQL tables)."""
|
||||
results = {
|
||||
"minio": False,
|
||||
"postgres": False,
|
||||
}
|
||||
|
||||
if MINIO_AVAILABLE:
|
||||
results["minio"] = await init_minio_bucket()
|
||||
|
||||
if METRICS_DB_AVAILABLE:
|
||||
results["postgres"] = await init_metrics_tables()
|
||||
|
||||
return {
|
||||
"status": "initialized",
|
||||
"services": results,
|
||||
}
|
||||
# Backward-compat shim -- module moved to admin/rag.py
|
||||
import importlib as _importlib
|
||||
import sys as _sys
|
||||
_sys.modules[__name__] = _importlib.import_module("admin.rag")
|
||||
|
||||
@@ -1,389 +1,4 @@
|
||||
"""
|
||||
Admin API - Legal Templates
|
||||
|
||||
Endpoints for legal template ingestion, search, source management,
|
||||
license info, and collection management.
|
||||
Extracted from admin_api.py for file-size compliance.
|
||||
"""
|
||||
|
||||
from fastapi import APIRouter, HTTPException, BackgroundTasks, Query
|
||||
from pydantic import BaseModel
|
||||
from typing import Optional, List, Dict
|
||||
from datetime import datetime
|
||||
|
||||
from eh_pipeline import generate_single_embedding
|
||||
|
||||
# Import legal templates modules
|
||||
try:
|
||||
from legal_templates_ingestion import (
|
||||
LegalTemplatesIngestion,
|
||||
LEGAL_TEMPLATES_COLLECTION,
|
||||
)
|
||||
from template_sources import (
|
||||
TEMPLATE_SOURCES,
|
||||
TEMPLATE_TYPES,
|
||||
JURISDICTIONS,
|
||||
LicenseType,
|
||||
get_enabled_sources,
|
||||
get_sources_by_priority,
|
||||
)
|
||||
from qdrant_service import (
|
||||
search_legal_templates,
|
||||
get_legal_templates_stats,
|
||||
init_legal_templates_collection,
|
||||
)
|
||||
LEGAL_TEMPLATES_AVAILABLE = True
|
||||
except ImportError as e:
|
||||
print(f"Legal templates module not available: {e}")
|
||||
LEGAL_TEMPLATES_AVAILABLE = False
|
||||
|
||||
router = APIRouter(prefix="/api/v1/admin", tags=["Admin"])
|
||||
|
||||
# Store for templates ingestion status
|
||||
_templates_ingestion_status: Dict = {
|
||||
"running": False,
|
||||
"last_run": None,
|
||||
"current_source": None,
|
||||
"results": {},
|
||||
}
|
||||
|
||||
|
||||
class TemplatesSearchRequest(BaseModel):
|
||||
query: str
|
||||
template_type: Optional[str] = None
|
||||
license_types: Optional[List[str]] = None
|
||||
language: Optional[str] = None
|
||||
jurisdiction: Optional[str] = None
|
||||
attribution_required: Optional[bool] = None
|
||||
limit: int = 10
|
||||
|
||||
|
||||
class TemplatesSearchResult(BaseModel):
|
||||
id: str
|
||||
score: float
|
||||
text: str
|
||||
document_title: Optional[str]
|
||||
template_type: Optional[str]
|
||||
clause_category: Optional[str]
|
||||
language: Optional[str]
|
||||
jurisdiction: Optional[str]
|
||||
license_id: Optional[str]
|
||||
license_name: Optional[str]
|
||||
attribution_required: Optional[bool]
|
||||
attribution_text: Optional[str]
|
||||
source_name: Optional[str]
|
||||
source_url: Optional[str]
|
||||
placeholders: Optional[List[str]]
|
||||
is_complete_document: Optional[bool]
|
||||
requires_customization: Optional[bool]
|
||||
|
||||
|
||||
class SourceIngestRequest(BaseModel):
|
||||
source_name: str
|
||||
|
||||
|
||||
@router.get("/templates/status")
|
||||
async def get_templates_status():
|
||||
"""Get status of legal templates collection and ingestion."""
|
||||
if not LEGAL_TEMPLATES_AVAILABLE:
|
||||
return {
|
||||
"available": False,
|
||||
"error": "Legal templates module not available",
|
||||
}
|
||||
|
||||
try:
|
||||
stats = await get_legal_templates_stats()
|
||||
|
||||
return {
|
||||
"available": True,
|
||||
"collection": LEGAL_TEMPLATES_COLLECTION,
|
||||
"ingestion": {
|
||||
"running": _templates_ingestion_status["running"],
|
||||
"last_run": _templates_ingestion_status.get("last_run"),
|
||||
"current_source": _templates_ingestion_status.get("current_source"),
|
||||
"results": _templates_ingestion_status.get("results", {}),
|
||||
},
|
||||
"stats": stats,
|
||||
}
|
||||
except Exception as e:
|
||||
return {
|
||||
"available": True,
|
||||
"error": str(e),
|
||||
"ingestion": _templates_ingestion_status,
|
||||
}
|
||||
|
||||
|
||||
@router.get("/templates/sources")
|
||||
async def get_templates_sources():
|
||||
"""Get list of all template sources with their configuration."""
|
||||
if not LEGAL_TEMPLATES_AVAILABLE:
|
||||
raise HTTPException(status_code=503, detail="Legal templates module not available")
|
||||
|
||||
sources = []
|
||||
for source in TEMPLATE_SOURCES:
|
||||
sources.append({
|
||||
"name": source.name,
|
||||
"description": source.description,
|
||||
"license_type": source.license_type.value,
|
||||
"license_name": source.license_info.name,
|
||||
"template_types": source.template_types,
|
||||
"languages": source.languages,
|
||||
"jurisdiction": source.jurisdiction,
|
||||
"repo_url": source.repo_url,
|
||||
"web_url": source.web_url,
|
||||
"priority": source.priority,
|
||||
"enabled": source.enabled,
|
||||
"attribution_required": source.license_info.attribution_required,
|
||||
})
|
||||
|
||||
return {
|
||||
"sources": sources,
|
||||
"total": len(sources),
|
||||
"enabled": len([s for s in TEMPLATE_SOURCES if s.enabled]),
|
||||
"template_types": TEMPLATE_TYPES,
|
||||
"jurisdictions": JURISDICTIONS,
|
||||
}
|
||||
|
||||
|
||||
@router.get("/templates/licenses")
|
||||
async def get_templates_licenses():
|
||||
"""Get license statistics for indexed templates."""
|
||||
if not LEGAL_TEMPLATES_AVAILABLE:
|
||||
raise HTTPException(status_code=503, detail="Legal templates module not available")
|
||||
|
||||
try:
|
||||
stats = await get_legal_templates_stats()
|
||||
return {
|
||||
"licenses": stats.get("licenses", {}),
|
||||
"total_chunks": stats.get("points_count", 0),
|
||||
}
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
@router.post("/templates/ingest")
|
||||
async def start_templates_ingestion(
|
||||
background_tasks: BackgroundTasks,
|
||||
max_priority: int = Query(default=3, ge=1, le=5, description="Maximum priority level (1=highest)"),
|
||||
):
|
||||
"""
|
||||
Start legal templates ingestion in background.
|
||||
Ingests all enabled sources up to the specified priority level.
|
||||
"""
|
||||
if not LEGAL_TEMPLATES_AVAILABLE:
|
||||
raise HTTPException(status_code=503, detail="Legal templates module not available")
|
||||
|
||||
if _templates_ingestion_status["running"]:
|
||||
raise HTTPException(
|
||||
status_code=409,
|
||||
detail="Templates ingestion already running. Check /templates/status for progress."
|
||||
)
|
||||
|
||||
async def run_templates_ingestion():
|
||||
global _templates_ingestion_status
|
||||
_templates_ingestion_status["running"] = True
|
||||
_templates_ingestion_status["last_run"] = datetime.now().isoformat()
|
||||
_templates_ingestion_status["results"] = {}
|
||||
|
||||
try:
|
||||
ingestion = LegalTemplatesIngestion()
|
||||
sources = get_sources_by_priority(max_priority)
|
||||
|
||||
for source in sources:
|
||||
_templates_ingestion_status["current_source"] = source.name
|
||||
|
||||
try:
|
||||
status = await ingestion.ingest_source(source)
|
||||
_templates_ingestion_status["results"][source.name] = {
|
||||
"status": status.status,
|
||||
"documents_found": status.documents_found,
|
||||
"chunks_indexed": status.chunks_indexed,
|
||||
"errors": status.errors[:5] if status.errors else [],
|
||||
}
|
||||
except Exception as e:
|
||||
_templates_ingestion_status["results"][source.name] = {
|
||||
"status": "failed",
|
||||
"error": str(e),
|
||||
}
|
||||
|
||||
await ingestion.close()
|
||||
|
||||
except Exception as e:
|
||||
_templates_ingestion_status["results"]["_global_error"] = str(e)
|
||||
finally:
|
||||
_templates_ingestion_status["running"] = False
|
||||
_templates_ingestion_status["current_source"] = None
|
||||
|
||||
background_tasks.add_task(run_templates_ingestion)
|
||||
|
||||
sources = get_sources_by_priority(max_priority)
|
||||
return {
|
||||
"status": "started",
|
||||
"message": f"Ingesting {len(sources)} sources up to priority {max_priority}",
|
||||
"sources": [s.name for s in sources],
|
||||
}
|
||||
|
||||
|
||||
@router.post("/templates/ingest-source")
|
||||
async def ingest_single_source(
|
||||
request: SourceIngestRequest,
|
||||
background_tasks: BackgroundTasks,
|
||||
):
|
||||
"""Ingest a single template source by name."""
|
||||
if not LEGAL_TEMPLATES_AVAILABLE:
|
||||
raise HTTPException(status_code=503, detail="Legal templates module not available")
|
||||
|
||||
source = next((s for s in TEMPLATE_SOURCES if s.name == request.source_name), None)
|
||||
if not source:
|
||||
raise HTTPException(
|
||||
status_code=404,
|
||||
detail=f"Source not found: {request.source_name}. Use /templates/sources to list available sources."
|
||||
)
|
||||
|
||||
if not source.enabled:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail=f"Source is disabled: {request.source_name}"
|
||||
)
|
||||
|
||||
if _templates_ingestion_status["running"]:
|
||||
raise HTTPException(
|
||||
status_code=409,
|
||||
detail="Templates ingestion already running."
|
||||
)
|
||||
|
||||
async def run_single_ingestion():
|
||||
global _templates_ingestion_status
|
||||
_templates_ingestion_status["running"] = True
|
||||
_templates_ingestion_status["current_source"] = source.name
|
||||
_templates_ingestion_status["last_run"] = datetime.now().isoformat()
|
||||
|
||||
try:
|
||||
ingestion = LegalTemplatesIngestion()
|
||||
status = await ingestion.ingest_source(source)
|
||||
_templates_ingestion_status["results"][source.name] = {
|
||||
"status": status.status,
|
||||
"documents_found": status.documents_found,
|
||||
"chunks_indexed": status.chunks_indexed,
|
||||
"errors": status.errors[:5] if status.errors else [],
|
||||
}
|
||||
await ingestion.close()
|
||||
|
||||
except Exception as e:
|
||||
_templates_ingestion_status["results"][source.name] = {
|
||||
"status": "failed",
|
||||
"error": str(e),
|
||||
}
|
||||
finally:
|
||||
_templates_ingestion_status["running"] = False
|
||||
_templates_ingestion_status["current_source"] = None
|
||||
|
||||
background_tasks.add_task(run_single_ingestion)
|
||||
|
||||
return {
|
||||
"status": "started",
|
||||
"source": source.name,
|
||||
"license": source.license_type.value,
|
||||
"template_types": source.template_types,
|
||||
}
|
||||
|
||||
|
||||
@router.post("/templates/search", response_model=List[TemplatesSearchResult])
|
||||
async def search_templates(request: TemplatesSearchRequest):
|
||||
"""Semantic search in legal templates collection."""
|
||||
if not LEGAL_TEMPLATES_AVAILABLE:
|
||||
raise HTTPException(status_code=503, detail="Legal templates module not available")
|
||||
|
||||
try:
|
||||
query_embedding = await generate_single_embedding(request.query)
|
||||
|
||||
if not query_embedding:
|
||||
raise HTTPException(status_code=500, detail="Failed to generate embedding")
|
||||
|
||||
results = await search_legal_templates(
|
||||
query_embedding=query_embedding,
|
||||
template_type=request.template_type,
|
||||
license_types=request.license_types,
|
||||
language=request.language,
|
||||
jurisdiction=request.jurisdiction,
|
||||
attribution_required=request.attribution_required,
|
||||
limit=request.limit,
|
||||
)
|
||||
|
||||
return [
|
||||
TemplatesSearchResult(
|
||||
id=r["id"],
|
||||
score=r["score"],
|
||||
text=r.get("text", "")[:1000],
|
||||
document_title=r.get("document_title"),
|
||||
template_type=r.get("template_type"),
|
||||
clause_category=r.get("clause_category"),
|
||||
language=r.get("language"),
|
||||
jurisdiction=r.get("jurisdiction"),
|
||||
license_id=r.get("license_id"),
|
||||
license_name=r.get("license_name"),
|
||||
attribution_required=r.get("attribution_required"),
|
||||
attribution_text=r.get("attribution_text"),
|
||||
source_name=r.get("source_name"),
|
||||
source_url=r.get("source_url"),
|
||||
placeholders=r.get("placeholders"),
|
||||
is_complete_document=r.get("is_complete_document"),
|
||||
requires_customization=r.get("requires_customization"),
|
||||
)
|
||||
for r in results
|
||||
]
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
@router.delete("/templates/reset")
|
||||
async def reset_templates_collection():
|
||||
"""Delete and recreate the legal templates collection."""
|
||||
if not LEGAL_TEMPLATES_AVAILABLE:
|
||||
raise HTTPException(status_code=503, detail="Legal templates module not available")
|
||||
|
||||
if _templates_ingestion_status["running"]:
|
||||
raise HTTPException(
|
||||
status_code=409,
|
||||
detail="Cannot reset while ingestion is running"
|
||||
)
|
||||
|
||||
try:
|
||||
ingestion = LegalTemplatesIngestion()
|
||||
ingestion.reset_collection()
|
||||
await ingestion.close()
|
||||
|
||||
_templates_ingestion_status["results"] = {}
|
||||
|
||||
return {
|
||||
"status": "reset",
|
||||
"collection": LEGAL_TEMPLATES_COLLECTION,
|
||||
"message": "Collection deleted and recreated. Run ingestion to populate.",
|
||||
}
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
@router.delete("/templates/source/{source_name}")
|
||||
async def delete_templates_source(source_name: str):
|
||||
"""Delete all templates from a specific source."""
|
||||
if not LEGAL_TEMPLATES_AVAILABLE:
|
||||
raise HTTPException(status_code=503, detail="Legal templates module not available")
|
||||
|
||||
try:
|
||||
from qdrant_service import delete_legal_templates_by_source
|
||||
|
||||
count = await delete_legal_templates_by_source(source_name)
|
||||
|
||||
if source_name in _templates_ingestion_status.get("results", {}):
|
||||
del _templates_ingestion_status["results"][source_name]
|
||||
|
||||
return {
|
||||
"status": "deleted",
|
||||
"source": source_name,
|
||||
"chunks_deleted": count,
|
||||
}
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
# Backward-compat shim -- module moved to admin/templates.py
|
||||
import importlib as _importlib
|
||||
import sys as _sys
|
||||
_sys.modules[__name__] = _importlib.import_module("admin.templates")
|
||||
|
||||
@@ -0,0 +1,6 @@
|
||||
"""
|
||||
compliance package — compliance pipeline, RBAC/ABAC policy engine.
|
||||
|
||||
Backward-compatible re-exports: consumers can still use
|
||||
``from compliance_models import ...`` etc. via the shim files in backend/.
|
||||
"""
|
||||
@@ -0,0 +1,200 @@
|
||||
"""
|
||||
Compliance Extraction & Generation.
|
||||
|
||||
Functions for extracting checkpoints from legal text chunks,
|
||||
generating controls, and creating remediation measures.
|
||||
"""
|
||||
|
||||
import re
|
||||
import hashlib
|
||||
import logging
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
from .models import Checkpoint, Control, Measure
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def extract_checkpoints_from_chunk(chunk_text: str, payload: Dict) -> List[Checkpoint]:
|
||||
"""
|
||||
Extract checkpoints/requirements from a chunk of text.
|
||||
|
||||
Uses pattern matching to find requirement-like statements.
|
||||
"""
|
||||
checkpoints = []
|
||||
regulation_code = payload.get("regulation_code", "UNKNOWN")
|
||||
regulation_name = payload.get("regulation_name", "Unknown")
|
||||
source_url = payload.get("source_url", "")
|
||||
chunk_id = hashlib.md5(chunk_text[:100].encode()).hexdigest()[:8]
|
||||
|
||||
# Patterns for different requirement types
|
||||
patterns = [
|
||||
# BSI-TR patterns
|
||||
(r'([OT]\.[A-Za-z_]+\d*)[:\s]+(.+?)(?=\n[OT]\.|$)', 'bsi_requirement'),
|
||||
# Article patterns (GDPR, AI Act, etc.)
|
||||
(r'(?:Artikel|Art\.?)\s+(\d+)(?:\s+Abs(?:atz)?\.?\s*(\d+))?\s*[-\u2013:]\s*(.+?)(?=\n|$)', 'article'),
|
||||
# Numbered requirements
|
||||
(r'\((\d+)\)\s+(.+?)(?=\n\(\d+\)|$)', 'numbered'),
|
||||
# "Der Verantwortliche muss" patterns
|
||||
(r'(?:Der Verantwortliche|Die Aufsichtsbeh\u00f6rde|Der Auftragsverarbeiter)\s+(muss|hat|soll)\s+(.+?)(?=\.\s|$)', 'obligation'),
|
||||
# "Es ist erforderlich" patterns
|
||||
(r'(?:Es ist erforderlich|Es muss gew\u00e4hrleistet|Es sind geeignete)\s+(.+?)(?=\.\s|$)', 'requirement'),
|
||||
]
|
||||
|
||||
for pattern, pattern_type in patterns:
|
||||
matches = re.finditer(pattern, chunk_text, re.MULTILINE | re.DOTALL)
|
||||
for match in matches:
|
||||
if pattern_type == 'bsi_requirement':
|
||||
req_id = match.group(1)
|
||||
description = match.group(2).strip()
|
||||
title = req_id
|
||||
elif pattern_type == 'article':
|
||||
article_num = match.group(1)
|
||||
paragraph = match.group(2) or ""
|
||||
title_text = match.group(3).strip()
|
||||
req_id = f"{regulation_code}-Art{article_num}"
|
||||
if paragraph:
|
||||
req_id += f"-{paragraph}"
|
||||
title = f"Art. {article_num}" + (f" Abs. {paragraph}" if paragraph else "")
|
||||
description = title_text
|
||||
elif pattern_type == 'numbered':
|
||||
num = match.group(1)
|
||||
description = match.group(2).strip()
|
||||
req_id = f"{regulation_code}-{num}"
|
||||
title = f"Anforderung {num}"
|
||||
else:
|
||||
# Generic requirement
|
||||
description = match.group(0).strip()
|
||||
req_id = f"{regulation_code}-{chunk_id}-{len(checkpoints)}"
|
||||
title = description[:50] + "..." if len(description) > 50 else description
|
||||
|
||||
# Skip very short matches
|
||||
if len(description) < 20:
|
||||
continue
|
||||
|
||||
checkpoint = Checkpoint(
|
||||
id=req_id,
|
||||
regulation_code=regulation_code,
|
||||
regulation_name=regulation_name,
|
||||
article=title if 'Art' in title else None,
|
||||
title=title,
|
||||
description=description[:500],
|
||||
original_text=description,
|
||||
chunk_id=chunk_id,
|
||||
source_url=source_url
|
||||
)
|
||||
checkpoints.append(checkpoint)
|
||||
|
||||
return checkpoints
|
||||
|
||||
|
||||
def generate_control_for_checkpoints(
|
||||
checkpoints: List[Checkpoint],
|
||||
domain_counts: Dict[str, int],
|
||||
) -> Optional[Control]:
|
||||
"""
|
||||
Generate a control that covers the given checkpoints.
|
||||
|
||||
This is a simplified version - in production this would use the AI assistant.
|
||||
"""
|
||||
if not checkpoints:
|
||||
return None
|
||||
|
||||
# Group by regulation
|
||||
regulation = checkpoints[0].regulation_code
|
||||
|
||||
# Determine domain based on content
|
||||
all_text = " ".join([cp.description for cp in checkpoints]).lower()
|
||||
|
||||
domain = "gov" # Default
|
||||
if any(kw in all_text for kw in ["verschl\u00fcssel", "krypto", "encrypt", "hash"]):
|
||||
domain = "crypto"
|
||||
elif any(kw in all_text for kw in ["zugang", "access", "authentif", "login", "benutzer"]):
|
||||
domain = "iam"
|
||||
elif any(kw in all_text for kw in ["datenschutz", "personenbezogen", "privacy", "einwilligung"]):
|
||||
domain = "priv"
|
||||
elif any(kw in all_text for kw in ["entwicklung", "test", "code", "software"]):
|
||||
domain = "sdlc"
|
||||
elif any(kw in all_text for kw in ["\u00fcberwach", "monitor", "log", "audit"]):
|
||||
domain = "aud"
|
||||
elif any(kw in all_text for kw in ["ki", "k\u00fcnstlich", "ai", "machine learning", "model"]):
|
||||
domain = "ai"
|
||||
elif any(kw in all_text for kw in ["betrieb", "operation", "verf\u00fcgbar", "backup"]):
|
||||
domain = "ops"
|
||||
elif any(kw in all_text for kw in ["cyber", "resilience", "sbom", "vulnerab"]):
|
||||
domain = "cra"
|
||||
|
||||
# Generate control ID
|
||||
domain_count = domain_counts.get(domain, 0) + 1
|
||||
control_id = f"{domain.upper()}-{domain_count:03d}"
|
||||
|
||||
# Create title from first checkpoint
|
||||
title = checkpoints[0].title
|
||||
if len(title) > 100:
|
||||
title = title[:97] + "..."
|
||||
|
||||
# Create description
|
||||
description = f"Control f\u00fcr {regulation}: " + checkpoints[0].description[:200]
|
||||
|
||||
# Pass criteria
|
||||
pass_criteria = f"Alle {len(checkpoints)} zugeh\u00f6rigen Anforderungen sind erf\u00fcllt und dokumentiert."
|
||||
|
||||
# Implementation guidance
|
||||
guidance = f"Implementiere Ma\u00dfnahmen zur Erf\u00fcllung der Anforderungen aus {regulation}. "
|
||||
guidance += f"Dokumentiere die Umsetzung und f\u00fchre regelm\u00e4\u00dfige Reviews durch."
|
||||
|
||||
# Determine if automated
|
||||
is_automated = any(kw in all_text for kw in ["automat", "tool", "scan", "test"])
|
||||
|
||||
control = Control(
|
||||
id=control_id,
|
||||
domain=domain,
|
||||
title=title,
|
||||
description=description,
|
||||
checkpoints=[cp.id for cp in checkpoints],
|
||||
pass_criteria=pass_criteria,
|
||||
implementation_guidance=guidance,
|
||||
is_automated=is_automated,
|
||||
automation_tool="CI/CD Pipeline" if is_automated else None,
|
||||
priority="high" if "muss" in all_text or "erforderlich" in all_text else "medium"
|
||||
)
|
||||
|
||||
return control
|
||||
|
||||
|
||||
def generate_measure_for_control(control: Control) -> Measure:
|
||||
"""Generate a remediation measure for a control."""
|
||||
measure_id = f"M-{control.id}"
|
||||
|
||||
# Determine deadline based on priority
|
||||
deadline_days = {
|
||||
"critical": 30,
|
||||
"high": 60,
|
||||
"medium": 90,
|
||||
"low": 180
|
||||
}.get(control.priority, 90)
|
||||
|
||||
# Determine responsible team
|
||||
responsible = {
|
||||
"priv": "Datenschutzbeauftragter",
|
||||
"iam": "IT-Security Team",
|
||||
"sdlc": "Entwicklungsteam",
|
||||
"crypto": "IT-Security Team",
|
||||
"ops": "Operations Team",
|
||||
"aud": "Compliance Team",
|
||||
"ai": "AI/ML Team",
|
||||
"cra": "IT-Security Team",
|
||||
"gov": "Management"
|
||||
}.get(control.domain, "Compliance Team")
|
||||
|
||||
measure = Measure(
|
||||
id=measure_id,
|
||||
control_id=control.id,
|
||||
title=f"Umsetzung: {control.title[:50]}",
|
||||
description=f"Implementierung und Dokumentation von {control.id}: {control.description[:100]}",
|
||||
responsible=responsible,
|
||||
deadline_days=deadline_days,
|
||||
status="pending"
|
||||
)
|
||||
|
||||
return measure
|
||||
@@ -0,0 +1,65 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Full Compliance Pipeline for Legal Corpus — Barrel Re-export.
|
||||
|
||||
Split into submodules:
|
||||
- compliance_models.py — Dataclasses (Checkpoint, Control, Measure)
|
||||
- compliance_extraction.py — Pattern extraction & control/measure generation
|
||||
- compliance_pipeline.py — Pipeline phases & orchestrator
|
||||
|
||||
Run on Mac Mini:
|
||||
nohup python full_compliance_pipeline.py > /tmp/compliance_pipeline.log 2>&1 &
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import sys
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s - %(levelname)s - %(message)s',
|
||||
handlers=[
|
||||
logging.StreamHandler(sys.stdout),
|
||||
logging.FileHandler('/tmp/compliance_pipeline.log')
|
||||
]
|
||||
)
|
||||
|
||||
# Re-export all public symbols
|
||||
from .models import Checkpoint, Control, Measure
|
||||
from .extraction import (
|
||||
extract_checkpoints_from_chunk,
|
||||
generate_control_for_checkpoints,
|
||||
generate_measure_for_control,
|
||||
)
|
||||
from .pipeline import CompliancePipeline
|
||||
|
||||
__all__ = [
|
||||
"Checkpoint",
|
||||
"Control",
|
||||
"Measure",
|
||||
"extract_checkpoints_from_chunk",
|
||||
"generate_control_for_checkpoints",
|
||||
"generate_measure_for_control",
|
||||
"CompliancePipeline",
|
||||
]
|
||||
|
||||
|
||||
async def main():
|
||||
import argparse
|
||||
parser = argparse.ArgumentParser(description="Run the compliance pipeline")
|
||||
parser.add_argument("--force-reindex", action="store_true",
|
||||
help="Force re-ingestion of all documents")
|
||||
parser.add_argument("--skip-ingestion", action="store_true",
|
||||
help="Skip ingestion phase, use existing chunks")
|
||||
args = parser.parse_args()
|
||||
|
||||
pipeline = CompliancePipeline()
|
||||
await pipeline.run_full_pipeline(
|
||||
force_reindex=args.force_reindex,
|
||||
skip_ingestion=args.skip_ingestion
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
@@ -0,0 +1,49 @@
|
||||
"""
|
||||
Compliance Pipeline Data Models.
|
||||
|
||||
Dataclasses for checkpoints, controls, and measures.
|
||||
"""
|
||||
|
||||
from typing import Optional, List
|
||||
from dataclasses import dataclass
|
||||
|
||||
|
||||
@dataclass
|
||||
class Checkpoint:
|
||||
"""A requirement/checkpoint extracted from legal text."""
|
||||
id: str
|
||||
regulation_code: str
|
||||
regulation_name: str
|
||||
article: Optional[str]
|
||||
title: str
|
||||
description: str
|
||||
original_text: str
|
||||
chunk_id: str
|
||||
source_url: str
|
||||
|
||||
|
||||
@dataclass
|
||||
class Control:
|
||||
"""A control derived from checkpoints."""
|
||||
id: str
|
||||
domain: str
|
||||
title: str
|
||||
description: str
|
||||
checkpoints: List[str] # List of checkpoint IDs
|
||||
pass_criteria: str
|
||||
implementation_guidance: str
|
||||
is_automated: bool
|
||||
automation_tool: Optional[str]
|
||||
priority: str
|
||||
|
||||
|
||||
@dataclass
|
||||
class Measure:
|
||||
"""A remediation measure for a control."""
|
||||
id: str
|
||||
control_id: str
|
||||
title: str
|
||||
description: str
|
||||
responsible: str
|
||||
deadline_days: int
|
||||
status: str
|
||||
@@ -0,0 +1,441 @@
|
||||
"""
|
||||
Compliance Pipeline Execution.
|
||||
|
||||
Pipeline phases (ingestion, extraction, control generation, measures)
|
||||
and orchestration logic.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
from datetime import datetime
|
||||
from typing import Dict, List, Any
|
||||
from dataclasses import asdict
|
||||
|
||||
from .models import Checkpoint, Control, Measure
|
||||
from .extraction import (
|
||||
extract_checkpoints_from_chunk,
|
||||
generate_control_for_checkpoints,
|
||||
generate_measure_for_control,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Import checkpoint manager
|
||||
try:
|
||||
from pipeline_checkpoints import CheckpointManager, EXPECTED_VALUES, ValidationStatus
|
||||
except ImportError:
|
||||
logger.warning("Checkpoint manager not available, running without checkpoints")
|
||||
CheckpointManager = None
|
||||
EXPECTED_VALUES = {}
|
||||
ValidationStatus = None
|
||||
|
||||
# Set environment variables for Docker network
|
||||
if not os.getenv("QDRANT_URL") and not os.getenv("QDRANT_HOST"):
|
||||
os.environ["QDRANT_HOST"] = "qdrant"
|
||||
os.environ.setdefault("EMBEDDING_SERVICE_URL", "http://embedding-service:8087")
|
||||
|
||||
# Try to import from klausur-service
|
||||
try:
|
||||
from legal_corpus_ingestion import LegalCorpusIngestion, REGULATIONS, LEGAL_CORPUS_COLLECTION
|
||||
from qdrant_client import QdrantClient
|
||||
from qdrant_client.models import Filter, FieldCondition, MatchValue
|
||||
except ImportError:
|
||||
logger.error("Could not import required modules. Make sure you're in the klausur-service container.")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
class CompliancePipeline:
|
||||
"""Handles the full compliance pipeline."""
|
||||
|
||||
def __init__(self):
|
||||
# Support both QDRANT_URL and QDRANT_HOST/PORT
|
||||
qdrant_url = os.getenv("QDRANT_URL", "")
|
||||
if qdrant_url:
|
||||
from urllib.parse import urlparse
|
||||
parsed = urlparse(qdrant_url)
|
||||
qdrant_host = parsed.hostname or "qdrant"
|
||||
qdrant_port = parsed.port or 6333
|
||||
else:
|
||||
qdrant_host = os.getenv("QDRANT_HOST", "qdrant")
|
||||
qdrant_port = 6333
|
||||
self.qdrant = QdrantClient(host=qdrant_host, port=qdrant_port)
|
||||
self.checkpoints: List[Checkpoint] = []
|
||||
self.controls: List[Control] = []
|
||||
self.measures: List[Measure] = []
|
||||
self.stats = {
|
||||
"chunks_processed": 0,
|
||||
"checkpoints_extracted": 0,
|
||||
"controls_created": 0,
|
||||
"measures_defined": 0,
|
||||
"by_regulation": {},
|
||||
"by_domain": {},
|
||||
}
|
||||
# Initialize checkpoint manager
|
||||
self.checkpoint_mgr = CheckpointManager() if CheckpointManager else None
|
||||
|
||||
async def run_ingestion_phase(self, force_reindex: bool = False) -> int:
|
||||
"""Phase 1: Ingest documents (incremental - only missing ones)."""
|
||||
logger.info("\n" + "=" * 60)
|
||||
logger.info("PHASE 1: DOCUMENT INGESTION (INCREMENTAL)")
|
||||
logger.info("=" * 60)
|
||||
|
||||
if self.checkpoint_mgr:
|
||||
self.checkpoint_mgr.start_checkpoint("ingestion", "Document Ingestion")
|
||||
|
||||
ingestion = LegalCorpusIngestion()
|
||||
|
||||
try:
|
||||
# Check existing chunks per regulation
|
||||
existing_chunks = {}
|
||||
try:
|
||||
for regulation in REGULATIONS:
|
||||
count_result = self.qdrant.count(
|
||||
collection_name=LEGAL_CORPUS_COLLECTION,
|
||||
count_filter=Filter(
|
||||
must=[FieldCondition(key="regulation_code", match=MatchValue(value=regulation.code))]
|
||||
)
|
||||
)
|
||||
existing_chunks[regulation.code] = count_result.count
|
||||
logger.info(f" {regulation.code}: {count_result.count} existing chunks")
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not check existing chunks: {e}")
|
||||
|
||||
# Determine which regulations need ingestion
|
||||
regulations_to_ingest = []
|
||||
for regulation in REGULATIONS:
|
||||
existing = existing_chunks.get(regulation.code, 0)
|
||||
if force_reindex or existing == 0:
|
||||
regulations_to_ingest.append(regulation)
|
||||
logger.info(f" -> Will ingest: {regulation.code} (existing: {existing}, force: {force_reindex})")
|
||||
else:
|
||||
logger.info(f" -> Skipping: {regulation.code} (already has {existing} chunks)")
|
||||
self.stats["by_regulation"][regulation.code] = existing
|
||||
|
||||
if not regulations_to_ingest:
|
||||
logger.info("All regulations already indexed. Skipping ingestion phase.")
|
||||
total_chunks = sum(existing_chunks.values())
|
||||
self.stats["chunks_processed"] = total_chunks
|
||||
if self.checkpoint_mgr:
|
||||
self.checkpoint_mgr.add_metric("total_chunks", total_chunks)
|
||||
self.checkpoint_mgr.add_metric("skipped", True)
|
||||
self.checkpoint_mgr.complete_checkpoint(success=True)
|
||||
return total_chunks
|
||||
|
||||
# Ingest only missing regulations
|
||||
total_chunks = sum(existing_chunks.values())
|
||||
for i, regulation in enumerate(regulations_to_ingest, 1):
|
||||
logger.info(f"[{i}/{len(regulations_to_ingest)}] Ingesting {regulation.code}...")
|
||||
try:
|
||||
count = await ingestion.ingest_regulation(regulation)
|
||||
total_chunks += count
|
||||
self.stats["by_regulation"][regulation.code] = count
|
||||
logger.info(f" -> {count} chunks")
|
||||
|
||||
if self.checkpoint_mgr:
|
||||
self.checkpoint_mgr.add_metric(f"chunks_{regulation.code}", count)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f" -> FAILED: {e}")
|
||||
self.stats["by_regulation"][regulation.code] = 0
|
||||
|
||||
self.stats["chunks_processed"] = total_chunks
|
||||
logger.info(f"\nTotal chunks in collection: {total_chunks}")
|
||||
|
||||
# Validate ingestion results
|
||||
if self.checkpoint_mgr:
|
||||
self.checkpoint_mgr.add_metric("total_chunks", total_chunks)
|
||||
self.checkpoint_mgr.add_metric("regulations_count", len(REGULATIONS))
|
||||
|
||||
expected = EXPECTED_VALUES.get("ingestion", {})
|
||||
self.checkpoint_mgr.validate(
|
||||
"total_chunks",
|
||||
expected=expected.get("total_chunks", 8000),
|
||||
actual=total_chunks,
|
||||
min_value=expected.get("min_chunks", 7000)
|
||||
)
|
||||
|
||||
reg_expected = expected.get("regulations", {})
|
||||
for reg_code, reg_exp in reg_expected.items():
|
||||
actual = self.stats["by_regulation"].get(reg_code, 0)
|
||||
self.checkpoint_mgr.validate(
|
||||
f"chunks_{reg_code}",
|
||||
expected=reg_exp.get("expected", 0),
|
||||
actual=actual,
|
||||
min_value=reg_exp.get("min", 0)
|
||||
)
|
||||
|
||||
self.checkpoint_mgr.complete_checkpoint(success=True)
|
||||
|
||||
return total_chunks
|
||||
|
||||
except Exception as e:
|
||||
if self.checkpoint_mgr:
|
||||
self.checkpoint_mgr.fail_checkpoint(str(e))
|
||||
raise
|
||||
|
||||
finally:
|
||||
await ingestion.close()
|
||||
|
||||
async def run_extraction_phase(self) -> int:
|
||||
"""Phase 2: Extract checkpoints from chunks."""
|
||||
logger.info("\n" + "=" * 60)
|
||||
logger.info("PHASE 2: CHECKPOINT EXTRACTION")
|
||||
logger.info("=" * 60)
|
||||
|
||||
if self.checkpoint_mgr:
|
||||
self.checkpoint_mgr.start_checkpoint("extraction", "Checkpoint Extraction")
|
||||
|
||||
try:
|
||||
offset = None
|
||||
total_checkpoints = 0
|
||||
|
||||
while True:
|
||||
result = self.qdrant.scroll(
|
||||
collection_name=LEGAL_CORPUS_COLLECTION,
|
||||
limit=100,
|
||||
offset=offset,
|
||||
with_payload=True,
|
||||
with_vectors=False
|
||||
)
|
||||
|
||||
points, next_offset = result
|
||||
|
||||
if not points:
|
||||
break
|
||||
|
||||
for point in points:
|
||||
payload = point.payload
|
||||
text = payload.get("text", "")
|
||||
|
||||
cps = extract_checkpoints_from_chunk(text, payload)
|
||||
self.checkpoints.extend(cps)
|
||||
total_checkpoints += len(cps)
|
||||
|
||||
logger.info(f"Processed {len(points)} chunks, extracted {total_checkpoints} checkpoints so far...")
|
||||
|
||||
if next_offset is None:
|
||||
break
|
||||
offset = next_offset
|
||||
|
||||
self.stats["checkpoints_extracted"] = len(self.checkpoints)
|
||||
logger.info(f"\nTotal checkpoints extracted: {len(self.checkpoints)}")
|
||||
|
||||
by_reg = {}
|
||||
for cp in self.checkpoints:
|
||||
by_reg[cp.regulation_code] = by_reg.get(cp.regulation_code, 0) + 1
|
||||
for reg, count in sorted(by_reg.items()):
|
||||
logger.info(f" {reg}: {count} checkpoints")
|
||||
|
||||
if self.checkpoint_mgr:
|
||||
self.checkpoint_mgr.add_metric("total_checkpoints", len(self.checkpoints))
|
||||
self.checkpoint_mgr.add_metric("checkpoints_by_regulation", by_reg)
|
||||
|
||||
expected = EXPECTED_VALUES.get("extraction", {})
|
||||
self.checkpoint_mgr.validate(
|
||||
"total_checkpoints",
|
||||
expected=expected.get("total_checkpoints", 3500),
|
||||
actual=len(self.checkpoints),
|
||||
min_value=expected.get("min_checkpoints", 3000)
|
||||
)
|
||||
|
||||
self.checkpoint_mgr.complete_checkpoint(success=True)
|
||||
|
||||
return len(self.checkpoints)
|
||||
|
||||
except Exception as e:
|
||||
if self.checkpoint_mgr:
|
||||
self.checkpoint_mgr.fail_checkpoint(str(e))
|
||||
raise
|
||||
|
||||
async def run_control_generation_phase(self) -> int:
|
||||
"""Phase 3: Generate controls from checkpoints."""
|
||||
logger.info("\n" + "=" * 60)
|
||||
logger.info("PHASE 3: CONTROL GENERATION")
|
||||
logger.info("=" * 60)
|
||||
|
||||
if self.checkpoint_mgr:
|
||||
self.checkpoint_mgr.start_checkpoint("controls", "Control Generation")
|
||||
|
||||
try:
|
||||
# Group checkpoints by regulation
|
||||
by_regulation: Dict[str, List[Checkpoint]] = {}
|
||||
for cp in self.checkpoints:
|
||||
reg = cp.regulation_code
|
||||
if reg not in by_regulation:
|
||||
by_regulation[reg] = []
|
||||
by_regulation[reg].append(cp)
|
||||
|
||||
# Generate controls per regulation (group every 3-5 checkpoints)
|
||||
for regulation, checkpoints in by_regulation.items():
|
||||
logger.info(f"Generating controls for {regulation} ({len(checkpoints)} checkpoints)...")
|
||||
|
||||
batch_size = 4
|
||||
for i in range(0, len(checkpoints), batch_size):
|
||||
batch = checkpoints[i:i + batch_size]
|
||||
control = generate_control_for_checkpoints(batch, self.stats.get("by_domain", {}))
|
||||
|
||||
if control:
|
||||
self.controls.append(control)
|
||||
self.stats["by_domain"][control.domain] = self.stats["by_domain"].get(control.domain, 0) + 1
|
||||
|
||||
self.stats["controls_created"] = len(self.controls)
|
||||
logger.info(f"\nTotal controls created: {len(self.controls)}")
|
||||
|
||||
for domain, count in sorted(self.stats["by_domain"].items()):
|
||||
logger.info(f" {domain}: {count} controls")
|
||||
|
||||
if self.checkpoint_mgr:
|
||||
self.checkpoint_mgr.add_metric("total_controls", len(self.controls))
|
||||
self.checkpoint_mgr.add_metric("controls_by_domain", dict(self.stats["by_domain"]))
|
||||
|
||||
expected = EXPECTED_VALUES.get("controls", {})
|
||||
self.checkpoint_mgr.validate(
|
||||
"total_controls",
|
||||
expected=expected.get("total_controls", 900),
|
||||
actual=len(self.controls),
|
||||
min_value=expected.get("min_controls", 800)
|
||||
)
|
||||
|
||||
self.checkpoint_mgr.complete_checkpoint(success=True)
|
||||
|
||||
return len(self.controls)
|
||||
|
||||
except Exception as e:
|
||||
if self.checkpoint_mgr:
|
||||
self.checkpoint_mgr.fail_checkpoint(str(e))
|
||||
raise
|
||||
|
||||
async def run_measure_generation_phase(self) -> int:
|
||||
"""Phase 4: Generate measures for controls."""
|
||||
logger.info("\n" + "=" * 60)
|
||||
logger.info("PHASE 4: MEASURE GENERATION")
|
||||
logger.info("=" * 60)
|
||||
|
||||
if self.checkpoint_mgr:
|
||||
self.checkpoint_mgr.start_checkpoint("measures", "Measure Generation")
|
||||
|
||||
try:
|
||||
for control in self.controls:
|
||||
measure = generate_measure_for_control(control)
|
||||
self.measures.append(measure)
|
||||
|
||||
self.stats["measures_defined"] = len(self.measures)
|
||||
logger.info(f"\nTotal measures defined: {len(self.measures)}")
|
||||
|
||||
if self.checkpoint_mgr:
|
||||
self.checkpoint_mgr.add_metric("total_measures", len(self.measures))
|
||||
|
||||
expected = EXPECTED_VALUES.get("measures", {})
|
||||
self.checkpoint_mgr.validate(
|
||||
"total_measures",
|
||||
expected=expected.get("total_measures", 900),
|
||||
actual=len(self.measures),
|
||||
min_value=expected.get("min_measures", 800)
|
||||
)
|
||||
|
||||
self.checkpoint_mgr.complete_checkpoint(success=True)
|
||||
|
||||
return len(self.measures)
|
||||
|
||||
except Exception as e:
|
||||
if self.checkpoint_mgr:
|
||||
self.checkpoint_mgr.fail_checkpoint(str(e))
|
||||
raise
|
||||
|
||||
def save_results(self, output_dir: str = "/tmp/compliance_output"):
|
||||
"""Save results to JSON files."""
|
||||
logger.info("\n" + "=" * 60)
|
||||
logger.info("SAVING RESULTS")
|
||||
logger.info("=" * 60)
|
||||
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
|
||||
checkpoints_file = os.path.join(output_dir, "checkpoints.json")
|
||||
with open(checkpoints_file, "w") as f:
|
||||
json.dump([asdict(cp) for cp in self.checkpoints], f, indent=2, ensure_ascii=False)
|
||||
logger.info(f"Saved {len(self.checkpoints)} checkpoints to {checkpoints_file}")
|
||||
|
||||
controls_file = os.path.join(output_dir, "controls.json")
|
||||
with open(controls_file, "w") as f:
|
||||
json.dump([asdict(c) for c in self.controls], f, indent=2, ensure_ascii=False)
|
||||
logger.info(f"Saved {len(self.controls)} controls to {controls_file}")
|
||||
|
||||
measures_file = os.path.join(output_dir, "measures.json")
|
||||
with open(measures_file, "w") as f:
|
||||
json.dump([asdict(m) for m in self.measures], f, indent=2, ensure_ascii=False)
|
||||
logger.info(f"Saved {len(self.measures)} measures to {measures_file}")
|
||||
|
||||
stats_file = os.path.join(output_dir, "statistics.json")
|
||||
self.stats["generated_at"] = datetime.now().isoformat()
|
||||
with open(stats_file, "w") as f:
|
||||
json.dump(self.stats, f, indent=2, ensure_ascii=False)
|
||||
logger.info(f"Saved statistics to {stats_file}")
|
||||
|
||||
async def run_full_pipeline(self, force_reindex: bool = False, skip_ingestion: bool = False):
|
||||
"""Run the complete pipeline.
|
||||
|
||||
Args:
|
||||
force_reindex: If True, re-ingest all documents even if they exist
|
||||
skip_ingestion: If True, skip ingestion phase entirely (use existing chunks)
|
||||
"""
|
||||
start_time = time.time()
|
||||
|
||||
logger.info("=" * 60)
|
||||
logger.info("FULL COMPLIANCE PIPELINE (INCREMENTAL)")
|
||||
logger.info(f"Started at: {datetime.now().isoformat()}")
|
||||
logger.info(f"Force reindex: {force_reindex}")
|
||||
logger.info(f"Skip ingestion: {skip_ingestion}")
|
||||
if self.checkpoint_mgr:
|
||||
logger.info(f"Pipeline ID: {self.checkpoint_mgr.pipeline_id}")
|
||||
logger.info("=" * 60)
|
||||
|
||||
try:
|
||||
if skip_ingestion:
|
||||
logger.info("Skipping ingestion phase as requested...")
|
||||
try:
|
||||
collection_info = self.qdrant.get_collection(LEGAL_CORPUS_COLLECTION)
|
||||
self.stats["chunks_processed"] = collection_info.points_count
|
||||
except Exception:
|
||||
self.stats["chunks_processed"] = 0
|
||||
else:
|
||||
await self.run_ingestion_phase(force_reindex=force_reindex)
|
||||
|
||||
await self.run_extraction_phase()
|
||||
await self.run_control_generation_phase()
|
||||
await self.run_measure_generation_phase()
|
||||
self.save_results()
|
||||
|
||||
elapsed = time.time() - start_time
|
||||
logger.info("\n" + "=" * 60)
|
||||
logger.info("PIPELINE COMPLETE")
|
||||
logger.info("=" * 60)
|
||||
logger.info(f"Duration: {elapsed:.1f} seconds")
|
||||
logger.info(f"Chunks processed: {self.stats['chunks_processed']}")
|
||||
logger.info(f"Checkpoints extracted: {self.stats['checkpoints_extracted']}")
|
||||
logger.info(f"Controls created: {self.stats['controls_created']}")
|
||||
logger.info(f"Measures defined: {self.stats['measures_defined']}")
|
||||
logger.info(f"\nResults saved to: /tmp/compliance_output/")
|
||||
logger.info("Checkpoint status: /tmp/pipeline_checkpoints.json")
|
||||
logger.info("=" * 60)
|
||||
|
||||
if self.checkpoint_mgr:
|
||||
self.checkpoint_mgr.complete_pipeline({
|
||||
"duration_seconds": elapsed,
|
||||
"chunks_processed": self.stats['chunks_processed'],
|
||||
"checkpoints_extracted": self.stats['checkpoints_extracted'],
|
||||
"controls_created": self.stats['controls_created'],
|
||||
"measures_defined": self.stats['measures_defined'],
|
||||
"by_regulation": self.stats['by_regulation'],
|
||||
"by_domain": self.stats['by_domain'],
|
||||
})
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Pipeline failed: {e}")
|
||||
if self.checkpoint_mgr:
|
||||
self.checkpoint_mgr.state.status = "failed"
|
||||
self.checkpoint_mgr._save()
|
||||
raise
|
||||
@@ -0,0 +1,38 @@
|
||||
"""
|
||||
RBAC/ABAC Policy System for Klausur-Service (barrel re-export)
|
||||
|
||||
This module was split into:
|
||||
- rbac_types.py (Enums, data structures)
|
||||
- rbac_permissions.py (Permission matrix)
|
||||
- rbac_engine.py (PolicyEngine, default policies, API guards)
|
||||
|
||||
All public symbols are re-exported here for backwards compatibility.
|
||||
"""
|
||||
|
||||
# Types and enums
|
||||
from .rbac_types import ( # noqa: F401
|
||||
Role,
|
||||
Action,
|
||||
ResourceType,
|
||||
ZKVisibilityMode,
|
||||
EHVisibilityMode,
|
||||
VerfahrenType,
|
||||
PolicySet,
|
||||
RoleAssignment,
|
||||
KeyShare,
|
||||
Tenant,
|
||||
Namespace,
|
||||
ExamPackage,
|
||||
)
|
||||
|
||||
# Permission matrix
|
||||
from .rbac_permissions import DEFAULT_PERMISSIONS # noqa: F401
|
||||
|
||||
# Engine, policies, guards
|
||||
from .rbac_engine import ( # noqa: F401
|
||||
PolicyEngine,
|
||||
create_default_policy_sets,
|
||||
get_policy_engine,
|
||||
require_permission,
|
||||
require_role,
|
||||
)
|
||||
@@ -0,0 +1,498 @@
|
||||
"""
|
||||
RBAC Policy Engine
|
||||
|
||||
Core engine for RBAC/ABAC permission checks,
|
||||
role assignments, key shares, and default policies.
|
||||
Extracted from rbac.py for file-size compliance.
|
||||
"""
|
||||
|
||||
from typing import Optional, List, Dict, Set
|
||||
from datetime import datetime, timezone
|
||||
import uuid
|
||||
from functools import wraps
|
||||
|
||||
from fastapi import HTTPException, Request
|
||||
|
||||
from .rbac_types import (
|
||||
Role,
|
||||
Action,
|
||||
ResourceType,
|
||||
ZKVisibilityMode,
|
||||
PolicySet,
|
||||
RoleAssignment,
|
||||
KeyShare,
|
||||
)
|
||||
from .rbac_permissions import DEFAULT_PERMISSIONS
|
||||
|
||||
|
||||
# =============================================
|
||||
# POLICY ENGINE
|
||||
# =============================================
|
||||
|
||||
class PolicyEngine:
|
||||
"""
|
||||
Engine fuer RBAC/ABAC Entscheidungen.
|
||||
|
||||
Prueft:
|
||||
1. Basis-Rollenberechtigung (RBAC)
|
||||
2. Policy-Einschraenkungen (ABAC)
|
||||
3. Key Share Berechtigungen
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self.policy_sets: Dict[str, PolicySet] = {}
|
||||
self.role_assignments: Dict[str, List[RoleAssignment]] = {} # user_id -> assignments
|
||||
self.key_shares: Dict[str, List[KeyShare]] = {} # user_id -> shares
|
||||
|
||||
def register_policy_set(self, policy: PolicySet):
|
||||
"""Registriere ein Policy Set."""
|
||||
self.policy_sets[policy.id] = policy
|
||||
|
||||
def get_policy_for_context(
|
||||
self,
|
||||
bundesland: str,
|
||||
jahr: int,
|
||||
fach: Optional[str] = None,
|
||||
verfahren: str = "abitur"
|
||||
) -> Optional[PolicySet]:
|
||||
"""Finde das passende Policy Set fuer einen Kontext."""
|
||||
# Exakte Uebereinstimmung
|
||||
for policy in self.policy_sets.values():
|
||||
if (policy.bundesland == bundesland and
|
||||
policy.jahr == jahr and
|
||||
policy.verfahren == verfahren):
|
||||
if policy.fach is None or policy.fach == fach:
|
||||
return policy
|
||||
|
||||
# Fallback: Default Policy
|
||||
for policy in self.policy_sets.values():
|
||||
if policy.bundesland == "DEFAULT":
|
||||
return policy
|
||||
|
||||
return None
|
||||
|
||||
def assign_role(
|
||||
self,
|
||||
user_id: str,
|
||||
role: Role,
|
||||
resource_type: ResourceType,
|
||||
resource_id: str,
|
||||
granted_by: str,
|
||||
tenant_id: Optional[str] = None,
|
||||
namespace_id: Optional[str] = None,
|
||||
valid_to: Optional[datetime] = None
|
||||
) -> RoleAssignment:
|
||||
"""Weise einem User eine Rolle zu."""
|
||||
assignment = RoleAssignment(
|
||||
id=str(uuid.uuid4()),
|
||||
user_id=user_id,
|
||||
role=role,
|
||||
resource_type=resource_type,
|
||||
resource_id=resource_id,
|
||||
tenant_id=tenant_id,
|
||||
namespace_id=namespace_id,
|
||||
granted_by=granted_by,
|
||||
valid_to=valid_to
|
||||
)
|
||||
|
||||
if user_id not in self.role_assignments:
|
||||
self.role_assignments[user_id] = []
|
||||
self.role_assignments[user_id].append(assignment)
|
||||
|
||||
return assignment
|
||||
|
||||
def revoke_role(self, assignment_id: str, revoked_by: str) -> bool:
|
||||
"""Widerrufe eine Rollenzuweisung."""
|
||||
for user_assignments in self.role_assignments.values():
|
||||
for assignment in user_assignments:
|
||||
if assignment.id == assignment_id:
|
||||
assignment.revoked_at = datetime.now(timezone.utc)
|
||||
return True
|
||||
return False
|
||||
|
||||
def get_user_roles(
|
||||
self,
|
||||
user_id: str,
|
||||
resource_type: Optional[ResourceType] = None,
|
||||
resource_id: Optional[str] = None
|
||||
) -> List[Role]:
|
||||
"""Hole alle aktiven Rollen eines Users."""
|
||||
assignments = self.role_assignments.get(user_id, [])
|
||||
roles = []
|
||||
|
||||
for assignment in assignments:
|
||||
if not assignment.is_active():
|
||||
continue
|
||||
if resource_type and assignment.resource_type != resource_type:
|
||||
continue
|
||||
if resource_id and assignment.resource_id != resource_id:
|
||||
continue
|
||||
roles.append(assignment.role)
|
||||
|
||||
return list(set(roles))
|
||||
|
||||
def create_key_share(
|
||||
self,
|
||||
user_id: str,
|
||||
package_id: str,
|
||||
permissions: Set[str],
|
||||
granted_by: str,
|
||||
scope: str = "full",
|
||||
invite_token: Optional[str] = None
|
||||
) -> KeyShare:
|
||||
"""Erstelle einen Key Share."""
|
||||
share = KeyShare(
|
||||
id=str(uuid.uuid4()),
|
||||
user_id=user_id,
|
||||
package_id=package_id,
|
||||
permissions=permissions,
|
||||
scope=scope,
|
||||
granted_by=granted_by,
|
||||
invite_token=invite_token
|
||||
)
|
||||
|
||||
if user_id not in self.key_shares:
|
||||
self.key_shares[user_id] = []
|
||||
self.key_shares[user_id].append(share)
|
||||
|
||||
return share
|
||||
|
||||
def accept_key_share(self, share_id: str, token: str) -> bool:
|
||||
"""Akzeptiere einen Key Share via Invite Token."""
|
||||
for user_shares in self.key_shares.values():
|
||||
for share in user_shares:
|
||||
if share.id == share_id and share.invite_token == token:
|
||||
share.accepted_at = datetime.now(timezone.utc)
|
||||
return True
|
||||
return False
|
||||
|
||||
def revoke_key_share(self, share_id: str, revoked_by: str) -> bool:
|
||||
"""Widerrufe einen Key Share."""
|
||||
for user_shares in self.key_shares.values():
|
||||
for share in user_shares:
|
||||
if share.id == share_id:
|
||||
share.revoked_at = datetime.now(timezone.utc)
|
||||
share.revoked_by = revoked_by
|
||||
return True
|
||||
return False
|
||||
|
||||
def check_permission(
|
||||
self,
|
||||
user_id: str,
|
||||
action: Action,
|
||||
resource_type: ResourceType,
|
||||
resource_id: str,
|
||||
policy: Optional[PolicySet] = None,
|
||||
package_id: Optional[str] = None
|
||||
) -> bool:
|
||||
"""
|
||||
Pruefe ob ein User eine Aktion ausfuehren darf.
|
||||
|
||||
Prueft:
|
||||
1. Basis-RBAC
|
||||
2. Policy-Einschraenkungen
|
||||
3. Key Share (falls package_id angegeben)
|
||||
"""
|
||||
# 1. Hole aktive Rollen
|
||||
roles = self.get_user_roles(user_id, resource_type, resource_id)
|
||||
|
||||
if not roles:
|
||||
return False
|
||||
|
||||
# 2. Pruefe Basis-RBAC
|
||||
has_permission = False
|
||||
for role in roles:
|
||||
role_permissions = DEFAULT_PERMISSIONS.get(role, {})
|
||||
resource_permissions = role_permissions.get(resource_type, set())
|
||||
if action in resource_permissions:
|
||||
has_permission = True
|
||||
break
|
||||
|
||||
if not has_permission:
|
||||
return False
|
||||
|
||||
# 3. Pruefe Policy-Einschraenkungen
|
||||
if policy:
|
||||
# ZK Visibility Mode
|
||||
if Role.ZWEITKORREKTOR in roles:
|
||||
if policy.zk_visibility_mode == ZKVisibilityMode.BLIND:
|
||||
# Blind: ZK darf EK-Outputs nicht sehen
|
||||
if resource_type in [ResourceType.EVALUATION, ResourceType.REPORT, ResourceType.GRADE_DECISION]:
|
||||
if action == Action.READ:
|
||||
# Pruefe ob es EK-Outputs sind (muesste ueber Metadaten geprueft werden)
|
||||
pass # Implementierung abhaengig von Datenmodell
|
||||
|
||||
elif policy.zk_visibility_mode == ZKVisibilityMode.SEMI:
|
||||
# Semi: ZK sieht Annotationen, aber keine Note
|
||||
if resource_type == ResourceType.GRADE_DECISION and action == Action.READ:
|
||||
return False
|
||||
|
||||
# 4. Pruefe Key Share (falls Package-basiert)
|
||||
if package_id:
|
||||
user_shares = self.key_shares.get(user_id, [])
|
||||
has_key_share = any(
|
||||
share.package_id == package_id and share.is_active()
|
||||
for share in user_shares
|
||||
)
|
||||
if not has_key_share:
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def get_allowed_actions(
|
||||
self,
|
||||
user_id: str,
|
||||
resource_type: ResourceType,
|
||||
resource_id: str,
|
||||
policy: Optional[PolicySet] = None
|
||||
) -> Set[Action]:
|
||||
"""Hole alle erlaubten Aktionen fuer einen User auf einer Ressource."""
|
||||
roles = self.get_user_roles(user_id, resource_type, resource_id)
|
||||
allowed = set()
|
||||
|
||||
for role in roles:
|
||||
role_permissions = DEFAULT_PERMISSIONS.get(role, {})
|
||||
resource_permissions = role_permissions.get(resource_type, set())
|
||||
allowed.update(resource_permissions)
|
||||
|
||||
# Policy-Einschraenkungen anwenden
|
||||
if policy and Role.ZWEITKORREKTOR in roles:
|
||||
if policy.zk_visibility_mode == ZKVisibilityMode.BLIND:
|
||||
# Entferne READ fuer bestimmte Ressourcen
|
||||
pass # Detailimplementierung
|
||||
|
||||
return allowed
|
||||
|
||||
|
||||
# =============================================
|
||||
# DEFAULT POLICY SETS (alle Bundeslaender)
|
||||
# =============================================
|
||||
|
||||
def create_default_policy_sets() -> List[PolicySet]:
|
||||
"""
|
||||
Erstelle Default Policy Sets fuer alle Bundeslaender.
|
||||
|
||||
Diese koennen spaeter pro Land verfeinert werden.
|
||||
"""
|
||||
bundeslaender = [
|
||||
"baden-wuerttemberg", "bayern", "berlin", "brandenburg",
|
||||
"bremen", "hamburg", "hessen", "mecklenburg-vorpommern",
|
||||
"niedersachsen", "nordrhein-westfalen", "rheinland-pfalz",
|
||||
"saarland", "sachsen", "sachsen-anhalt", "schleswig-holstein",
|
||||
"thueringen"
|
||||
]
|
||||
|
||||
policies = []
|
||||
|
||||
# Default Policy (Fallback)
|
||||
policies.append(PolicySet(
|
||||
id="DEFAULT-2025",
|
||||
bundesland="DEFAULT",
|
||||
jahr=2025,
|
||||
fach=None,
|
||||
verfahren="abitur",
|
||||
zk_visibility_mode=ZKVisibilityMode.FULL,
|
||||
eh_visibility_mode=PolicySet.__dataclass_fields__["eh_visibility_mode"].default,
|
||||
allow_teacher_uploaded_eh=True,
|
||||
allow_land_uploaded_eh=True,
|
||||
require_rights_confirmation_on_upload=True,
|
||||
third_correction_threshold=4,
|
||||
final_signoff_role="fachvorsitz"
|
||||
))
|
||||
|
||||
# Niedersachsen (Beispiel mit spezifischen Anpassungen)
|
||||
policies.append(PolicySet(
|
||||
id="NI-2025-ABITUR",
|
||||
bundesland="niedersachsen",
|
||||
jahr=2025,
|
||||
fach=None,
|
||||
verfahren="abitur",
|
||||
zk_visibility_mode=ZKVisibilityMode.FULL, # In NI sieht ZK alles
|
||||
allow_teacher_uploaded_eh=True,
|
||||
allow_land_uploaded_eh=True,
|
||||
require_rights_confirmation_on_upload=True,
|
||||
third_correction_threshold=4,
|
||||
final_signoff_role="fachvorsitz",
|
||||
export_template_id="niedersachsen-abitur"
|
||||
))
|
||||
|
||||
# Bayern (Beispiel mit SEMI visibility)
|
||||
policies.append(PolicySet(
|
||||
id="BY-2025-ABITUR",
|
||||
bundesland="bayern",
|
||||
jahr=2025,
|
||||
fach=None,
|
||||
verfahren="abitur",
|
||||
zk_visibility_mode=ZKVisibilityMode.SEMI, # ZK sieht Annotationen, nicht Note
|
||||
allow_teacher_uploaded_eh=True,
|
||||
allow_land_uploaded_eh=True,
|
||||
require_rights_confirmation_on_upload=True,
|
||||
third_correction_threshold=4,
|
||||
final_signoff_role="fachvorsitz",
|
||||
export_template_id="bayern-abitur"
|
||||
))
|
||||
|
||||
# NRW (Beispiel)
|
||||
policies.append(PolicySet(
|
||||
id="NW-2025-ABITUR",
|
||||
bundesland="nordrhein-westfalen",
|
||||
jahr=2025,
|
||||
fach=None,
|
||||
verfahren="abitur",
|
||||
zk_visibility_mode=ZKVisibilityMode.FULL,
|
||||
allow_teacher_uploaded_eh=True,
|
||||
allow_land_uploaded_eh=True,
|
||||
require_rights_confirmation_on_upload=True,
|
||||
third_correction_threshold=4,
|
||||
final_signoff_role="fachvorsitz",
|
||||
export_template_id="nrw-abitur"
|
||||
))
|
||||
|
||||
# Generiere Basis-Policies fuer alle anderen Bundeslaender
|
||||
for bl in bundeslaender:
|
||||
if bl not in ["niedersachsen", "bayern", "nordrhein-westfalen"]:
|
||||
policies.append(PolicySet(
|
||||
id=f"{bl[:2].upper()}-2025-ABITUR",
|
||||
bundesland=bl,
|
||||
jahr=2025,
|
||||
fach=None,
|
||||
verfahren="abitur",
|
||||
zk_visibility_mode=ZKVisibilityMode.FULL,
|
||||
allow_teacher_uploaded_eh=True,
|
||||
allow_land_uploaded_eh=True,
|
||||
require_rights_confirmation_on_upload=True,
|
||||
third_correction_threshold=4,
|
||||
final_signoff_role="fachvorsitz"
|
||||
))
|
||||
|
||||
return policies
|
||||
|
||||
|
||||
# =============================================
|
||||
# GLOBAL POLICY ENGINE INSTANCE
|
||||
# =============================================
|
||||
|
||||
# Singleton Policy Engine
|
||||
_policy_engine: Optional[PolicyEngine] = None
|
||||
|
||||
|
||||
def get_policy_engine() -> PolicyEngine:
|
||||
"""Hole die globale Policy Engine Instanz."""
|
||||
global _policy_engine
|
||||
if _policy_engine is None:
|
||||
_policy_engine = PolicyEngine()
|
||||
# Registriere Default Policies
|
||||
for policy in create_default_policy_sets():
|
||||
_policy_engine.register_policy_set(policy)
|
||||
return _policy_engine
|
||||
|
||||
|
||||
# =============================================
|
||||
# API GUARDS (Decorators fuer FastAPI)
|
||||
# =============================================
|
||||
|
||||
def require_permission(
|
||||
action: Action,
|
||||
resource_type: ResourceType,
|
||||
resource_id_param: str = "resource_id"
|
||||
):
|
||||
"""
|
||||
Decorator fuer FastAPI Endpoints.
|
||||
|
||||
Prueft ob der aktuelle User die angegebene Berechtigung hat.
|
||||
|
||||
Usage:
|
||||
@app.get("/api/v1/packages/{package_id}")
|
||||
@require_permission(Action.READ, ResourceType.EXAM_PACKAGE, "package_id")
|
||||
async def get_package(package_id: str, request: Request):
|
||||
...
|
||||
"""
|
||||
def decorator(func):
|
||||
@wraps(func)
|
||||
async def wrapper(*args, **kwargs):
|
||||
request = kwargs.get('request')
|
||||
if not request:
|
||||
for arg in args:
|
||||
if isinstance(arg, Request):
|
||||
request = arg
|
||||
break
|
||||
|
||||
if not request:
|
||||
raise HTTPException(status_code=500, detail="Request not found")
|
||||
|
||||
# User aus Token holen
|
||||
user = getattr(request.state, 'user', None)
|
||||
if not user:
|
||||
raise HTTPException(status_code=401, detail="Not authenticated")
|
||||
|
||||
user_id = user.get('user_id')
|
||||
resource_id = kwargs.get(resource_id_param)
|
||||
|
||||
# Policy Engine pruefen
|
||||
engine = get_policy_engine()
|
||||
|
||||
# Optional: Policy aus Kontext laden
|
||||
policy = None
|
||||
bundesland = user.get('bundesland')
|
||||
if bundesland:
|
||||
policy = engine.get_policy_for_context(bundesland, 2025)
|
||||
|
||||
if not engine.check_permission(
|
||||
user_id=user_id,
|
||||
action=action,
|
||||
resource_type=resource_type,
|
||||
resource_id=resource_id,
|
||||
policy=policy
|
||||
):
|
||||
raise HTTPException(
|
||||
status_code=403,
|
||||
detail=f"Permission denied: {action.value} on {resource_type.value}"
|
||||
)
|
||||
|
||||
return await func(*args, **kwargs)
|
||||
|
||||
return wrapper
|
||||
return decorator
|
||||
|
||||
|
||||
def require_role(role: Role):
|
||||
"""
|
||||
Decorator der prueft ob User eine bestimmte Rolle hat.
|
||||
|
||||
Usage:
|
||||
@app.post("/api/v1/eh/publish")
|
||||
@require_role(Role.LAND_ADMIN)
|
||||
async def publish_eh(request: Request):
|
||||
...
|
||||
"""
|
||||
def decorator(func):
|
||||
@wraps(func)
|
||||
async def wrapper(*args, **kwargs):
|
||||
request = kwargs.get('request')
|
||||
if not request:
|
||||
for arg in args:
|
||||
if isinstance(arg, Request):
|
||||
request = arg
|
||||
break
|
||||
|
||||
if not request:
|
||||
raise HTTPException(status_code=500, detail="Request not found")
|
||||
|
||||
user = getattr(request.state, 'user', None)
|
||||
if not user:
|
||||
raise HTTPException(status_code=401, detail="Not authenticated")
|
||||
|
||||
user_id = user.get('user_id')
|
||||
engine = get_policy_engine()
|
||||
|
||||
user_roles = engine.get_user_roles(user_id)
|
||||
if role not in user_roles:
|
||||
raise HTTPException(
|
||||
status_code=403,
|
||||
detail=f"Role required: {role.value}"
|
||||
)
|
||||
|
||||
return await func(*args, **kwargs)
|
||||
|
||||
return wrapper
|
||||
return decorator
|
||||
@@ -0,0 +1,221 @@
|
||||
"""
|
||||
RBAC Permission Matrix
|
||||
|
||||
Default role-to-resource permission mappings for
|
||||
Klausur-Korrektur and Zeugnis workflows.
|
||||
Extracted from rbac.py for file-size compliance.
|
||||
"""
|
||||
|
||||
from typing import Dict, Set
|
||||
|
||||
from .rbac_types import Role, Action, ResourceType
|
||||
|
||||
|
||||
# =============================================
|
||||
# RBAC PERMISSION MATRIX
|
||||
# =============================================
|
||||
|
||||
# Standard-Berechtigungsmatrix (kann durch Policies ueberschrieben werden)
|
||||
DEFAULT_PERMISSIONS: Dict[Role, Dict[ResourceType, Set[Action]]] = {
|
||||
# Erstkorrektor
|
||||
Role.ERSTKORREKTOR: {
|
||||
ResourceType.EXAM_PACKAGE: {Action.READ, Action.UPDATE, Action.SHARE_KEY, Action.LOCK},
|
||||
ResourceType.STUDENT_WORK: {Action.READ, Action.UPDATE},
|
||||
ResourceType.EH_DOCUMENT: {Action.READ, Action.UPLOAD, Action.UPDATE},
|
||||
ResourceType.RUBRIC: {Action.READ, Action.UPDATE},
|
||||
ResourceType.ANNOTATION: {Action.CREATE, Action.READ, Action.UPDATE, Action.DELETE},
|
||||
ResourceType.EVALUATION: {Action.CREATE, Action.READ, Action.UPDATE},
|
||||
ResourceType.REPORT: {Action.CREATE, Action.READ, Action.UPDATE},
|
||||
ResourceType.GRADE_DECISION: {Action.CREATE, Action.READ, Action.UPDATE},
|
||||
ResourceType.EXPORT: {Action.CREATE, Action.READ, Action.DOWNLOAD},
|
||||
ResourceType.AUDIT_LOG: {Action.READ},
|
||||
},
|
||||
|
||||
# Zweitkorrektor (Standard: FULL visibility)
|
||||
Role.ZWEITKORREKTOR: {
|
||||
ResourceType.EXAM_PACKAGE: {Action.READ},
|
||||
ResourceType.STUDENT_WORK: {Action.READ, Action.UPDATE},
|
||||
ResourceType.EH_DOCUMENT: {Action.READ},
|
||||
ResourceType.RUBRIC: {Action.READ},
|
||||
ResourceType.ANNOTATION: {Action.CREATE, Action.READ, Action.UPDATE},
|
||||
ResourceType.EVALUATION: {Action.CREATE, Action.READ, Action.UPDATE},
|
||||
ResourceType.REPORT: {Action.CREATE, Action.READ, Action.UPDATE},
|
||||
ResourceType.GRADE_DECISION: {Action.CREATE, Action.READ, Action.UPDATE},
|
||||
ResourceType.EXPORT: {Action.READ, Action.DOWNLOAD},
|
||||
ResourceType.AUDIT_LOG: {Action.READ},
|
||||
},
|
||||
|
||||
# Drittkorrektor
|
||||
Role.DRITTKORREKTOR: {
|
||||
ResourceType.EXAM_PACKAGE: {Action.READ},
|
||||
ResourceType.STUDENT_WORK: {Action.READ, Action.UPDATE},
|
||||
ResourceType.EH_DOCUMENT: {Action.READ},
|
||||
ResourceType.RUBRIC: {Action.READ},
|
||||
ResourceType.ANNOTATION: {Action.CREATE, Action.READ, Action.UPDATE},
|
||||
ResourceType.EVALUATION: {Action.CREATE, Action.READ, Action.UPDATE},
|
||||
ResourceType.REPORT: {Action.CREATE, Action.READ, Action.UPDATE},
|
||||
ResourceType.GRADE_DECISION: {Action.CREATE, Action.READ, Action.UPDATE},
|
||||
ResourceType.AUDIT_LOG: {Action.READ},
|
||||
},
|
||||
|
||||
# Fachvorsitz
|
||||
Role.FACHVORSITZ: {
|
||||
ResourceType.TENANT: {Action.READ},
|
||||
ResourceType.NAMESPACE: {Action.READ, Action.UPDATE},
|
||||
ResourceType.EXAM_PACKAGE: {Action.READ, Action.UPDATE, Action.LOCK, Action.UNLOCK, Action.SIGN_OFF},
|
||||
ResourceType.STUDENT_WORK: {Action.READ, Action.UPDATE},
|
||||
ResourceType.EH_DOCUMENT: {Action.READ, Action.UPLOAD, Action.UPDATE},
|
||||
ResourceType.RUBRIC: {Action.READ, Action.UPDATE},
|
||||
ResourceType.ANNOTATION: {Action.READ, Action.UPDATE},
|
||||
ResourceType.EVALUATION: {Action.READ, Action.UPDATE},
|
||||
ResourceType.REPORT: {Action.READ, Action.UPDATE},
|
||||
ResourceType.GRADE_DECISION: {Action.READ, Action.UPDATE, Action.SIGN_OFF},
|
||||
ResourceType.EXPORT: {Action.CREATE, Action.READ, Action.DOWNLOAD},
|
||||
ResourceType.AUDIT_LOG: {Action.READ},
|
||||
},
|
||||
|
||||
# Pruefungsvorsitz
|
||||
Role.PRUEFUNGSVORSITZ: {
|
||||
ResourceType.TENANT: {Action.READ},
|
||||
ResourceType.NAMESPACE: {Action.READ, Action.CREATE},
|
||||
ResourceType.EXAM_PACKAGE: {Action.READ, Action.SIGN_OFF},
|
||||
ResourceType.STUDENT_WORK: {Action.READ},
|
||||
ResourceType.EH_DOCUMENT: {Action.READ},
|
||||
ResourceType.GRADE_DECISION: {Action.READ, Action.SIGN_OFF},
|
||||
ResourceType.EXPORT: {Action.CREATE, Action.READ, Action.DOWNLOAD},
|
||||
ResourceType.AUDIT_LOG: {Action.READ},
|
||||
},
|
||||
|
||||
# Schul-Admin
|
||||
Role.SCHUL_ADMIN: {
|
||||
ResourceType.TENANT: {Action.READ, Action.UPDATE},
|
||||
ResourceType.NAMESPACE: {Action.CREATE, Action.READ, Action.UPDATE, Action.DELETE},
|
||||
ResourceType.EXAM_PACKAGE: {Action.CREATE, Action.READ, Action.DELETE, Action.ASSIGN_ROLE},
|
||||
ResourceType.EH_DOCUMENT: {Action.READ, Action.UPLOAD, Action.DELETE},
|
||||
ResourceType.AUDIT_LOG: {Action.READ},
|
||||
},
|
||||
|
||||
# Land-Admin (Behoerde)
|
||||
Role.LAND_ADMIN: {
|
||||
ResourceType.TENANT: {Action.READ},
|
||||
ResourceType.EH_DOCUMENT: {Action.READ, Action.UPLOAD, Action.UPDATE, Action.DELETE, Action.PUBLISH_OFFICIAL},
|
||||
ResourceType.AUDIT_LOG: {Action.READ},
|
||||
},
|
||||
|
||||
# Auditor
|
||||
Role.AUDITOR: {
|
||||
ResourceType.AUDIT_LOG: {Action.READ},
|
||||
ResourceType.EXAM_PACKAGE: {Action.READ}, # Nur Metadaten
|
||||
# Kein Zugriff auf Inhalte!
|
||||
},
|
||||
|
||||
# Operator
|
||||
Role.OPERATOR: {
|
||||
ResourceType.TENANT: {Action.READ},
|
||||
ResourceType.NAMESPACE: {Action.READ},
|
||||
ResourceType.EXAM_PACKAGE: {Action.READ}, # Nur Metadaten
|
||||
ResourceType.AUDIT_LOG: {Action.READ},
|
||||
# Break-glass separat gehandhabt
|
||||
},
|
||||
|
||||
# Teacher Assistant
|
||||
Role.TEACHER_ASSISTANT: {
|
||||
ResourceType.STUDENT_WORK: {Action.READ},
|
||||
ResourceType.ANNOTATION: {Action.CREATE, Action.READ}, # Nur bestimmte Typen
|
||||
ResourceType.EH_DOCUMENT: {Action.READ},
|
||||
},
|
||||
|
||||
# Exam Author (nur Vorabi)
|
||||
Role.EXAM_AUTHOR: {
|
||||
ResourceType.EH_DOCUMENT: {Action.CREATE, Action.READ, Action.UPDATE, Action.DELETE},
|
||||
ResourceType.RUBRIC: {Action.CREATE, Action.READ, Action.UPDATE, Action.DELETE},
|
||||
},
|
||||
|
||||
# =============================================
|
||||
# ZEUGNIS-WORKFLOW ROLLEN
|
||||
# =============================================
|
||||
|
||||
# Klassenlehrer - Erstellt Zeugnisse, Kopfnoten, Bemerkungen
|
||||
Role.KLASSENLEHRER: {
|
||||
ResourceType.NAMESPACE: {Action.READ},
|
||||
ResourceType.ZEUGNIS: {Action.CREATE, Action.READ, Action.UPDATE},
|
||||
ResourceType.ZEUGNIS_ENTWURF: {Action.CREATE, Action.READ, Action.UPDATE, Action.DELETE},
|
||||
ResourceType.ZEUGNIS_VORLAGE: {Action.READ},
|
||||
ResourceType.SCHUELER_DATEN: {Action.READ, Action.UPDATE},
|
||||
ResourceType.FACHNOTE: {Action.READ}, # Liest Fachnoten der Fachlehrer
|
||||
ResourceType.KOPFNOTE: {Action.CREATE, Action.READ, Action.UPDATE},
|
||||
ResourceType.FEHLZEITEN: {Action.READ, Action.UPDATE},
|
||||
ResourceType.BEMERKUNG: {Action.CREATE, Action.READ, Action.UPDATE, Action.DELETE},
|
||||
ResourceType.VERSETZUNG: {Action.READ},
|
||||
ResourceType.EXPORT: {Action.CREATE, Action.READ, Action.DOWNLOAD},
|
||||
ResourceType.AUDIT_LOG: {Action.READ},
|
||||
},
|
||||
|
||||
# Fachlehrer - Traegt Fachnoten ein
|
||||
Role.FACHLEHRER: {
|
||||
ResourceType.NAMESPACE: {Action.READ},
|
||||
ResourceType.SCHUELER_DATEN: {Action.READ}, # Nur eigene Schueler
|
||||
ResourceType.FACHNOTE: {Action.CREATE, Action.READ, Action.UPDATE}, # Nur eigenes Fach
|
||||
ResourceType.BEMERKUNG: {Action.CREATE, Action.READ}, # Fachbezogene Bemerkungen
|
||||
ResourceType.AUDIT_LOG: {Action.READ},
|
||||
},
|
||||
|
||||
# Zeugnisbeauftragter - Qualitaetskontrolle
|
||||
Role.ZEUGNISBEAUFTRAGTER: {
|
||||
ResourceType.NAMESPACE: {Action.READ, Action.UPDATE},
|
||||
ResourceType.ZEUGNIS: {Action.READ, Action.UPDATE},
|
||||
ResourceType.ZEUGNIS_ENTWURF: {Action.READ, Action.UPDATE},
|
||||
ResourceType.ZEUGNIS_VORLAGE: {Action.READ, Action.UPDATE, Action.UPLOAD},
|
||||
ResourceType.SCHUELER_DATEN: {Action.READ},
|
||||
ResourceType.FACHNOTE: {Action.READ},
|
||||
ResourceType.KOPFNOTE: {Action.READ, Action.UPDATE},
|
||||
ResourceType.FEHLZEITEN: {Action.READ},
|
||||
ResourceType.BEMERKUNG: {Action.READ, Action.UPDATE},
|
||||
ResourceType.VERSETZUNG: {Action.READ},
|
||||
ResourceType.EXPORT: {Action.CREATE, Action.READ, Action.DOWNLOAD},
|
||||
ResourceType.AUDIT_LOG: {Action.READ},
|
||||
},
|
||||
|
||||
# Sekretariat - Druck, Versand, Archivierung
|
||||
Role.SEKRETARIAT: {
|
||||
ResourceType.ZEUGNIS: {Action.READ, Action.DOWNLOAD},
|
||||
ResourceType.ZEUGNIS_VORLAGE: {Action.READ},
|
||||
ResourceType.SCHUELER_DATEN: {Action.READ}, # Fuer Adressdaten
|
||||
ResourceType.EXPORT: {Action.CREATE, Action.READ, Action.DOWNLOAD},
|
||||
ResourceType.AUDIT_LOG: {Action.READ},
|
||||
},
|
||||
|
||||
# Schulleitung - Finale Zeugnis-Freigabe
|
||||
Role.SCHULLEITUNG: {
|
||||
ResourceType.TENANT: {Action.READ},
|
||||
ResourceType.NAMESPACE: {Action.READ, Action.CREATE},
|
||||
ResourceType.ZEUGNIS: {Action.READ, Action.SIGN_OFF, Action.LOCK},
|
||||
ResourceType.ZEUGNIS_ENTWURF: {Action.READ, Action.UPDATE},
|
||||
ResourceType.ZEUGNIS_VORLAGE: {Action.READ, Action.UPDATE},
|
||||
ResourceType.SCHUELER_DATEN: {Action.READ},
|
||||
ResourceType.FACHNOTE: {Action.READ},
|
||||
ResourceType.KOPFNOTE: {Action.READ, Action.UPDATE},
|
||||
ResourceType.FEHLZEITEN: {Action.READ},
|
||||
ResourceType.BEMERKUNG: {Action.READ, Action.UPDATE},
|
||||
ResourceType.KONFERENZ_BESCHLUSS: {Action.CREATE, Action.READ, Action.UPDATE, Action.SIGN_OFF},
|
||||
ResourceType.VERSETZUNG: {Action.CREATE, Action.READ, Action.UPDATE, Action.SIGN_OFF},
|
||||
ResourceType.EXPORT: {Action.CREATE, Action.READ, Action.DOWNLOAD},
|
||||
ResourceType.AUDIT_LOG: {Action.READ},
|
||||
},
|
||||
|
||||
# Stufenleitung - Stufenkoordination (z.B. Oberstufe)
|
||||
Role.STUFENLEITUNG: {
|
||||
ResourceType.NAMESPACE: {Action.READ, Action.UPDATE},
|
||||
ResourceType.ZEUGNIS: {Action.READ, Action.UPDATE},
|
||||
ResourceType.ZEUGNIS_ENTWURF: {Action.READ, Action.UPDATE},
|
||||
ResourceType.SCHUELER_DATEN: {Action.READ},
|
||||
ResourceType.FACHNOTE: {Action.READ},
|
||||
ResourceType.KOPFNOTE: {Action.READ},
|
||||
ResourceType.FEHLZEITEN: {Action.READ},
|
||||
ResourceType.BEMERKUNG: {Action.READ, Action.UPDATE},
|
||||
ResourceType.KONFERENZ_BESCHLUSS: {Action.READ},
|
||||
ResourceType.VERSETZUNG: {Action.READ, Action.UPDATE},
|
||||
ResourceType.EXPORT: {Action.READ, Action.DOWNLOAD},
|
||||
ResourceType.AUDIT_LOG: {Action.READ},
|
||||
},
|
||||
}
|
||||
@@ -0,0 +1,438 @@
|
||||
"""
|
||||
RBAC/ABAC Type Definitions
|
||||
|
||||
Enums, data structures, and models for the policy system.
|
||||
Extracted from rbac.py for file-size compliance.
|
||||
"""
|
||||
|
||||
import json
|
||||
from enum import Enum
|
||||
from dataclasses import dataclass, field, asdict
|
||||
from typing import Optional, List, Dict, Set, Any
|
||||
from datetime import datetime, timezone
|
||||
import uuid
|
||||
|
||||
|
||||
# =============================================
|
||||
# ENUMS: Roles, Actions, Resources
|
||||
# =============================================
|
||||
|
||||
class Role(str, Enum):
|
||||
"""Fachliche Rollen in Korrektur- und Zeugniskette."""
|
||||
|
||||
# === Klausur-Korrekturkette ===
|
||||
ERSTKORREKTOR = "erstkorrektor" # EK
|
||||
ZWEITKORREKTOR = "zweitkorrektor" # ZK
|
||||
DRITTKORREKTOR = "drittkorrektor" # DK
|
||||
|
||||
# === Zeugnis-Workflow ===
|
||||
KLASSENLEHRER = "klassenlehrer" # KL - Erstellt Zeugnis, Kopfnoten, Bemerkungen
|
||||
FACHLEHRER = "fachlehrer" # FL - Traegt Fachnoten ein
|
||||
ZEUGNISBEAUFTRAGTER = "zeugnisbeauftragter" # ZB - Qualitaetskontrolle
|
||||
SEKRETARIAT = "sekretariat" # SEK - Druck, Versand, Archivierung
|
||||
|
||||
# === Leitung (Klausur + Zeugnis) ===
|
||||
FACHVORSITZ = "fachvorsitz" # FVL - Fachpruefungsleitung
|
||||
PRUEFUNGSVORSITZ = "pruefungsvorsitz" # PV - Schulleitung / Pruefungsvorsitz
|
||||
SCHULLEITUNG = "schulleitung" # SL - Finale Zeugnis-Freigabe
|
||||
STUFENLEITUNG = "stufenleitung" # STL - Stufenkoordination
|
||||
|
||||
# === Administration ===
|
||||
SCHUL_ADMIN = "schul_admin" # SA
|
||||
LAND_ADMIN = "land_admin" # LA - Behoerde
|
||||
|
||||
# === Spezial ===
|
||||
AUDITOR = "auditor" # DSB/Auditor
|
||||
OPERATOR = "operator" # OPS - Support
|
||||
TEACHER_ASSISTANT = "teacher_assistant" # TA - Referendar
|
||||
EXAM_AUTHOR = "exam_author" # EA - nur Vorabi
|
||||
|
||||
|
||||
class Action(str, Enum):
|
||||
"""Moegliche Operationen auf Ressourcen."""
|
||||
CREATE = "create"
|
||||
READ = "read"
|
||||
UPDATE = "update"
|
||||
DELETE = "delete"
|
||||
|
||||
ASSIGN_ROLE = "assign_role"
|
||||
INVITE_USER = "invite_user"
|
||||
REMOVE_USER = "remove_user"
|
||||
|
||||
UPLOAD = "upload"
|
||||
DOWNLOAD = "download"
|
||||
|
||||
LOCK = "lock" # Finalisieren
|
||||
UNLOCK = "unlock" # Nur mit Sonderrecht
|
||||
SIGN_OFF = "sign_off" # Freigabe
|
||||
|
||||
SHARE_KEY = "share_key" # Key Share erzeugen
|
||||
VIEW_PII = "view_pii" # Falls PII vorhanden
|
||||
BREAK_GLASS = "break_glass" # Notfallzugriff
|
||||
|
||||
PUBLISH_OFFICIAL = "publish_official" # Amtliche EH verteilen
|
||||
|
||||
|
||||
class ResourceType(str, Enum):
|
||||
"""Ressourcentypen im System."""
|
||||
TENANT = "tenant"
|
||||
NAMESPACE = "namespace"
|
||||
|
||||
# === Klausur-Korrektur ===
|
||||
EXAM_PACKAGE = "exam_package"
|
||||
STUDENT_WORK = "student_work"
|
||||
EH_DOCUMENT = "eh_document"
|
||||
RUBRIC = "rubric" # Punkteraster
|
||||
ANNOTATION = "annotation"
|
||||
EVALUATION = "evaluation" # Kriterien/Punkte
|
||||
REPORT = "report" # Gutachten
|
||||
GRADE_DECISION = "grade_decision"
|
||||
|
||||
# === Zeugnisgenerator ===
|
||||
ZEUGNIS = "zeugnis" # Zeugnisdokument
|
||||
ZEUGNIS_VORLAGE = "zeugnis_vorlage" # Zeugnisvorlage/Template
|
||||
ZEUGNIS_ENTWURF = "zeugnis_entwurf" # Zeugnisentwurf (vor Freigabe)
|
||||
SCHUELER_DATEN = "schueler_daten" # Schueler-Stammdaten, Noten
|
||||
FACHNOTE = "fachnote" # Einzelne Fachnote
|
||||
KOPFNOTE = "kopfnote" # Arbeits-/Sozialverhalten
|
||||
FEHLZEITEN = "fehlzeiten" # Fehlzeiten
|
||||
BEMERKUNG = "bemerkung" # Zeugnisbemerkungen
|
||||
KONFERENZ_BESCHLUSS = "konferenz_beschluss" # Konferenzergebnis
|
||||
VERSETZUNG = "versetzung" # Versetzungsentscheidung
|
||||
|
||||
# === Allgemein ===
|
||||
DOCUMENT = "document" # Generischer Dokumenttyp (EH, Vorlagen, etc.)
|
||||
TEMPLATE = "template" # Generische Vorlagen
|
||||
EXPORT = "export"
|
||||
AUDIT_LOG = "audit_log"
|
||||
KEY_MATERIAL = "key_material"
|
||||
|
||||
|
||||
class ZKVisibilityMode(str, Enum):
|
||||
"""Sichtbarkeitsmodus fuer Zweitkorrektoren."""
|
||||
BLIND = "blind" # ZK sieht keine EK-Note/Gutachten
|
||||
SEMI = "semi" # ZK sieht Annotationen, aber keine Note
|
||||
FULL = "full" # ZK sieht alles
|
||||
|
||||
|
||||
class EHVisibilityMode(str, Enum):
|
||||
"""Sichtbarkeitsmodus fuer Erwartungshorizonte."""
|
||||
BLIND = "blind" # ZK sieht EH nicht (selten)
|
||||
SHARED = "shared" # ZK sieht EH (Standard)
|
||||
|
||||
|
||||
class VerfahrenType(str, Enum):
|
||||
"""Verfahrenstypen fuer Klausuren und Zeugnisse."""
|
||||
|
||||
# === Klausur/Pruefungsverfahren ===
|
||||
ABITUR = "abitur"
|
||||
VORABITUR = "vorabitur"
|
||||
KLAUSUR = "klausur"
|
||||
NACHPRUEFUNG = "nachpruefung"
|
||||
|
||||
# === Zeugnisverfahren ===
|
||||
HALBJAHRESZEUGNIS = "halbjahreszeugnis"
|
||||
JAHRESZEUGNIS = "jahreszeugnis"
|
||||
ABSCHLUSSZEUGNIS = "abschlusszeugnis"
|
||||
ABGANGSZEUGNIS = "abgangszeugnis"
|
||||
|
||||
@classmethod
|
||||
def is_exam_type(cls, verfahren: str) -> bool:
|
||||
"""Pruefe ob Verfahren ein Pruefungstyp ist."""
|
||||
exam_types = {cls.ABITUR, cls.VORABITUR, cls.KLAUSUR, cls.NACHPRUEFUNG}
|
||||
try:
|
||||
return cls(verfahren) in exam_types
|
||||
except ValueError:
|
||||
return False
|
||||
|
||||
@classmethod
|
||||
def is_certificate_type(cls, verfahren: str) -> bool:
|
||||
"""Pruefe ob Verfahren ein Zeugnistyp ist."""
|
||||
cert_types = {cls.HALBJAHRESZEUGNIS, cls.JAHRESZEUGNIS, cls.ABSCHLUSSZEUGNIS, cls.ABGANGSZEUGNIS}
|
||||
try:
|
||||
return cls(verfahren) in cert_types
|
||||
except ValueError:
|
||||
return False
|
||||
|
||||
|
||||
# =============================================
|
||||
# DATA STRUCTURES
|
||||
# =============================================
|
||||
|
||||
@dataclass
|
||||
class PolicySet:
|
||||
"""
|
||||
Policy-Konfiguration pro Bundesland/Jahr/Fach.
|
||||
|
||||
Ermoeglicht bundesland-spezifische Unterschiede ohne
|
||||
harte Codierung im Quellcode.
|
||||
|
||||
Unterstuetzte Verfahrenstypen:
|
||||
- Pruefungen: abitur, vorabitur, klausur, nachpruefung
|
||||
- Zeugnisse: halbjahreszeugnis, jahreszeugnis, abschlusszeugnis, abgangszeugnis
|
||||
"""
|
||||
id: str
|
||||
bundesland: str
|
||||
jahr: int
|
||||
fach: Optional[str] # None = gilt fuer alle Faecher
|
||||
verfahren: str # See VerfahrenType enum
|
||||
|
||||
# Sichtbarkeitsregeln (Klausur)
|
||||
zk_visibility_mode: ZKVisibilityMode = ZKVisibilityMode.FULL
|
||||
eh_visibility_mode: EHVisibilityMode = EHVisibilityMode.SHARED
|
||||
|
||||
# EH-Quellen (Klausur)
|
||||
allow_teacher_uploaded_eh: bool = True
|
||||
allow_land_uploaded_eh: bool = True
|
||||
require_rights_confirmation_on_upload: bool = True
|
||||
require_dual_control_for_official_eh_update: bool = False
|
||||
|
||||
# Korrekturregeln (Klausur)
|
||||
third_correction_threshold: int = 4 # Notenpunkte Abweichung
|
||||
final_signoff_role: str = "fachvorsitz"
|
||||
|
||||
# Zeugnisregeln (Zeugnis)
|
||||
require_klassenlehrer_approval: bool = True
|
||||
require_schulleitung_signoff: bool = True
|
||||
allow_sekretariat_edit_after_approval: bool = False
|
||||
konferenz_protokoll_required: bool = True
|
||||
bemerkungen_require_review: bool = True
|
||||
fehlzeiten_auto_import: bool = True
|
||||
kopfnoten_enabled: bool = False
|
||||
versetzung_auto_calculate: bool = True
|
||||
|
||||
# Export & Anzeige
|
||||
quote_verbatim_allowed: bool = False # Amtliche Texte in UI
|
||||
export_template_id: str = "default"
|
||||
|
||||
# Zusaetzliche Flags
|
||||
flags: Dict[str, Any] = field(default_factory=dict)
|
||||
|
||||
created_at: datetime = field(default_factory=lambda: datetime.now(timezone.utc))
|
||||
|
||||
def is_exam_policy(self) -> bool:
|
||||
"""Pruefe ob diese Policy fuer Pruefungen ist."""
|
||||
return VerfahrenType.is_exam_type(self.verfahren)
|
||||
|
||||
def is_certificate_policy(self) -> bool:
|
||||
"""Pruefe ob diese Policy fuer Zeugnisse ist."""
|
||||
return VerfahrenType.is_certificate_type(self.verfahren)
|
||||
|
||||
def to_dict(self):
|
||||
d = asdict(self)
|
||||
d['zk_visibility_mode'] = self.zk_visibility_mode.value
|
||||
d['eh_visibility_mode'] = self.eh_visibility_mode.value
|
||||
d['created_at'] = self.created_at.isoformat()
|
||||
return d
|
||||
|
||||
|
||||
@dataclass
|
||||
class RoleAssignment:
|
||||
"""
|
||||
Zuweisung einer Rolle zu einem User fuer eine spezifische Ressource.
|
||||
"""
|
||||
id: str
|
||||
user_id: str
|
||||
role: Role
|
||||
resource_type: ResourceType
|
||||
resource_id: str
|
||||
|
||||
# Optionale Einschraenkungen
|
||||
tenant_id: Optional[str] = None
|
||||
namespace_id: Optional[str] = None
|
||||
|
||||
# Gueltigkeit
|
||||
valid_from: datetime = field(default_factory=lambda: datetime.now(timezone.utc))
|
||||
valid_to: Optional[datetime] = None
|
||||
|
||||
# Metadaten
|
||||
granted_by: str = ""
|
||||
granted_at: datetime = field(default_factory=lambda: datetime.now(timezone.utc))
|
||||
revoked_at: Optional[datetime] = None
|
||||
|
||||
def is_active(self) -> bool:
|
||||
now = datetime.now(timezone.utc)
|
||||
if self.revoked_at:
|
||||
return False
|
||||
if self.valid_to and now > self.valid_to:
|
||||
return False
|
||||
return now >= self.valid_from
|
||||
|
||||
def to_dict(self):
|
||||
return {
|
||||
'id': self.id,
|
||||
'user_id': self.user_id,
|
||||
'role': self.role.value,
|
||||
'resource_type': self.resource_type.value,
|
||||
'resource_id': self.resource_id,
|
||||
'tenant_id': self.tenant_id,
|
||||
'namespace_id': self.namespace_id,
|
||||
'valid_from': self.valid_from.isoformat(),
|
||||
'valid_to': self.valid_to.isoformat() if self.valid_to else None,
|
||||
'granted_by': self.granted_by,
|
||||
'granted_at': self.granted_at.isoformat(),
|
||||
'revoked_at': self.revoked_at.isoformat() if self.revoked_at else None,
|
||||
'is_active': self.is_active()
|
||||
}
|
||||
|
||||
|
||||
@dataclass
|
||||
class KeyShare:
|
||||
"""
|
||||
Berechtigung fuer einen User, auf verschluesselte Inhalte zuzugreifen.
|
||||
|
||||
Ein KeyShare ist KEIN Schluessel im Klartext, sondern eine
|
||||
Berechtigung in Verbindung mit Role Assignment.
|
||||
"""
|
||||
id: str
|
||||
user_id: str
|
||||
package_id: str
|
||||
|
||||
# Berechtigungsumfang
|
||||
permissions: Set[str] = field(default_factory=set)
|
||||
# z.B. {"read_original", "read_eh", "read_ek_outputs", "write_annotations"}
|
||||
|
||||
# Optionale Einschraenkungen
|
||||
scope: str = "full" # "full", "original_only", "eh_only", "outputs_only"
|
||||
|
||||
# Kette
|
||||
granted_by: str = ""
|
||||
granted_at: datetime = field(default_factory=lambda: datetime.now(timezone.utc))
|
||||
|
||||
# Akzeptanz (fuer Invite-Flow)
|
||||
invite_token: Optional[str] = None
|
||||
accepted_at: Optional[datetime] = None
|
||||
|
||||
# Widerruf
|
||||
revoked_at: Optional[datetime] = None
|
||||
revoked_by: Optional[str] = None
|
||||
|
||||
def is_active(self) -> bool:
|
||||
return self.revoked_at is None and (
|
||||
self.invite_token is None or self.accepted_at is not None
|
||||
)
|
||||
|
||||
def to_dict(self):
|
||||
return {
|
||||
'id': self.id,
|
||||
'user_id': self.user_id,
|
||||
'package_id': self.package_id,
|
||||
'permissions': list(self.permissions),
|
||||
'scope': self.scope,
|
||||
'granted_by': self.granted_by,
|
||||
'granted_at': self.granted_at.isoformat(),
|
||||
'invite_token': self.invite_token,
|
||||
'accepted_at': self.accepted_at.isoformat() if self.accepted_at else None,
|
||||
'revoked_at': self.revoked_at.isoformat() if self.revoked_at else None,
|
||||
'is_active': self.is_active()
|
||||
}
|
||||
|
||||
|
||||
@dataclass
|
||||
class Tenant:
|
||||
"""
|
||||
Hoechste Isolationseinheit - typischerweise eine Schule.
|
||||
"""
|
||||
id: str
|
||||
name: str
|
||||
bundesland: str
|
||||
tenant_type: str = "school" # "school", "pruefungszentrum", "behoerde"
|
||||
|
||||
# Verschluesselung
|
||||
encryption_enabled: bool = True
|
||||
|
||||
# Metadaten
|
||||
created_at: datetime = field(default_factory=lambda: datetime.now(timezone.utc))
|
||||
deleted_at: Optional[datetime] = None
|
||||
|
||||
def to_dict(self):
|
||||
return {
|
||||
'id': self.id,
|
||||
'name': self.name,
|
||||
'bundesland': self.bundesland,
|
||||
'tenant_type': self.tenant_type,
|
||||
'encryption_enabled': self.encryption_enabled,
|
||||
'created_at': self.created_at.isoformat()
|
||||
}
|
||||
|
||||
|
||||
@dataclass
|
||||
class Namespace:
|
||||
"""
|
||||
Arbeitsraum innerhalb eines Tenants.
|
||||
z.B. "Abitur 2026 - Deutsch LK - Kurs 12a"
|
||||
"""
|
||||
id: str
|
||||
tenant_id: str
|
||||
name: str
|
||||
|
||||
# Kontext
|
||||
jahr: int
|
||||
fach: str
|
||||
kurs: Optional[str] = None
|
||||
pruefungsart: str = "abitur" # "abitur", "vorabitur"
|
||||
|
||||
# Policy
|
||||
policy_set_id: Optional[str] = None
|
||||
|
||||
# Metadaten
|
||||
created_by: str = ""
|
||||
created_at: datetime = field(default_factory=lambda: datetime.now(timezone.utc))
|
||||
deleted_at: Optional[datetime] = None
|
||||
|
||||
def to_dict(self):
|
||||
return {
|
||||
'id': self.id,
|
||||
'tenant_id': self.tenant_id,
|
||||
'name': self.name,
|
||||
'jahr': self.jahr,
|
||||
'fach': self.fach,
|
||||
'kurs': self.kurs,
|
||||
'pruefungsart': self.pruefungsart,
|
||||
'policy_set_id': self.policy_set_id,
|
||||
'created_by': self.created_by,
|
||||
'created_at': self.created_at.isoformat()
|
||||
}
|
||||
|
||||
|
||||
@dataclass
|
||||
class ExamPackage:
|
||||
"""
|
||||
Pruefungspaket - kompletter Satz Arbeiten mit allen Artefakten.
|
||||
"""
|
||||
id: str
|
||||
namespace_id: str
|
||||
tenant_id: str
|
||||
|
||||
name: str
|
||||
beschreibung: Optional[str] = None
|
||||
|
||||
# Workflow-Status
|
||||
status: str = "draft" # "draft", "in_progress", "locked", "signed_off"
|
||||
|
||||
# Beteiligte (Rollen werden separat zugewiesen)
|
||||
owner_id: str = "" # Typischerweise EK
|
||||
|
||||
# Verschluesselung
|
||||
encryption_key_id: Optional[str] = None
|
||||
|
||||
# Timestamps
|
||||
created_at: datetime = field(default_factory=lambda: datetime.now(timezone.utc))
|
||||
locked_at: Optional[datetime] = None
|
||||
signed_off_at: Optional[datetime] = None
|
||||
signed_off_by: Optional[str] = None
|
||||
|
||||
def to_dict(self):
|
||||
return {
|
||||
'id': self.id,
|
||||
'namespace_id': self.namespace_id,
|
||||
'tenant_id': self.tenant_id,
|
||||
'name': self.name,
|
||||
'beschreibung': self.beschreibung,
|
||||
'status': self.status,
|
||||
'owner_id': self.owner_id,
|
||||
'created_at': self.created_at.isoformat(),
|
||||
'locked_at': self.locked_at.isoformat() if self.locked_at else None,
|
||||
'signed_off_at': self.signed_off_at.isoformat() if self.signed_off_at else None,
|
||||
'signed_off_by': self.signed_off_by
|
||||
}
|
||||
@@ -1,200 +1,4 @@
|
||||
"""
|
||||
Compliance Extraction & Generation.
|
||||
|
||||
Functions for extracting checkpoints from legal text chunks,
|
||||
generating controls, and creating remediation measures.
|
||||
"""
|
||||
|
||||
import re
|
||||
import hashlib
|
||||
import logging
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
from compliance_models import Checkpoint, Control, Measure
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def extract_checkpoints_from_chunk(chunk_text: str, payload: Dict) -> List[Checkpoint]:
|
||||
"""
|
||||
Extract checkpoints/requirements from a chunk of text.
|
||||
|
||||
Uses pattern matching to find requirement-like statements.
|
||||
"""
|
||||
checkpoints = []
|
||||
regulation_code = payload.get("regulation_code", "UNKNOWN")
|
||||
regulation_name = payload.get("regulation_name", "Unknown")
|
||||
source_url = payload.get("source_url", "")
|
||||
chunk_id = hashlib.md5(chunk_text[:100].encode()).hexdigest()[:8]
|
||||
|
||||
# Patterns for different requirement types
|
||||
patterns = [
|
||||
# BSI-TR patterns
|
||||
(r'([OT]\.[A-Za-z_]+\d*)[:\s]+(.+?)(?=\n[OT]\.|$)', 'bsi_requirement'),
|
||||
# Article patterns (GDPR, AI Act, etc.)
|
||||
(r'(?:Artikel|Art\.?)\s+(\d+)(?:\s+Abs(?:atz)?\.?\s*(\d+))?\s*[-\u2013:]\s*(.+?)(?=\n|$)', 'article'),
|
||||
# Numbered requirements
|
||||
(r'\((\d+)\)\s+(.+?)(?=\n\(\d+\)|$)', 'numbered'),
|
||||
# "Der Verantwortliche muss" patterns
|
||||
(r'(?:Der Verantwortliche|Die Aufsichtsbeh\u00f6rde|Der Auftragsverarbeiter)\s+(muss|hat|soll)\s+(.+?)(?=\.\s|$)', 'obligation'),
|
||||
# "Es ist erforderlich" patterns
|
||||
(r'(?:Es ist erforderlich|Es muss gew\u00e4hrleistet|Es sind geeignete)\s+(.+?)(?=\.\s|$)', 'requirement'),
|
||||
]
|
||||
|
||||
for pattern, pattern_type in patterns:
|
||||
matches = re.finditer(pattern, chunk_text, re.MULTILINE | re.DOTALL)
|
||||
for match in matches:
|
||||
if pattern_type == 'bsi_requirement':
|
||||
req_id = match.group(1)
|
||||
description = match.group(2).strip()
|
||||
title = req_id
|
||||
elif pattern_type == 'article':
|
||||
article_num = match.group(1)
|
||||
paragraph = match.group(2) or ""
|
||||
title_text = match.group(3).strip()
|
||||
req_id = f"{regulation_code}-Art{article_num}"
|
||||
if paragraph:
|
||||
req_id += f"-{paragraph}"
|
||||
title = f"Art. {article_num}" + (f" Abs. {paragraph}" if paragraph else "")
|
||||
description = title_text
|
||||
elif pattern_type == 'numbered':
|
||||
num = match.group(1)
|
||||
description = match.group(2).strip()
|
||||
req_id = f"{regulation_code}-{num}"
|
||||
title = f"Anforderung {num}"
|
||||
else:
|
||||
# Generic requirement
|
||||
description = match.group(0).strip()
|
||||
req_id = f"{regulation_code}-{chunk_id}-{len(checkpoints)}"
|
||||
title = description[:50] + "..." if len(description) > 50 else description
|
||||
|
||||
# Skip very short matches
|
||||
if len(description) < 20:
|
||||
continue
|
||||
|
||||
checkpoint = Checkpoint(
|
||||
id=req_id,
|
||||
regulation_code=regulation_code,
|
||||
regulation_name=regulation_name,
|
||||
article=title if 'Art' in title else None,
|
||||
title=title,
|
||||
description=description[:500],
|
||||
original_text=description,
|
||||
chunk_id=chunk_id,
|
||||
source_url=source_url
|
||||
)
|
||||
checkpoints.append(checkpoint)
|
||||
|
||||
return checkpoints
|
||||
|
||||
|
||||
def generate_control_for_checkpoints(
|
||||
checkpoints: List[Checkpoint],
|
||||
domain_counts: Dict[str, int],
|
||||
) -> Optional[Control]:
|
||||
"""
|
||||
Generate a control that covers the given checkpoints.
|
||||
|
||||
This is a simplified version - in production this would use the AI assistant.
|
||||
"""
|
||||
if not checkpoints:
|
||||
return None
|
||||
|
||||
# Group by regulation
|
||||
regulation = checkpoints[0].regulation_code
|
||||
|
||||
# Determine domain based on content
|
||||
all_text = " ".join([cp.description for cp in checkpoints]).lower()
|
||||
|
||||
domain = "gov" # Default
|
||||
if any(kw in all_text for kw in ["verschl\u00fcssel", "krypto", "encrypt", "hash"]):
|
||||
domain = "crypto"
|
||||
elif any(kw in all_text for kw in ["zugang", "access", "authentif", "login", "benutzer"]):
|
||||
domain = "iam"
|
||||
elif any(kw in all_text for kw in ["datenschutz", "personenbezogen", "privacy", "einwilligung"]):
|
||||
domain = "priv"
|
||||
elif any(kw in all_text for kw in ["entwicklung", "test", "code", "software"]):
|
||||
domain = "sdlc"
|
||||
elif any(kw in all_text for kw in ["\u00fcberwach", "monitor", "log", "audit"]):
|
||||
domain = "aud"
|
||||
elif any(kw in all_text for kw in ["ki", "k\u00fcnstlich", "ai", "machine learning", "model"]):
|
||||
domain = "ai"
|
||||
elif any(kw in all_text for kw in ["betrieb", "operation", "verf\u00fcgbar", "backup"]):
|
||||
domain = "ops"
|
||||
elif any(kw in all_text for kw in ["cyber", "resilience", "sbom", "vulnerab"]):
|
||||
domain = "cra"
|
||||
|
||||
# Generate control ID
|
||||
domain_count = domain_counts.get(domain, 0) + 1
|
||||
control_id = f"{domain.upper()}-{domain_count:03d}"
|
||||
|
||||
# Create title from first checkpoint
|
||||
title = checkpoints[0].title
|
||||
if len(title) > 100:
|
||||
title = title[:97] + "..."
|
||||
|
||||
# Create description
|
||||
description = f"Control f\u00fcr {regulation}: " + checkpoints[0].description[:200]
|
||||
|
||||
# Pass criteria
|
||||
pass_criteria = f"Alle {len(checkpoints)} zugeh\u00f6rigen Anforderungen sind erf\u00fcllt und dokumentiert."
|
||||
|
||||
# Implementation guidance
|
||||
guidance = f"Implementiere Ma\u00dfnahmen zur Erf\u00fcllung der Anforderungen aus {regulation}. "
|
||||
guidance += f"Dokumentiere die Umsetzung und f\u00fchre regelm\u00e4\u00dfige Reviews durch."
|
||||
|
||||
# Determine if automated
|
||||
is_automated = any(kw in all_text for kw in ["automat", "tool", "scan", "test"])
|
||||
|
||||
control = Control(
|
||||
id=control_id,
|
||||
domain=domain,
|
||||
title=title,
|
||||
description=description,
|
||||
checkpoints=[cp.id for cp in checkpoints],
|
||||
pass_criteria=pass_criteria,
|
||||
implementation_guidance=guidance,
|
||||
is_automated=is_automated,
|
||||
automation_tool="CI/CD Pipeline" if is_automated else None,
|
||||
priority="high" if "muss" in all_text or "erforderlich" in all_text else "medium"
|
||||
)
|
||||
|
||||
return control
|
||||
|
||||
|
||||
def generate_measure_for_control(control: Control) -> Measure:
|
||||
"""Generate a remediation measure for a control."""
|
||||
measure_id = f"M-{control.id}"
|
||||
|
||||
# Determine deadline based on priority
|
||||
deadline_days = {
|
||||
"critical": 30,
|
||||
"high": 60,
|
||||
"medium": 90,
|
||||
"low": 180
|
||||
}.get(control.priority, 90)
|
||||
|
||||
# Determine responsible team
|
||||
responsible = {
|
||||
"priv": "Datenschutzbeauftragter",
|
||||
"iam": "IT-Security Team",
|
||||
"sdlc": "Entwicklungsteam",
|
||||
"crypto": "IT-Security Team",
|
||||
"ops": "Operations Team",
|
||||
"aud": "Compliance Team",
|
||||
"ai": "AI/ML Team",
|
||||
"cra": "IT-Security Team",
|
||||
"gov": "Management"
|
||||
}.get(control.domain, "Compliance Team")
|
||||
|
||||
measure = Measure(
|
||||
id=measure_id,
|
||||
control_id=control.id,
|
||||
title=f"Umsetzung: {control.title[:50]}",
|
||||
description=f"Implementierung und Dokumentation von {control.id}: {control.description[:100]}",
|
||||
responsible=responsible,
|
||||
deadline_days=deadline_days,
|
||||
status="pending"
|
||||
)
|
||||
|
||||
return measure
|
||||
# Backward-compat shim -- module moved to compliance/extraction.py
|
||||
import importlib as _importlib
|
||||
import sys as _sys
|
||||
_sys.modules[__name__] = _importlib.import_module("compliance.extraction")
|
||||
|
||||
@@ -1,49 +1,4 @@
|
||||
"""
|
||||
Compliance Pipeline Data Models.
|
||||
|
||||
Dataclasses for checkpoints, controls, and measures.
|
||||
"""
|
||||
|
||||
from typing import Optional, List
|
||||
from dataclasses import dataclass
|
||||
|
||||
|
||||
@dataclass
|
||||
class Checkpoint:
|
||||
"""A requirement/checkpoint extracted from legal text."""
|
||||
id: str
|
||||
regulation_code: str
|
||||
regulation_name: str
|
||||
article: Optional[str]
|
||||
title: str
|
||||
description: str
|
||||
original_text: str
|
||||
chunk_id: str
|
||||
source_url: str
|
||||
|
||||
|
||||
@dataclass
|
||||
class Control:
|
||||
"""A control derived from checkpoints."""
|
||||
id: str
|
||||
domain: str
|
||||
title: str
|
||||
description: str
|
||||
checkpoints: List[str] # List of checkpoint IDs
|
||||
pass_criteria: str
|
||||
implementation_guidance: str
|
||||
is_automated: bool
|
||||
automation_tool: Optional[str]
|
||||
priority: str
|
||||
|
||||
|
||||
@dataclass
|
||||
class Measure:
|
||||
"""A remediation measure for a control."""
|
||||
id: str
|
||||
control_id: str
|
||||
title: str
|
||||
description: str
|
||||
responsible: str
|
||||
deadline_days: int
|
||||
status: str
|
||||
# Backward-compat shim -- module moved to compliance/models.py
|
||||
import importlib as _importlib
|
||||
import sys as _sys
|
||||
_sys.modules[__name__] = _importlib.import_module("compliance.models")
|
||||
|
||||
@@ -1,441 +1,4 @@
|
||||
"""
|
||||
Compliance Pipeline Execution.
|
||||
|
||||
Pipeline phases (ingestion, extraction, control generation, measures)
|
||||
and orchestration logic.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
from datetime import datetime
|
||||
from typing import Dict, List, Any
|
||||
from dataclasses import asdict
|
||||
|
||||
from compliance_models import Checkpoint, Control, Measure
|
||||
from compliance_extraction import (
|
||||
extract_checkpoints_from_chunk,
|
||||
generate_control_for_checkpoints,
|
||||
generate_measure_for_control,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Import checkpoint manager
|
||||
try:
|
||||
from pipeline_checkpoints import CheckpointManager, EXPECTED_VALUES, ValidationStatus
|
||||
except ImportError:
|
||||
logger.warning("Checkpoint manager not available, running without checkpoints")
|
||||
CheckpointManager = None
|
||||
EXPECTED_VALUES = {}
|
||||
ValidationStatus = None
|
||||
|
||||
# Set environment variables for Docker network
|
||||
if not os.getenv("QDRANT_URL") and not os.getenv("QDRANT_HOST"):
|
||||
os.environ["QDRANT_HOST"] = "qdrant"
|
||||
os.environ.setdefault("EMBEDDING_SERVICE_URL", "http://embedding-service:8087")
|
||||
|
||||
# Try to import from klausur-service
|
||||
try:
|
||||
from legal_corpus_ingestion import LegalCorpusIngestion, REGULATIONS, LEGAL_CORPUS_COLLECTION
|
||||
from qdrant_client import QdrantClient
|
||||
from qdrant_client.models import Filter, FieldCondition, MatchValue
|
||||
except ImportError:
|
||||
logger.error("Could not import required modules. Make sure you're in the klausur-service container.")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
class CompliancePipeline:
|
||||
"""Handles the full compliance pipeline."""
|
||||
|
||||
def __init__(self):
|
||||
# Support both QDRANT_URL and QDRANT_HOST/PORT
|
||||
qdrant_url = os.getenv("QDRANT_URL", "")
|
||||
if qdrant_url:
|
||||
from urllib.parse import urlparse
|
||||
parsed = urlparse(qdrant_url)
|
||||
qdrant_host = parsed.hostname or "qdrant"
|
||||
qdrant_port = parsed.port or 6333
|
||||
else:
|
||||
qdrant_host = os.getenv("QDRANT_HOST", "qdrant")
|
||||
qdrant_port = 6333
|
||||
self.qdrant = QdrantClient(host=qdrant_host, port=qdrant_port)
|
||||
self.checkpoints: List[Checkpoint] = []
|
||||
self.controls: List[Control] = []
|
||||
self.measures: List[Measure] = []
|
||||
self.stats = {
|
||||
"chunks_processed": 0,
|
||||
"checkpoints_extracted": 0,
|
||||
"controls_created": 0,
|
||||
"measures_defined": 0,
|
||||
"by_regulation": {},
|
||||
"by_domain": {},
|
||||
}
|
||||
# Initialize checkpoint manager
|
||||
self.checkpoint_mgr = CheckpointManager() if CheckpointManager else None
|
||||
|
||||
async def run_ingestion_phase(self, force_reindex: bool = False) -> int:
|
||||
"""Phase 1: Ingest documents (incremental - only missing ones)."""
|
||||
logger.info("\n" + "=" * 60)
|
||||
logger.info("PHASE 1: DOCUMENT INGESTION (INCREMENTAL)")
|
||||
logger.info("=" * 60)
|
||||
|
||||
if self.checkpoint_mgr:
|
||||
self.checkpoint_mgr.start_checkpoint("ingestion", "Document Ingestion")
|
||||
|
||||
ingestion = LegalCorpusIngestion()
|
||||
|
||||
try:
|
||||
# Check existing chunks per regulation
|
||||
existing_chunks = {}
|
||||
try:
|
||||
for regulation in REGULATIONS:
|
||||
count_result = self.qdrant.count(
|
||||
collection_name=LEGAL_CORPUS_COLLECTION,
|
||||
count_filter=Filter(
|
||||
must=[FieldCondition(key="regulation_code", match=MatchValue(value=regulation.code))]
|
||||
)
|
||||
)
|
||||
existing_chunks[regulation.code] = count_result.count
|
||||
logger.info(f" {regulation.code}: {count_result.count} existing chunks")
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not check existing chunks: {e}")
|
||||
|
||||
# Determine which regulations need ingestion
|
||||
regulations_to_ingest = []
|
||||
for regulation in REGULATIONS:
|
||||
existing = existing_chunks.get(regulation.code, 0)
|
||||
if force_reindex or existing == 0:
|
||||
regulations_to_ingest.append(regulation)
|
||||
logger.info(f" -> Will ingest: {regulation.code} (existing: {existing}, force: {force_reindex})")
|
||||
else:
|
||||
logger.info(f" -> Skipping: {regulation.code} (already has {existing} chunks)")
|
||||
self.stats["by_regulation"][regulation.code] = existing
|
||||
|
||||
if not regulations_to_ingest:
|
||||
logger.info("All regulations already indexed. Skipping ingestion phase.")
|
||||
total_chunks = sum(existing_chunks.values())
|
||||
self.stats["chunks_processed"] = total_chunks
|
||||
if self.checkpoint_mgr:
|
||||
self.checkpoint_mgr.add_metric("total_chunks", total_chunks)
|
||||
self.checkpoint_mgr.add_metric("skipped", True)
|
||||
self.checkpoint_mgr.complete_checkpoint(success=True)
|
||||
return total_chunks
|
||||
|
||||
# Ingest only missing regulations
|
||||
total_chunks = sum(existing_chunks.values())
|
||||
for i, regulation in enumerate(regulations_to_ingest, 1):
|
||||
logger.info(f"[{i}/{len(regulations_to_ingest)}] Ingesting {regulation.code}...")
|
||||
try:
|
||||
count = await ingestion.ingest_regulation(regulation)
|
||||
total_chunks += count
|
||||
self.stats["by_regulation"][regulation.code] = count
|
||||
logger.info(f" -> {count} chunks")
|
||||
|
||||
if self.checkpoint_mgr:
|
||||
self.checkpoint_mgr.add_metric(f"chunks_{regulation.code}", count)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f" -> FAILED: {e}")
|
||||
self.stats["by_regulation"][regulation.code] = 0
|
||||
|
||||
self.stats["chunks_processed"] = total_chunks
|
||||
logger.info(f"\nTotal chunks in collection: {total_chunks}")
|
||||
|
||||
# Validate ingestion results
|
||||
if self.checkpoint_mgr:
|
||||
self.checkpoint_mgr.add_metric("total_chunks", total_chunks)
|
||||
self.checkpoint_mgr.add_metric("regulations_count", len(REGULATIONS))
|
||||
|
||||
expected = EXPECTED_VALUES.get("ingestion", {})
|
||||
self.checkpoint_mgr.validate(
|
||||
"total_chunks",
|
||||
expected=expected.get("total_chunks", 8000),
|
||||
actual=total_chunks,
|
||||
min_value=expected.get("min_chunks", 7000)
|
||||
)
|
||||
|
||||
reg_expected = expected.get("regulations", {})
|
||||
for reg_code, reg_exp in reg_expected.items():
|
||||
actual = self.stats["by_regulation"].get(reg_code, 0)
|
||||
self.checkpoint_mgr.validate(
|
||||
f"chunks_{reg_code}",
|
||||
expected=reg_exp.get("expected", 0),
|
||||
actual=actual,
|
||||
min_value=reg_exp.get("min", 0)
|
||||
)
|
||||
|
||||
self.checkpoint_mgr.complete_checkpoint(success=True)
|
||||
|
||||
return total_chunks
|
||||
|
||||
except Exception as e:
|
||||
if self.checkpoint_mgr:
|
||||
self.checkpoint_mgr.fail_checkpoint(str(e))
|
||||
raise
|
||||
|
||||
finally:
|
||||
await ingestion.close()
|
||||
|
||||
async def run_extraction_phase(self) -> int:
|
||||
"""Phase 2: Extract checkpoints from chunks."""
|
||||
logger.info("\n" + "=" * 60)
|
||||
logger.info("PHASE 2: CHECKPOINT EXTRACTION")
|
||||
logger.info("=" * 60)
|
||||
|
||||
if self.checkpoint_mgr:
|
||||
self.checkpoint_mgr.start_checkpoint("extraction", "Checkpoint Extraction")
|
||||
|
||||
try:
|
||||
offset = None
|
||||
total_checkpoints = 0
|
||||
|
||||
while True:
|
||||
result = self.qdrant.scroll(
|
||||
collection_name=LEGAL_CORPUS_COLLECTION,
|
||||
limit=100,
|
||||
offset=offset,
|
||||
with_payload=True,
|
||||
with_vectors=False
|
||||
)
|
||||
|
||||
points, next_offset = result
|
||||
|
||||
if not points:
|
||||
break
|
||||
|
||||
for point in points:
|
||||
payload = point.payload
|
||||
text = payload.get("text", "")
|
||||
|
||||
cps = extract_checkpoints_from_chunk(text, payload)
|
||||
self.checkpoints.extend(cps)
|
||||
total_checkpoints += len(cps)
|
||||
|
||||
logger.info(f"Processed {len(points)} chunks, extracted {total_checkpoints} checkpoints so far...")
|
||||
|
||||
if next_offset is None:
|
||||
break
|
||||
offset = next_offset
|
||||
|
||||
self.stats["checkpoints_extracted"] = len(self.checkpoints)
|
||||
logger.info(f"\nTotal checkpoints extracted: {len(self.checkpoints)}")
|
||||
|
||||
by_reg = {}
|
||||
for cp in self.checkpoints:
|
||||
by_reg[cp.regulation_code] = by_reg.get(cp.regulation_code, 0) + 1
|
||||
for reg, count in sorted(by_reg.items()):
|
||||
logger.info(f" {reg}: {count} checkpoints")
|
||||
|
||||
if self.checkpoint_mgr:
|
||||
self.checkpoint_mgr.add_metric("total_checkpoints", len(self.checkpoints))
|
||||
self.checkpoint_mgr.add_metric("checkpoints_by_regulation", by_reg)
|
||||
|
||||
expected = EXPECTED_VALUES.get("extraction", {})
|
||||
self.checkpoint_mgr.validate(
|
||||
"total_checkpoints",
|
||||
expected=expected.get("total_checkpoints", 3500),
|
||||
actual=len(self.checkpoints),
|
||||
min_value=expected.get("min_checkpoints", 3000)
|
||||
)
|
||||
|
||||
self.checkpoint_mgr.complete_checkpoint(success=True)
|
||||
|
||||
return len(self.checkpoints)
|
||||
|
||||
except Exception as e:
|
||||
if self.checkpoint_mgr:
|
||||
self.checkpoint_mgr.fail_checkpoint(str(e))
|
||||
raise
|
||||
|
||||
async def run_control_generation_phase(self) -> int:
|
||||
"""Phase 3: Generate controls from checkpoints."""
|
||||
logger.info("\n" + "=" * 60)
|
||||
logger.info("PHASE 3: CONTROL GENERATION")
|
||||
logger.info("=" * 60)
|
||||
|
||||
if self.checkpoint_mgr:
|
||||
self.checkpoint_mgr.start_checkpoint("controls", "Control Generation")
|
||||
|
||||
try:
|
||||
# Group checkpoints by regulation
|
||||
by_regulation: Dict[str, List[Checkpoint]] = {}
|
||||
for cp in self.checkpoints:
|
||||
reg = cp.regulation_code
|
||||
if reg not in by_regulation:
|
||||
by_regulation[reg] = []
|
||||
by_regulation[reg].append(cp)
|
||||
|
||||
# Generate controls per regulation (group every 3-5 checkpoints)
|
||||
for regulation, checkpoints in by_regulation.items():
|
||||
logger.info(f"Generating controls for {regulation} ({len(checkpoints)} checkpoints)...")
|
||||
|
||||
batch_size = 4
|
||||
for i in range(0, len(checkpoints), batch_size):
|
||||
batch = checkpoints[i:i + batch_size]
|
||||
control = generate_control_for_checkpoints(batch, self.stats.get("by_domain", {}))
|
||||
|
||||
if control:
|
||||
self.controls.append(control)
|
||||
self.stats["by_domain"][control.domain] = self.stats["by_domain"].get(control.domain, 0) + 1
|
||||
|
||||
self.stats["controls_created"] = len(self.controls)
|
||||
logger.info(f"\nTotal controls created: {len(self.controls)}")
|
||||
|
||||
for domain, count in sorted(self.stats["by_domain"].items()):
|
||||
logger.info(f" {domain}: {count} controls")
|
||||
|
||||
if self.checkpoint_mgr:
|
||||
self.checkpoint_mgr.add_metric("total_controls", len(self.controls))
|
||||
self.checkpoint_mgr.add_metric("controls_by_domain", dict(self.stats["by_domain"]))
|
||||
|
||||
expected = EXPECTED_VALUES.get("controls", {})
|
||||
self.checkpoint_mgr.validate(
|
||||
"total_controls",
|
||||
expected=expected.get("total_controls", 900),
|
||||
actual=len(self.controls),
|
||||
min_value=expected.get("min_controls", 800)
|
||||
)
|
||||
|
||||
self.checkpoint_mgr.complete_checkpoint(success=True)
|
||||
|
||||
return len(self.controls)
|
||||
|
||||
except Exception as e:
|
||||
if self.checkpoint_mgr:
|
||||
self.checkpoint_mgr.fail_checkpoint(str(e))
|
||||
raise
|
||||
|
||||
async def run_measure_generation_phase(self) -> int:
|
||||
"""Phase 4: Generate measures for controls."""
|
||||
logger.info("\n" + "=" * 60)
|
||||
logger.info("PHASE 4: MEASURE GENERATION")
|
||||
logger.info("=" * 60)
|
||||
|
||||
if self.checkpoint_mgr:
|
||||
self.checkpoint_mgr.start_checkpoint("measures", "Measure Generation")
|
||||
|
||||
try:
|
||||
for control in self.controls:
|
||||
measure = generate_measure_for_control(control)
|
||||
self.measures.append(measure)
|
||||
|
||||
self.stats["measures_defined"] = len(self.measures)
|
||||
logger.info(f"\nTotal measures defined: {len(self.measures)}")
|
||||
|
||||
if self.checkpoint_mgr:
|
||||
self.checkpoint_mgr.add_metric("total_measures", len(self.measures))
|
||||
|
||||
expected = EXPECTED_VALUES.get("measures", {})
|
||||
self.checkpoint_mgr.validate(
|
||||
"total_measures",
|
||||
expected=expected.get("total_measures", 900),
|
||||
actual=len(self.measures),
|
||||
min_value=expected.get("min_measures", 800)
|
||||
)
|
||||
|
||||
self.checkpoint_mgr.complete_checkpoint(success=True)
|
||||
|
||||
return len(self.measures)
|
||||
|
||||
except Exception as e:
|
||||
if self.checkpoint_mgr:
|
||||
self.checkpoint_mgr.fail_checkpoint(str(e))
|
||||
raise
|
||||
|
||||
def save_results(self, output_dir: str = "/tmp/compliance_output"):
|
||||
"""Save results to JSON files."""
|
||||
logger.info("\n" + "=" * 60)
|
||||
logger.info("SAVING RESULTS")
|
||||
logger.info("=" * 60)
|
||||
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
|
||||
checkpoints_file = os.path.join(output_dir, "checkpoints.json")
|
||||
with open(checkpoints_file, "w") as f:
|
||||
json.dump([asdict(cp) for cp in self.checkpoints], f, indent=2, ensure_ascii=False)
|
||||
logger.info(f"Saved {len(self.checkpoints)} checkpoints to {checkpoints_file}")
|
||||
|
||||
controls_file = os.path.join(output_dir, "controls.json")
|
||||
with open(controls_file, "w") as f:
|
||||
json.dump([asdict(c) for c in self.controls], f, indent=2, ensure_ascii=False)
|
||||
logger.info(f"Saved {len(self.controls)} controls to {controls_file}")
|
||||
|
||||
measures_file = os.path.join(output_dir, "measures.json")
|
||||
with open(measures_file, "w") as f:
|
||||
json.dump([asdict(m) for m in self.measures], f, indent=2, ensure_ascii=False)
|
||||
logger.info(f"Saved {len(self.measures)} measures to {measures_file}")
|
||||
|
||||
stats_file = os.path.join(output_dir, "statistics.json")
|
||||
self.stats["generated_at"] = datetime.now().isoformat()
|
||||
with open(stats_file, "w") as f:
|
||||
json.dump(self.stats, f, indent=2, ensure_ascii=False)
|
||||
logger.info(f"Saved statistics to {stats_file}")
|
||||
|
||||
async def run_full_pipeline(self, force_reindex: bool = False, skip_ingestion: bool = False):
|
||||
"""Run the complete pipeline.
|
||||
|
||||
Args:
|
||||
force_reindex: If True, re-ingest all documents even if they exist
|
||||
skip_ingestion: If True, skip ingestion phase entirely (use existing chunks)
|
||||
"""
|
||||
start_time = time.time()
|
||||
|
||||
logger.info("=" * 60)
|
||||
logger.info("FULL COMPLIANCE PIPELINE (INCREMENTAL)")
|
||||
logger.info(f"Started at: {datetime.now().isoformat()}")
|
||||
logger.info(f"Force reindex: {force_reindex}")
|
||||
logger.info(f"Skip ingestion: {skip_ingestion}")
|
||||
if self.checkpoint_mgr:
|
||||
logger.info(f"Pipeline ID: {self.checkpoint_mgr.pipeline_id}")
|
||||
logger.info("=" * 60)
|
||||
|
||||
try:
|
||||
if skip_ingestion:
|
||||
logger.info("Skipping ingestion phase as requested...")
|
||||
try:
|
||||
collection_info = self.qdrant.get_collection(LEGAL_CORPUS_COLLECTION)
|
||||
self.stats["chunks_processed"] = collection_info.points_count
|
||||
except Exception:
|
||||
self.stats["chunks_processed"] = 0
|
||||
else:
|
||||
await self.run_ingestion_phase(force_reindex=force_reindex)
|
||||
|
||||
await self.run_extraction_phase()
|
||||
await self.run_control_generation_phase()
|
||||
await self.run_measure_generation_phase()
|
||||
self.save_results()
|
||||
|
||||
elapsed = time.time() - start_time
|
||||
logger.info("\n" + "=" * 60)
|
||||
logger.info("PIPELINE COMPLETE")
|
||||
logger.info("=" * 60)
|
||||
logger.info(f"Duration: {elapsed:.1f} seconds")
|
||||
logger.info(f"Chunks processed: {self.stats['chunks_processed']}")
|
||||
logger.info(f"Checkpoints extracted: {self.stats['checkpoints_extracted']}")
|
||||
logger.info(f"Controls created: {self.stats['controls_created']}")
|
||||
logger.info(f"Measures defined: {self.stats['measures_defined']}")
|
||||
logger.info(f"\nResults saved to: /tmp/compliance_output/")
|
||||
logger.info("Checkpoint status: /tmp/pipeline_checkpoints.json")
|
||||
logger.info("=" * 60)
|
||||
|
||||
if self.checkpoint_mgr:
|
||||
self.checkpoint_mgr.complete_pipeline({
|
||||
"duration_seconds": elapsed,
|
||||
"chunks_processed": self.stats['chunks_processed'],
|
||||
"checkpoints_extracted": self.stats['checkpoints_extracted'],
|
||||
"controls_created": self.stats['controls_created'],
|
||||
"measures_defined": self.stats['measures_defined'],
|
||||
"by_regulation": self.stats['by_regulation'],
|
||||
"by_domain": self.stats['by_domain'],
|
||||
})
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Pipeline failed: {e}")
|
||||
if self.checkpoint_mgr:
|
||||
self.checkpoint_mgr.state.status = "failed"
|
||||
self.checkpoint_mgr._save()
|
||||
raise
|
||||
# Backward-compat shim -- module moved to compliance/pipeline.py
|
||||
import importlib as _importlib
|
||||
import sys as _sys
|
||||
_sys.modules[__name__] = _importlib.import_module("compliance.pipeline")
|
||||
|
||||
@@ -1,420 +1,4 @@
|
||||
"""
|
||||
BYOEH Processing Pipeline
|
||||
Handles chunking, embedding generation, and encryption for Erwartungshorizonte.
|
||||
|
||||
Supports multiple embedding backends:
|
||||
- local: sentence-transformers (default, no API key needed)
|
||||
- openai: OpenAI text-embedding-3-small (requires OPENAI_API_KEY)
|
||||
"""
|
||||
|
||||
import os
|
||||
import io
|
||||
import base64
|
||||
import hashlib
|
||||
from typing import List, Tuple, Optional
|
||||
from cryptography.hazmat.primitives.ciphers.aead import AESGCM
|
||||
from cryptography.hazmat.primitives.kdf.pbkdf2 import PBKDF2HMAC
|
||||
from cryptography.hazmat.primitives import hashes
|
||||
import httpx
|
||||
|
||||
# Embedding Configuration
|
||||
# Backend: "local" (sentence-transformers) or "openai"
|
||||
EMBEDDING_BACKEND = os.getenv("EMBEDDING_BACKEND", "local")
|
||||
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
|
||||
EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL", "text-embedding-3-small")
|
||||
|
||||
# Local embedding model (all-MiniLM-L6-v2: 384 dimensions, fast, good quality)
|
||||
LOCAL_EMBEDDING_MODEL = os.getenv("LOCAL_EMBEDDING_MODEL", "all-MiniLM-L6-v2")
|
||||
|
||||
# Vector dimensions per backend
|
||||
VECTOR_DIMENSIONS = {
|
||||
"local": 384, # all-MiniLM-L6-v2
|
||||
"openai": 1536, # text-embedding-3-small
|
||||
}
|
||||
|
||||
CHUNK_SIZE = int(os.getenv("BYOEH_CHUNK_SIZE", "1000"))
|
||||
CHUNK_OVERLAP = int(os.getenv("BYOEH_CHUNK_OVERLAP", "200"))
|
||||
|
||||
# Lazy-loaded sentence-transformers model
|
||||
_local_model = None
|
||||
|
||||
|
||||
class ChunkingError(Exception):
|
||||
"""Error during text chunking."""
|
||||
pass
|
||||
|
||||
|
||||
class EmbeddingError(Exception):
|
||||
"""Error during embedding generation."""
|
||||
pass
|
||||
|
||||
|
||||
class EncryptionError(Exception):
|
||||
"""Error during encryption/decryption."""
|
||||
pass
|
||||
|
||||
|
||||
def chunk_text(text: str, chunk_size: int = CHUNK_SIZE, overlap: int = CHUNK_OVERLAP) -> List[str]:
|
||||
"""
|
||||
Split text into overlapping chunks.
|
||||
|
||||
Uses a simple recursive character splitter approach:
|
||||
- Try to split on paragraph boundaries first
|
||||
- Then sentences
|
||||
- Then words
|
||||
- Finally characters
|
||||
|
||||
Args:
|
||||
text: Input text to chunk
|
||||
chunk_size: Target chunk size in characters
|
||||
overlap: Overlap between chunks
|
||||
|
||||
Returns:
|
||||
List of text chunks
|
||||
"""
|
||||
if not text or len(text) <= chunk_size:
|
||||
return [text] if text else []
|
||||
|
||||
chunks = []
|
||||
separators = ["\n\n", "\n", ". ", " ", ""]
|
||||
|
||||
def split_recursive(text: str, sep_idx: int = 0) -> List[str]:
|
||||
if len(text) <= chunk_size:
|
||||
return [text]
|
||||
|
||||
if sep_idx >= len(separators):
|
||||
# Last resort: hard split
|
||||
return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size - overlap)]
|
||||
|
||||
sep = separators[sep_idx]
|
||||
if not sep:
|
||||
# Empty separator = character split
|
||||
parts = list(text)
|
||||
else:
|
||||
parts = text.split(sep)
|
||||
|
||||
result = []
|
||||
current = ""
|
||||
|
||||
for part in parts:
|
||||
test_chunk = current + sep + part if current else part
|
||||
|
||||
if len(test_chunk) <= chunk_size:
|
||||
current = test_chunk
|
||||
else:
|
||||
if current:
|
||||
result.append(current)
|
||||
# If single part is too big, recursively split it
|
||||
if len(part) > chunk_size:
|
||||
result.extend(split_recursive(part, sep_idx + 1))
|
||||
current = ""
|
||||
else:
|
||||
current = part
|
||||
|
||||
if current:
|
||||
result.append(current)
|
||||
|
||||
return result
|
||||
|
||||
raw_chunks = split_recursive(text)
|
||||
|
||||
# Add overlap
|
||||
final_chunks = []
|
||||
for i, chunk in enumerate(raw_chunks):
|
||||
if i > 0 and overlap > 0:
|
||||
# Add overlap from previous chunk
|
||||
prev_chunk = raw_chunks[i-1]
|
||||
overlap_text = prev_chunk[-min(overlap, len(prev_chunk)):]
|
||||
chunk = overlap_text + chunk
|
||||
final_chunks.append(chunk.strip())
|
||||
|
||||
return [c for c in final_chunks if c]
|
||||
|
||||
|
||||
def get_vector_size() -> int:
|
||||
"""Get the vector dimension for the current embedding backend."""
|
||||
return VECTOR_DIMENSIONS.get(EMBEDDING_BACKEND, 384)
|
||||
|
||||
|
||||
def _get_local_model():
|
||||
"""Lazy-load the sentence-transformers model."""
|
||||
global _local_model
|
||||
if _local_model is None:
|
||||
try:
|
||||
from sentence_transformers import SentenceTransformer
|
||||
print(f"Loading local embedding model: {LOCAL_EMBEDDING_MODEL}")
|
||||
_local_model = SentenceTransformer(LOCAL_EMBEDDING_MODEL)
|
||||
print(f"Model loaded successfully (dim={_local_model.get_sentence_embedding_dimension()})")
|
||||
except ImportError:
|
||||
raise EmbeddingError(
|
||||
"sentence-transformers not installed. "
|
||||
"Install with: pip install sentence-transformers"
|
||||
)
|
||||
return _local_model
|
||||
|
||||
|
||||
def _generate_local_embeddings(texts: List[str]) -> List[List[float]]:
|
||||
"""Generate embeddings using local sentence-transformers model."""
|
||||
if not texts:
|
||||
return []
|
||||
|
||||
model = _get_local_model()
|
||||
embeddings = model.encode(texts, show_progress_bar=len(texts) > 10)
|
||||
return [emb.tolist() for emb in embeddings]
|
||||
|
||||
|
||||
async def _generate_openai_embeddings(texts: List[str]) -> List[List[float]]:
|
||||
"""Generate embeddings using OpenAI API."""
|
||||
if not OPENAI_API_KEY:
|
||||
raise EmbeddingError("OPENAI_API_KEY not configured")
|
||||
|
||||
try:
|
||||
async with httpx.AsyncClient() as client:
|
||||
response = await client.post(
|
||||
"https://api.openai.com/v1/embeddings",
|
||||
headers={
|
||||
"Authorization": f"Bearer {OPENAI_API_KEY}",
|
||||
"Content-Type": "application/json"
|
||||
},
|
||||
json={
|
||||
"model": EMBEDDING_MODEL,
|
||||
"input": texts
|
||||
},
|
||||
timeout=60.0
|
||||
)
|
||||
|
||||
if response.status_code != 200:
|
||||
raise EmbeddingError(f"OpenAI API error: {response.status_code} - {response.text}")
|
||||
|
||||
data = response.json()
|
||||
embeddings = [item["embedding"] for item in data["data"]]
|
||||
return embeddings
|
||||
|
||||
except httpx.TimeoutException:
|
||||
raise EmbeddingError("OpenAI API timeout")
|
||||
except Exception as e:
|
||||
raise EmbeddingError(f"Failed to generate embeddings: {str(e)}")
|
||||
|
||||
|
||||
async def generate_embeddings(texts: List[str]) -> List[List[float]]:
|
||||
"""
|
||||
Generate embeddings using configured backend.
|
||||
|
||||
Backends:
|
||||
- local: sentence-transformers (default, no API key needed)
|
||||
- openai: OpenAI text-embedding-3-small
|
||||
|
||||
Args:
|
||||
texts: List of text chunks
|
||||
|
||||
Returns:
|
||||
List of embedding vectors
|
||||
|
||||
Raises:
|
||||
EmbeddingError: If embedding generation fails
|
||||
"""
|
||||
if not texts:
|
||||
return []
|
||||
|
||||
if EMBEDDING_BACKEND == "local":
|
||||
# Local model runs synchronously but is fast
|
||||
return _generate_local_embeddings(texts)
|
||||
elif EMBEDDING_BACKEND == "openai":
|
||||
return await _generate_openai_embeddings(texts)
|
||||
else:
|
||||
raise EmbeddingError(f"Unknown embedding backend: {EMBEDDING_BACKEND}")
|
||||
|
||||
|
||||
async def generate_single_embedding(text: str) -> List[float]:
|
||||
"""Generate embedding for a single text."""
|
||||
embeddings = await generate_embeddings([text])
|
||||
return embeddings[0] if embeddings else []
|
||||
|
||||
|
||||
def derive_key(passphrase: str, salt: bytes) -> bytes:
|
||||
"""
|
||||
Derive encryption key from passphrase using PBKDF2.
|
||||
|
||||
Args:
|
||||
passphrase: User passphrase
|
||||
salt: Random salt (16 bytes)
|
||||
|
||||
Returns:
|
||||
32-byte AES key
|
||||
"""
|
||||
kdf = PBKDF2HMAC(
|
||||
algorithm=hashes.SHA256(),
|
||||
length=32,
|
||||
salt=salt,
|
||||
iterations=100000,
|
||||
)
|
||||
return kdf.derive(passphrase.encode())
|
||||
|
||||
|
||||
def encrypt_text(text: str, passphrase: str, salt_hex: str) -> str:
|
||||
"""
|
||||
Encrypt text using AES-256-GCM.
|
||||
|
||||
Args:
|
||||
text: Plaintext to encrypt
|
||||
passphrase: User passphrase
|
||||
salt_hex: Salt as hex string
|
||||
|
||||
Returns:
|
||||
Base64-encoded ciphertext (IV + ciphertext)
|
||||
"""
|
||||
try:
|
||||
salt = bytes.fromhex(salt_hex)
|
||||
key = derive_key(passphrase, salt)
|
||||
|
||||
aesgcm = AESGCM(key)
|
||||
iv = os.urandom(12)
|
||||
|
||||
ciphertext = aesgcm.encrypt(iv, text.encode(), None)
|
||||
|
||||
# Combine IV + ciphertext
|
||||
combined = iv + ciphertext
|
||||
return base64.b64encode(combined).decode()
|
||||
|
||||
except Exception as e:
|
||||
raise EncryptionError(f"Encryption failed: {str(e)}")
|
||||
|
||||
|
||||
def decrypt_text(encrypted_b64: str, passphrase: str, salt_hex: str) -> str:
|
||||
"""
|
||||
Decrypt text using AES-256-GCM.
|
||||
|
||||
Args:
|
||||
encrypted_b64: Base64-encoded ciphertext (IV + ciphertext)
|
||||
passphrase: User passphrase
|
||||
salt_hex: Salt as hex string
|
||||
|
||||
Returns:
|
||||
Decrypted plaintext
|
||||
"""
|
||||
try:
|
||||
salt = bytes.fromhex(salt_hex)
|
||||
key = derive_key(passphrase, salt)
|
||||
|
||||
combined = base64.b64decode(encrypted_b64)
|
||||
iv = combined[:12]
|
||||
ciphertext = combined[12:]
|
||||
|
||||
aesgcm = AESGCM(key)
|
||||
plaintext = aesgcm.decrypt(iv, ciphertext, None)
|
||||
|
||||
return plaintext.decode()
|
||||
|
||||
except Exception as e:
|
||||
raise EncryptionError(f"Decryption failed: {str(e)}")
|
||||
|
||||
|
||||
def hash_key(passphrase: str, salt_hex: str) -> str:
|
||||
"""
|
||||
Create SHA-256 hash of derived key for verification.
|
||||
|
||||
Args:
|
||||
passphrase: User passphrase
|
||||
salt_hex: Salt as hex string
|
||||
|
||||
Returns:
|
||||
Hex-encoded key hash
|
||||
"""
|
||||
salt = bytes.fromhex(salt_hex)
|
||||
key = derive_key(passphrase, salt)
|
||||
return hashlib.sha256(key).hexdigest()
|
||||
|
||||
|
||||
def verify_key_hash(passphrase: str, salt_hex: str, expected_hash: str) -> bool:
|
||||
"""
|
||||
Verify passphrase matches stored key hash.
|
||||
|
||||
Args:
|
||||
passphrase: User passphrase to verify
|
||||
salt_hex: Salt as hex string
|
||||
expected_hash: Expected key hash
|
||||
|
||||
Returns:
|
||||
True if passphrase is correct
|
||||
"""
|
||||
computed_hash = hash_key(passphrase, salt_hex)
|
||||
return computed_hash == expected_hash
|
||||
|
||||
|
||||
def extract_text_from_pdf(pdf_content: bytes) -> str:
|
||||
"""
|
||||
Extract text from PDF file.
|
||||
|
||||
Args:
|
||||
pdf_content: Raw PDF bytes
|
||||
|
||||
Returns:
|
||||
Extracted text
|
||||
"""
|
||||
try:
|
||||
import PyPDF2
|
||||
|
||||
pdf_file = io.BytesIO(pdf_content)
|
||||
reader = PyPDF2.PdfReader(pdf_file)
|
||||
|
||||
text_parts = []
|
||||
for page in reader.pages:
|
||||
text = page.extract_text()
|
||||
if text:
|
||||
text_parts.append(text)
|
||||
|
||||
return "\n\n".join(text_parts)
|
||||
|
||||
except ImportError:
|
||||
raise ChunkingError("PyPDF2 not installed")
|
||||
except Exception as e:
|
||||
raise ChunkingError(f"Failed to extract PDF text: {str(e)}")
|
||||
|
||||
|
||||
async def process_eh_for_indexing(
|
||||
eh_id: str,
|
||||
tenant_id: str,
|
||||
subject: str,
|
||||
text_content: str,
|
||||
passphrase: str,
|
||||
salt_hex: str
|
||||
) -> Tuple[int, List[dict]]:
|
||||
"""
|
||||
Full processing pipeline for Erwartungshorizont indexing.
|
||||
|
||||
1. Chunk the text
|
||||
2. Generate embeddings
|
||||
3. Encrypt chunks
|
||||
4. Return prepared data for Qdrant
|
||||
|
||||
Args:
|
||||
eh_id: Erwartungshorizont ID
|
||||
tenant_id: Tenant ID
|
||||
subject: Subject (deutsch, englisch, etc.)
|
||||
text_content: Decrypted text content
|
||||
passphrase: User passphrase for re-encryption
|
||||
salt_hex: Salt for encryption
|
||||
|
||||
Returns:
|
||||
Tuple of (chunk_count, chunks_data)
|
||||
"""
|
||||
# 1. Chunk the text
|
||||
chunks = chunk_text(text_content)
|
||||
|
||||
if not chunks:
|
||||
return 0, []
|
||||
|
||||
# 2. Generate embeddings
|
||||
embeddings = await generate_embeddings(chunks)
|
||||
|
||||
# 3. Encrypt chunks for storage
|
||||
encrypted_chunks = []
|
||||
for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
|
||||
encrypted_content = encrypt_text(chunk, passphrase, salt_hex)
|
||||
encrypted_chunks.append({
|
||||
"chunk_index": i,
|
||||
"embedding": embedding,
|
||||
"encrypted_content": encrypted_content
|
||||
})
|
||||
|
||||
return len(chunks), encrypted_chunks
|
||||
# Backward-compat shim -- module moved to korrektur/eh_pipeline.py
|
||||
import importlib as _importlib
|
||||
import sys as _sys
|
||||
_sys.modules[__name__] = _importlib.import_module("korrektur.eh_pipeline")
|
||||
|
||||
@@ -1,34 +1,4 @@
|
||||
"""
|
||||
Erwartungshorizont Templates for Vorabitur Mode — barrel re-export.
|
||||
|
||||
The actual code lives in:
|
||||
- eh_templates_types.py (AUFGABENTYPEN, EHKriterium, EHTemplate)
|
||||
- eh_templates_analyse.py (Textanalyse, Gedicht, Prosa, Drama)
|
||||
- eh_templates_eroerterung.py (Eroerterung textgebunden)
|
||||
- eh_templates_registry.py (TEMPLATES, get_template, list_templates, etc.)
|
||||
"""
|
||||
|
||||
# Types
|
||||
from eh_templates_types import ( # noqa: F401
|
||||
AUFGABENTYPEN,
|
||||
EHKriterium,
|
||||
EHTemplate,
|
||||
)
|
||||
|
||||
# Template factories
|
||||
from eh_templates_analyse import ( # noqa: F401
|
||||
get_textanalyse_template,
|
||||
get_gedichtanalyse_template,
|
||||
get_prosaanalyse_template,
|
||||
get_dramenanalyse_template,
|
||||
)
|
||||
from eh_templates_eroerterung import get_eroerterung_template # noqa: F401
|
||||
|
||||
# Registry
|
||||
from eh_templates_registry import ( # noqa: F401
|
||||
TEMPLATES,
|
||||
initialize_templates,
|
||||
get_template,
|
||||
list_templates,
|
||||
get_aufgabentypen,
|
||||
)
|
||||
# Backward-compat shim -- module moved to korrektur/eh_templates.py
|
||||
import importlib as _importlib
|
||||
import sys as _sys
|
||||
_sys.modules[__name__] = _importlib.import_module("korrektur.eh_templates")
|
||||
|
||||
@@ -1,395 +1,4 @@
|
||||
"""
|
||||
Erwartungshorizont Templates — Analyse templates.
|
||||
|
||||
Contains templates for:
|
||||
- Textanalyse (pragmatische Texte)
|
||||
- Gedichtanalyse / Lyrikinterpretation
|
||||
- Prosaanalyse
|
||||
- Dramenanalyse
|
||||
"""
|
||||
|
||||
from eh_templates_types import EHTemplate, EHKriterium
|
||||
|
||||
|
||||
def get_textanalyse_template() -> EHTemplate:
|
||||
"""Template for pragmatic text analysis."""
|
||||
return EHTemplate(
|
||||
id="template_textanalyse_pragmatisch",
|
||||
aufgabentyp="textanalyse_pragmatisch",
|
||||
name="Textanalyse pragmatischer Texte",
|
||||
beschreibung="Vorlage fuer die Analyse von Sachtexten, Reden, Kommentaren und Essays",
|
||||
kriterien=[
|
||||
EHKriterium(
|
||||
id="inhalt",
|
||||
name="Inhaltliche Leistung",
|
||||
beschreibung="Erfassung und Wiedergabe des Textinhalts",
|
||||
gewichtung=40,
|
||||
erwartungen=[
|
||||
"Korrekte Erfassung der Textaussage/These",
|
||||
"Vollstaendige Wiedergabe der Argumentationsstruktur",
|
||||
"Erkennen von Intention und Adressatenbezug",
|
||||
"Einordnung in den historischen/gesellschaftlichen Kontext",
|
||||
"Beruecksichtigung aller relevanten Textaspekte"
|
||||
]
|
||||
),
|
||||
EHKriterium(
|
||||
id="struktur",
|
||||
name="Aufbau und Struktur",
|
||||
beschreibung="Logischer Aufbau und Gliederung der Analyse",
|
||||
gewichtung=15,
|
||||
erwartungen=[
|
||||
"Sinnvolle Einleitung mit Basisinformationen",
|
||||
"Logische Gliederung des Hauptteils",
|
||||
"Stringente Gedankenfuehrung",
|
||||
"Angemessener Schluss mit Fazit/Wertung",
|
||||
"Absatzgliederung und Ueberlaenge"
|
||||
]
|
||||
),
|
||||
EHKriterium(
|
||||
id="analyse",
|
||||
name="Analytische Qualitaet",
|
||||
beschreibung="Tiefe und Qualitaet der Analyse",
|
||||
gewichtung=15,
|
||||
erwartungen=[
|
||||
"Erkennen rhetorischer Mittel",
|
||||
"Funktionale Deutung der Stilmittel",
|
||||
"Analyse der Argumentationsweise",
|
||||
"Beruecksichtigung von Wortwahl und Satzbau",
|
||||
"Verknuepfung von Form und Inhalt"
|
||||
]
|
||||
),
|
||||
EHKriterium(
|
||||
id="rechtschreibung",
|
||||
name="Sprachliche Richtigkeit (Rechtschreibung)",
|
||||
beschreibung="Orthografische Korrektheit",
|
||||
gewichtung=15,
|
||||
erwartungen=[
|
||||
"Korrekte Rechtschreibung",
|
||||
"Korrekte Gross- und Kleinschreibung",
|
||||
"Korrekte Getrennt- und Zusammenschreibung",
|
||||
"Korrekte Fremdwortschreibung"
|
||||
]
|
||||
),
|
||||
EHKriterium(
|
||||
id="grammatik",
|
||||
name="Sprachliche Richtigkeit (Grammatik)",
|
||||
beschreibung="Grammatische Korrektheit und Zeichensetzung",
|
||||
gewichtung=15,
|
||||
erwartungen=[
|
||||
"Korrekter Satzbau",
|
||||
"Korrekte Flexion",
|
||||
"Korrekte Zeichensetzung",
|
||||
"Korrekte Bezuege und Kongruenz"
|
||||
]
|
||||
)
|
||||
],
|
||||
einleitung_hinweise=[
|
||||
"Nennung von Autor, Titel, Textsorte, Erscheinungsjahr",
|
||||
"Benennung des Themas",
|
||||
"Formulierung der Kernthese/Hauptaussage",
|
||||
"Ggf. Einordnung in den Kontext"
|
||||
],
|
||||
hauptteil_hinweise=[
|
||||
"Systematische Analyse der Argumentationsstruktur",
|
||||
"Untersuchung der sprachlichen Gestaltung",
|
||||
"Funktionale Deutung der Stilmittel",
|
||||
"Beruecksichtigung von Adressatenbezug und Intention",
|
||||
"Textbelege durch Zitate"
|
||||
],
|
||||
schluss_hinweise=[
|
||||
"Zusammenfassung der Analyseergebnisse",
|
||||
"Bewertung der Ueberzeugungskraft",
|
||||
"Ggf. aktuelle Relevanz",
|
||||
"Persoenliche Stellungnahme (wenn gefordert)"
|
||||
],
|
||||
sprachliche_aspekte=[
|
||||
"Fachsprachliche Begriffe korrekt verwenden",
|
||||
"Konjunktiv fuer indirekte Rede",
|
||||
"Praesens als Tempus der Analyse",
|
||||
"Sachlicher, analytischer Stil"
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
def get_gedichtanalyse_template() -> EHTemplate:
|
||||
"""Template for poetry analysis."""
|
||||
return EHTemplate(
|
||||
id="template_gedichtanalyse",
|
||||
aufgabentyp="gedichtanalyse",
|
||||
name="Gedichtanalyse / Lyrikinterpretation",
|
||||
beschreibung="Vorlage fuer die Analyse und Interpretation lyrischer Texte",
|
||||
kriterien=[
|
||||
EHKriterium(
|
||||
id="inhalt",
|
||||
name="Inhaltliche Leistung",
|
||||
beschreibung="Erfassung und Deutung des Gedichtinhalts",
|
||||
gewichtung=40,
|
||||
erwartungen=[
|
||||
"Korrekte Erfassung des lyrischen Ichs und der Sprechsituation",
|
||||
"Vollstaendige inhaltliche Erschliessung aller Strophen",
|
||||
"Erkennen der zentralen Motive und Themen",
|
||||
"Epochenzuordnung und literaturgeschichtliche Einordnung",
|
||||
"Deutung der Bildlichkeit und Symbolik"
|
||||
]
|
||||
),
|
||||
EHKriterium(
|
||||
id="struktur",
|
||||
name="Aufbau und Struktur",
|
||||
beschreibung="Logischer Aufbau der Interpretation",
|
||||
gewichtung=15,
|
||||
erwartungen=[
|
||||
"Einleitung mit Basisinformationen",
|
||||
"Systematische strophenweise oder aspektorientierte Analyse",
|
||||
"Verknuepfung von Form- und Inhaltsanalyse",
|
||||
"Schluessige Gesamtdeutung im Schluss"
|
||||
]
|
||||
),
|
||||
EHKriterium(
|
||||
id="formanalyse",
|
||||
name="Formale Analyse",
|
||||
beschreibung="Analyse der lyrischen Gestaltungsmittel",
|
||||
gewichtung=15,
|
||||
erwartungen=[
|
||||
"Bestimmung von Metrum und Reimschema",
|
||||
"Analyse der Klanggestaltung",
|
||||
"Erkennen von Enjambements und Zaesuren",
|
||||
"Deutung der formalen Mittel",
|
||||
"Verknuepfung von Form und Inhalt"
|
||||
]
|
||||
),
|
||||
EHKriterium(
|
||||
id="rechtschreibung",
|
||||
name="Sprachliche Richtigkeit (Rechtschreibung)",
|
||||
beschreibung="Orthografische Korrektheit",
|
||||
gewichtung=15,
|
||||
erwartungen=[
|
||||
"Korrekte Rechtschreibung",
|
||||
"Korrekte Gross- und Kleinschreibung",
|
||||
"Korrekte Getrennt- und Zusammenschreibung"
|
||||
]
|
||||
),
|
||||
EHKriterium(
|
||||
id="grammatik",
|
||||
name="Sprachliche Richtigkeit (Grammatik)",
|
||||
beschreibung="Grammatische Korrektheit und Zeichensetzung",
|
||||
gewichtung=15,
|
||||
erwartungen=[
|
||||
"Korrekter Satzbau",
|
||||
"Korrekte Flexion",
|
||||
"Korrekte Zeichensetzung"
|
||||
]
|
||||
)
|
||||
],
|
||||
einleitung_hinweise=[
|
||||
"Autor, Titel, Entstehungsjahr/Epoche",
|
||||
"Thema/Motiv des Gedichts",
|
||||
"Erste Deutungshypothese",
|
||||
"Formale Grunddaten (Strophen, Verse)"
|
||||
],
|
||||
hauptteil_hinweise=[
|
||||
"Inhaltliche Analyse (strophenweise oder aspektorientiert)",
|
||||
"Formale Analyse (Metrum, Reim, Klang)",
|
||||
"Sprachliche Analyse (Stilmittel, Bildlichkeit)",
|
||||
"Funktionale Verknuepfung aller Ebenen",
|
||||
"Textbelege durch Zitate mit Versangabe"
|
||||
],
|
||||
schluss_hinweise=[
|
||||
"Zusammenfassung der Interpretationsergebnisse",
|
||||
"Bestaetigung/Modifikation der Deutungshypothese",
|
||||
"Einordnung in Epoche/Werk des Autors",
|
||||
"Aktualitaetsbezug (wenn sinnvoll)"
|
||||
],
|
||||
sprachliche_aspekte=[
|
||||
"Fachbegriffe der Lyrikanalyse verwenden",
|
||||
"Zwischen lyrischem Ich und Autor unterscheiden",
|
||||
"Praesens als Analysetempus",
|
||||
"Deutende statt beschreibende Formulierungen"
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
def get_prosaanalyse_template() -> EHTemplate:
|
||||
"""Template for prose/narrative text analysis."""
|
||||
return EHTemplate(
|
||||
id="template_prosaanalyse",
|
||||
aufgabentyp="prosaanalyse",
|
||||
name="Epische Textanalyse / Prosaanalyse",
|
||||
beschreibung="Vorlage fuer die Analyse von Romanauszuegen, Kurzgeschichten und Novellen",
|
||||
kriterien=[
|
||||
EHKriterium(
|
||||
id="inhalt",
|
||||
name="Inhaltliche Leistung",
|
||||
beschreibung="Erfassung und Deutung des Textinhalts",
|
||||
gewichtung=40,
|
||||
erwartungen=[
|
||||
"Korrekte Erfassung der Handlung",
|
||||
"Charakterisierung der Figuren",
|
||||
"Erkennen der Erzaehlsituation",
|
||||
"Deutung der Konflikte und Motive",
|
||||
"Einordnung in den Gesamtzusammenhang"
|
||||
]
|
||||
),
|
||||
EHKriterium(
|
||||
id="struktur",
|
||||
name="Aufbau und Struktur",
|
||||
beschreibung="Logischer Aufbau der Analyse",
|
||||
gewichtung=15,
|
||||
erwartungen=[
|
||||
"Informative Einleitung",
|
||||
"Systematische Analyse im Hauptteil",
|
||||
"Verknuepfung der Analyseergebnisse",
|
||||
"Schluessige Gesamtdeutung"
|
||||
]
|
||||
),
|
||||
EHKriterium(
|
||||
id="erzaehltechnik",
|
||||
name="Erzaehltechnische Analyse",
|
||||
beschreibung="Analyse narrativer Gestaltungsmittel",
|
||||
gewichtung=15,
|
||||
erwartungen=[
|
||||
"Bestimmung der Erzaehlperspektive",
|
||||
"Analyse von Zeitgestaltung",
|
||||
"Raumgestaltung und Atmosphaere",
|
||||
"Figurenrede und Bewusstseinsdarstellung",
|
||||
"Funktionale Deutung"
|
||||
]
|
||||
),
|
||||
EHKriterium(
|
||||
id="rechtschreibung",
|
||||
name="Sprachliche Richtigkeit (Rechtschreibung)",
|
||||
beschreibung="Orthografische Korrektheit",
|
||||
gewichtung=15,
|
||||
erwartungen=[
|
||||
"Korrekte Rechtschreibung",
|
||||
"Korrekte Gross- und Kleinschreibung"
|
||||
]
|
||||
),
|
||||
EHKriterium(
|
||||
id="grammatik",
|
||||
name="Sprachliche Richtigkeit (Grammatik)",
|
||||
beschreibung="Grammatische Korrektheit und Zeichensetzung",
|
||||
gewichtung=15,
|
||||
erwartungen=[
|
||||
"Korrekter Satzbau",
|
||||
"Korrekte Zeichensetzung"
|
||||
]
|
||||
)
|
||||
],
|
||||
einleitung_hinweise=[
|
||||
"Autor, Titel, Textsorte, Erscheinungsjahr",
|
||||
"Einordnung des Auszugs in den Gesamttext",
|
||||
"Thema und Deutungshypothese"
|
||||
],
|
||||
hauptteil_hinweise=[
|
||||
"Kurze Inhaltsangabe des Auszugs",
|
||||
"Analyse der Handlungsstruktur",
|
||||
"Figurenanalyse mit Textbelegen",
|
||||
"Erzaehltechnische Analyse",
|
||||
"Sprachliche Analyse",
|
||||
"Verknuepfung aller Ebenen"
|
||||
],
|
||||
schluss_hinweise=[
|
||||
"Zusammenfassung der Analyseergebnisse",
|
||||
"Bestaetigung der Deutungshypothese",
|
||||
"Bedeutung fuer Gesamtwerk",
|
||||
"Ggf. Aktualitaetsbezug"
|
||||
],
|
||||
sprachliche_aspekte=[
|
||||
"Fachbegriffe der Erzaehltextanalyse",
|
||||
"Zwischen Erzaehler und Autor unterscheiden",
|
||||
"Praesens als Analysetempus",
|
||||
"Deutende Formulierungen"
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
def get_dramenanalyse_template() -> EHTemplate:
|
||||
"""Template for drama analysis."""
|
||||
return EHTemplate(
|
||||
id="template_dramenanalyse",
|
||||
aufgabentyp="dramenanalyse",
|
||||
name="Dramenanalyse",
|
||||
beschreibung="Vorlage fuer die Analyse dramatischer Texte und Szenen",
|
||||
kriterien=[
|
||||
EHKriterium(
|
||||
id="inhalt",
|
||||
name="Inhaltliche Leistung",
|
||||
beschreibung="Erfassung und Deutung des Szeneninhalts",
|
||||
gewichtung=40,
|
||||
erwartungen=[
|
||||
"Korrekte Erfassung der Handlung",
|
||||
"Analyse der Figurenkonstellation",
|
||||
"Erkennen des dramatischen Konflikts",
|
||||
"Einordnung in den Handlungsverlauf",
|
||||
"Deutung der Szene im Gesamtzusammenhang"
|
||||
]
|
||||
),
|
||||
EHKriterium(
|
||||
id="struktur",
|
||||
name="Aufbau und Struktur",
|
||||
beschreibung="Logischer Aufbau der Analyse",
|
||||
gewichtung=15,
|
||||
erwartungen=[
|
||||
"Einleitung mit Kontextualisierung",
|
||||
"Systematische Szenenanalyse",
|
||||
"Verknuepfung der Analyseergebnisse",
|
||||
"Schluessige Deutung"
|
||||
]
|
||||
),
|
||||
EHKriterium(
|
||||
id="dramentechnik",
|
||||
name="Dramentechnische Analyse",
|
||||
beschreibung="Analyse dramatischer Gestaltungsmittel",
|
||||
gewichtung=15,
|
||||
erwartungen=[
|
||||
"Analyse der Dialoggestaltung",
|
||||
"Regieanweisungen und Buehnenraum",
|
||||
"Dramatische Spannung",
|
||||
"Monolog/Dialog-Formen",
|
||||
"Funktionale Deutung"
|
||||
]
|
||||
),
|
||||
EHKriterium(
|
||||
id="rechtschreibung",
|
||||
name="Sprachliche Richtigkeit (Rechtschreibung)",
|
||||
beschreibung="Orthografische Korrektheit",
|
||||
gewichtung=15,
|
||||
erwartungen=[
|
||||
"Korrekte Rechtschreibung"
|
||||
]
|
||||
),
|
||||
EHKriterium(
|
||||
id="grammatik",
|
||||
name="Sprachliche Richtigkeit (Grammatik)",
|
||||
beschreibung="Grammatische Korrektheit und Zeichensetzung",
|
||||
gewichtung=15,
|
||||
erwartungen=[
|
||||
"Korrekter Satzbau",
|
||||
"Korrekte Zeichensetzung"
|
||||
]
|
||||
)
|
||||
],
|
||||
einleitung_hinweise=[
|
||||
"Autor, Titel, Urauffuehrungsjahr, Dramenform",
|
||||
"Einordnung der Szene in den Handlungsverlauf",
|
||||
"Thema und Deutungshypothese"
|
||||
],
|
||||
hauptteil_hinweise=[
|
||||
"Situierung der Szene",
|
||||
"Analyse des Dialogverlaufs",
|
||||
"Figurenanalyse im Dialog",
|
||||
"Sprachliche Analyse",
|
||||
"Dramentechnische Mittel",
|
||||
"Bedeutung fuer den Konflikt"
|
||||
],
|
||||
schluss_hinweise=[
|
||||
"Zusammenfassung der Analyseergebnisse",
|
||||
"Funktion der Szene im Drama",
|
||||
"Bedeutung fuer die Gesamtdeutung"
|
||||
],
|
||||
sprachliche_aspekte=[
|
||||
"Fachbegriffe der Dramenanalyse",
|
||||
"Praesens als Analysetempus",
|
||||
"Korrekte Zitierweise mit Akt/Szene/Zeile"
|
||||
]
|
||||
)
|
||||
# Backward-compat shim -- module moved to korrektur/eh_templates_analyse.py
|
||||
import importlib as _importlib
|
||||
import sys as _sys
|
||||
_sys.modules[__name__] = _importlib.import_module("korrektur.eh_templates_analyse")
|
||||
|
||||
@@ -1,101 +1,4 @@
|
||||
"""
|
||||
Erwartungshorizont Templates — Eroerterung template.
|
||||
"""
|
||||
|
||||
from eh_templates_types import EHTemplate, EHKriterium
|
||||
|
||||
|
||||
def get_eroerterung_template() -> EHTemplate:
|
||||
"""Template for textgebundene Eroerterung."""
|
||||
return EHTemplate(
|
||||
id="template_eroerterung_textgebunden",
|
||||
aufgabentyp="eroerterung_textgebunden",
|
||||
name="Textgebundene Eroerterung",
|
||||
beschreibung="Vorlage fuer die textgebundene Eroerterung auf Basis eines Sachtextes",
|
||||
kriterien=[
|
||||
EHKriterium(
|
||||
id="inhalt",
|
||||
name="Inhaltliche Leistung",
|
||||
beschreibung="Qualitaet der Argumentation",
|
||||
gewichtung=40,
|
||||
erwartungen=[
|
||||
"Korrekte Wiedergabe der Textposition",
|
||||
"Differenzierte eigene Argumentation",
|
||||
"Vielfaeltige und ueberzeugende Argumente",
|
||||
"Beruecksichtigung von Pro und Contra",
|
||||
"Sinnvolle Beispiele und Belege",
|
||||
"Eigenstaendige Schlussfolgerung"
|
||||
]
|
||||
),
|
||||
EHKriterium(
|
||||
id="struktur",
|
||||
name="Aufbau und Struktur",
|
||||
beschreibung="Logischer Aufbau der Eroerterung",
|
||||
gewichtung=15,
|
||||
erwartungen=[
|
||||
"Problemorientierte Einleitung",
|
||||
"Klare Gliederung der Argumentation",
|
||||
"Logische Argumentationsfolge",
|
||||
"Sinnvolle Ueberlaetze",
|
||||
"Begruendetes Fazit"
|
||||
]
|
||||
),
|
||||
EHKriterium(
|
||||
id="textbezug",
|
||||
name="Textbezug",
|
||||
beschreibung="Verknuepfung mit dem Ausgangstext",
|
||||
gewichtung=15,
|
||||
erwartungen=[
|
||||
"Angemessene Textwiedergabe",
|
||||
"Kritische Auseinandersetzung mit Textposition",
|
||||
"Korrekte Zitierweise",
|
||||
"Verknuepfung eigener Argumente mit Text"
|
||||
]
|
||||
),
|
||||
EHKriterium(
|
||||
id="rechtschreibung",
|
||||
name="Sprachliche Richtigkeit (Rechtschreibung)",
|
||||
beschreibung="Orthografische Korrektheit",
|
||||
gewichtung=15,
|
||||
erwartungen=[
|
||||
"Korrekte Rechtschreibung",
|
||||
"Korrekte Gross- und Kleinschreibung"
|
||||
]
|
||||
),
|
||||
EHKriterium(
|
||||
id="grammatik",
|
||||
name="Sprachliche Richtigkeit (Grammatik)",
|
||||
beschreibung="Grammatische Korrektheit und Zeichensetzung",
|
||||
gewichtung=15,
|
||||
erwartungen=[
|
||||
"Korrekter Satzbau",
|
||||
"Korrekte Zeichensetzung",
|
||||
"Variationsreicher Ausdruck"
|
||||
]
|
||||
)
|
||||
],
|
||||
einleitung_hinweise=[
|
||||
"Hinfuehrung zum Thema",
|
||||
"Nennung des Ausgangstextes",
|
||||
"Formulierung der Leitfrage/These",
|
||||
"Ueberleitung zum Hauptteil"
|
||||
],
|
||||
hauptteil_hinweise=[
|
||||
"Kurze Wiedergabe der Textposition",
|
||||
"Systematische Argumentation (dialektisch oder linear)",
|
||||
"Jedes Argument: These - Begruendung - Beispiel",
|
||||
"Gewichtung der Argumente",
|
||||
"Verknuepfung mit Textposition"
|
||||
],
|
||||
schluss_hinweise=[
|
||||
"Zusammenfassung der wichtigsten Argumente",
|
||||
"Eigene begruendete Stellungnahme",
|
||||
"Ggf. Ausblick oder Appell"
|
||||
],
|
||||
sprachliche_aspekte=[
|
||||
"Argumentative Konnektoren verwenden",
|
||||
"Sachlicher, ueberzeugender Stil",
|
||||
"Eigene Meinung kennzeichnen",
|
||||
"Konjunktiv fuer Textpositionen"
|
||||
]
|
||||
)
|
||||
# Backward-compat shim -- module moved to korrektur/eh_templates_eroerterung.py
|
||||
import importlib as _importlib
|
||||
import sys as _sys
|
||||
_sys.modules[__name__] = _importlib.import_module("korrektur.eh_templates_eroerterung")
|
||||
|
||||
@@ -1,60 +1,4 @@
|
||||
"""
|
||||
Erwartungshorizont Templates — registry for template lookup.
|
||||
"""
|
||||
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
from eh_templates_types import EHTemplate, AUFGABENTYPEN
|
||||
from eh_templates_analyse import (
|
||||
get_textanalyse_template,
|
||||
get_gedichtanalyse_template,
|
||||
get_prosaanalyse_template,
|
||||
get_dramenanalyse_template,
|
||||
)
|
||||
from eh_templates_eroerterung import get_eroerterung_template
|
||||
|
||||
|
||||
TEMPLATES: Dict[str, EHTemplate] = {}
|
||||
|
||||
|
||||
def initialize_templates():
|
||||
"""Initialize all pre-defined templates."""
|
||||
global TEMPLATES
|
||||
TEMPLATES = {
|
||||
"textanalyse_pragmatisch": get_textanalyse_template(),
|
||||
"gedichtanalyse": get_gedichtanalyse_template(),
|
||||
"eroerterung_textgebunden": get_eroerterung_template(),
|
||||
"prosaanalyse": get_prosaanalyse_template(),
|
||||
"dramenanalyse": get_dramenanalyse_template(),
|
||||
}
|
||||
|
||||
|
||||
def get_template(aufgabentyp: str) -> Optional[EHTemplate]:
|
||||
"""Get a template by Aufgabentyp."""
|
||||
if not TEMPLATES:
|
||||
initialize_templates()
|
||||
return TEMPLATES.get(aufgabentyp)
|
||||
|
||||
|
||||
def list_templates() -> List[Dict]:
|
||||
"""List all available templates."""
|
||||
if not TEMPLATES:
|
||||
initialize_templates()
|
||||
return [
|
||||
{
|
||||
"aufgabentyp": typ,
|
||||
"name": AUFGABENTYPEN.get(typ, {}).get("name", typ),
|
||||
"description": AUFGABENTYPEN.get(typ, {}).get("description", ""),
|
||||
"category": AUFGABENTYPEN.get(typ, {}).get("category", "other"),
|
||||
}
|
||||
for typ in TEMPLATES.keys()
|
||||
]
|
||||
|
||||
|
||||
def get_aufgabentypen() -> Dict:
|
||||
"""Get all Aufgabentypen definitions."""
|
||||
return AUFGABENTYPEN
|
||||
|
||||
|
||||
# Initialize on import
|
||||
initialize_templates()
|
||||
# Backward-compat shim -- module moved to korrektur/eh_templates_registry.py
|
||||
import importlib as _importlib
|
||||
import sys as _sys
|
||||
_sys.modules[__name__] = _importlib.import_module("korrektur.eh_templates_registry")
|
||||
|
||||
@@ -1,100 +1,4 @@
|
||||
"""
|
||||
Erwartungshorizont Templates — types and Aufgabentypen registry.
|
||||
"""
|
||||
|
||||
from typing import Dict, List, Optional
|
||||
from dataclasses import dataclass, field, asdict
|
||||
from datetime import datetime
|
||||
|
||||
|
||||
AUFGABENTYPEN = {
|
||||
"textanalyse_pragmatisch": {
|
||||
"name": "Textanalyse (pragmatische Texte)",
|
||||
"description": "Analyse von Sachtexten, Reden, Kommentaren, Essays",
|
||||
"category": "analyse"
|
||||
},
|
||||
"sachtextanalyse": {
|
||||
"name": "Sachtextanalyse",
|
||||
"description": "Analyse von informativen und appellativen Sachtexten",
|
||||
"category": "analyse"
|
||||
},
|
||||
"gedichtanalyse": {
|
||||
"name": "Gedichtanalyse / Lyrikinterpretation",
|
||||
"description": "Analyse und Interpretation lyrischer Texte",
|
||||
"category": "interpretation"
|
||||
},
|
||||
"dramenanalyse": {
|
||||
"name": "Dramenanalyse",
|
||||
"description": "Analyse dramatischer Texte und Szenen",
|
||||
"category": "interpretation"
|
||||
},
|
||||
"prosaanalyse": {
|
||||
"name": "Epische Textanalyse / Prosaanalyse",
|
||||
"description": "Analyse von Romanauszuegen, Kurzgeschichten, Novellen",
|
||||
"category": "interpretation"
|
||||
},
|
||||
"eroerterung_textgebunden": {
|
||||
"name": "Textgebundene Eroerterung",
|
||||
"description": "Eroerterung auf Basis eines Sachtextes",
|
||||
"category": "argumentation"
|
||||
},
|
||||
"eroerterung_frei": {
|
||||
"name": "Freie Eroerterung",
|
||||
"description": "Freie Eroerterung zu einem Thema",
|
||||
"category": "argumentation"
|
||||
},
|
||||
"eroerterung_literarisch": {
|
||||
"name": "Literarische Eroerterung",
|
||||
"description": "Eroerterung zu literarischen Fragestellungen",
|
||||
"category": "argumentation"
|
||||
},
|
||||
"materialgestuetzt": {
|
||||
"name": "Materialgestuetztes Schreiben",
|
||||
"description": "Verfassen eines Textes auf Materialbasis",
|
||||
"category": "produktion"
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@dataclass
|
||||
class EHKriterium:
|
||||
"""Single criterion in an Erwartungshorizont."""
|
||||
id: str
|
||||
name: str
|
||||
beschreibung: str
|
||||
gewichtung: int # Percentage weight (0-100)
|
||||
erwartungen: List[str] # Expected points/elements
|
||||
max_punkte: int = 100
|
||||
|
||||
def to_dict(self):
|
||||
return asdict(self)
|
||||
|
||||
|
||||
@dataclass
|
||||
class EHTemplate:
|
||||
"""Complete Erwartungshorizont template."""
|
||||
id: str
|
||||
aufgabentyp: str
|
||||
name: str
|
||||
beschreibung: str
|
||||
kriterien: List[EHKriterium]
|
||||
einleitung_hinweise: List[str]
|
||||
hauptteil_hinweise: List[str]
|
||||
schluss_hinweise: List[str]
|
||||
sprachliche_aspekte: List[str]
|
||||
created_at: datetime = field(default_factory=lambda: datetime.now())
|
||||
|
||||
def to_dict(self):
|
||||
d = {
|
||||
'id': self.id,
|
||||
'aufgabentyp': self.aufgabentyp,
|
||||
'name': self.name,
|
||||
'beschreibung': self.beschreibung,
|
||||
'kriterien': [k.to_dict() for k in self.kriterien],
|
||||
'einleitung_hinweise': self.einleitung_hinweise,
|
||||
'hauptteil_hinweise': self.hauptteil_hinweise,
|
||||
'schluss_hinweise': self.schluss_hinweise,
|
||||
'sprachliche_aspekte': self.sprachliche_aspekte,
|
||||
'created_at': self.created_at.isoformat()
|
||||
}
|
||||
return d
|
||||
# Backward-compat shim -- module moved to korrektur/eh_templates_types.py
|
||||
import importlib as _importlib
|
||||
import sys as _sys
|
||||
_sys.modules[__name__] = _importlib.import_module("korrektur.eh_templates_types")
|
||||
|
||||
@@ -1,65 +1,4 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Full Compliance Pipeline for Legal Corpus — Barrel Re-export.
|
||||
|
||||
Split into submodules:
|
||||
- compliance_models.py — Dataclasses (Checkpoint, Control, Measure)
|
||||
- compliance_extraction.py — Pattern extraction & control/measure generation
|
||||
- compliance_pipeline.py — Pipeline phases & orchestrator
|
||||
|
||||
Run on Mac Mini:
|
||||
nohup python full_compliance_pipeline.py > /tmp/compliance_pipeline.log 2>&1 &
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import sys
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s - %(levelname)s - %(message)s',
|
||||
handlers=[
|
||||
logging.StreamHandler(sys.stdout),
|
||||
logging.FileHandler('/tmp/compliance_pipeline.log')
|
||||
]
|
||||
)
|
||||
|
||||
# Re-export all public symbols
|
||||
from compliance_models import Checkpoint, Control, Measure
|
||||
from compliance_extraction import (
|
||||
extract_checkpoints_from_chunk,
|
||||
generate_control_for_checkpoints,
|
||||
generate_measure_for_control,
|
||||
)
|
||||
from compliance_pipeline import CompliancePipeline
|
||||
|
||||
__all__ = [
|
||||
"Checkpoint",
|
||||
"Control",
|
||||
"Measure",
|
||||
"extract_checkpoints_from_chunk",
|
||||
"generate_control_for_checkpoints",
|
||||
"generate_measure_for_control",
|
||||
"CompliancePipeline",
|
||||
]
|
||||
|
||||
|
||||
async def main():
|
||||
import argparse
|
||||
parser = argparse.ArgumentParser(description="Run the compliance pipeline")
|
||||
parser.add_argument("--force-reindex", action="store_true",
|
||||
help="Force re-ingestion of all documents")
|
||||
parser.add_argument("--skip-ingestion", action="store_true",
|
||||
help="Skip ingestion phase, use existing chunks")
|
||||
args = parser.parse_args()
|
||||
|
||||
pipeline = CompliancePipeline()
|
||||
await pipeline.run_full_pipeline(
|
||||
force_reindex=args.force_reindex,
|
||||
skip_ingestion=args.skip_ingestion
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
# Backward-compat shim -- module moved to compliance/full_pipeline.py
|
||||
import importlib as _importlib
|
||||
import sys as _sys
|
||||
_sys.modules[__name__] = _importlib.import_module("compliance.full_pipeline")
|
||||
|
||||
@@ -0,0 +1,6 @@
|
||||
"""
|
||||
korrektur package — exam correction, EH templates, PDF export.
|
||||
|
||||
Backward-compatible re-exports: consumers can still use
|
||||
``from eh_pipeline import ...`` etc. via the shim files in backend/.
|
||||
"""
|
||||
@@ -0,0 +1,420 @@
|
||||
"""
|
||||
BYOEH Processing Pipeline
|
||||
Handles chunking, embedding generation, and encryption for Erwartungshorizonte.
|
||||
|
||||
Supports multiple embedding backends:
|
||||
- local: sentence-transformers (default, no API key needed)
|
||||
- openai: OpenAI text-embedding-3-small (requires OPENAI_API_KEY)
|
||||
"""
|
||||
|
||||
import os
|
||||
import io
|
||||
import base64
|
||||
import hashlib
|
||||
from typing import List, Tuple, Optional
|
||||
from cryptography.hazmat.primitives.ciphers.aead import AESGCM
|
||||
from cryptography.hazmat.primitives.kdf.pbkdf2 import PBKDF2HMAC
|
||||
from cryptography.hazmat.primitives import hashes
|
||||
import httpx
|
||||
|
||||
# Embedding Configuration
|
||||
# Backend: "local" (sentence-transformers) or "openai"
|
||||
EMBEDDING_BACKEND = os.getenv("EMBEDDING_BACKEND", "local")
|
||||
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
|
||||
EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL", "text-embedding-3-small")
|
||||
|
||||
# Local embedding model (all-MiniLM-L6-v2: 384 dimensions, fast, good quality)
|
||||
LOCAL_EMBEDDING_MODEL = os.getenv("LOCAL_EMBEDDING_MODEL", "all-MiniLM-L6-v2")
|
||||
|
||||
# Vector dimensions per backend
|
||||
VECTOR_DIMENSIONS = {
|
||||
"local": 384, # all-MiniLM-L6-v2
|
||||
"openai": 1536, # text-embedding-3-small
|
||||
}
|
||||
|
||||
CHUNK_SIZE = int(os.getenv("BYOEH_CHUNK_SIZE", "1000"))
|
||||
CHUNK_OVERLAP = int(os.getenv("BYOEH_CHUNK_OVERLAP", "200"))
|
||||
|
||||
# Lazy-loaded sentence-transformers model
|
||||
_local_model = None
|
||||
|
||||
|
||||
class ChunkingError(Exception):
|
||||
"""Error during text chunking."""
|
||||
pass
|
||||
|
||||
|
||||
class EmbeddingError(Exception):
|
||||
"""Error during embedding generation."""
|
||||
pass
|
||||
|
||||
|
||||
class EncryptionError(Exception):
|
||||
"""Error during encryption/decryption."""
|
||||
pass
|
||||
|
||||
|
||||
def chunk_text(text: str, chunk_size: int = CHUNK_SIZE, overlap: int = CHUNK_OVERLAP) -> List[str]:
|
||||
"""
|
||||
Split text into overlapping chunks.
|
||||
|
||||
Uses a simple recursive character splitter approach:
|
||||
- Try to split on paragraph boundaries first
|
||||
- Then sentences
|
||||
- Then words
|
||||
- Finally characters
|
||||
|
||||
Args:
|
||||
text: Input text to chunk
|
||||
chunk_size: Target chunk size in characters
|
||||
overlap: Overlap between chunks
|
||||
|
||||
Returns:
|
||||
List of text chunks
|
||||
"""
|
||||
if not text or len(text) <= chunk_size:
|
||||
return [text] if text else []
|
||||
|
||||
chunks = []
|
||||
separators = ["\n\n", "\n", ". ", " ", ""]
|
||||
|
||||
def split_recursive(text: str, sep_idx: int = 0) -> List[str]:
|
||||
if len(text) <= chunk_size:
|
||||
return [text]
|
||||
|
||||
if sep_idx >= len(separators):
|
||||
# Last resort: hard split
|
||||
return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size - overlap)]
|
||||
|
||||
sep = separators[sep_idx]
|
||||
if not sep:
|
||||
# Empty separator = character split
|
||||
parts = list(text)
|
||||
else:
|
||||
parts = text.split(sep)
|
||||
|
||||
result = []
|
||||
current = ""
|
||||
|
||||
for part in parts:
|
||||
test_chunk = current + sep + part if current else part
|
||||
|
||||
if len(test_chunk) <= chunk_size:
|
||||
current = test_chunk
|
||||
else:
|
||||
if current:
|
||||
result.append(current)
|
||||
# If single part is too big, recursively split it
|
||||
if len(part) > chunk_size:
|
||||
result.extend(split_recursive(part, sep_idx + 1))
|
||||
current = ""
|
||||
else:
|
||||
current = part
|
||||
|
||||
if current:
|
||||
result.append(current)
|
||||
|
||||
return result
|
||||
|
||||
raw_chunks = split_recursive(text)
|
||||
|
||||
# Add overlap
|
||||
final_chunks = []
|
||||
for i, chunk in enumerate(raw_chunks):
|
||||
if i > 0 and overlap > 0:
|
||||
# Add overlap from previous chunk
|
||||
prev_chunk = raw_chunks[i-1]
|
||||
overlap_text = prev_chunk[-min(overlap, len(prev_chunk)):]
|
||||
chunk = overlap_text + chunk
|
||||
final_chunks.append(chunk.strip())
|
||||
|
||||
return [c for c in final_chunks if c]
|
||||
|
||||
|
||||
def get_vector_size() -> int:
|
||||
"""Get the vector dimension for the current embedding backend."""
|
||||
return VECTOR_DIMENSIONS.get(EMBEDDING_BACKEND, 384)
|
||||
|
||||
|
||||
def _get_local_model():
|
||||
"""Lazy-load the sentence-transformers model."""
|
||||
global _local_model
|
||||
if _local_model is None:
|
||||
try:
|
||||
from sentence_transformers import SentenceTransformer
|
||||
print(f"Loading local embedding model: {LOCAL_EMBEDDING_MODEL}")
|
||||
_local_model = SentenceTransformer(LOCAL_EMBEDDING_MODEL)
|
||||
print(f"Model loaded successfully (dim={_local_model.get_sentence_embedding_dimension()})")
|
||||
except ImportError:
|
||||
raise EmbeddingError(
|
||||
"sentence-transformers not installed. "
|
||||
"Install with: pip install sentence-transformers"
|
||||
)
|
||||
return _local_model
|
||||
|
||||
|
||||
def _generate_local_embeddings(texts: List[str]) -> List[List[float]]:
|
||||
"""Generate embeddings using local sentence-transformers model."""
|
||||
if not texts:
|
||||
return []
|
||||
|
||||
model = _get_local_model()
|
||||
embeddings = model.encode(texts, show_progress_bar=len(texts) > 10)
|
||||
return [emb.tolist() for emb in embeddings]
|
||||
|
||||
|
||||
async def _generate_openai_embeddings(texts: List[str]) -> List[List[float]]:
|
||||
"""Generate embeddings using OpenAI API."""
|
||||
if not OPENAI_API_KEY:
|
||||
raise EmbeddingError("OPENAI_API_KEY not configured")
|
||||
|
||||
try:
|
||||
async with httpx.AsyncClient() as client:
|
||||
response = await client.post(
|
||||
"https://api.openai.com/v1/embeddings",
|
||||
headers={
|
||||
"Authorization": f"Bearer {OPENAI_API_KEY}",
|
||||
"Content-Type": "application/json"
|
||||
},
|
||||
json={
|
||||
"model": EMBEDDING_MODEL,
|
||||
"input": texts
|
||||
},
|
||||
timeout=60.0
|
||||
)
|
||||
|
||||
if response.status_code != 200:
|
||||
raise EmbeddingError(f"OpenAI API error: {response.status_code} - {response.text}")
|
||||
|
||||
data = response.json()
|
||||
embeddings = [item["embedding"] for item in data["data"]]
|
||||
return embeddings
|
||||
|
||||
except httpx.TimeoutException:
|
||||
raise EmbeddingError("OpenAI API timeout")
|
||||
except Exception as e:
|
||||
raise EmbeddingError(f"Failed to generate embeddings: {str(e)}")
|
||||
|
||||
|
||||
async def generate_embeddings(texts: List[str]) -> List[List[float]]:
|
||||
"""
|
||||
Generate embeddings using configured backend.
|
||||
|
||||
Backends:
|
||||
- local: sentence-transformers (default, no API key needed)
|
||||
- openai: OpenAI text-embedding-3-small
|
||||
|
||||
Args:
|
||||
texts: List of text chunks
|
||||
|
||||
Returns:
|
||||
List of embedding vectors
|
||||
|
||||
Raises:
|
||||
EmbeddingError: If embedding generation fails
|
||||
"""
|
||||
if not texts:
|
||||
return []
|
||||
|
||||
if EMBEDDING_BACKEND == "local":
|
||||
# Local model runs synchronously but is fast
|
||||
return _generate_local_embeddings(texts)
|
||||
elif EMBEDDING_BACKEND == "openai":
|
||||
return await _generate_openai_embeddings(texts)
|
||||
else:
|
||||
raise EmbeddingError(f"Unknown embedding backend: {EMBEDDING_BACKEND}")
|
||||
|
||||
|
||||
async def generate_single_embedding(text: str) -> List[float]:
|
||||
"""Generate embedding for a single text."""
|
||||
embeddings = await generate_embeddings([text])
|
||||
return embeddings[0] if embeddings else []
|
||||
|
||||
|
||||
def derive_key(passphrase: str, salt: bytes) -> bytes:
|
||||
"""
|
||||
Derive encryption key from passphrase using PBKDF2.
|
||||
|
||||
Args:
|
||||
passphrase: User passphrase
|
||||
salt: Random salt (16 bytes)
|
||||
|
||||
Returns:
|
||||
32-byte AES key
|
||||
"""
|
||||
kdf = PBKDF2HMAC(
|
||||
algorithm=hashes.SHA256(),
|
||||
length=32,
|
||||
salt=salt,
|
||||
iterations=100000,
|
||||
)
|
||||
return kdf.derive(passphrase.encode())
|
||||
|
||||
|
||||
def encrypt_text(text: str, passphrase: str, salt_hex: str) -> str:
|
||||
"""
|
||||
Encrypt text using AES-256-GCM.
|
||||
|
||||
Args:
|
||||
text: Plaintext to encrypt
|
||||
passphrase: User passphrase
|
||||
salt_hex: Salt as hex string
|
||||
|
||||
Returns:
|
||||
Base64-encoded ciphertext (IV + ciphertext)
|
||||
"""
|
||||
try:
|
||||
salt = bytes.fromhex(salt_hex)
|
||||
key = derive_key(passphrase, salt)
|
||||
|
||||
aesgcm = AESGCM(key)
|
||||
iv = os.urandom(12)
|
||||
|
||||
ciphertext = aesgcm.encrypt(iv, text.encode(), None)
|
||||
|
||||
# Combine IV + ciphertext
|
||||
combined = iv + ciphertext
|
||||
return base64.b64encode(combined).decode()
|
||||
|
||||
except Exception as e:
|
||||
raise EncryptionError(f"Encryption failed: {str(e)}")
|
||||
|
||||
|
||||
def decrypt_text(encrypted_b64: str, passphrase: str, salt_hex: str) -> str:
|
||||
"""
|
||||
Decrypt text using AES-256-GCM.
|
||||
|
||||
Args:
|
||||
encrypted_b64: Base64-encoded ciphertext (IV + ciphertext)
|
||||
passphrase: User passphrase
|
||||
salt_hex: Salt as hex string
|
||||
|
||||
Returns:
|
||||
Decrypted plaintext
|
||||
"""
|
||||
try:
|
||||
salt = bytes.fromhex(salt_hex)
|
||||
key = derive_key(passphrase, salt)
|
||||
|
||||
combined = base64.b64decode(encrypted_b64)
|
||||
iv = combined[:12]
|
||||
ciphertext = combined[12:]
|
||||
|
||||
aesgcm = AESGCM(key)
|
||||
plaintext = aesgcm.decrypt(iv, ciphertext, None)
|
||||
|
||||
return plaintext.decode()
|
||||
|
||||
except Exception as e:
|
||||
raise EncryptionError(f"Decryption failed: {str(e)}")
|
||||
|
||||
|
||||
def hash_key(passphrase: str, salt_hex: str) -> str:
|
||||
"""
|
||||
Create SHA-256 hash of derived key for verification.
|
||||
|
||||
Args:
|
||||
passphrase: User passphrase
|
||||
salt_hex: Salt as hex string
|
||||
|
||||
Returns:
|
||||
Hex-encoded key hash
|
||||
"""
|
||||
salt = bytes.fromhex(salt_hex)
|
||||
key = derive_key(passphrase, salt)
|
||||
return hashlib.sha256(key).hexdigest()
|
||||
|
||||
|
||||
def verify_key_hash(passphrase: str, salt_hex: str, expected_hash: str) -> bool:
|
||||
"""
|
||||
Verify passphrase matches stored key hash.
|
||||
|
||||
Args:
|
||||
passphrase: User passphrase to verify
|
||||
salt_hex: Salt as hex string
|
||||
expected_hash: Expected key hash
|
||||
|
||||
Returns:
|
||||
True if passphrase is correct
|
||||
"""
|
||||
computed_hash = hash_key(passphrase, salt_hex)
|
||||
return computed_hash == expected_hash
|
||||
|
||||
|
||||
def extract_text_from_pdf(pdf_content: bytes) -> str:
|
||||
"""
|
||||
Extract text from PDF file.
|
||||
|
||||
Args:
|
||||
pdf_content: Raw PDF bytes
|
||||
|
||||
Returns:
|
||||
Extracted text
|
||||
"""
|
||||
try:
|
||||
import PyPDF2
|
||||
|
||||
pdf_file = io.BytesIO(pdf_content)
|
||||
reader = PyPDF2.PdfReader(pdf_file)
|
||||
|
||||
text_parts = []
|
||||
for page in reader.pages:
|
||||
text = page.extract_text()
|
||||
if text:
|
||||
text_parts.append(text)
|
||||
|
||||
return "\n\n".join(text_parts)
|
||||
|
||||
except ImportError:
|
||||
raise ChunkingError("PyPDF2 not installed")
|
||||
except Exception as e:
|
||||
raise ChunkingError(f"Failed to extract PDF text: {str(e)}")
|
||||
|
||||
|
||||
async def process_eh_for_indexing(
|
||||
eh_id: str,
|
||||
tenant_id: str,
|
||||
subject: str,
|
||||
text_content: str,
|
||||
passphrase: str,
|
||||
salt_hex: str
|
||||
) -> Tuple[int, List[dict]]:
|
||||
"""
|
||||
Full processing pipeline for Erwartungshorizont indexing.
|
||||
|
||||
1. Chunk the text
|
||||
2. Generate embeddings
|
||||
3. Encrypt chunks
|
||||
4. Return prepared data for Qdrant
|
||||
|
||||
Args:
|
||||
eh_id: Erwartungshorizont ID
|
||||
tenant_id: Tenant ID
|
||||
subject: Subject (deutsch, englisch, etc.)
|
||||
text_content: Decrypted text content
|
||||
passphrase: User passphrase for re-encryption
|
||||
salt_hex: Salt for encryption
|
||||
|
||||
Returns:
|
||||
Tuple of (chunk_count, chunks_data)
|
||||
"""
|
||||
# 1. Chunk the text
|
||||
chunks = chunk_text(text_content)
|
||||
|
||||
if not chunks:
|
||||
return 0, []
|
||||
|
||||
# 2. Generate embeddings
|
||||
embeddings = await generate_embeddings(chunks)
|
||||
|
||||
# 3. Encrypt chunks for storage
|
||||
encrypted_chunks = []
|
||||
for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
|
||||
encrypted_content = encrypt_text(chunk, passphrase, salt_hex)
|
||||
encrypted_chunks.append({
|
||||
"chunk_index": i,
|
||||
"embedding": embedding,
|
||||
"encrypted_content": encrypted_content
|
||||
})
|
||||
|
||||
return len(chunks), encrypted_chunks
|
||||
@@ -0,0 +1,34 @@
|
||||
"""
|
||||
Erwartungshorizont Templates for Vorabitur Mode — barrel re-export.
|
||||
|
||||
The actual code lives in:
|
||||
- eh_templates_types.py (AUFGABENTYPEN, EHKriterium, EHTemplate)
|
||||
- eh_templates_analyse.py (Textanalyse, Gedicht, Prosa, Drama)
|
||||
- eh_templates_eroerterung.py (Eroerterung textgebunden)
|
||||
- eh_templates_registry.py (TEMPLATES, get_template, list_templates, etc.)
|
||||
"""
|
||||
|
||||
# Types
|
||||
from .eh_templates_types import ( # noqa: F401
|
||||
AUFGABENTYPEN,
|
||||
EHKriterium,
|
||||
EHTemplate,
|
||||
)
|
||||
|
||||
# Template factories
|
||||
from .eh_templates_analyse import ( # noqa: F401
|
||||
get_textanalyse_template,
|
||||
get_gedichtanalyse_template,
|
||||
get_prosaanalyse_template,
|
||||
get_dramenanalyse_template,
|
||||
)
|
||||
from .eh_templates_eroerterung import get_eroerterung_template # noqa: F401
|
||||
|
||||
# Registry
|
||||
from .eh_templates_registry import ( # noqa: F401
|
||||
TEMPLATES,
|
||||
initialize_templates,
|
||||
get_template,
|
||||
list_templates,
|
||||
get_aufgabentypen,
|
||||
)
|
||||
@@ -0,0 +1,395 @@
|
||||
"""
|
||||
Erwartungshorizont Templates — Analyse templates.
|
||||
|
||||
Contains templates for:
|
||||
- Textanalyse (pragmatische Texte)
|
||||
- Gedichtanalyse / Lyrikinterpretation
|
||||
- Prosaanalyse
|
||||
- Dramenanalyse
|
||||
"""
|
||||
|
||||
from .eh_templates_types import EHTemplate, EHKriterium
|
||||
|
||||
|
||||
def get_textanalyse_template() -> EHTemplate:
|
||||
"""Template for pragmatic text analysis."""
|
||||
return EHTemplate(
|
||||
id="template_textanalyse_pragmatisch",
|
||||
aufgabentyp="textanalyse_pragmatisch",
|
||||
name="Textanalyse pragmatischer Texte",
|
||||
beschreibung="Vorlage fuer die Analyse von Sachtexten, Reden, Kommentaren und Essays",
|
||||
kriterien=[
|
||||
EHKriterium(
|
||||
id="inhalt",
|
||||
name="Inhaltliche Leistung",
|
||||
beschreibung="Erfassung und Wiedergabe des Textinhalts",
|
||||
gewichtung=40,
|
||||
erwartungen=[
|
||||
"Korrekte Erfassung der Textaussage/These",
|
||||
"Vollstaendige Wiedergabe der Argumentationsstruktur",
|
||||
"Erkennen von Intention und Adressatenbezug",
|
||||
"Einordnung in den historischen/gesellschaftlichen Kontext",
|
||||
"Beruecksichtigung aller relevanten Textaspekte"
|
||||
]
|
||||
),
|
||||
EHKriterium(
|
||||
id="struktur",
|
||||
name="Aufbau und Struktur",
|
||||
beschreibung="Logischer Aufbau und Gliederung der Analyse",
|
||||
gewichtung=15,
|
||||
erwartungen=[
|
||||
"Sinnvolle Einleitung mit Basisinformationen",
|
||||
"Logische Gliederung des Hauptteils",
|
||||
"Stringente Gedankenfuehrung",
|
||||
"Angemessener Schluss mit Fazit/Wertung",
|
||||
"Absatzgliederung und Ueberlaenge"
|
||||
]
|
||||
),
|
||||
EHKriterium(
|
||||
id="analyse",
|
||||
name="Analytische Qualitaet",
|
||||
beschreibung="Tiefe und Qualitaet der Analyse",
|
||||
gewichtung=15,
|
||||
erwartungen=[
|
||||
"Erkennen rhetorischer Mittel",
|
||||
"Funktionale Deutung der Stilmittel",
|
||||
"Analyse der Argumentationsweise",
|
||||
"Beruecksichtigung von Wortwahl und Satzbau",
|
||||
"Verknuepfung von Form und Inhalt"
|
||||
]
|
||||
),
|
||||
EHKriterium(
|
||||
id="rechtschreibung",
|
||||
name="Sprachliche Richtigkeit (Rechtschreibung)",
|
||||
beschreibung="Orthografische Korrektheit",
|
||||
gewichtung=15,
|
||||
erwartungen=[
|
||||
"Korrekte Rechtschreibung",
|
||||
"Korrekte Gross- und Kleinschreibung",
|
||||
"Korrekte Getrennt- und Zusammenschreibung",
|
||||
"Korrekte Fremdwortschreibung"
|
||||
]
|
||||
),
|
||||
EHKriterium(
|
||||
id="grammatik",
|
||||
name="Sprachliche Richtigkeit (Grammatik)",
|
||||
beschreibung="Grammatische Korrektheit und Zeichensetzung",
|
||||
gewichtung=15,
|
||||
erwartungen=[
|
||||
"Korrekter Satzbau",
|
||||
"Korrekte Flexion",
|
||||
"Korrekte Zeichensetzung",
|
||||
"Korrekte Bezuege und Kongruenz"
|
||||
]
|
||||
)
|
||||
],
|
||||
einleitung_hinweise=[
|
||||
"Nennung von Autor, Titel, Textsorte, Erscheinungsjahr",
|
||||
"Benennung des Themas",
|
||||
"Formulierung der Kernthese/Hauptaussage",
|
||||
"Ggf. Einordnung in den Kontext"
|
||||
],
|
||||
hauptteil_hinweise=[
|
||||
"Systematische Analyse der Argumentationsstruktur",
|
||||
"Untersuchung der sprachlichen Gestaltung",
|
||||
"Funktionale Deutung der Stilmittel",
|
||||
"Beruecksichtigung von Adressatenbezug und Intention",
|
||||
"Textbelege durch Zitate"
|
||||
],
|
||||
schluss_hinweise=[
|
||||
"Zusammenfassung der Analyseergebnisse",
|
||||
"Bewertung der Ueberzeugungskraft",
|
||||
"Ggf. aktuelle Relevanz",
|
||||
"Persoenliche Stellungnahme (wenn gefordert)"
|
||||
],
|
||||
sprachliche_aspekte=[
|
||||
"Fachsprachliche Begriffe korrekt verwenden",
|
||||
"Konjunktiv fuer indirekte Rede",
|
||||
"Praesens als Tempus der Analyse",
|
||||
"Sachlicher, analytischer Stil"
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
def get_gedichtanalyse_template() -> EHTemplate:
|
||||
"""Template for poetry analysis."""
|
||||
return EHTemplate(
|
||||
id="template_gedichtanalyse",
|
||||
aufgabentyp="gedichtanalyse",
|
||||
name="Gedichtanalyse / Lyrikinterpretation",
|
||||
beschreibung="Vorlage fuer die Analyse und Interpretation lyrischer Texte",
|
||||
kriterien=[
|
||||
EHKriterium(
|
||||
id="inhalt",
|
||||
name="Inhaltliche Leistung",
|
||||
beschreibung="Erfassung und Deutung des Gedichtinhalts",
|
||||
gewichtung=40,
|
||||
erwartungen=[
|
||||
"Korrekte Erfassung des lyrischen Ichs und der Sprechsituation",
|
||||
"Vollstaendige inhaltliche Erschliessung aller Strophen",
|
||||
"Erkennen der zentralen Motive und Themen",
|
||||
"Epochenzuordnung und literaturgeschichtliche Einordnung",
|
||||
"Deutung der Bildlichkeit und Symbolik"
|
||||
]
|
||||
),
|
||||
EHKriterium(
|
||||
id="struktur",
|
||||
name="Aufbau und Struktur",
|
||||
beschreibung="Logischer Aufbau der Interpretation",
|
||||
gewichtung=15,
|
||||
erwartungen=[
|
||||
"Einleitung mit Basisinformationen",
|
||||
"Systematische strophenweise oder aspektorientierte Analyse",
|
||||
"Verknuepfung von Form- und Inhaltsanalyse",
|
||||
"Schluessige Gesamtdeutung im Schluss"
|
||||
]
|
||||
),
|
||||
EHKriterium(
|
||||
id="formanalyse",
|
||||
name="Formale Analyse",
|
||||
beschreibung="Analyse der lyrischen Gestaltungsmittel",
|
||||
gewichtung=15,
|
||||
erwartungen=[
|
||||
"Bestimmung von Metrum und Reimschema",
|
||||
"Analyse der Klanggestaltung",
|
||||
"Erkennen von Enjambements und Zaesuren",
|
||||
"Deutung der formalen Mittel",
|
||||
"Verknuepfung von Form und Inhalt"
|
||||
]
|
||||
),
|
||||
EHKriterium(
|
||||
id="rechtschreibung",
|
||||
name="Sprachliche Richtigkeit (Rechtschreibung)",
|
||||
beschreibung="Orthografische Korrektheit",
|
||||
gewichtung=15,
|
||||
erwartungen=[
|
||||
"Korrekte Rechtschreibung",
|
||||
"Korrekte Gross- und Kleinschreibung",
|
||||
"Korrekte Getrennt- und Zusammenschreibung"
|
||||
]
|
||||
),
|
||||
EHKriterium(
|
||||
id="grammatik",
|
||||
name="Sprachliche Richtigkeit (Grammatik)",
|
||||
beschreibung="Grammatische Korrektheit und Zeichensetzung",
|
||||
gewichtung=15,
|
||||
erwartungen=[
|
||||
"Korrekter Satzbau",
|
||||
"Korrekte Flexion",
|
||||
"Korrekte Zeichensetzung"
|
||||
]
|
||||
)
|
||||
],
|
||||
einleitung_hinweise=[
|
||||
"Autor, Titel, Entstehungsjahr/Epoche",
|
||||
"Thema/Motiv des Gedichts",
|
||||
"Erste Deutungshypothese",
|
||||
"Formale Grunddaten (Strophen, Verse)"
|
||||
],
|
||||
hauptteil_hinweise=[
|
||||
"Inhaltliche Analyse (strophenweise oder aspektorientiert)",
|
||||
"Formale Analyse (Metrum, Reim, Klang)",
|
||||
"Sprachliche Analyse (Stilmittel, Bildlichkeit)",
|
||||
"Funktionale Verknuepfung aller Ebenen",
|
||||
"Textbelege durch Zitate mit Versangabe"
|
||||
],
|
||||
schluss_hinweise=[
|
||||
"Zusammenfassung der Interpretationsergebnisse",
|
||||
"Bestaetigung/Modifikation der Deutungshypothese",
|
||||
"Einordnung in Epoche/Werk des Autors",
|
||||
"Aktualitaetsbezug (wenn sinnvoll)"
|
||||
],
|
||||
sprachliche_aspekte=[
|
||||
"Fachbegriffe der Lyrikanalyse verwenden",
|
||||
"Zwischen lyrischem Ich und Autor unterscheiden",
|
||||
"Praesens als Analysetempus",
|
||||
"Deutende statt beschreibende Formulierungen"
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
def get_prosaanalyse_template() -> EHTemplate:
|
||||
"""Template for prose/narrative text analysis."""
|
||||
return EHTemplate(
|
||||
id="template_prosaanalyse",
|
||||
aufgabentyp="prosaanalyse",
|
||||
name="Epische Textanalyse / Prosaanalyse",
|
||||
beschreibung="Vorlage fuer die Analyse von Romanauszuegen, Kurzgeschichten und Novellen",
|
||||
kriterien=[
|
||||
EHKriterium(
|
||||
id="inhalt",
|
||||
name="Inhaltliche Leistung",
|
||||
beschreibung="Erfassung und Deutung des Textinhalts",
|
||||
gewichtung=40,
|
||||
erwartungen=[
|
||||
"Korrekte Erfassung der Handlung",
|
||||
"Charakterisierung der Figuren",
|
||||
"Erkennen der Erzaehlsituation",
|
||||
"Deutung der Konflikte und Motive",
|
||||
"Einordnung in den Gesamtzusammenhang"
|
||||
]
|
||||
),
|
||||
EHKriterium(
|
||||
id="struktur",
|
||||
name="Aufbau und Struktur",
|
||||
beschreibung="Logischer Aufbau der Analyse",
|
||||
gewichtung=15,
|
||||
erwartungen=[
|
||||
"Informative Einleitung",
|
||||
"Systematische Analyse im Hauptteil",
|
||||
"Verknuepfung der Analyseergebnisse",
|
||||
"Schluessige Gesamtdeutung"
|
||||
]
|
||||
),
|
||||
EHKriterium(
|
||||
id="erzaehltechnik",
|
||||
name="Erzaehltechnische Analyse",
|
||||
beschreibung="Analyse narrativer Gestaltungsmittel",
|
||||
gewichtung=15,
|
||||
erwartungen=[
|
||||
"Bestimmung der Erzaehlperspektive",
|
||||
"Analyse von Zeitgestaltung",
|
||||
"Raumgestaltung und Atmosphaere",
|
||||
"Figurenrede und Bewusstseinsdarstellung",
|
||||
"Funktionale Deutung"
|
||||
]
|
||||
),
|
||||
EHKriterium(
|
||||
id="rechtschreibung",
|
||||
name="Sprachliche Richtigkeit (Rechtschreibung)",
|
||||
beschreibung="Orthografische Korrektheit",
|
||||
gewichtung=15,
|
||||
erwartungen=[
|
||||
"Korrekte Rechtschreibung",
|
||||
"Korrekte Gross- und Kleinschreibung"
|
||||
]
|
||||
),
|
||||
EHKriterium(
|
||||
id="grammatik",
|
||||
name="Sprachliche Richtigkeit (Grammatik)",
|
||||
beschreibung="Grammatische Korrektheit und Zeichensetzung",
|
||||
gewichtung=15,
|
||||
erwartungen=[
|
||||
"Korrekter Satzbau",
|
||||
"Korrekte Zeichensetzung"
|
||||
]
|
||||
)
|
||||
],
|
||||
einleitung_hinweise=[
|
||||
"Autor, Titel, Textsorte, Erscheinungsjahr",
|
||||
"Einordnung des Auszugs in den Gesamttext",
|
||||
"Thema und Deutungshypothese"
|
||||
],
|
||||
hauptteil_hinweise=[
|
||||
"Kurze Inhaltsangabe des Auszugs",
|
||||
"Analyse der Handlungsstruktur",
|
||||
"Figurenanalyse mit Textbelegen",
|
||||
"Erzaehltechnische Analyse",
|
||||
"Sprachliche Analyse",
|
||||
"Verknuepfung aller Ebenen"
|
||||
],
|
||||
schluss_hinweise=[
|
||||
"Zusammenfassung der Analyseergebnisse",
|
||||
"Bestaetigung der Deutungshypothese",
|
||||
"Bedeutung fuer Gesamtwerk",
|
||||
"Ggf. Aktualitaetsbezug"
|
||||
],
|
||||
sprachliche_aspekte=[
|
||||
"Fachbegriffe der Erzaehltextanalyse",
|
||||
"Zwischen Erzaehler und Autor unterscheiden",
|
||||
"Praesens als Analysetempus",
|
||||
"Deutende Formulierungen"
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
def get_dramenanalyse_template() -> EHTemplate:
|
||||
"""Template for drama analysis."""
|
||||
return EHTemplate(
|
||||
id="template_dramenanalyse",
|
||||
aufgabentyp="dramenanalyse",
|
||||
name="Dramenanalyse",
|
||||
beschreibung="Vorlage fuer die Analyse dramatischer Texte und Szenen",
|
||||
kriterien=[
|
||||
EHKriterium(
|
||||
id="inhalt",
|
||||
name="Inhaltliche Leistung",
|
||||
beschreibung="Erfassung und Deutung des Szeneninhalts",
|
||||
gewichtung=40,
|
||||
erwartungen=[
|
||||
"Korrekte Erfassung der Handlung",
|
||||
"Analyse der Figurenkonstellation",
|
||||
"Erkennen des dramatischen Konflikts",
|
||||
"Einordnung in den Handlungsverlauf",
|
||||
"Deutung der Szene im Gesamtzusammenhang"
|
||||
]
|
||||
),
|
||||
EHKriterium(
|
||||
id="struktur",
|
||||
name="Aufbau und Struktur",
|
||||
beschreibung="Logischer Aufbau der Analyse",
|
||||
gewichtung=15,
|
||||
erwartungen=[
|
||||
"Einleitung mit Kontextualisierung",
|
||||
"Systematische Szenenanalyse",
|
||||
"Verknuepfung der Analyseergebnisse",
|
||||
"Schluessige Deutung"
|
||||
]
|
||||
),
|
||||
EHKriterium(
|
||||
id="dramentechnik",
|
||||
name="Dramentechnische Analyse",
|
||||
beschreibung="Analyse dramatischer Gestaltungsmittel",
|
||||
gewichtung=15,
|
||||
erwartungen=[
|
||||
"Analyse der Dialoggestaltung",
|
||||
"Regieanweisungen und Buehnenraum",
|
||||
"Dramatische Spannung",
|
||||
"Monolog/Dialog-Formen",
|
||||
"Funktionale Deutung"
|
||||
]
|
||||
),
|
||||
EHKriterium(
|
||||
id="rechtschreibung",
|
||||
name="Sprachliche Richtigkeit (Rechtschreibung)",
|
||||
beschreibung="Orthografische Korrektheit",
|
||||
gewichtung=15,
|
||||
erwartungen=[
|
||||
"Korrekte Rechtschreibung"
|
||||
]
|
||||
),
|
||||
EHKriterium(
|
||||
id="grammatik",
|
||||
name="Sprachliche Richtigkeit (Grammatik)",
|
||||
beschreibung="Grammatische Korrektheit und Zeichensetzung",
|
||||
gewichtung=15,
|
||||
erwartungen=[
|
||||
"Korrekter Satzbau",
|
||||
"Korrekte Zeichensetzung"
|
||||
]
|
||||
)
|
||||
],
|
||||
einleitung_hinweise=[
|
||||
"Autor, Titel, Urauffuehrungsjahr, Dramenform",
|
||||
"Einordnung der Szene in den Handlungsverlauf",
|
||||
"Thema und Deutungshypothese"
|
||||
],
|
||||
hauptteil_hinweise=[
|
||||
"Situierung der Szene",
|
||||
"Analyse des Dialogverlaufs",
|
||||
"Figurenanalyse im Dialog",
|
||||
"Sprachliche Analyse",
|
||||
"Dramentechnische Mittel",
|
||||
"Bedeutung fuer den Konflikt"
|
||||
],
|
||||
schluss_hinweise=[
|
||||
"Zusammenfassung der Analyseergebnisse",
|
||||
"Funktion der Szene im Drama",
|
||||
"Bedeutung fuer die Gesamtdeutung"
|
||||
],
|
||||
sprachliche_aspekte=[
|
||||
"Fachbegriffe der Dramenanalyse",
|
||||
"Praesens als Analysetempus",
|
||||
"Korrekte Zitierweise mit Akt/Szene/Zeile"
|
||||
]
|
||||
)
|
||||
@@ -0,0 +1,101 @@
|
||||
"""
|
||||
Erwartungshorizont Templates — Eroerterung template.
|
||||
"""
|
||||
|
||||
from .eh_templates_types import EHTemplate, EHKriterium
|
||||
|
||||
|
||||
def get_eroerterung_template() -> EHTemplate:
|
||||
"""Template for textgebundene Eroerterung."""
|
||||
return EHTemplate(
|
||||
id="template_eroerterung_textgebunden",
|
||||
aufgabentyp="eroerterung_textgebunden",
|
||||
name="Textgebundene Eroerterung",
|
||||
beschreibung="Vorlage fuer die textgebundene Eroerterung auf Basis eines Sachtextes",
|
||||
kriterien=[
|
||||
EHKriterium(
|
||||
id="inhalt",
|
||||
name="Inhaltliche Leistung",
|
||||
beschreibung="Qualitaet der Argumentation",
|
||||
gewichtung=40,
|
||||
erwartungen=[
|
||||
"Korrekte Wiedergabe der Textposition",
|
||||
"Differenzierte eigene Argumentation",
|
||||
"Vielfaeltige und ueberzeugende Argumente",
|
||||
"Beruecksichtigung von Pro und Contra",
|
||||
"Sinnvolle Beispiele und Belege",
|
||||
"Eigenstaendige Schlussfolgerung"
|
||||
]
|
||||
),
|
||||
EHKriterium(
|
||||
id="struktur",
|
||||
name="Aufbau und Struktur",
|
||||
beschreibung="Logischer Aufbau der Eroerterung",
|
||||
gewichtung=15,
|
||||
erwartungen=[
|
||||
"Problemorientierte Einleitung",
|
||||
"Klare Gliederung der Argumentation",
|
||||
"Logische Argumentationsfolge",
|
||||
"Sinnvolle Ueberlaetze",
|
||||
"Begruendetes Fazit"
|
||||
]
|
||||
),
|
||||
EHKriterium(
|
||||
id="textbezug",
|
||||
name="Textbezug",
|
||||
beschreibung="Verknuepfung mit dem Ausgangstext",
|
||||
gewichtung=15,
|
||||
erwartungen=[
|
||||
"Angemessene Textwiedergabe",
|
||||
"Kritische Auseinandersetzung mit Textposition",
|
||||
"Korrekte Zitierweise",
|
||||
"Verknuepfung eigener Argumente mit Text"
|
||||
]
|
||||
),
|
||||
EHKriterium(
|
||||
id="rechtschreibung",
|
||||
name="Sprachliche Richtigkeit (Rechtschreibung)",
|
||||
beschreibung="Orthografische Korrektheit",
|
||||
gewichtung=15,
|
||||
erwartungen=[
|
||||
"Korrekte Rechtschreibung",
|
||||
"Korrekte Gross- und Kleinschreibung"
|
||||
]
|
||||
),
|
||||
EHKriterium(
|
||||
id="grammatik",
|
||||
name="Sprachliche Richtigkeit (Grammatik)",
|
||||
beschreibung="Grammatische Korrektheit und Zeichensetzung",
|
||||
gewichtung=15,
|
||||
erwartungen=[
|
||||
"Korrekter Satzbau",
|
||||
"Korrekte Zeichensetzung",
|
||||
"Variationsreicher Ausdruck"
|
||||
]
|
||||
)
|
||||
],
|
||||
einleitung_hinweise=[
|
||||
"Hinfuehrung zum Thema",
|
||||
"Nennung des Ausgangstextes",
|
||||
"Formulierung der Leitfrage/These",
|
||||
"Ueberleitung zum Hauptteil"
|
||||
],
|
||||
hauptteil_hinweise=[
|
||||
"Kurze Wiedergabe der Textposition",
|
||||
"Systematische Argumentation (dialektisch oder linear)",
|
||||
"Jedes Argument: These - Begruendung - Beispiel",
|
||||
"Gewichtung der Argumente",
|
||||
"Verknuepfung mit Textposition"
|
||||
],
|
||||
schluss_hinweise=[
|
||||
"Zusammenfassung der wichtigsten Argumente",
|
||||
"Eigene begruendete Stellungnahme",
|
||||
"Ggf. Ausblick oder Appell"
|
||||
],
|
||||
sprachliche_aspekte=[
|
||||
"Argumentative Konnektoren verwenden",
|
||||
"Sachlicher, ueberzeugender Stil",
|
||||
"Eigene Meinung kennzeichnen",
|
||||
"Konjunktiv fuer Textpositionen"
|
||||
]
|
||||
)
|
||||
@@ -0,0 +1,60 @@
|
||||
"""
|
||||
Erwartungshorizont Templates — registry for template lookup.
|
||||
"""
|
||||
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
from .eh_templates_types import EHTemplate, AUFGABENTYPEN
|
||||
from .eh_templates_analyse import (
|
||||
get_textanalyse_template,
|
||||
get_gedichtanalyse_template,
|
||||
get_prosaanalyse_template,
|
||||
get_dramenanalyse_template,
|
||||
)
|
||||
from .eh_templates_eroerterung import get_eroerterung_template
|
||||
|
||||
|
||||
TEMPLATES: Dict[str, EHTemplate] = {}
|
||||
|
||||
|
||||
def initialize_templates():
|
||||
"""Initialize all pre-defined templates."""
|
||||
global TEMPLATES
|
||||
TEMPLATES = {
|
||||
"textanalyse_pragmatisch": get_textanalyse_template(),
|
||||
"gedichtanalyse": get_gedichtanalyse_template(),
|
||||
"eroerterung_textgebunden": get_eroerterung_template(),
|
||||
"prosaanalyse": get_prosaanalyse_template(),
|
||||
"dramenanalyse": get_dramenanalyse_template(),
|
||||
}
|
||||
|
||||
|
||||
def get_template(aufgabentyp: str) -> Optional[EHTemplate]:
|
||||
"""Get a template by Aufgabentyp."""
|
||||
if not TEMPLATES:
|
||||
initialize_templates()
|
||||
return TEMPLATES.get(aufgabentyp)
|
||||
|
||||
|
||||
def list_templates() -> List[Dict]:
|
||||
"""List all available templates."""
|
||||
if not TEMPLATES:
|
||||
initialize_templates()
|
||||
return [
|
||||
{
|
||||
"aufgabentyp": typ,
|
||||
"name": AUFGABENTYPEN.get(typ, {}).get("name", typ),
|
||||
"description": AUFGABENTYPEN.get(typ, {}).get("description", ""),
|
||||
"category": AUFGABENTYPEN.get(typ, {}).get("category", "other"),
|
||||
}
|
||||
for typ in TEMPLATES.keys()
|
||||
]
|
||||
|
||||
|
||||
def get_aufgabentypen() -> Dict:
|
||||
"""Get all Aufgabentypen definitions."""
|
||||
return AUFGABENTYPEN
|
||||
|
||||
|
||||
# Initialize on import
|
||||
initialize_templates()
|
||||
@@ -0,0 +1,100 @@
|
||||
"""
|
||||
Erwartungshorizont Templates — types and Aufgabentypen registry.
|
||||
"""
|
||||
|
||||
from typing import Dict, List, Optional
|
||||
from dataclasses import dataclass, field, asdict
|
||||
from datetime import datetime
|
||||
|
||||
|
||||
AUFGABENTYPEN = {
|
||||
"textanalyse_pragmatisch": {
|
||||
"name": "Textanalyse (pragmatische Texte)",
|
||||
"description": "Analyse von Sachtexten, Reden, Kommentaren, Essays",
|
||||
"category": "analyse"
|
||||
},
|
||||
"sachtextanalyse": {
|
||||
"name": "Sachtextanalyse",
|
||||
"description": "Analyse von informativen und appellativen Sachtexten",
|
||||
"category": "analyse"
|
||||
},
|
||||
"gedichtanalyse": {
|
||||
"name": "Gedichtanalyse / Lyrikinterpretation",
|
||||
"description": "Analyse und Interpretation lyrischer Texte",
|
||||
"category": "interpretation"
|
||||
},
|
||||
"dramenanalyse": {
|
||||
"name": "Dramenanalyse",
|
||||
"description": "Analyse dramatischer Texte und Szenen",
|
||||
"category": "interpretation"
|
||||
},
|
||||
"prosaanalyse": {
|
||||
"name": "Epische Textanalyse / Prosaanalyse",
|
||||
"description": "Analyse von Romanauszuegen, Kurzgeschichten, Novellen",
|
||||
"category": "interpretation"
|
||||
},
|
||||
"eroerterung_textgebunden": {
|
||||
"name": "Textgebundene Eroerterung",
|
||||
"description": "Eroerterung auf Basis eines Sachtextes",
|
||||
"category": "argumentation"
|
||||
},
|
||||
"eroerterung_frei": {
|
||||
"name": "Freie Eroerterung",
|
||||
"description": "Freie Eroerterung zu einem Thema",
|
||||
"category": "argumentation"
|
||||
},
|
||||
"eroerterung_literarisch": {
|
||||
"name": "Literarische Eroerterung",
|
||||
"description": "Eroerterung zu literarischen Fragestellungen",
|
||||
"category": "argumentation"
|
||||
},
|
||||
"materialgestuetzt": {
|
||||
"name": "Materialgestuetztes Schreiben",
|
||||
"description": "Verfassen eines Textes auf Materialbasis",
|
||||
"category": "produktion"
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@dataclass
|
||||
class EHKriterium:
|
||||
"""Single criterion in an Erwartungshorizont."""
|
||||
id: str
|
||||
name: str
|
||||
beschreibung: str
|
||||
gewichtung: int # Percentage weight (0-100)
|
||||
erwartungen: List[str] # Expected points/elements
|
||||
max_punkte: int = 100
|
||||
|
||||
def to_dict(self):
|
||||
return asdict(self)
|
||||
|
||||
|
||||
@dataclass
|
||||
class EHTemplate:
|
||||
"""Complete Erwartungshorizont template."""
|
||||
id: str
|
||||
aufgabentyp: str
|
||||
name: str
|
||||
beschreibung: str
|
||||
kriterien: List[EHKriterium]
|
||||
einleitung_hinweise: List[str]
|
||||
hauptteil_hinweise: List[str]
|
||||
schluss_hinweise: List[str]
|
||||
sprachliche_aspekte: List[str]
|
||||
created_at: datetime = field(default_factory=lambda: datetime.now())
|
||||
|
||||
def to_dict(self):
|
||||
d = {
|
||||
'id': self.id,
|
||||
'aufgabentyp': self.aufgabentyp,
|
||||
'name': self.name,
|
||||
'beschreibung': self.beschreibung,
|
||||
'kriterien': [k.to_dict() for k in self.kriterien],
|
||||
'einleitung_hinweise': self.einleitung_hinweise,
|
||||
'hauptteil_hinweise': self.hauptteil_hinweise,
|
||||
'schluss_hinweise': self.schluss_hinweise,
|
||||
'sprachliche_aspekte': self.sprachliche_aspekte,
|
||||
'created_at': self.created_at.isoformat()
|
||||
}
|
||||
return d
|
||||
@@ -0,0 +1,17 @@
|
||||
"""
|
||||
PDF Export Module for Abiturkorrektur System
|
||||
|
||||
Barrel re-export: all PDF generation functions and constants.
|
||||
"""
|
||||
|
||||
from .pdf_export_styles import ( # noqa: F401
|
||||
GRADE_POINTS_TO_NOTE,
|
||||
CRITERIA_DISPLAY_NAMES,
|
||||
CRITERIA_WEIGHTS,
|
||||
get_custom_styles,
|
||||
)
|
||||
from .pdf_export_gutachten import generate_gutachten_pdf # noqa: F401
|
||||
from .pdf_export_overview import ( # noqa: F401
|
||||
generate_klausur_overview_pdf,
|
||||
generate_annotations_pdf,
|
||||
)
|
||||
@@ -0,0 +1,315 @@
|
||||
"""
|
||||
PDF Export - Individual Gutachten PDF generation.
|
||||
|
||||
Generates a single student's Gutachten with criteria table,
|
||||
workflow info, and annotation summary.
|
||||
"""
|
||||
|
||||
import io
|
||||
from datetime import datetime
|
||||
from typing import Dict, List, Optional, Any
|
||||
|
||||
from reportlab.lib import colors
|
||||
from reportlab.lib.pagesizes import A4
|
||||
from reportlab.lib.units import cm
|
||||
from reportlab.platypus import (
|
||||
SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle,
|
||||
HRFlowable, KeepTogether
|
||||
)
|
||||
|
||||
from .pdf_export_styles import (
|
||||
GRADE_POINTS_TO_NOTE,
|
||||
CRITERIA_DISPLAY_NAMES,
|
||||
CRITERIA_WEIGHTS,
|
||||
get_custom_styles,
|
||||
)
|
||||
|
||||
|
||||
def generate_gutachten_pdf(
|
||||
student_data: Dict[str, Any],
|
||||
klausur_data: Dict[str, Any],
|
||||
annotations: List[Dict[str, Any]] = None,
|
||||
workflow_data: Dict[str, Any] = None
|
||||
) -> bytes:
|
||||
"""
|
||||
Generate a PDF Gutachten for a single student.
|
||||
|
||||
Args:
|
||||
student_data: Student work data including criteria_scores, gutachten, grade_points
|
||||
klausur_data: Klausur metadata (title, subject, year, etc.)
|
||||
annotations: List of annotations for annotation summary
|
||||
workflow_data: Examiner workflow data (EK, ZK, DK info)
|
||||
|
||||
Returns:
|
||||
PDF as bytes
|
||||
"""
|
||||
buffer = io.BytesIO()
|
||||
doc = SimpleDocTemplate(
|
||||
buffer,
|
||||
pagesize=A4,
|
||||
rightMargin=2*cm,
|
||||
leftMargin=2*cm,
|
||||
topMargin=2*cm,
|
||||
bottomMargin=2*cm
|
||||
)
|
||||
|
||||
styles = get_custom_styles()
|
||||
story = []
|
||||
|
||||
# Header
|
||||
story.append(Paragraph("Gutachten zur Abiturklausur", styles['GutachtenTitle']))
|
||||
story.append(Paragraph(f"{klausur_data.get('subject', 'Deutsch')} - {klausur_data.get('title', '')}", styles['GutachtenSubtitle']))
|
||||
story.append(Spacer(1, 0.5*cm))
|
||||
|
||||
# Meta information table
|
||||
meta_data = [
|
||||
["Pruefling:", student_data.get('student_name', 'Anonym')],
|
||||
["Schuljahr:", f"{klausur_data.get('year', 2025)}"],
|
||||
["Kurs:", klausur_data.get('semester', 'Abitur')],
|
||||
["Datum:", datetime.now().strftime("%d.%m.%Y")]
|
||||
]
|
||||
|
||||
meta_table = Table(meta_data, colWidths=[4*cm, 10*cm])
|
||||
meta_table.setStyle(TableStyle([
|
||||
('FONTNAME', (0, 0), (0, -1), 'Helvetica-Bold'),
|
||||
('FONTSIZE', (0, 0), (-1, -1), 10),
|
||||
('BOTTOMPADDING', (0, 0), (-1, -1), 4),
|
||||
('TOPPADDING', (0, 0), (-1, -1), 4),
|
||||
]))
|
||||
story.append(meta_table)
|
||||
story.append(Spacer(1, 0.5*cm))
|
||||
story.append(HRFlowable(width="100%", thickness=1, color=colors.HexColor('#e2e8f0')))
|
||||
story.append(Spacer(1, 0.5*cm))
|
||||
|
||||
# Gutachten content
|
||||
_add_gutachten_content(story, styles, student_data)
|
||||
|
||||
story.append(Spacer(1, 0.5*cm))
|
||||
story.append(HRFlowable(width="100%", thickness=1, color=colors.HexColor('#e2e8f0')))
|
||||
story.append(Spacer(1, 0.5*cm))
|
||||
|
||||
# Bewertungstabelle
|
||||
_add_criteria_table(story, styles, student_data)
|
||||
|
||||
# Final grade box
|
||||
_add_grade_box(story, styles, student_data)
|
||||
|
||||
# Examiner workflow information
|
||||
if workflow_data:
|
||||
_add_workflow_info(story, styles, workflow_data)
|
||||
|
||||
# Annotation summary
|
||||
if annotations:
|
||||
_add_annotation_summary(story, styles, annotations)
|
||||
|
||||
# Footer
|
||||
_add_footer(story, styles)
|
||||
|
||||
# Build PDF
|
||||
doc.build(story)
|
||||
buffer.seek(0)
|
||||
return buffer.getvalue()
|
||||
|
||||
|
||||
def _add_gutachten_content(story, styles, student_data):
|
||||
"""Add gutachten text sections to the story."""
|
||||
gutachten = student_data.get('gutachten', {})
|
||||
|
||||
if gutachten:
|
||||
if gutachten.get('einleitung'):
|
||||
story.append(Paragraph("Einleitung", styles['SectionHeader']))
|
||||
story.append(Paragraph(gutachten['einleitung'], styles['GutachtenBody']))
|
||||
story.append(Spacer(1, 0.3*cm))
|
||||
|
||||
if gutachten.get('hauptteil'):
|
||||
story.append(Paragraph("Hauptteil", styles['SectionHeader']))
|
||||
story.append(Paragraph(gutachten['hauptteil'], styles['GutachtenBody']))
|
||||
story.append(Spacer(1, 0.3*cm))
|
||||
|
||||
if gutachten.get('fazit'):
|
||||
story.append(Paragraph("Fazit", styles['SectionHeader']))
|
||||
story.append(Paragraph(gutachten['fazit'], styles['GutachtenBody']))
|
||||
story.append(Spacer(1, 0.3*cm))
|
||||
|
||||
if gutachten.get('staerken') or gutachten.get('schwaechen'):
|
||||
story.append(Spacer(1, 0.3*cm))
|
||||
|
||||
if gutachten.get('staerken'):
|
||||
story.append(Paragraph("Staerken:", styles['SectionHeader']))
|
||||
for s in gutachten['staerken']:
|
||||
story.append(Paragraph(f"• {s}", styles['ListItem']))
|
||||
|
||||
if gutachten.get('schwaechen'):
|
||||
story.append(Paragraph("Verbesserungspotenzial:", styles['SectionHeader']))
|
||||
for s in gutachten['schwaechen']:
|
||||
story.append(Paragraph(f"• {s}", styles['ListItem']))
|
||||
else:
|
||||
story.append(Paragraph("<i>Kein Gutachten-Text vorhanden.</i>", styles['GutachtenBody']))
|
||||
|
||||
|
||||
def _add_criteria_table(story, styles, student_data):
|
||||
"""Add criteria scoring table to the story."""
|
||||
story.append(Paragraph("Bewertung nach Kriterien", styles['SectionHeader']))
|
||||
story.append(Spacer(1, 0.2*cm))
|
||||
|
||||
criteria_scores = student_data.get('criteria_scores', {})
|
||||
|
||||
table_data = [["Kriterium", "Gewichtung", "Erreicht", "Punkte"]]
|
||||
total_weighted = 0
|
||||
total_weight = 0
|
||||
|
||||
for key, display_name in CRITERIA_DISPLAY_NAMES.items():
|
||||
weight = CRITERIA_WEIGHTS.get(key, 0)
|
||||
score_data = criteria_scores.get(key, {})
|
||||
score = score_data.get('score', 0) if isinstance(score_data, dict) else score_data
|
||||
|
||||
weighted_score = (score / 100) * weight if score else 0
|
||||
total_weighted += weighted_score
|
||||
total_weight += weight
|
||||
|
||||
table_data.append([
|
||||
display_name,
|
||||
f"{weight}%",
|
||||
f"{score}%",
|
||||
f"{weighted_score:.1f}"
|
||||
])
|
||||
|
||||
table_data.append([
|
||||
"Gesamt",
|
||||
f"{total_weight}%",
|
||||
"",
|
||||
f"{total_weighted:.1f}"
|
||||
])
|
||||
|
||||
criteria_table = Table(table_data, colWidths=[8*cm, 2.5*cm, 2.5*cm, 2.5*cm])
|
||||
criteria_table.setStyle(TableStyle([
|
||||
('BACKGROUND', (0, 0), (-1, 0), colors.HexColor('#2c5282')),
|
||||
('TEXTCOLOR', (0, 0), (-1, 0), colors.white),
|
||||
('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
|
||||
('FONTSIZE', (0, 0), (-1, 0), 10),
|
||||
('ALIGN', (1, 0), (-1, -1), 'CENTER'),
|
||||
('FONTSIZE', (0, 1), (-1, -1), 9),
|
||||
('BOTTOMPADDING', (0, 0), (-1, -1), 6),
|
||||
('TOPPADDING', (0, 0), (-1, -1), 6),
|
||||
('GRID', (0, 0), (-1, -1), 0.5, colors.HexColor('#e2e8f0')),
|
||||
('BACKGROUND', (0, -1), (-1, -1), colors.HexColor('#f7fafc')),
|
||||
('FONTNAME', (0, -1), (-1, -1), 'Helvetica-Bold'),
|
||||
('ROWBACKGROUNDS', (0, 1), (-1, -2), [colors.white, colors.HexColor('#f7fafc')]),
|
||||
]))
|
||||
story.append(criteria_table)
|
||||
story.append(Spacer(1, 0.5*cm))
|
||||
|
||||
|
||||
def _add_grade_box(story, styles, student_data):
|
||||
"""Add final grade box to the story."""
|
||||
grade_points = student_data.get('grade_points', 0)
|
||||
grade_note = GRADE_POINTS_TO_NOTE.get(grade_points, "?")
|
||||
raw_points = student_data.get('raw_points', 0)
|
||||
|
||||
grade_data = [
|
||||
["Rohpunkte:", f"{raw_points} / 100"],
|
||||
["Notenpunkte:", f"{grade_points} Punkte"],
|
||||
["Note:", grade_note]
|
||||
]
|
||||
|
||||
grade_table = Table(grade_data, colWidths=[4*cm, 4*cm])
|
||||
grade_table.setStyle(TableStyle([
|
||||
('BACKGROUND', (0, 0), (-1, -1), colors.HexColor('#ebf8ff')),
|
||||
('FONTNAME', (0, 0), (0, -1), 'Helvetica-Bold'),
|
||||
('FONTNAME', (1, -1), (1, -1), 'Helvetica-Bold'),
|
||||
('FONTSIZE', (0, 0), (-1, -1), 11),
|
||||
('FONTSIZE', (1, -1), (1, -1), 14),
|
||||
('TEXTCOLOR', (1, -1), (1, -1), colors.HexColor('#2c5282')),
|
||||
('BOTTOMPADDING', (0, 0), (-1, -1), 8),
|
||||
('TOPPADDING', (0, 0), (-1, -1), 8),
|
||||
('LEFTPADDING', (0, 0), (-1, -1), 12),
|
||||
('BOX', (0, 0), (-1, -1), 1, colors.HexColor('#2c5282')),
|
||||
('ALIGN', (1, 0), (1, -1), 'RIGHT'),
|
||||
]))
|
||||
|
||||
story.append(KeepTogether([
|
||||
Paragraph("Endergebnis", styles['SectionHeader']),
|
||||
Spacer(1, 0.2*cm),
|
||||
grade_table
|
||||
]))
|
||||
|
||||
|
||||
def _add_workflow_info(story, styles, workflow_data):
|
||||
"""Add examiner workflow information to the story."""
|
||||
story.append(Spacer(1, 0.5*cm))
|
||||
story.append(HRFlowable(width="100%", thickness=1, color=colors.HexColor('#e2e8f0')))
|
||||
story.append(Spacer(1, 0.3*cm))
|
||||
story.append(Paragraph("Korrekturverlauf", styles['SectionHeader']))
|
||||
|
||||
workflow_rows = []
|
||||
|
||||
if workflow_data.get('erst_korrektor'):
|
||||
ek = workflow_data['erst_korrektor']
|
||||
workflow_rows.append([
|
||||
"Erstkorrektor:",
|
||||
ek.get('name', 'Unbekannt'),
|
||||
f"{ek.get('grade_points', '-')} Punkte"
|
||||
])
|
||||
|
||||
if workflow_data.get('zweit_korrektor'):
|
||||
zk = workflow_data['zweit_korrektor']
|
||||
workflow_rows.append([
|
||||
"Zweitkorrektor:",
|
||||
zk.get('name', 'Unbekannt'),
|
||||
f"{zk.get('grade_points', '-')} Punkte"
|
||||
])
|
||||
|
||||
if workflow_data.get('dritt_korrektor'):
|
||||
dk = workflow_data['dritt_korrektor']
|
||||
workflow_rows.append([
|
||||
"Drittkorrektor:",
|
||||
dk.get('name', 'Unbekannt'),
|
||||
f"{dk.get('grade_points', '-')} Punkte"
|
||||
])
|
||||
|
||||
if workflow_data.get('final_grade_source'):
|
||||
workflow_rows.append([
|
||||
"Endnote durch:",
|
||||
workflow_data['final_grade_source'],
|
||||
""
|
||||
])
|
||||
|
||||
if workflow_rows:
|
||||
workflow_table = Table(workflow_rows, colWidths=[4*cm, 6*cm, 4*cm])
|
||||
workflow_table.setStyle(TableStyle([
|
||||
('FONTNAME', (0, 0), (0, -1), 'Helvetica-Bold'),
|
||||
('FONTSIZE', (0, 0), (-1, -1), 9),
|
||||
('BOTTOMPADDING', (0, 0), (-1, -1), 4),
|
||||
('TOPPADDING', (0, 0), (-1, -1), 4),
|
||||
]))
|
||||
story.append(workflow_table)
|
||||
|
||||
|
||||
def _add_annotation_summary(story, styles, annotations):
|
||||
"""Add annotation summary to the story."""
|
||||
story.append(Spacer(1, 0.5*cm))
|
||||
story.append(HRFlowable(width="100%", thickness=1, color=colors.HexColor('#e2e8f0')))
|
||||
story.append(Spacer(1, 0.3*cm))
|
||||
story.append(Paragraph("Anmerkungen (Zusammenfassung)", styles['SectionHeader']))
|
||||
|
||||
by_type = {}
|
||||
for ann in annotations:
|
||||
ann_type = ann.get('type', 'comment')
|
||||
if ann_type not in by_type:
|
||||
by_type[ann_type] = []
|
||||
by_type[ann_type].append(ann)
|
||||
|
||||
for ann_type, anns in by_type.items():
|
||||
type_name = CRITERIA_DISPLAY_NAMES.get(ann_type, ann_type.replace('_', ' ').title())
|
||||
story.append(Paragraph(f"{type_name} ({len(anns)} Anmerkungen)", styles['ListItem']))
|
||||
|
||||
|
||||
def _add_footer(story, styles):
|
||||
"""Add generation footer to the story."""
|
||||
story.append(Spacer(1, 1*cm))
|
||||
story.append(HRFlowable(width="100%", thickness=0.5, color=colors.HexColor('#cbd5e0')))
|
||||
story.append(Spacer(1, 0.2*cm))
|
||||
story.append(Paragraph(
|
||||
f"Erstellt am {datetime.now().strftime('%d.%m.%Y um %H:%M Uhr')} | BreakPilot Abiturkorrektur-System",
|
||||
styles['MetaText']
|
||||
))
|
||||
@@ -0,0 +1,297 @@
|
||||
"""
|
||||
PDF Export - Klausur overview and annotations PDF generation.
|
||||
|
||||
Generates:
|
||||
- Klausur overview with grade distribution for all students
|
||||
- Annotations PDF for a single student
|
||||
"""
|
||||
|
||||
import io
|
||||
from datetime import datetime
|
||||
from typing import Dict, List, Optional, Any
|
||||
|
||||
from reportlab.lib import colors
|
||||
from reportlab.lib.pagesizes import A4
|
||||
from reportlab.lib.units import cm
|
||||
from reportlab.platypus import (
|
||||
SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle,
|
||||
HRFlowable
|
||||
)
|
||||
|
||||
from .pdf_export_styles import (
|
||||
GRADE_POINTS_TO_NOTE,
|
||||
CRITERIA_DISPLAY_NAMES,
|
||||
get_custom_styles,
|
||||
)
|
||||
|
||||
|
||||
def generate_klausur_overview_pdf(
|
||||
klausur_data: Dict[str, Any],
|
||||
students: List[Dict[str, Any]],
|
||||
fairness_data: Optional[Dict[str, Any]] = None
|
||||
) -> bytes:
|
||||
"""
|
||||
Generate an overview PDF for an entire Klausur with all student grades.
|
||||
|
||||
Args:
|
||||
klausur_data: Klausur metadata
|
||||
students: List of all student work data
|
||||
fairness_data: Optional fairness analysis data
|
||||
|
||||
Returns:
|
||||
PDF as bytes
|
||||
"""
|
||||
buffer = io.BytesIO()
|
||||
doc = SimpleDocTemplate(
|
||||
buffer,
|
||||
pagesize=A4,
|
||||
rightMargin=1.5*cm,
|
||||
leftMargin=1.5*cm,
|
||||
topMargin=2*cm,
|
||||
bottomMargin=2*cm
|
||||
)
|
||||
|
||||
styles = get_custom_styles()
|
||||
story = []
|
||||
|
||||
# Header
|
||||
story.append(Paragraph("Notenuebersicht", styles['GutachtenTitle']))
|
||||
story.append(Paragraph(f"{klausur_data.get('subject', 'Deutsch')} - {klausur_data.get('title', '')}", styles['GutachtenSubtitle']))
|
||||
story.append(Spacer(1, 0.5*cm))
|
||||
|
||||
# Meta information
|
||||
meta_data = [
|
||||
["Schuljahr:", f"{klausur_data.get('year', 2025)}"],
|
||||
["Kurs:", klausur_data.get('semester', 'Abitur')],
|
||||
["Anzahl Arbeiten:", str(len(students))],
|
||||
["Stand:", datetime.now().strftime("%d.%m.%Y")]
|
||||
]
|
||||
|
||||
meta_table = Table(meta_data, colWidths=[4*cm, 10*cm])
|
||||
meta_table.setStyle(TableStyle([
|
||||
('FONTNAME', (0, 0), (0, -1), 'Helvetica-Bold'),
|
||||
('FONTSIZE', (0, 0), (-1, -1), 10),
|
||||
('BOTTOMPADDING', (0, 0), (-1, -1), 4),
|
||||
('TOPPADDING', (0, 0), (-1, -1), 4),
|
||||
]))
|
||||
story.append(meta_table)
|
||||
story.append(Spacer(1, 0.5*cm))
|
||||
|
||||
# Statistics (if fairness data available)
|
||||
if fairness_data and fairness_data.get('statistics'):
|
||||
_add_statistics(story, styles, fairness_data['statistics'])
|
||||
|
||||
story.append(HRFlowable(width="100%", thickness=1, color=colors.HexColor('#e2e8f0')))
|
||||
story.append(Spacer(1, 0.5*cm))
|
||||
|
||||
# Student grades table
|
||||
sorted_students = sorted(students, key=lambda s: s.get('grade_points', 0), reverse=True)
|
||||
_add_student_table(story, styles, sorted_students)
|
||||
|
||||
# Grade distribution
|
||||
_add_grade_distribution(story, styles, sorted_students)
|
||||
|
||||
# Footer
|
||||
story.append(Spacer(1, 1*cm))
|
||||
story.append(HRFlowable(width="100%", thickness=0.5, color=colors.HexColor('#cbd5e0')))
|
||||
story.append(Spacer(1, 0.2*cm))
|
||||
story.append(Paragraph(
|
||||
f"Erstellt am {datetime.now().strftime('%d.%m.%Y um %H:%M Uhr')} | BreakPilot Abiturkorrektur-System",
|
||||
styles['MetaText']
|
||||
))
|
||||
|
||||
# Build PDF
|
||||
doc.build(story)
|
||||
buffer.seek(0)
|
||||
return buffer.getvalue()
|
||||
|
||||
|
||||
def _add_statistics(story, styles, stats):
|
||||
"""Add statistics section."""
|
||||
story.append(Paragraph("Statistik", styles['SectionHeader']))
|
||||
|
||||
stats_data = [
|
||||
["Durchschnitt:", f"{stats.get('average_grade', 0):.1f} Punkte"],
|
||||
["Minimum:", f"{stats.get('min_grade', 0)} Punkte"],
|
||||
["Maximum:", f"{stats.get('max_grade', 0)} Punkte"],
|
||||
["Standardabweichung:", f"{stats.get('standard_deviation', 0):.2f}"],
|
||||
]
|
||||
|
||||
stats_table = Table(stats_data, colWidths=[4*cm, 4*cm])
|
||||
stats_table.setStyle(TableStyle([
|
||||
('FONTNAME', (0, 0), (0, -1), 'Helvetica-Bold'),
|
||||
('FONTSIZE', (0, 0), (-1, -1), 9),
|
||||
('BOTTOMPADDING', (0, 0), (-1, -1), 4),
|
||||
('BACKGROUND', (0, 0), (-1, -1), colors.HexColor('#f7fafc')),
|
||||
('BOX', (0, 0), (-1, -1), 0.5, colors.HexColor('#e2e8f0')),
|
||||
]))
|
||||
story.append(stats_table)
|
||||
story.append(Spacer(1, 0.5*cm))
|
||||
|
||||
|
||||
def _add_student_table(story, styles, sorted_students):
|
||||
"""Add student grades table."""
|
||||
story.append(Paragraph("Einzelergebnisse", styles['SectionHeader']))
|
||||
story.append(Spacer(1, 0.2*cm))
|
||||
|
||||
table_data = [["#", "Name", "Rohpunkte", "Notenpunkte", "Note", "Status"]]
|
||||
|
||||
for idx, student in enumerate(sorted_students, 1):
|
||||
grade_points = student.get('grade_points', 0)
|
||||
grade_note = GRADE_POINTS_TO_NOTE.get(grade_points, "-")
|
||||
raw_points = student.get('raw_points', 0)
|
||||
status = student.get('status', 'unknown')
|
||||
|
||||
status_display = {
|
||||
'completed': 'Abgeschlossen',
|
||||
'first_examiner': 'In Korrektur',
|
||||
'second_examiner': 'Zweitkorrektur',
|
||||
'uploaded': 'Hochgeladen',
|
||||
'ocr_complete': 'OCR fertig',
|
||||
'analyzing': 'Wird analysiert'
|
||||
}.get(status, status)
|
||||
|
||||
table_data.append([
|
||||
str(idx),
|
||||
student.get('student_name', 'Anonym'),
|
||||
f"{raw_points}/100",
|
||||
str(grade_points),
|
||||
grade_note,
|
||||
status_display
|
||||
])
|
||||
|
||||
student_table = Table(table_data, colWidths=[1*cm, 5*cm, 2.5*cm, 3*cm, 2*cm, 3*cm])
|
||||
student_table.setStyle(TableStyle([
|
||||
('BACKGROUND', (0, 0), (-1, 0), colors.HexColor('#2c5282')),
|
||||
('TEXTCOLOR', (0, 0), (-1, 0), colors.white),
|
||||
('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
|
||||
('FONTSIZE', (0, 0), (-1, 0), 9),
|
||||
('ALIGN', (0, 0), (-1, 0), 'CENTER'),
|
||||
('FONTSIZE', (0, 1), (-1, -1), 9),
|
||||
('ALIGN', (0, 1), (0, -1), 'CENTER'),
|
||||
('ALIGN', (2, 1), (4, -1), 'CENTER'),
|
||||
('BOTTOMPADDING', (0, 0), (-1, -1), 6),
|
||||
('TOPPADDING', (0, 0), (-1, -1), 6),
|
||||
('GRID', (0, 0), (-1, -1), 0.5, colors.HexColor('#e2e8f0')),
|
||||
('ROWBACKGROUNDS', (0, 1), (-1, -1), [colors.white, colors.HexColor('#f7fafc')]),
|
||||
]))
|
||||
story.append(student_table)
|
||||
|
||||
|
||||
def _add_grade_distribution(story, styles, sorted_students):
|
||||
"""Add grade distribution table."""
|
||||
story.append(Spacer(1, 0.5*cm))
|
||||
story.append(Paragraph("Notenverteilung", styles['SectionHeader']))
|
||||
story.append(Spacer(1, 0.2*cm))
|
||||
|
||||
grade_counts = {}
|
||||
for student in sorted_students:
|
||||
gp = student.get('grade_points', 0)
|
||||
grade_counts[gp] = grade_counts.get(gp, 0) + 1
|
||||
|
||||
dist_data = [["Punkte", "Note", "Anzahl"]]
|
||||
for points in range(15, -1, -1):
|
||||
if points in grade_counts:
|
||||
note = GRADE_POINTS_TO_NOTE.get(points, "-")
|
||||
count = grade_counts[points]
|
||||
dist_data.append([str(points), note, str(count)])
|
||||
|
||||
if len(dist_data) > 1:
|
||||
dist_table = Table(dist_data, colWidths=[2.5*cm, 2.5*cm, 2.5*cm])
|
||||
dist_table.setStyle(TableStyle([
|
||||
('BACKGROUND', (0, 0), (-1, 0), colors.HexColor('#2c5282')),
|
||||
('TEXTCOLOR', (0, 0), (-1, 0), colors.white),
|
||||
('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
|
||||
('FONTSIZE', (0, 0), (-1, -1), 9),
|
||||
('ALIGN', (0, 0), (-1, -1), 'CENTER'),
|
||||
('BOTTOMPADDING', (0, 0), (-1, -1), 4),
|
||||
('TOPPADDING', (0, 0), (-1, -1), 4),
|
||||
('GRID', (0, 0), (-1, -1), 0.5, colors.HexColor('#e2e8f0')),
|
||||
]))
|
||||
story.append(dist_table)
|
||||
|
||||
|
||||
def generate_annotations_pdf(
|
||||
student_data: Dict[str, Any],
|
||||
klausur_data: Dict[str, Any],
|
||||
annotations: List[Dict[str, Any]]
|
||||
) -> bytes:
|
||||
"""
|
||||
Generate a PDF with all annotations for a student work.
|
||||
|
||||
Args:
|
||||
student_data: Student work data
|
||||
klausur_data: Klausur metadata
|
||||
annotations: List of all annotations
|
||||
|
||||
Returns:
|
||||
PDF as bytes
|
||||
"""
|
||||
buffer = io.BytesIO()
|
||||
doc = SimpleDocTemplate(
|
||||
buffer,
|
||||
pagesize=A4,
|
||||
rightMargin=2*cm,
|
||||
leftMargin=2*cm,
|
||||
topMargin=2*cm,
|
||||
bottomMargin=2*cm
|
||||
)
|
||||
|
||||
styles = get_custom_styles()
|
||||
story = []
|
||||
|
||||
# Header
|
||||
story.append(Paragraph("Anmerkungen zur Klausur", styles['GutachtenTitle']))
|
||||
story.append(Paragraph(f"{student_data.get('student_name', 'Anonym')}", styles['GutachtenSubtitle']))
|
||||
story.append(Spacer(1, 0.5*cm))
|
||||
|
||||
if not annotations:
|
||||
story.append(Paragraph("<i>Keine Anmerkungen vorhanden.</i>", styles['GutachtenBody']))
|
||||
else:
|
||||
# Group by type
|
||||
by_type = {}
|
||||
for ann in annotations:
|
||||
ann_type = ann.get('type', 'comment')
|
||||
if ann_type not in by_type:
|
||||
by_type[ann_type] = []
|
||||
by_type[ann_type].append(ann)
|
||||
|
||||
for ann_type, anns in by_type.items():
|
||||
type_name = CRITERIA_DISPLAY_NAMES.get(ann_type, ann_type.replace('_', ' ').title())
|
||||
story.append(Paragraph(f"{type_name} ({len(anns)})", styles['SectionHeader']))
|
||||
story.append(Spacer(1, 0.2*cm))
|
||||
|
||||
sorted_anns = sorted(anns, key=lambda a: (a.get('page', 0), a.get('position', {}).get('y', 0)))
|
||||
|
||||
for idx, ann in enumerate(sorted_anns, 1):
|
||||
page = ann.get('page', 1)
|
||||
text = ann.get('text', '')
|
||||
suggestion = ann.get('suggestion', '')
|
||||
severity = ann.get('severity', 'minor')
|
||||
|
||||
ann_text = f"<b>[S.{page}]</b> {text}"
|
||||
if suggestion:
|
||||
ann_text += f" -> <i>{suggestion}</i>"
|
||||
|
||||
if severity == 'critical':
|
||||
ann_text = f"<font color='red'>{ann_text}</font>"
|
||||
elif severity == 'major':
|
||||
ann_text = f"<font color='orange'>{ann_text}</font>"
|
||||
|
||||
story.append(Paragraph(f"{idx}. {ann_text}", styles['ListItem']))
|
||||
|
||||
story.append(Spacer(1, 0.3*cm))
|
||||
|
||||
# Footer
|
||||
story.append(Spacer(1, 1*cm))
|
||||
story.append(HRFlowable(width="100%", thickness=0.5, color=colors.HexColor('#cbd5e0')))
|
||||
story.append(Spacer(1, 0.2*cm))
|
||||
story.append(Paragraph(
|
||||
f"Erstellt am {datetime.now().strftime('%d.%m.%Y um %H:%M Uhr')} | BreakPilot Abiturkorrektur-System",
|
||||
styles['MetaText']
|
||||
))
|
||||
|
||||
# Build PDF
|
||||
doc.build(story)
|
||||
buffer.seek(0)
|
||||
return buffer.getvalue()
|
||||
@@ -0,0 +1,110 @@
|
||||
"""
|
||||
PDF Export - Constants and ReportLab styles for Abiturkorrektur PDFs.
|
||||
"""
|
||||
|
||||
from reportlab.lib import colors
|
||||
from reportlab.lib.enums import TA_LEFT, TA_CENTER, TA_JUSTIFY
|
||||
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
|
||||
|
||||
|
||||
# =============================================
|
||||
# CONSTANTS
|
||||
# =============================================
|
||||
|
||||
GRADE_POINTS_TO_NOTE = {
|
||||
15: "1+", 14: "1", 13: "1-",
|
||||
12: "2+", 11: "2", 10: "2-",
|
||||
9: "3+", 8: "3", 7: "3-",
|
||||
6: "4+", 5: "4", 4: "4-",
|
||||
3: "5+", 2: "5", 1: "5-",
|
||||
0: "6"
|
||||
}
|
||||
|
||||
CRITERIA_DISPLAY_NAMES = {
|
||||
"rechtschreibung": "Sprachliche Richtigkeit (Rechtschreibung)",
|
||||
"grammatik": "Sprachliche Richtigkeit (Grammatik)",
|
||||
"inhalt": "Inhaltliche Leistung",
|
||||
"struktur": "Aufbau und Struktur",
|
||||
"stil": "Ausdruck und Stil"
|
||||
}
|
||||
|
||||
CRITERIA_WEIGHTS = {
|
||||
"rechtschreibung": 15,
|
||||
"grammatik": 15,
|
||||
"inhalt": 40,
|
||||
"struktur": 15,
|
||||
"stil": 15
|
||||
}
|
||||
|
||||
|
||||
# =============================================
|
||||
# STYLES
|
||||
# =============================================
|
||||
|
||||
def get_custom_styles():
|
||||
"""Create custom paragraph styles for Gutachten."""
|
||||
styles = getSampleStyleSheet()
|
||||
|
||||
# Title style
|
||||
styles.add(ParagraphStyle(
|
||||
name='GutachtenTitle',
|
||||
parent=styles['Heading1'],
|
||||
fontSize=16,
|
||||
spaceAfter=12,
|
||||
alignment=TA_CENTER,
|
||||
textColor=colors.HexColor('#1e3a5f')
|
||||
))
|
||||
|
||||
# Subtitle style
|
||||
styles.add(ParagraphStyle(
|
||||
name='GutachtenSubtitle',
|
||||
parent=styles['Heading2'],
|
||||
fontSize=12,
|
||||
spaceAfter=8,
|
||||
spaceBefore=16,
|
||||
textColor=colors.HexColor('#2c5282')
|
||||
))
|
||||
|
||||
# Section header
|
||||
styles.add(ParagraphStyle(
|
||||
name='SectionHeader',
|
||||
parent=styles['Heading3'],
|
||||
fontSize=11,
|
||||
spaceAfter=6,
|
||||
spaceBefore=12,
|
||||
textColor=colors.HexColor('#2d3748'),
|
||||
borderColor=colors.HexColor('#e2e8f0'),
|
||||
borderWidth=0,
|
||||
borderPadding=0
|
||||
))
|
||||
|
||||
# Body text
|
||||
styles.add(ParagraphStyle(
|
||||
name='GutachtenBody',
|
||||
parent=styles['Normal'],
|
||||
fontSize=10,
|
||||
leading=14,
|
||||
alignment=TA_JUSTIFY,
|
||||
spaceAfter=6
|
||||
))
|
||||
|
||||
# Small text for footer/meta
|
||||
styles.add(ParagraphStyle(
|
||||
name='MetaText',
|
||||
parent=styles['Normal'],
|
||||
fontSize=8,
|
||||
textColor=colors.grey,
|
||||
alignment=TA_LEFT
|
||||
))
|
||||
|
||||
# List item
|
||||
styles.add(ParagraphStyle(
|
||||
name='ListItem',
|
||||
parent=styles['Normal'],
|
||||
fontSize=10,
|
||||
leftIndent=20,
|
||||
bulletIndent=10,
|
||||
spaceAfter=4
|
||||
))
|
||||
|
||||
return styles
|
||||
@@ -0,0 +1,164 @@
|
||||
"""
|
||||
PDF Extraction Module
|
||||
|
||||
NOTE: This module delegates ML-heavy operations to the embedding-service via HTTP.
|
||||
|
||||
Provides enhanced PDF text extraction using multiple backends (in embedding-service):
|
||||
1. Unstructured.io - Best for complex layouts, tables, headers (Apache 2.0)
|
||||
2. pypdf - Modern, BSD-licensed PDF library (recommended default)
|
||||
|
||||
License Compliance:
|
||||
- Default backends (unstructured, pypdf) are BSD/Apache licensed
|
||||
"""
|
||||
|
||||
import os
|
||||
import logging
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Configuration (for backward compatibility - actual config in embedding-service)
|
||||
EMBEDDING_SERVICE_URL = os.getenv("EMBEDDING_SERVICE_URL", "http://embedding-service:8087")
|
||||
PDF_BACKEND = os.getenv("PDF_EXTRACTION_BACKEND", "auto")
|
||||
|
||||
|
||||
class PDFExtractionError(Exception):
|
||||
"""Error during PDF extraction."""
|
||||
pass
|
||||
|
||||
|
||||
class PDFExtractionResult:
|
||||
"""Result of PDF extraction with metadata."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
text: str,
|
||||
backend_used: str,
|
||||
pages: int = 0,
|
||||
elements: Optional[List[Dict]] = None,
|
||||
tables: Optional[List[Dict]] = None,
|
||||
metadata: Optional[Dict] = None,
|
||||
):
|
||||
self.text = text
|
||||
self.backend_used = backend_used
|
||||
self.pages = pages
|
||||
self.elements = elements or []
|
||||
self.tables = tables or []
|
||||
self.metadata = metadata or {}
|
||||
|
||||
def to_dict(self) -> Dict:
|
||||
return {
|
||||
"text": self.text,
|
||||
"backend_used": self.backend_used,
|
||||
"pages": self.pages,
|
||||
"element_count": len(self.elements),
|
||||
"table_count": len(self.tables),
|
||||
"metadata": self.metadata,
|
||||
}
|
||||
|
||||
|
||||
def _detect_available_backends() -> List[str]:
|
||||
"""Get available backends from embedding-service."""
|
||||
import httpx
|
||||
|
||||
try:
|
||||
with httpx.Client(timeout=5.0) as client:
|
||||
response = client.get(f"{EMBEDDING_SERVICE_URL}/models")
|
||||
if response.status_code == 200:
|
||||
data = response.json()
|
||||
return data.get("available_pdf_backends", ["pypdf"])
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not reach embedding-service: {e}")
|
||||
|
||||
return []
|
||||
|
||||
|
||||
def extract_text_from_pdf_enhanced(
|
||||
pdf_content: bytes,
|
||||
backend: str = PDF_BACKEND,
|
||||
fallback: bool = True,
|
||||
) -> PDFExtractionResult:
|
||||
"""
|
||||
Extract text from PDF using embedding-service.
|
||||
|
||||
Args:
|
||||
pdf_content: PDF file content as bytes
|
||||
backend: Preferred backend (auto, unstructured, pypdf)
|
||||
fallback: If True, try other backends if preferred fails
|
||||
|
||||
Returns:
|
||||
PDFExtractionResult with extracted text and metadata
|
||||
"""
|
||||
import httpx
|
||||
|
||||
try:
|
||||
with httpx.Client(timeout=120.0) as client:
|
||||
response = client.post(
|
||||
f"{EMBEDDING_SERVICE_URL}/extract-pdf",
|
||||
content=pdf_content,
|
||||
headers={"Content-Type": "application/octet-stream"}
|
||||
)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
|
||||
return PDFExtractionResult(
|
||||
text=data.get("text", ""),
|
||||
backend_used=data.get("backend_used", "unknown"),
|
||||
pages=data.get("pages", 0),
|
||||
tables=[{"count": data.get("table_count", 0)}] if data.get("table_count", 0) > 0 else [],
|
||||
metadata={"embedding_service": True}
|
||||
)
|
||||
except httpx.TimeoutException:
|
||||
raise PDFExtractionError("PDF extraction timeout")
|
||||
except httpx.HTTPStatusError as e:
|
||||
raise PDFExtractionError(f"PDF extraction error: {e.response.status_code}")
|
||||
except Exception as e:
|
||||
raise PDFExtractionError(f"Failed to extract PDF: {str(e)}")
|
||||
|
||||
|
||||
def extract_text_from_pdf(pdf_content: bytes) -> str:
|
||||
"""
|
||||
Extract text from PDF (simple interface).
|
||||
|
||||
This is a drop-in replacement for the original function
|
||||
that uses the embedding-service internally.
|
||||
"""
|
||||
result = extract_text_from_pdf_enhanced(pdf_content)
|
||||
return result.text
|
||||
|
||||
|
||||
def get_pdf_extraction_info() -> dict:
|
||||
"""Get information about PDF extraction configuration."""
|
||||
import httpx
|
||||
|
||||
try:
|
||||
with httpx.Client(timeout=5.0) as client:
|
||||
response = client.get(f"{EMBEDDING_SERVICE_URL}/models")
|
||||
if response.status_code == 200:
|
||||
data = response.json()
|
||||
available = data.get("available_pdf_backends", [])
|
||||
return {
|
||||
"configured_backend": data.get("pdf_backend", PDF_BACKEND),
|
||||
"available_backends": available,
|
||||
"recommended": "unstructured" if "unstructured" in available else "pypdf",
|
||||
"backend_licenses": {
|
||||
"unstructured": "Apache-2.0",
|
||||
"pypdf": "BSD-3-Clause",
|
||||
},
|
||||
"commercial_safe_backends": available,
|
||||
"embedding_service_url": EMBEDDING_SERVICE_URL,
|
||||
"embedding_service_available": True,
|
||||
}
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not reach embedding-service: {e}")
|
||||
|
||||
# Fallback when embedding-service is not available
|
||||
return {
|
||||
"configured_backend": PDF_BACKEND,
|
||||
"available_backends": [],
|
||||
"recommended": None,
|
||||
"backend_licenses": {},
|
||||
"commercial_safe_backends": [],
|
||||
"embedding_service_url": EMBEDDING_SERVICE_URL,
|
||||
"embedding_service_available": False,
|
||||
}
|
||||
@@ -0,0 +1,6 @@
|
||||
"""
|
||||
metrics package — PostgreSQL metrics database operations.
|
||||
|
||||
Backward-compatible re-exports: consumers can still use
|
||||
``from metrics_db import ...`` etc. via the shim files in backend/.
|
||||
"""
|
||||
@@ -0,0 +1,36 @@
|
||||
"""
|
||||
PostgreSQL Metrics Database Service — Barrel Re-export
|
||||
|
||||
Split into:
|
||||
- metrics_db_core.py — Pool, feedback, metrics, relevance
|
||||
- metrics_db_schema.py — Table initialization (DDL)
|
||||
- metrics_db_zeugnis.py — Zeugnis source/document/stats operations
|
||||
|
||||
All public names are re-exported here for backward compatibility.
|
||||
"""
|
||||
|
||||
# Schema: table initialization
|
||||
from .db_schema import init_metrics_tables # noqa: F401
|
||||
|
||||
# Core: pool, feedback, search logs, metrics, relevance
|
||||
from .db_core import ( # noqa: F401
|
||||
DATABASE_URL,
|
||||
get_pool,
|
||||
store_feedback,
|
||||
log_search,
|
||||
log_upload,
|
||||
calculate_metrics,
|
||||
get_recent_feedback,
|
||||
get_upload_history,
|
||||
store_relevance_judgment,
|
||||
calculate_precision_recall,
|
||||
)
|
||||
|
||||
# Zeugnis operations
|
||||
from .db_zeugnis import ( # noqa: F401
|
||||
get_zeugnis_sources,
|
||||
upsert_zeugnis_source,
|
||||
get_zeugnis_documents,
|
||||
get_zeugnis_stats,
|
||||
log_zeugnis_event,
|
||||
)
|
||||
@@ -0,0 +1,459 @@
|
||||
"""
|
||||
PostgreSQL Metrics Database - Core Operations
|
||||
|
||||
Connection pool, table initialization, feedback storage, search logging,
|
||||
upload history, metrics calculation, and relevance judgments.
|
||||
|
||||
Extracted from metrics_db.py to keep files under 500 LOC.
|
||||
"""
|
||||
|
||||
import os
|
||||
from typing import Optional, List, Dict
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
# Database Configuration - uses test default if not configured (for CI)
|
||||
DATABASE_URL = os.getenv("DATABASE_URL", "postgresql://test:test@localhost:5432/test_metrics")
|
||||
|
||||
# Connection pool
|
||||
_pool = None
|
||||
|
||||
|
||||
async def get_pool():
|
||||
"""Get or create database connection pool."""
|
||||
global _pool
|
||||
if _pool is None:
|
||||
try:
|
||||
import asyncpg
|
||||
_pool = await asyncpg.create_pool(DATABASE_URL, min_size=2, max_size=10)
|
||||
except ImportError:
|
||||
print("Warning: asyncpg not installed. Metrics storage disabled.")
|
||||
return None
|
||||
except Exception as e:
|
||||
print(f"Warning: Failed to connect to PostgreSQL: {e}")
|
||||
return None
|
||||
return _pool
|
||||
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Feedback Storage
|
||||
# =============================================================================
|
||||
|
||||
async def store_feedback(
|
||||
result_id: str,
|
||||
rating: int,
|
||||
query_text: Optional[str] = None,
|
||||
collection_name: Optional[str] = None,
|
||||
score: Optional[float] = None,
|
||||
notes: Optional[str] = None,
|
||||
user_id: Optional[str] = None,
|
||||
) -> bool:
|
||||
"""Store search result feedback."""
|
||||
pool = await get_pool()
|
||||
if pool is None:
|
||||
return False
|
||||
|
||||
try:
|
||||
async with pool.acquire() as conn:
|
||||
await conn.execute(
|
||||
"""
|
||||
INSERT INTO rag_search_feedback
|
||||
(result_id, query_text, collection_name, score, rating, notes, user_id)
|
||||
VALUES ($1, $2, $3, $4, $5, $6, $7)
|
||||
""",
|
||||
result_id, query_text, collection_name, score, rating, notes, user_id
|
||||
)
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f"Failed to store feedback: {e}")
|
||||
return False
|
||||
|
||||
|
||||
async def log_search(
|
||||
query_text: str,
|
||||
collection_name: str,
|
||||
result_count: int,
|
||||
latency_ms: int,
|
||||
top_score: Optional[float] = None,
|
||||
filters: Optional[Dict] = None,
|
||||
) -> bool:
|
||||
"""Log a search for metrics tracking."""
|
||||
pool = await get_pool()
|
||||
if pool is None:
|
||||
return False
|
||||
|
||||
try:
|
||||
import json
|
||||
async with pool.acquire() as conn:
|
||||
await conn.execute(
|
||||
"""
|
||||
INSERT INTO rag_search_logs
|
||||
(query_text, collection_name, result_count, latency_ms, top_score, filters)
|
||||
VALUES ($1, $2, $3, $4, $5, $6)
|
||||
""",
|
||||
query_text, collection_name, result_count, latency_ms, top_score,
|
||||
json.dumps(filters) if filters else None
|
||||
)
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f"Failed to log search: {e}")
|
||||
return False
|
||||
|
||||
|
||||
async def log_upload(
|
||||
filename: str,
|
||||
collection_name: str,
|
||||
year: int,
|
||||
pdfs_extracted: int,
|
||||
minio_path: Optional[str] = None,
|
||||
uploaded_by: Optional[str] = None,
|
||||
) -> bool:
|
||||
"""Log an upload for history tracking."""
|
||||
pool = await get_pool()
|
||||
if pool is None:
|
||||
return False
|
||||
|
||||
try:
|
||||
async with pool.acquire() as conn:
|
||||
await conn.execute(
|
||||
"""
|
||||
INSERT INTO rag_upload_history
|
||||
(filename, collection_name, year, pdfs_extracted, minio_path, uploaded_by)
|
||||
VALUES ($1, $2, $3, $4, $5, $6)
|
||||
""",
|
||||
filename, collection_name, year, pdfs_extracted, minio_path, uploaded_by
|
||||
)
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f"Failed to log upload: {e}")
|
||||
return False
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Metrics Calculation
|
||||
# =============================================================================
|
||||
|
||||
async def calculate_metrics(
|
||||
collection_name: Optional[str] = None,
|
||||
days: int = 7,
|
||||
) -> Dict:
|
||||
"""
|
||||
Calculate RAG quality metrics from stored feedback.
|
||||
|
||||
Returns:
|
||||
Dict with precision, recall, MRR, latency, etc.
|
||||
"""
|
||||
pool = await get_pool()
|
||||
if pool is None:
|
||||
return {"error": "Database not available", "connected": False}
|
||||
|
||||
try:
|
||||
async with pool.acquire() as conn:
|
||||
since = datetime.now() - timedelta(days=days)
|
||||
|
||||
collection_filter = ""
|
||||
params = [since]
|
||||
if collection_name:
|
||||
collection_filter = "AND collection_name = $2"
|
||||
params.append(collection_name)
|
||||
|
||||
total_feedback = await conn.fetchval(
|
||||
f"""
|
||||
SELECT COUNT(*) FROM rag_search_feedback
|
||||
WHERE created_at >= $1 {collection_filter}
|
||||
""",
|
||||
*params
|
||||
)
|
||||
|
||||
rating_dist = await conn.fetch(
|
||||
f"""
|
||||
SELECT rating, COUNT(*) as count
|
||||
FROM rag_search_feedback
|
||||
WHERE created_at >= $1 {collection_filter}
|
||||
GROUP BY rating
|
||||
ORDER BY rating DESC
|
||||
""",
|
||||
*params
|
||||
)
|
||||
|
||||
avg_rating = await conn.fetchval(
|
||||
f"""
|
||||
SELECT AVG(rating) FROM rag_search_feedback
|
||||
WHERE created_at >= $1 {collection_filter}
|
||||
""",
|
||||
*params
|
||||
)
|
||||
|
||||
score_dist = await conn.fetch(
|
||||
f"""
|
||||
SELECT
|
||||
CASE
|
||||
WHEN score >= 0.9 THEN '0.9+'
|
||||
WHEN score >= 0.7 THEN '0.7-0.9'
|
||||
WHEN score >= 0.5 THEN '0.5-0.7'
|
||||
ELSE '<0.5'
|
||||
END as range,
|
||||
COUNT(*) as count
|
||||
FROM rag_search_feedback
|
||||
WHERE created_at >= $1 AND score IS NOT NULL {collection_filter}
|
||||
GROUP BY range
|
||||
ORDER BY range DESC
|
||||
""",
|
||||
*params
|
||||
)
|
||||
|
||||
latency_stats = await conn.fetchrow(
|
||||
f"""
|
||||
SELECT
|
||||
AVG(latency_ms) as avg_latency,
|
||||
COUNT(*) as total_searches,
|
||||
AVG(result_count) as avg_results
|
||||
FROM rag_search_logs
|
||||
WHERE created_at >= $1 {collection_filter.replace('collection_name', 'collection_name')}
|
||||
""",
|
||||
*params
|
||||
)
|
||||
|
||||
precision_at_5 = await conn.fetchval(
|
||||
f"""
|
||||
SELECT
|
||||
CASE WHEN COUNT(*) > 0
|
||||
THEN CAST(SUM(CASE WHEN rating >= 4 THEN 1 ELSE 0 END) AS FLOAT) / COUNT(*)
|
||||
ELSE 0 END
|
||||
FROM rag_search_feedback
|
||||
WHERE created_at >= $1 {collection_filter}
|
||||
""",
|
||||
*params
|
||||
) or 0
|
||||
|
||||
mrr = (avg_rating or 0) / 5.0
|
||||
|
||||
error_count = sum(
|
||||
r['count'] for r in rating_dist if r['rating'] and r['rating'] <= 2
|
||||
)
|
||||
error_rate = (error_count / total_feedback * 100) if total_feedback > 0 else 0
|
||||
|
||||
total_scored = sum(s['count'] for s in score_dist)
|
||||
score_distribution = {}
|
||||
for s in score_dist:
|
||||
if total_scored > 0:
|
||||
score_distribution[s['range']] = round(s['count'] / total_scored * 100)
|
||||
else:
|
||||
score_distribution[s['range']] = 0
|
||||
|
||||
return {
|
||||
"connected": True,
|
||||
"period_days": days,
|
||||
"precision_at_5": round(precision_at_5, 2),
|
||||
"recall_at_10": round(precision_at_5 * 1.1, 2),
|
||||
"mrr": round(mrr, 2),
|
||||
"avg_latency_ms": round(latency_stats['avg_latency'] or 0),
|
||||
"total_ratings": total_feedback,
|
||||
"total_searches": latency_stats['total_searches'] or 0,
|
||||
"error_rate": round(error_rate, 1),
|
||||
"score_distribution": score_distribution,
|
||||
"rating_distribution": {
|
||||
str(r['rating']): r['count'] for r in rating_dist if r['rating']
|
||||
},
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
print(f"Failed to calculate metrics: {e}")
|
||||
return {"error": str(e), "connected": False}
|
||||
|
||||
|
||||
async def get_recent_feedback(limit: int = 20) -> List[Dict]:
|
||||
"""Get recent feedback entries."""
|
||||
pool = await get_pool()
|
||||
if pool is None:
|
||||
return []
|
||||
|
||||
try:
|
||||
async with pool.acquire() as conn:
|
||||
rows = await conn.fetch(
|
||||
"""
|
||||
SELECT result_id, rating, query_text, collection_name, score, notes, created_at
|
||||
FROM rag_search_feedback
|
||||
ORDER BY created_at DESC
|
||||
LIMIT $1
|
||||
""",
|
||||
limit
|
||||
)
|
||||
return [
|
||||
{
|
||||
"result_id": r['result_id'],
|
||||
"rating": r['rating'],
|
||||
"query_text": r['query_text'],
|
||||
"collection_name": r['collection_name'],
|
||||
"score": r['score'],
|
||||
"notes": r['notes'],
|
||||
"created_at": r['created_at'].isoformat() if r['created_at'] else None,
|
||||
}
|
||||
for r in rows
|
||||
]
|
||||
except Exception as e:
|
||||
print(f"Failed to get recent feedback: {e}")
|
||||
return []
|
||||
|
||||
|
||||
async def get_upload_history(limit: int = 20) -> List[Dict]:
|
||||
"""Get recent upload history."""
|
||||
pool = await get_pool()
|
||||
if pool is None:
|
||||
return []
|
||||
|
||||
try:
|
||||
async with pool.acquire() as conn:
|
||||
rows = await conn.fetch(
|
||||
"""
|
||||
SELECT filename, collection_name, year, pdfs_extracted, minio_path, uploaded_by, created_at
|
||||
FROM rag_upload_history
|
||||
ORDER BY created_at DESC
|
||||
LIMIT $1
|
||||
""",
|
||||
limit
|
||||
)
|
||||
return [
|
||||
{
|
||||
"filename": r['filename'],
|
||||
"collection_name": r['collection_name'],
|
||||
"year": r['year'],
|
||||
"pdfs_extracted": r['pdfs_extracted'],
|
||||
"minio_path": r['minio_path'],
|
||||
"uploaded_by": r['uploaded_by'],
|
||||
"created_at": r['created_at'].isoformat() if r['created_at'] else None,
|
||||
}
|
||||
for r in rows
|
||||
]
|
||||
except Exception as e:
|
||||
print(f"Failed to get upload history: {e}")
|
||||
return []
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Relevance Judgments (Binary Precision/Recall)
|
||||
# =============================================================================
|
||||
|
||||
async def store_relevance_judgment(
|
||||
query_id: str,
|
||||
query_text: str,
|
||||
result_id: str,
|
||||
is_relevant: bool,
|
||||
result_rank: Optional[int] = None,
|
||||
collection_name: Optional[str] = None,
|
||||
user_id: Optional[str] = None,
|
||||
) -> bool:
|
||||
"""Store binary relevance judgment for Precision/Recall calculation."""
|
||||
pool = await get_pool()
|
||||
if pool is None:
|
||||
return False
|
||||
|
||||
try:
|
||||
async with pool.acquire() as conn:
|
||||
await conn.execute(
|
||||
"""
|
||||
INSERT INTO rag_relevance_judgments
|
||||
(query_id, query_text, result_id, result_rank, is_relevant, collection_name, user_id)
|
||||
VALUES ($1, $2, $3, $4, $5, $6, $7)
|
||||
ON CONFLICT DO NOTHING
|
||||
""",
|
||||
query_id, query_text, result_id, result_rank, is_relevant, collection_name, user_id
|
||||
)
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f"Failed to store relevance judgment: {e}")
|
||||
return False
|
||||
|
||||
|
||||
async def calculate_precision_recall(
|
||||
collection_name: Optional[str] = None,
|
||||
days: int = 7,
|
||||
k: int = 10,
|
||||
) -> Dict:
|
||||
"""
|
||||
Calculate true Precision@k and Recall@k from binary relevance judgments.
|
||||
|
||||
Precision@k = (Relevant docs in top k) / k
|
||||
Recall@k = (Relevant docs in top k) / (Total relevant docs for query)
|
||||
"""
|
||||
pool = await get_pool()
|
||||
if pool is None:
|
||||
return {"error": "Database not available", "connected": False}
|
||||
|
||||
try:
|
||||
async with pool.acquire() as conn:
|
||||
since = datetime.now() - timedelta(days=days)
|
||||
|
||||
collection_filter = ""
|
||||
params = [since, k]
|
||||
if collection_name:
|
||||
collection_filter = "AND collection_name = $3"
|
||||
params.append(collection_name)
|
||||
|
||||
precision_result = await conn.fetchval(
|
||||
f"""
|
||||
WITH query_precision AS (
|
||||
SELECT
|
||||
query_id,
|
||||
COUNT(CASE WHEN is_relevant THEN 1 END)::FLOAT /
|
||||
GREATEST(COUNT(*), 1) as precision
|
||||
FROM rag_relevance_judgments
|
||||
WHERE created_at >= $1
|
||||
AND (result_rank IS NULL OR result_rank <= $2)
|
||||
{collection_filter}
|
||||
GROUP BY query_id
|
||||
)
|
||||
SELECT AVG(precision) FROM query_precision
|
||||
""",
|
||||
*params
|
||||
) or 0
|
||||
|
||||
recall_result = await conn.fetchval(
|
||||
f"""
|
||||
WITH query_recall AS (
|
||||
SELECT
|
||||
query_id,
|
||||
COUNT(CASE WHEN is_relevant AND (result_rank IS NULL OR result_rank <= $2) THEN 1 END)::FLOAT /
|
||||
GREATEST(COUNT(CASE WHEN is_relevant THEN 1 END), 1) as recall
|
||||
FROM rag_relevance_judgments
|
||||
WHERE created_at >= $1
|
||||
{collection_filter}
|
||||
GROUP BY query_id
|
||||
)
|
||||
SELECT AVG(recall) FROM query_recall
|
||||
""",
|
||||
*params
|
||||
) or 0
|
||||
|
||||
total_judgments = await conn.fetchval(
|
||||
f"""
|
||||
SELECT COUNT(*) FROM rag_relevance_judgments
|
||||
WHERE created_at >= $1 {collection_filter}
|
||||
""",
|
||||
since, *([collection_name] if collection_name else [])
|
||||
)
|
||||
|
||||
unique_queries = await conn.fetchval(
|
||||
f"""
|
||||
SELECT COUNT(DISTINCT query_id) FROM rag_relevance_judgments
|
||||
WHERE created_at >= $1 {collection_filter}
|
||||
""",
|
||||
since, *([collection_name] if collection_name else [])
|
||||
)
|
||||
|
||||
return {
|
||||
"connected": True,
|
||||
"period_days": days,
|
||||
"k": k,
|
||||
"precision_at_k": round(precision_result, 3),
|
||||
"recall_at_k": round(recall_result, 3),
|
||||
"f1_score": round(
|
||||
2 * precision_result * recall_result / max(precision_result + recall_result, 0.001), 3
|
||||
),
|
||||
"total_judgments": total_judgments or 0,
|
||||
"unique_queries": unique_queries or 0,
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
print(f"Failed to calculate precision/recall: {e}")
|
||||
return {"error": str(e), "connected": False}
|
||||
@@ -0,0 +1,182 @@
|
||||
"""
|
||||
PostgreSQL Metrics Database - Schema Initialization
|
||||
|
||||
Table creation DDL for all metrics, feedback, and zeugnis tables.
|
||||
|
||||
Extracted from metrics_db_core.py to keep files under 500 LOC.
|
||||
"""
|
||||
|
||||
from .db_core import get_pool
|
||||
|
||||
|
||||
async def init_metrics_tables() -> bool:
|
||||
"""Initialize metrics tables in PostgreSQL."""
|
||||
pool = await get_pool()
|
||||
if pool is None:
|
||||
return False
|
||||
|
||||
create_tables_sql = """
|
||||
-- RAG Search Feedback Table
|
||||
CREATE TABLE IF NOT EXISTS rag_search_feedback (
|
||||
id SERIAL PRIMARY KEY,
|
||||
result_id VARCHAR(255) NOT NULL,
|
||||
query_text TEXT,
|
||||
collection_name VARCHAR(100),
|
||||
score FLOAT,
|
||||
rating INTEGER CHECK (rating >= 1 AND rating <= 5),
|
||||
notes TEXT,
|
||||
user_id VARCHAR(100),
|
||||
created_at TIMESTAMP DEFAULT NOW()
|
||||
);
|
||||
|
||||
-- Index for efficient querying
|
||||
CREATE INDEX IF NOT EXISTS idx_feedback_created_at ON rag_search_feedback(created_at);
|
||||
CREATE INDEX IF NOT EXISTS idx_feedback_collection ON rag_search_feedback(collection_name);
|
||||
CREATE INDEX IF NOT EXISTS idx_feedback_rating ON rag_search_feedback(rating);
|
||||
|
||||
-- RAG Search Logs Table (for latency tracking)
|
||||
CREATE TABLE IF NOT EXISTS rag_search_logs (
|
||||
id SERIAL PRIMARY KEY,
|
||||
query_text TEXT NOT NULL,
|
||||
collection_name VARCHAR(100),
|
||||
result_count INTEGER,
|
||||
latency_ms INTEGER,
|
||||
top_score FLOAT,
|
||||
filters JSONB,
|
||||
created_at TIMESTAMP DEFAULT NOW()
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_search_logs_created_at ON rag_search_logs(created_at);
|
||||
|
||||
-- RAG Upload History Table
|
||||
CREATE TABLE IF NOT EXISTS rag_upload_history (
|
||||
id SERIAL PRIMARY KEY,
|
||||
filename VARCHAR(500) NOT NULL,
|
||||
collection_name VARCHAR(100),
|
||||
year INTEGER,
|
||||
pdfs_extracted INTEGER,
|
||||
minio_path VARCHAR(1000),
|
||||
uploaded_by VARCHAR(100),
|
||||
created_at TIMESTAMP DEFAULT NOW()
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_upload_history_created_at ON rag_upload_history(created_at);
|
||||
|
||||
-- Binaere Relevanz-Judgments fuer echte Precision/Recall
|
||||
CREATE TABLE IF NOT EXISTS rag_relevance_judgments (
|
||||
id SERIAL PRIMARY KEY,
|
||||
query_id VARCHAR(255) NOT NULL,
|
||||
query_text TEXT NOT NULL,
|
||||
result_id VARCHAR(255) NOT NULL,
|
||||
result_rank INTEGER,
|
||||
is_relevant BOOLEAN NOT NULL,
|
||||
collection_name VARCHAR(100),
|
||||
user_id VARCHAR(100),
|
||||
created_at TIMESTAMP DEFAULT NOW()
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_relevance_query ON rag_relevance_judgments(query_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_relevance_created_at ON rag_relevance_judgments(created_at);
|
||||
|
||||
-- Zeugnisse Source Tracking
|
||||
CREATE TABLE IF NOT EXISTS zeugnis_sources (
|
||||
id VARCHAR(36) PRIMARY KEY,
|
||||
bundesland VARCHAR(10) NOT NULL,
|
||||
name VARCHAR(255) NOT NULL,
|
||||
base_url TEXT,
|
||||
license_type VARCHAR(50) NOT NULL,
|
||||
training_allowed BOOLEAN DEFAULT FALSE,
|
||||
verified_by VARCHAR(100),
|
||||
verified_at TIMESTAMP,
|
||||
created_at TIMESTAMP DEFAULT NOW(),
|
||||
updated_at TIMESTAMP DEFAULT NOW()
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_zeugnis_sources_bundesland ON zeugnis_sources(bundesland);
|
||||
|
||||
-- Zeugnisse Seed URLs
|
||||
CREATE TABLE IF NOT EXISTS zeugnis_seed_urls (
|
||||
id VARCHAR(36) PRIMARY KEY,
|
||||
source_id VARCHAR(36) REFERENCES zeugnis_sources(id),
|
||||
url TEXT NOT NULL,
|
||||
doc_type VARCHAR(50),
|
||||
status VARCHAR(20) DEFAULT 'pending',
|
||||
last_crawled TIMESTAMP,
|
||||
error_message TEXT,
|
||||
created_at TIMESTAMP DEFAULT NOW()
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_zeugnis_seed_urls_source ON zeugnis_seed_urls(source_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_zeugnis_seed_urls_status ON zeugnis_seed_urls(status);
|
||||
|
||||
-- Zeugnisse Documents
|
||||
CREATE TABLE IF NOT EXISTS zeugnis_documents (
|
||||
id VARCHAR(36) PRIMARY KEY,
|
||||
seed_url_id VARCHAR(36) REFERENCES zeugnis_seed_urls(id),
|
||||
title VARCHAR(500),
|
||||
url TEXT NOT NULL,
|
||||
content_hash VARCHAR(64),
|
||||
minio_path TEXT,
|
||||
training_allowed BOOLEAN DEFAULT FALSE,
|
||||
indexed_in_qdrant BOOLEAN DEFAULT FALSE,
|
||||
file_size INTEGER,
|
||||
content_type VARCHAR(100),
|
||||
created_at TIMESTAMP DEFAULT NOW(),
|
||||
updated_at TIMESTAMP DEFAULT NOW()
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_zeugnis_documents_seed ON zeugnis_documents(seed_url_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_zeugnis_documents_hash ON zeugnis_documents(content_hash);
|
||||
|
||||
-- Zeugnisse Document Versions
|
||||
CREATE TABLE IF NOT EXISTS zeugnis_document_versions (
|
||||
id VARCHAR(36) PRIMARY KEY,
|
||||
document_id VARCHAR(36) REFERENCES zeugnis_documents(id),
|
||||
version INTEGER NOT NULL,
|
||||
content_hash VARCHAR(64),
|
||||
minio_path TEXT,
|
||||
change_summary TEXT,
|
||||
created_at TIMESTAMP DEFAULT NOW()
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_zeugnis_versions_doc ON zeugnis_document_versions(document_id);
|
||||
|
||||
-- Zeugnisse Usage Events (Audit Trail)
|
||||
CREATE TABLE IF NOT EXISTS zeugnis_usage_events (
|
||||
id VARCHAR(36) PRIMARY KEY,
|
||||
document_id VARCHAR(36) REFERENCES zeugnis_documents(id),
|
||||
event_type VARCHAR(50) NOT NULL,
|
||||
user_id VARCHAR(100),
|
||||
details JSONB,
|
||||
created_at TIMESTAMP DEFAULT NOW()
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_zeugnis_events_doc ON zeugnis_usage_events(document_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_zeugnis_events_type ON zeugnis_usage_events(event_type);
|
||||
CREATE INDEX IF NOT EXISTS idx_zeugnis_events_created ON zeugnis_usage_events(created_at);
|
||||
|
||||
-- Crawler Queue
|
||||
CREATE TABLE IF NOT EXISTS zeugnis_crawler_queue (
|
||||
id VARCHAR(36) PRIMARY KEY,
|
||||
source_id VARCHAR(36) REFERENCES zeugnis_sources(id),
|
||||
priority INTEGER DEFAULT 5,
|
||||
status VARCHAR(20) DEFAULT 'pending',
|
||||
started_at TIMESTAMP,
|
||||
completed_at TIMESTAMP,
|
||||
documents_found INTEGER DEFAULT 0,
|
||||
documents_indexed INTEGER DEFAULT 0,
|
||||
error_count INTEGER DEFAULT 0,
|
||||
created_at TIMESTAMP DEFAULT NOW()
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_crawler_queue_status ON zeugnis_crawler_queue(status);
|
||||
"""
|
||||
|
||||
try:
|
||||
async with pool.acquire() as conn:
|
||||
await conn.execute(create_tables_sql)
|
||||
print("RAG metrics tables initialized")
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f"Failed to initialize metrics tables: {e}")
|
||||
return False
|
||||
@@ -0,0 +1,193 @@
|
||||
"""
|
||||
PostgreSQL Metrics Database - Zeugnis Operations
|
||||
|
||||
Zeugnis source management, document queries, statistics, and event logging.
|
||||
|
||||
Extracted from metrics_db.py to keep files under 500 LOC.
|
||||
"""
|
||||
|
||||
from typing import Optional, List, Dict
|
||||
|
||||
from .db_core import get_pool
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Zeugnis Database Operations
|
||||
# =============================================================================
|
||||
|
||||
async def get_zeugnis_sources() -> List[Dict]:
|
||||
"""Get all zeugnis sources (Bundeslaender)."""
|
||||
pool = await get_pool()
|
||||
if pool is None:
|
||||
return []
|
||||
|
||||
try:
|
||||
async with pool.acquire() as conn:
|
||||
rows = await conn.fetch(
|
||||
"""
|
||||
SELECT id, bundesland, name, base_url, license_type, training_allowed,
|
||||
verified_by, verified_at, created_at, updated_at
|
||||
FROM zeugnis_sources
|
||||
ORDER BY bundesland
|
||||
"""
|
||||
)
|
||||
return [dict(r) for r in rows]
|
||||
except Exception as e:
|
||||
print(f"Failed to get zeugnis sources: {e}")
|
||||
return []
|
||||
|
||||
|
||||
async def upsert_zeugnis_source(
|
||||
id: str,
|
||||
bundesland: str,
|
||||
name: str,
|
||||
license_type: str,
|
||||
training_allowed: bool,
|
||||
base_url: Optional[str] = None,
|
||||
verified_by: Optional[str] = None,
|
||||
) -> bool:
|
||||
"""Insert or update a zeugnis source."""
|
||||
pool = await get_pool()
|
||||
if pool is None:
|
||||
return False
|
||||
|
||||
try:
|
||||
async with pool.acquire() as conn:
|
||||
await conn.execute(
|
||||
"""
|
||||
INSERT INTO zeugnis_sources (id, bundesland, name, base_url, license_type, training_allowed, verified_by, verified_at)
|
||||
VALUES ($1, $2, $3, $4, $5, $6, $7, NOW())
|
||||
ON CONFLICT (id) DO UPDATE SET
|
||||
name = EXCLUDED.name,
|
||||
base_url = EXCLUDED.base_url,
|
||||
license_type = EXCLUDED.license_type,
|
||||
training_allowed = EXCLUDED.training_allowed,
|
||||
verified_by = EXCLUDED.verified_by,
|
||||
verified_at = NOW(),
|
||||
updated_at = NOW()
|
||||
""",
|
||||
id, bundesland, name, base_url, license_type, training_allowed, verified_by
|
||||
)
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f"Failed to upsert zeugnis source: {e}")
|
||||
return False
|
||||
|
||||
|
||||
async def get_zeugnis_documents(
|
||||
bundesland: Optional[str] = None,
|
||||
limit: int = 100,
|
||||
offset: int = 0,
|
||||
) -> List[Dict]:
|
||||
"""Get zeugnis documents with optional filtering."""
|
||||
pool = await get_pool()
|
||||
if pool is None:
|
||||
return []
|
||||
|
||||
try:
|
||||
async with pool.acquire() as conn:
|
||||
if bundesland:
|
||||
rows = await conn.fetch(
|
||||
"""
|
||||
SELECT d.*, s.bundesland, s.name as source_name
|
||||
FROM zeugnis_documents d
|
||||
JOIN zeugnis_seed_urls u ON d.seed_url_id = u.id
|
||||
JOIN zeugnis_sources s ON u.source_id = s.id
|
||||
WHERE s.bundesland = $1
|
||||
ORDER BY d.created_at DESC
|
||||
LIMIT $2 OFFSET $3
|
||||
""",
|
||||
bundesland, limit, offset
|
||||
)
|
||||
else:
|
||||
rows = await conn.fetch(
|
||||
"""
|
||||
SELECT d.*, s.bundesland, s.name as source_name
|
||||
FROM zeugnis_documents d
|
||||
JOIN zeugnis_seed_urls u ON d.seed_url_id = u.id
|
||||
JOIN zeugnis_sources s ON u.source_id = s.id
|
||||
ORDER BY d.created_at DESC
|
||||
LIMIT $1 OFFSET $2
|
||||
""",
|
||||
limit, offset
|
||||
)
|
||||
return [dict(r) for r in rows]
|
||||
except Exception as e:
|
||||
print(f"Failed to get zeugnis documents: {e}")
|
||||
return []
|
||||
|
||||
|
||||
async def get_zeugnis_stats() -> Dict:
|
||||
"""Get zeugnis crawler statistics."""
|
||||
pool = await get_pool()
|
||||
if pool is None:
|
||||
return {"error": "Database not available"}
|
||||
|
||||
try:
|
||||
async with pool.acquire() as conn:
|
||||
sources = await conn.fetchval("SELECT COUNT(*) FROM zeugnis_sources")
|
||||
documents = await conn.fetchval("SELECT COUNT(*) FROM zeugnis_documents")
|
||||
|
||||
indexed = await conn.fetchval(
|
||||
"SELECT COUNT(*) FROM zeugnis_documents WHERE indexed_in_qdrant = true"
|
||||
)
|
||||
|
||||
training_allowed = await conn.fetchval(
|
||||
"SELECT COUNT(*) FROM zeugnis_documents WHERE training_allowed = true"
|
||||
)
|
||||
|
||||
per_bundesland = await conn.fetch(
|
||||
"""
|
||||
SELECT s.bundesland, s.name, s.training_allowed, COUNT(d.id) as doc_count
|
||||
FROM zeugnis_sources s
|
||||
LEFT JOIN zeugnis_seed_urls u ON s.id = u.source_id
|
||||
LEFT JOIN zeugnis_documents d ON u.id = d.seed_url_id
|
||||
GROUP BY s.bundesland, s.name, s.training_allowed
|
||||
ORDER BY s.bundesland
|
||||
"""
|
||||
)
|
||||
|
||||
active_crawls = await conn.fetchval(
|
||||
"SELECT COUNT(*) FROM zeugnis_crawler_queue WHERE status = 'running'"
|
||||
)
|
||||
|
||||
return {
|
||||
"total_sources": sources or 0,
|
||||
"total_documents": documents or 0,
|
||||
"indexed_documents": indexed or 0,
|
||||
"training_allowed_documents": training_allowed or 0,
|
||||
"active_crawls": active_crawls or 0,
|
||||
"per_bundesland": [dict(r) for r in per_bundesland],
|
||||
}
|
||||
except Exception as e:
|
||||
print(f"Failed to get zeugnis stats: {e}")
|
||||
return {"error": str(e)}
|
||||
|
||||
|
||||
async def log_zeugnis_event(
|
||||
document_id: str,
|
||||
event_type: str,
|
||||
user_id: Optional[str] = None,
|
||||
details: Optional[Dict] = None,
|
||||
) -> bool:
|
||||
"""Log a zeugnis usage event for audit trail."""
|
||||
pool = await get_pool()
|
||||
if pool is None:
|
||||
return False
|
||||
|
||||
try:
|
||||
import json
|
||||
import uuid
|
||||
async with pool.acquire() as conn:
|
||||
await conn.execute(
|
||||
"""
|
||||
INSERT INTO zeugnis_usage_events (id, document_id, event_type, user_id, details)
|
||||
VALUES ($1, $2, $3, $4, $5)
|
||||
""",
|
||||
str(uuid.uuid4()), document_id, event_type, user_id,
|
||||
json.dumps(details) if details else None
|
||||
)
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f"Failed to log zeugnis event: {e}")
|
||||
return False
|
||||
@@ -1,36 +1,4 @@
|
||||
"""
|
||||
PostgreSQL Metrics Database Service — Barrel Re-export
|
||||
|
||||
Split into:
|
||||
- metrics_db_core.py — Pool, feedback, metrics, relevance
|
||||
- metrics_db_schema.py — Table initialization (DDL)
|
||||
- metrics_db_zeugnis.py — Zeugnis source/document/stats operations
|
||||
|
||||
All public names are re-exported here for backward compatibility.
|
||||
"""
|
||||
|
||||
# Schema: table initialization
|
||||
from metrics_db_schema import init_metrics_tables # noqa: F401
|
||||
|
||||
# Core: pool, feedback, search logs, metrics, relevance
|
||||
from metrics_db_core import ( # noqa: F401
|
||||
DATABASE_URL,
|
||||
get_pool,
|
||||
store_feedback,
|
||||
log_search,
|
||||
log_upload,
|
||||
calculate_metrics,
|
||||
get_recent_feedback,
|
||||
get_upload_history,
|
||||
store_relevance_judgment,
|
||||
calculate_precision_recall,
|
||||
)
|
||||
|
||||
# Zeugnis operations
|
||||
from metrics_db_zeugnis import ( # noqa: F401
|
||||
get_zeugnis_sources,
|
||||
upsert_zeugnis_source,
|
||||
get_zeugnis_documents,
|
||||
get_zeugnis_stats,
|
||||
log_zeugnis_event,
|
||||
)
|
||||
# Backward-compat shim -- module moved to metrics/db.py
|
||||
import importlib as _importlib
|
||||
import sys as _sys
|
||||
_sys.modules[__name__] = _importlib.import_module("metrics.db")
|
||||
|
||||
@@ -1,459 +1,4 @@
|
||||
"""
|
||||
PostgreSQL Metrics Database - Core Operations
|
||||
|
||||
Connection pool, table initialization, feedback storage, search logging,
|
||||
upload history, metrics calculation, and relevance judgments.
|
||||
|
||||
Extracted from metrics_db.py to keep files under 500 LOC.
|
||||
"""
|
||||
|
||||
import os
|
||||
from typing import Optional, List, Dict
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
# Database Configuration - uses test default if not configured (for CI)
|
||||
DATABASE_URL = os.getenv("DATABASE_URL", "postgresql://test:test@localhost:5432/test_metrics")
|
||||
|
||||
# Connection pool
|
||||
_pool = None
|
||||
|
||||
|
||||
async def get_pool():
|
||||
"""Get or create database connection pool."""
|
||||
global _pool
|
||||
if _pool is None:
|
||||
try:
|
||||
import asyncpg
|
||||
_pool = await asyncpg.create_pool(DATABASE_URL, min_size=2, max_size=10)
|
||||
except ImportError:
|
||||
print("Warning: asyncpg not installed. Metrics storage disabled.")
|
||||
return None
|
||||
except Exception as e:
|
||||
print(f"Warning: Failed to connect to PostgreSQL: {e}")
|
||||
return None
|
||||
return _pool
|
||||
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Feedback Storage
|
||||
# =============================================================================
|
||||
|
||||
async def store_feedback(
|
||||
result_id: str,
|
||||
rating: int,
|
||||
query_text: Optional[str] = None,
|
||||
collection_name: Optional[str] = None,
|
||||
score: Optional[float] = None,
|
||||
notes: Optional[str] = None,
|
||||
user_id: Optional[str] = None,
|
||||
) -> bool:
|
||||
"""Store search result feedback."""
|
||||
pool = await get_pool()
|
||||
if pool is None:
|
||||
return False
|
||||
|
||||
try:
|
||||
async with pool.acquire() as conn:
|
||||
await conn.execute(
|
||||
"""
|
||||
INSERT INTO rag_search_feedback
|
||||
(result_id, query_text, collection_name, score, rating, notes, user_id)
|
||||
VALUES ($1, $2, $3, $4, $5, $6, $7)
|
||||
""",
|
||||
result_id, query_text, collection_name, score, rating, notes, user_id
|
||||
)
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f"Failed to store feedback: {e}")
|
||||
return False
|
||||
|
||||
|
||||
async def log_search(
|
||||
query_text: str,
|
||||
collection_name: str,
|
||||
result_count: int,
|
||||
latency_ms: int,
|
||||
top_score: Optional[float] = None,
|
||||
filters: Optional[Dict] = None,
|
||||
) -> bool:
|
||||
"""Log a search for metrics tracking."""
|
||||
pool = await get_pool()
|
||||
if pool is None:
|
||||
return False
|
||||
|
||||
try:
|
||||
import json
|
||||
async with pool.acquire() as conn:
|
||||
await conn.execute(
|
||||
"""
|
||||
INSERT INTO rag_search_logs
|
||||
(query_text, collection_name, result_count, latency_ms, top_score, filters)
|
||||
VALUES ($1, $2, $3, $4, $5, $6)
|
||||
""",
|
||||
query_text, collection_name, result_count, latency_ms, top_score,
|
||||
json.dumps(filters) if filters else None
|
||||
)
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f"Failed to log search: {e}")
|
||||
return False
|
||||
|
||||
|
||||
async def log_upload(
|
||||
filename: str,
|
||||
collection_name: str,
|
||||
year: int,
|
||||
pdfs_extracted: int,
|
||||
minio_path: Optional[str] = None,
|
||||
uploaded_by: Optional[str] = None,
|
||||
) -> bool:
|
||||
"""Log an upload for history tracking."""
|
||||
pool = await get_pool()
|
||||
if pool is None:
|
||||
return False
|
||||
|
||||
try:
|
||||
async with pool.acquire() as conn:
|
||||
await conn.execute(
|
||||
"""
|
||||
INSERT INTO rag_upload_history
|
||||
(filename, collection_name, year, pdfs_extracted, minio_path, uploaded_by)
|
||||
VALUES ($1, $2, $3, $4, $5, $6)
|
||||
""",
|
||||
filename, collection_name, year, pdfs_extracted, minio_path, uploaded_by
|
||||
)
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f"Failed to log upload: {e}")
|
||||
return False
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Metrics Calculation
|
||||
# =============================================================================
|
||||
|
||||
async def calculate_metrics(
|
||||
collection_name: Optional[str] = None,
|
||||
days: int = 7,
|
||||
) -> Dict:
|
||||
"""
|
||||
Calculate RAG quality metrics from stored feedback.
|
||||
|
||||
Returns:
|
||||
Dict with precision, recall, MRR, latency, etc.
|
||||
"""
|
||||
pool = await get_pool()
|
||||
if pool is None:
|
||||
return {"error": "Database not available", "connected": False}
|
||||
|
||||
try:
|
||||
async with pool.acquire() as conn:
|
||||
since = datetime.now() - timedelta(days=days)
|
||||
|
||||
collection_filter = ""
|
||||
params = [since]
|
||||
if collection_name:
|
||||
collection_filter = "AND collection_name = $2"
|
||||
params.append(collection_name)
|
||||
|
||||
total_feedback = await conn.fetchval(
|
||||
f"""
|
||||
SELECT COUNT(*) FROM rag_search_feedback
|
||||
WHERE created_at >= $1 {collection_filter}
|
||||
""",
|
||||
*params
|
||||
)
|
||||
|
||||
rating_dist = await conn.fetch(
|
||||
f"""
|
||||
SELECT rating, COUNT(*) as count
|
||||
FROM rag_search_feedback
|
||||
WHERE created_at >= $1 {collection_filter}
|
||||
GROUP BY rating
|
||||
ORDER BY rating DESC
|
||||
""",
|
||||
*params
|
||||
)
|
||||
|
||||
avg_rating = await conn.fetchval(
|
||||
f"""
|
||||
SELECT AVG(rating) FROM rag_search_feedback
|
||||
WHERE created_at >= $1 {collection_filter}
|
||||
""",
|
||||
*params
|
||||
)
|
||||
|
||||
score_dist = await conn.fetch(
|
||||
f"""
|
||||
SELECT
|
||||
CASE
|
||||
WHEN score >= 0.9 THEN '0.9+'
|
||||
WHEN score >= 0.7 THEN '0.7-0.9'
|
||||
WHEN score >= 0.5 THEN '0.5-0.7'
|
||||
ELSE '<0.5'
|
||||
END as range,
|
||||
COUNT(*) as count
|
||||
FROM rag_search_feedback
|
||||
WHERE created_at >= $1 AND score IS NOT NULL {collection_filter}
|
||||
GROUP BY range
|
||||
ORDER BY range DESC
|
||||
""",
|
||||
*params
|
||||
)
|
||||
|
||||
latency_stats = await conn.fetchrow(
|
||||
f"""
|
||||
SELECT
|
||||
AVG(latency_ms) as avg_latency,
|
||||
COUNT(*) as total_searches,
|
||||
AVG(result_count) as avg_results
|
||||
FROM rag_search_logs
|
||||
WHERE created_at >= $1 {collection_filter.replace('collection_name', 'collection_name')}
|
||||
""",
|
||||
*params
|
||||
)
|
||||
|
||||
precision_at_5 = await conn.fetchval(
|
||||
f"""
|
||||
SELECT
|
||||
CASE WHEN COUNT(*) > 0
|
||||
THEN CAST(SUM(CASE WHEN rating >= 4 THEN 1 ELSE 0 END) AS FLOAT) / COUNT(*)
|
||||
ELSE 0 END
|
||||
FROM rag_search_feedback
|
||||
WHERE created_at >= $1 {collection_filter}
|
||||
""",
|
||||
*params
|
||||
) or 0
|
||||
|
||||
mrr = (avg_rating or 0) / 5.0
|
||||
|
||||
error_count = sum(
|
||||
r['count'] for r in rating_dist if r['rating'] and r['rating'] <= 2
|
||||
)
|
||||
error_rate = (error_count / total_feedback * 100) if total_feedback > 0 else 0
|
||||
|
||||
total_scored = sum(s['count'] for s in score_dist)
|
||||
score_distribution = {}
|
||||
for s in score_dist:
|
||||
if total_scored > 0:
|
||||
score_distribution[s['range']] = round(s['count'] / total_scored * 100)
|
||||
else:
|
||||
score_distribution[s['range']] = 0
|
||||
|
||||
return {
|
||||
"connected": True,
|
||||
"period_days": days,
|
||||
"precision_at_5": round(precision_at_5, 2),
|
||||
"recall_at_10": round(precision_at_5 * 1.1, 2),
|
||||
"mrr": round(mrr, 2),
|
||||
"avg_latency_ms": round(latency_stats['avg_latency'] or 0),
|
||||
"total_ratings": total_feedback,
|
||||
"total_searches": latency_stats['total_searches'] or 0,
|
||||
"error_rate": round(error_rate, 1),
|
||||
"score_distribution": score_distribution,
|
||||
"rating_distribution": {
|
||||
str(r['rating']): r['count'] for r in rating_dist if r['rating']
|
||||
},
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
print(f"Failed to calculate metrics: {e}")
|
||||
return {"error": str(e), "connected": False}
|
||||
|
||||
|
||||
async def get_recent_feedback(limit: int = 20) -> List[Dict]:
|
||||
"""Get recent feedback entries."""
|
||||
pool = await get_pool()
|
||||
if pool is None:
|
||||
return []
|
||||
|
||||
try:
|
||||
async with pool.acquire() as conn:
|
||||
rows = await conn.fetch(
|
||||
"""
|
||||
SELECT result_id, rating, query_text, collection_name, score, notes, created_at
|
||||
FROM rag_search_feedback
|
||||
ORDER BY created_at DESC
|
||||
LIMIT $1
|
||||
""",
|
||||
limit
|
||||
)
|
||||
return [
|
||||
{
|
||||
"result_id": r['result_id'],
|
||||
"rating": r['rating'],
|
||||
"query_text": r['query_text'],
|
||||
"collection_name": r['collection_name'],
|
||||
"score": r['score'],
|
||||
"notes": r['notes'],
|
||||
"created_at": r['created_at'].isoformat() if r['created_at'] else None,
|
||||
}
|
||||
for r in rows
|
||||
]
|
||||
except Exception as e:
|
||||
print(f"Failed to get recent feedback: {e}")
|
||||
return []
|
||||
|
||||
|
||||
async def get_upload_history(limit: int = 20) -> List[Dict]:
|
||||
"""Get recent upload history."""
|
||||
pool = await get_pool()
|
||||
if pool is None:
|
||||
return []
|
||||
|
||||
try:
|
||||
async with pool.acquire() as conn:
|
||||
rows = await conn.fetch(
|
||||
"""
|
||||
SELECT filename, collection_name, year, pdfs_extracted, minio_path, uploaded_by, created_at
|
||||
FROM rag_upload_history
|
||||
ORDER BY created_at DESC
|
||||
LIMIT $1
|
||||
""",
|
||||
limit
|
||||
)
|
||||
return [
|
||||
{
|
||||
"filename": r['filename'],
|
||||
"collection_name": r['collection_name'],
|
||||
"year": r['year'],
|
||||
"pdfs_extracted": r['pdfs_extracted'],
|
||||
"minio_path": r['minio_path'],
|
||||
"uploaded_by": r['uploaded_by'],
|
||||
"created_at": r['created_at'].isoformat() if r['created_at'] else None,
|
||||
}
|
||||
for r in rows
|
||||
]
|
||||
except Exception as e:
|
||||
print(f"Failed to get upload history: {e}")
|
||||
return []
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Relevance Judgments (Binary Precision/Recall)
|
||||
# =============================================================================
|
||||
|
||||
async def store_relevance_judgment(
|
||||
query_id: str,
|
||||
query_text: str,
|
||||
result_id: str,
|
||||
is_relevant: bool,
|
||||
result_rank: Optional[int] = None,
|
||||
collection_name: Optional[str] = None,
|
||||
user_id: Optional[str] = None,
|
||||
) -> bool:
|
||||
"""Store binary relevance judgment for Precision/Recall calculation."""
|
||||
pool = await get_pool()
|
||||
if pool is None:
|
||||
return False
|
||||
|
||||
try:
|
||||
async with pool.acquire() as conn:
|
||||
await conn.execute(
|
||||
"""
|
||||
INSERT INTO rag_relevance_judgments
|
||||
(query_id, query_text, result_id, result_rank, is_relevant, collection_name, user_id)
|
||||
VALUES ($1, $2, $3, $4, $5, $6, $7)
|
||||
ON CONFLICT DO NOTHING
|
||||
""",
|
||||
query_id, query_text, result_id, result_rank, is_relevant, collection_name, user_id
|
||||
)
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f"Failed to store relevance judgment: {e}")
|
||||
return False
|
||||
|
||||
|
||||
async def calculate_precision_recall(
|
||||
collection_name: Optional[str] = None,
|
||||
days: int = 7,
|
||||
k: int = 10,
|
||||
) -> Dict:
|
||||
"""
|
||||
Calculate true Precision@k and Recall@k from binary relevance judgments.
|
||||
|
||||
Precision@k = (Relevant docs in top k) / k
|
||||
Recall@k = (Relevant docs in top k) / (Total relevant docs for query)
|
||||
"""
|
||||
pool = await get_pool()
|
||||
if pool is None:
|
||||
return {"error": "Database not available", "connected": False}
|
||||
|
||||
try:
|
||||
async with pool.acquire() as conn:
|
||||
since = datetime.now() - timedelta(days=days)
|
||||
|
||||
collection_filter = ""
|
||||
params = [since, k]
|
||||
if collection_name:
|
||||
collection_filter = "AND collection_name = $3"
|
||||
params.append(collection_name)
|
||||
|
||||
precision_result = await conn.fetchval(
|
||||
f"""
|
||||
WITH query_precision AS (
|
||||
SELECT
|
||||
query_id,
|
||||
COUNT(CASE WHEN is_relevant THEN 1 END)::FLOAT /
|
||||
GREATEST(COUNT(*), 1) as precision
|
||||
FROM rag_relevance_judgments
|
||||
WHERE created_at >= $1
|
||||
AND (result_rank IS NULL OR result_rank <= $2)
|
||||
{collection_filter}
|
||||
GROUP BY query_id
|
||||
)
|
||||
SELECT AVG(precision) FROM query_precision
|
||||
""",
|
||||
*params
|
||||
) or 0
|
||||
|
||||
recall_result = await conn.fetchval(
|
||||
f"""
|
||||
WITH query_recall AS (
|
||||
SELECT
|
||||
query_id,
|
||||
COUNT(CASE WHEN is_relevant AND (result_rank IS NULL OR result_rank <= $2) THEN 1 END)::FLOAT /
|
||||
GREATEST(COUNT(CASE WHEN is_relevant THEN 1 END), 1) as recall
|
||||
FROM rag_relevance_judgments
|
||||
WHERE created_at >= $1
|
||||
{collection_filter}
|
||||
GROUP BY query_id
|
||||
)
|
||||
SELECT AVG(recall) FROM query_recall
|
||||
""",
|
||||
*params
|
||||
) or 0
|
||||
|
||||
total_judgments = await conn.fetchval(
|
||||
f"""
|
||||
SELECT COUNT(*) FROM rag_relevance_judgments
|
||||
WHERE created_at >= $1 {collection_filter}
|
||||
""",
|
||||
since, *([collection_name] if collection_name else [])
|
||||
)
|
||||
|
||||
unique_queries = await conn.fetchval(
|
||||
f"""
|
||||
SELECT COUNT(DISTINCT query_id) FROM rag_relevance_judgments
|
||||
WHERE created_at >= $1 {collection_filter}
|
||||
""",
|
||||
since, *([collection_name] if collection_name else [])
|
||||
)
|
||||
|
||||
return {
|
||||
"connected": True,
|
||||
"period_days": days,
|
||||
"k": k,
|
||||
"precision_at_k": round(precision_result, 3),
|
||||
"recall_at_k": round(recall_result, 3),
|
||||
"f1_score": round(
|
||||
2 * precision_result * recall_result / max(precision_result + recall_result, 0.001), 3
|
||||
),
|
||||
"total_judgments": total_judgments or 0,
|
||||
"unique_queries": unique_queries or 0,
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
print(f"Failed to calculate precision/recall: {e}")
|
||||
return {"error": str(e), "connected": False}
|
||||
# Backward-compat shim -- module moved to metrics/db_core.py
|
||||
import importlib as _importlib
|
||||
import sys as _sys
|
||||
_sys.modules[__name__] = _importlib.import_module("metrics.db_core")
|
||||
|
||||
@@ -1,182 +1,4 @@
|
||||
"""
|
||||
PostgreSQL Metrics Database - Schema Initialization
|
||||
|
||||
Table creation DDL for all metrics, feedback, and zeugnis tables.
|
||||
|
||||
Extracted from metrics_db_core.py to keep files under 500 LOC.
|
||||
"""
|
||||
|
||||
from metrics_db_core import get_pool
|
||||
|
||||
|
||||
async def init_metrics_tables() -> bool:
|
||||
"""Initialize metrics tables in PostgreSQL."""
|
||||
pool = await get_pool()
|
||||
if pool is None:
|
||||
return False
|
||||
|
||||
create_tables_sql = """
|
||||
-- RAG Search Feedback Table
|
||||
CREATE TABLE IF NOT EXISTS rag_search_feedback (
|
||||
id SERIAL PRIMARY KEY,
|
||||
result_id VARCHAR(255) NOT NULL,
|
||||
query_text TEXT,
|
||||
collection_name VARCHAR(100),
|
||||
score FLOAT,
|
||||
rating INTEGER CHECK (rating >= 1 AND rating <= 5),
|
||||
notes TEXT,
|
||||
user_id VARCHAR(100),
|
||||
created_at TIMESTAMP DEFAULT NOW()
|
||||
);
|
||||
|
||||
-- Index for efficient querying
|
||||
CREATE INDEX IF NOT EXISTS idx_feedback_created_at ON rag_search_feedback(created_at);
|
||||
CREATE INDEX IF NOT EXISTS idx_feedback_collection ON rag_search_feedback(collection_name);
|
||||
CREATE INDEX IF NOT EXISTS idx_feedback_rating ON rag_search_feedback(rating);
|
||||
|
||||
-- RAG Search Logs Table (for latency tracking)
|
||||
CREATE TABLE IF NOT EXISTS rag_search_logs (
|
||||
id SERIAL PRIMARY KEY,
|
||||
query_text TEXT NOT NULL,
|
||||
collection_name VARCHAR(100),
|
||||
result_count INTEGER,
|
||||
latency_ms INTEGER,
|
||||
top_score FLOAT,
|
||||
filters JSONB,
|
||||
created_at TIMESTAMP DEFAULT NOW()
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_search_logs_created_at ON rag_search_logs(created_at);
|
||||
|
||||
-- RAG Upload History Table
|
||||
CREATE TABLE IF NOT EXISTS rag_upload_history (
|
||||
id SERIAL PRIMARY KEY,
|
||||
filename VARCHAR(500) NOT NULL,
|
||||
collection_name VARCHAR(100),
|
||||
year INTEGER,
|
||||
pdfs_extracted INTEGER,
|
||||
minio_path VARCHAR(1000),
|
||||
uploaded_by VARCHAR(100),
|
||||
created_at TIMESTAMP DEFAULT NOW()
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_upload_history_created_at ON rag_upload_history(created_at);
|
||||
|
||||
-- Binaere Relevanz-Judgments fuer echte Precision/Recall
|
||||
CREATE TABLE IF NOT EXISTS rag_relevance_judgments (
|
||||
id SERIAL PRIMARY KEY,
|
||||
query_id VARCHAR(255) NOT NULL,
|
||||
query_text TEXT NOT NULL,
|
||||
result_id VARCHAR(255) NOT NULL,
|
||||
result_rank INTEGER,
|
||||
is_relevant BOOLEAN NOT NULL,
|
||||
collection_name VARCHAR(100),
|
||||
user_id VARCHAR(100),
|
||||
created_at TIMESTAMP DEFAULT NOW()
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_relevance_query ON rag_relevance_judgments(query_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_relevance_created_at ON rag_relevance_judgments(created_at);
|
||||
|
||||
-- Zeugnisse Source Tracking
|
||||
CREATE TABLE IF NOT EXISTS zeugnis_sources (
|
||||
id VARCHAR(36) PRIMARY KEY,
|
||||
bundesland VARCHAR(10) NOT NULL,
|
||||
name VARCHAR(255) NOT NULL,
|
||||
base_url TEXT,
|
||||
license_type VARCHAR(50) NOT NULL,
|
||||
training_allowed BOOLEAN DEFAULT FALSE,
|
||||
verified_by VARCHAR(100),
|
||||
verified_at TIMESTAMP,
|
||||
created_at TIMESTAMP DEFAULT NOW(),
|
||||
updated_at TIMESTAMP DEFAULT NOW()
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_zeugnis_sources_bundesland ON zeugnis_sources(bundesland);
|
||||
|
||||
-- Zeugnisse Seed URLs
|
||||
CREATE TABLE IF NOT EXISTS zeugnis_seed_urls (
|
||||
id VARCHAR(36) PRIMARY KEY,
|
||||
source_id VARCHAR(36) REFERENCES zeugnis_sources(id),
|
||||
url TEXT NOT NULL,
|
||||
doc_type VARCHAR(50),
|
||||
status VARCHAR(20) DEFAULT 'pending',
|
||||
last_crawled TIMESTAMP,
|
||||
error_message TEXT,
|
||||
created_at TIMESTAMP DEFAULT NOW()
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_zeugnis_seed_urls_source ON zeugnis_seed_urls(source_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_zeugnis_seed_urls_status ON zeugnis_seed_urls(status);
|
||||
|
||||
-- Zeugnisse Documents
|
||||
CREATE TABLE IF NOT EXISTS zeugnis_documents (
|
||||
id VARCHAR(36) PRIMARY KEY,
|
||||
seed_url_id VARCHAR(36) REFERENCES zeugnis_seed_urls(id),
|
||||
title VARCHAR(500),
|
||||
url TEXT NOT NULL,
|
||||
content_hash VARCHAR(64),
|
||||
minio_path TEXT,
|
||||
training_allowed BOOLEAN DEFAULT FALSE,
|
||||
indexed_in_qdrant BOOLEAN DEFAULT FALSE,
|
||||
file_size INTEGER,
|
||||
content_type VARCHAR(100),
|
||||
created_at TIMESTAMP DEFAULT NOW(),
|
||||
updated_at TIMESTAMP DEFAULT NOW()
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_zeugnis_documents_seed ON zeugnis_documents(seed_url_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_zeugnis_documents_hash ON zeugnis_documents(content_hash);
|
||||
|
||||
-- Zeugnisse Document Versions
|
||||
CREATE TABLE IF NOT EXISTS zeugnis_document_versions (
|
||||
id VARCHAR(36) PRIMARY KEY,
|
||||
document_id VARCHAR(36) REFERENCES zeugnis_documents(id),
|
||||
version INTEGER NOT NULL,
|
||||
content_hash VARCHAR(64),
|
||||
minio_path TEXT,
|
||||
change_summary TEXT,
|
||||
created_at TIMESTAMP DEFAULT NOW()
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_zeugnis_versions_doc ON zeugnis_document_versions(document_id);
|
||||
|
||||
-- Zeugnisse Usage Events (Audit Trail)
|
||||
CREATE TABLE IF NOT EXISTS zeugnis_usage_events (
|
||||
id VARCHAR(36) PRIMARY KEY,
|
||||
document_id VARCHAR(36) REFERENCES zeugnis_documents(id),
|
||||
event_type VARCHAR(50) NOT NULL,
|
||||
user_id VARCHAR(100),
|
||||
details JSONB,
|
||||
created_at TIMESTAMP DEFAULT NOW()
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_zeugnis_events_doc ON zeugnis_usage_events(document_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_zeugnis_events_type ON zeugnis_usage_events(event_type);
|
||||
CREATE INDEX IF NOT EXISTS idx_zeugnis_events_created ON zeugnis_usage_events(created_at);
|
||||
|
||||
-- Crawler Queue
|
||||
CREATE TABLE IF NOT EXISTS zeugnis_crawler_queue (
|
||||
id VARCHAR(36) PRIMARY KEY,
|
||||
source_id VARCHAR(36) REFERENCES zeugnis_sources(id),
|
||||
priority INTEGER DEFAULT 5,
|
||||
status VARCHAR(20) DEFAULT 'pending',
|
||||
started_at TIMESTAMP,
|
||||
completed_at TIMESTAMP,
|
||||
documents_found INTEGER DEFAULT 0,
|
||||
documents_indexed INTEGER DEFAULT 0,
|
||||
error_count INTEGER DEFAULT 0,
|
||||
created_at TIMESTAMP DEFAULT NOW()
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_crawler_queue_status ON zeugnis_crawler_queue(status);
|
||||
"""
|
||||
|
||||
try:
|
||||
async with pool.acquire() as conn:
|
||||
await conn.execute(create_tables_sql)
|
||||
print("RAG metrics tables initialized")
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f"Failed to initialize metrics tables: {e}")
|
||||
return False
|
||||
# Backward-compat shim -- module moved to metrics/db_schema.py
|
||||
import importlib as _importlib
|
||||
import sys as _sys
|
||||
_sys.modules[__name__] = _importlib.import_module("metrics.db_schema")
|
||||
|
||||
@@ -1,193 +1,4 @@
|
||||
"""
|
||||
PostgreSQL Metrics Database - Zeugnis Operations
|
||||
|
||||
Zeugnis source management, document queries, statistics, and event logging.
|
||||
|
||||
Extracted from metrics_db.py to keep files under 500 LOC.
|
||||
"""
|
||||
|
||||
from typing import Optional, List, Dict
|
||||
|
||||
from metrics_db_core import get_pool
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Zeugnis Database Operations
|
||||
# =============================================================================
|
||||
|
||||
async def get_zeugnis_sources() -> List[Dict]:
|
||||
"""Get all zeugnis sources (Bundeslaender)."""
|
||||
pool = await get_pool()
|
||||
if pool is None:
|
||||
return []
|
||||
|
||||
try:
|
||||
async with pool.acquire() as conn:
|
||||
rows = await conn.fetch(
|
||||
"""
|
||||
SELECT id, bundesland, name, base_url, license_type, training_allowed,
|
||||
verified_by, verified_at, created_at, updated_at
|
||||
FROM zeugnis_sources
|
||||
ORDER BY bundesland
|
||||
"""
|
||||
)
|
||||
return [dict(r) for r in rows]
|
||||
except Exception as e:
|
||||
print(f"Failed to get zeugnis sources: {e}")
|
||||
return []
|
||||
|
||||
|
||||
async def upsert_zeugnis_source(
|
||||
id: str,
|
||||
bundesland: str,
|
||||
name: str,
|
||||
license_type: str,
|
||||
training_allowed: bool,
|
||||
base_url: Optional[str] = None,
|
||||
verified_by: Optional[str] = None,
|
||||
) -> bool:
|
||||
"""Insert or update a zeugnis source."""
|
||||
pool = await get_pool()
|
||||
if pool is None:
|
||||
return False
|
||||
|
||||
try:
|
||||
async with pool.acquire() as conn:
|
||||
await conn.execute(
|
||||
"""
|
||||
INSERT INTO zeugnis_sources (id, bundesland, name, base_url, license_type, training_allowed, verified_by, verified_at)
|
||||
VALUES ($1, $2, $3, $4, $5, $6, $7, NOW())
|
||||
ON CONFLICT (id) DO UPDATE SET
|
||||
name = EXCLUDED.name,
|
||||
base_url = EXCLUDED.base_url,
|
||||
license_type = EXCLUDED.license_type,
|
||||
training_allowed = EXCLUDED.training_allowed,
|
||||
verified_by = EXCLUDED.verified_by,
|
||||
verified_at = NOW(),
|
||||
updated_at = NOW()
|
||||
""",
|
||||
id, bundesland, name, base_url, license_type, training_allowed, verified_by
|
||||
)
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f"Failed to upsert zeugnis source: {e}")
|
||||
return False
|
||||
|
||||
|
||||
async def get_zeugnis_documents(
|
||||
bundesland: Optional[str] = None,
|
||||
limit: int = 100,
|
||||
offset: int = 0,
|
||||
) -> List[Dict]:
|
||||
"""Get zeugnis documents with optional filtering."""
|
||||
pool = await get_pool()
|
||||
if pool is None:
|
||||
return []
|
||||
|
||||
try:
|
||||
async with pool.acquire() as conn:
|
||||
if bundesland:
|
||||
rows = await conn.fetch(
|
||||
"""
|
||||
SELECT d.*, s.bundesland, s.name as source_name
|
||||
FROM zeugnis_documents d
|
||||
JOIN zeugnis_seed_urls u ON d.seed_url_id = u.id
|
||||
JOIN zeugnis_sources s ON u.source_id = s.id
|
||||
WHERE s.bundesland = $1
|
||||
ORDER BY d.created_at DESC
|
||||
LIMIT $2 OFFSET $3
|
||||
""",
|
||||
bundesland, limit, offset
|
||||
)
|
||||
else:
|
||||
rows = await conn.fetch(
|
||||
"""
|
||||
SELECT d.*, s.bundesland, s.name as source_name
|
||||
FROM zeugnis_documents d
|
||||
JOIN zeugnis_seed_urls u ON d.seed_url_id = u.id
|
||||
JOIN zeugnis_sources s ON u.source_id = s.id
|
||||
ORDER BY d.created_at DESC
|
||||
LIMIT $1 OFFSET $2
|
||||
""",
|
||||
limit, offset
|
||||
)
|
||||
return [dict(r) for r in rows]
|
||||
except Exception as e:
|
||||
print(f"Failed to get zeugnis documents: {e}")
|
||||
return []
|
||||
|
||||
|
||||
async def get_zeugnis_stats() -> Dict:
|
||||
"""Get zeugnis crawler statistics."""
|
||||
pool = await get_pool()
|
||||
if pool is None:
|
||||
return {"error": "Database not available"}
|
||||
|
||||
try:
|
||||
async with pool.acquire() as conn:
|
||||
sources = await conn.fetchval("SELECT COUNT(*) FROM zeugnis_sources")
|
||||
documents = await conn.fetchval("SELECT COUNT(*) FROM zeugnis_documents")
|
||||
|
||||
indexed = await conn.fetchval(
|
||||
"SELECT COUNT(*) FROM zeugnis_documents WHERE indexed_in_qdrant = true"
|
||||
)
|
||||
|
||||
training_allowed = await conn.fetchval(
|
||||
"SELECT COUNT(*) FROM zeugnis_documents WHERE training_allowed = true"
|
||||
)
|
||||
|
||||
per_bundesland = await conn.fetch(
|
||||
"""
|
||||
SELECT s.bundesland, s.name, s.training_allowed, COUNT(d.id) as doc_count
|
||||
FROM zeugnis_sources s
|
||||
LEFT JOIN zeugnis_seed_urls u ON s.id = u.source_id
|
||||
LEFT JOIN zeugnis_documents d ON u.id = d.seed_url_id
|
||||
GROUP BY s.bundesland, s.name, s.training_allowed
|
||||
ORDER BY s.bundesland
|
||||
"""
|
||||
)
|
||||
|
||||
active_crawls = await conn.fetchval(
|
||||
"SELECT COUNT(*) FROM zeugnis_crawler_queue WHERE status = 'running'"
|
||||
)
|
||||
|
||||
return {
|
||||
"total_sources": sources or 0,
|
||||
"total_documents": documents or 0,
|
||||
"indexed_documents": indexed or 0,
|
||||
"training_allowed_documents": training_allowed or 0,
|
||||
"active_crawls": active_crawls or 0,
|
||||
"per_bundesland": [dict(r) for r in per_bundesland],
|
||||
}
|
||||
except Exception as e:
|
||||
print(f"Failed to get zeugnis stats: {e}")
|
||||
return {"error": str(e)}
|
||||
|
||||
|
||||
async def log_zeugnis_event(
|
||||
document_id: str,
|
||||
event_type: str,
|
||||
user_id: Optional[str] = None,
|
||||
details: Optional[Dict] = None,
|
||||
) -> bool:
|
||||
"""Log a zeugnis usage event for audit trail."""
|
||||
pool = await get_pool()
|
||||
if pool is None:
|
||||
return False
|
||||
|
||||
try:
|
||||
import json
|
||||
import uuid
|
||||
async with pool.acquire() as conn:
|
||||
await conn.execute(
|
||||
"""
|
||||
INSERT INTO zeugnis_usage_events (id, document_id, event_type, user_id, details)
|
||||
VALUES ($1, $2, $3, $4, $5)
|
||||
""",
|
||||
str(uuid.uuid4()), document_id, event_type, user_id,
|
||||
json.dumps(details) if details else None
|
||||
)
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f"Failed to log zeugnis event: {e}")
|
||||
return False
|
||||
# Backward-compat shim -- module moved to metrics/db_zeugnis.py
|
||||
import importlib as _importlib
|
||||
import sys as _sys
|
||||
_sys.modules[__name__] = _importlib.import_module("metrics.db_zeugnis")
|
||||
|
||||
@@ -1,26 +1,4 @@
|
||||
"""
|
||||
NRU Worksheet Generator — barrel re-export.
|
||||
|
||||
All implementation split into:
|
||||
nru_worksheet_models — data classes, entry separation
|
||||
nru_worksheet_html — HTML generation
|
||||
nru_worksheet_pdf — PDF generation
|
||||
|
||||
Per scanned page, we generate 2 worksheet pages.
|
||||
"""
|
||||
|
||||
# Models
|
||||
from nru_worksheet_models import ( # noqa: F401
|
||||
VocabEntry,
|
||||
SentenceEntry,
|
||||
separate_vocab_and_sentences,
|
||||
)
|
||||
|
||||
# HTML generation
|
||||
from nru_worksheet_html import ( # noqa: F401
|
||||
generate_nru_html,
|
||||
generate_nru_worksheet_html,
|
||||
)
|
||||
|
||||
# PDF generation
|
||||
from nru_worksheet_pdf import generate_nru_pdf # noqa: F401
|
||||
# Backward-compat shim -- module moved to worksheet/nru_generator.py
|
||||
import importlib as _importlib
|
||||
import sys as _sys
|
||||
_sys.modules[__name__] = _importlib.import_module("worksheet.nru_generator")
|
||||
|
||||
@@ -1,466 +1,4 @@
|
||||
"""
|
||||
NRU Worksheet HTML — HTML generation for vocabulary worksheets.
|
||||
|
||||
Extracted from nru_worksheet_generator.py for modularity.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from typing import List, Dict
|
||||
|
||||
from nru_worksheet_models import VocabEntry, SentenceEntry, separate_vocab_and_sentences
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def generate_nru_html(
|
||||
vocab_list: List[VocabEntry],
|
||||
sentence_list: List[SentenceEntry],
|
||||
page_number: int,
|
||||
title: str = "Vokabeltest",
|
||||
show_solutions: bool = False,
|
||||
line_height_px: int = 28
|
||||
) -> str:
|
||||
"""
|
||||
Generate HTML for NRU-format worksheet.
|
||||
|
||||
Returns HTML for 2 pages:
|
||||
- Page 1: Vocabulary table (3 columns)
|
||||
- Page 2: Sentence practice (full width)
|
||||
"""
|
||||
|
||||
# Filter by page
|
||||
page_vocab = [v for v in vocab_list if v.source_page == page_number]
|
||||
page_sentences = [s for s in sentence_list if s.source_page == page_number]
|
||||
|
||||
html = f"""<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<style>
|
||||
@page {{
|
||||
size: A4;
|
||||
margin: 1.5cm 2cm;
|
||||
}}
|
||||
* {{
|
||||
box-sizing: border-box;
|
||||
}}
|
||||
body {{
|
||||
font-family: Arial, Helvetica, sans-serif;
|
||||
font-size: 12pt;
|
||||
line-height: 1.4;
|
||||
margin: 0;
|
||||
padding: 0;
|
||||
}}
|
||||
.page {{
|
||||
page-break-after: always;
|
||||
min-height: 100%;
|
||||
}}
|
||||
.page:last-child {{
|
||||
page-break-after: avoid;
|
||||
}}
|
||||
h1 {{
|
||||
font-size: 16pt;
|
||||
margin: 0 0 8px 0;
|
||||
text-align: center;
|
||||
}}
|
||||
.header {{
|
||||
margin-bottom: 15px;
|
||||
}}
|
||||
.name-line {{
|
||||
font-size: 11pt;
|
||||
margin-bottom: 10px;
|
||||
}}
|
||||
|
||||
/* Vocabulary Table - 3 columns */
|
||||
.vocab-table {{
|
||||
width: 100%;
|
||||
border-collapse: collapse;
|
||||
table-layout: fixed;
|
||||
}}
|
||||
.vocab-table th {{
|
||||
background: #f0f0f0;
|
||||
border: 1px solid #333;
|
||||
padding: 6px 8px;
|
||||
font-weight: bold;
|
||||
font-size: 11pt;
|
||||
text-align: left;
|
||||
}}
|
||||
.vocab-table td {{
|
||||
border: 1px solid #333;
|
||||
padding: 4px 8px;
|
||||
height: {line_height_px}px;
|
||||
vertical-align: middle;
|
||||
}}
|
||||
.vocab-table .col-english {{ width: 35%; }}
|
||||
.vocab-table .col-german {{ width: 35%; }}
|
||||
.vocab-table .col-correction {{ width: 30%; }}
|
||||
.vocab-answer {{
|
||||
color: #0066cc;
|
||||
font-style: italic;
|
||||
}}
|
||||
|
||||
/* Sentence Table - full width */
|
||||
.sentence-table {{
|
||||
width: 100%;
|
||||
border-collapse: collapse;
|
||||
margin-bottom: 15px;
|
||||
}}
|
||||
.sentence-table td {{
|
||||
border: 1px solid #333;
|
||||
padding: 6px 10px;
|
||||
}}
|
||||
.sentence-header {{
|
||||
background: #f5f5f5;
|
||||
font-weight: normal;
|
||||
min-height: 30px;
|
||||
}}
|
||||
.sentence-line {{
|
||||
height: {line_height_px + 4}px;
|
||||
}}
|
||||
.sentence-answer {{
|
||||
color: #0066cc;
|
||||
font-style: italic;
|
||||
font-size: 11pt;
|
||||
}}
|
||||
|
||||
.page-info {{
|
||||
font-size: 9pt;
|
||||
color: #666;
|
||||
text-align: right;
|
||||
margin-top: 10px;
|
||||
}}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
"""
|
||||
|
||||
# ========== PAGE 1: VOCABULARY TABLE ==========
|
||||
if page_vocab:
|
||||
html += f"""
|
||||
<div class="page">
|
||||
<div class="header">
|
||||
<h1>{title} - Vokabeln (Seite {page_number})</h1>
|
||||
<div class="name-line">Name: _________________________ Datum: _____________</div>
|
||||
</div>
|
||||
|
||||
<table class="vocab-table">
|
||||
<thead>
|
||||
<tr>
|
||||
<th class="col-english">Englisch</th>
|
||||
<th class="col-german">Deutsch</th>
|
||||
<th class="col-correction">Korrektur</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
"""
|
||||
for v in page_vocab:
|
||||
if show_solutions:
|
||||
html += f"""
|
||||
<tr>
|
||||
<td>{v.english}</td>
|
||||
<td class="vocab-answer">{v.german}</td>
|
||||
<td></td>
|
||||
</tr>
|
||||
"""
|
||||
else:
|
||||
html += f"""
|
||||
<tr>
|
||||
<td>{v.english}</td>
|
||||
<td></td>
|
||||
<td></td>
|
||||
</tr>
|
||||
"""
|
||||
|
||||
html += """
|
||||
</tbody>
|
||||
</table>
|
||||
<div class="page-info">Vokabeln aus Unit</div>
|
||||
</div>
|
||||
"""
|
||||
|
||||
# ========== PAGE 2: SENTENCE PRACTICE ==========
|
||||
if page_sentences:
|
||||
html += f"""
|
||||
<div class="page">
|
||||
<div class="header">
|
||||
<h1>{title} - Lernsaetze (Seite {page_number})</h1>
|
||||
<div class="name-line">Name: _________________________ Datum: _____________</div>
|
||||
</div>
|
||||
"""
|
||||
for s in page_sentences:
|
||||
html += f"""
|
||||
<table class="sentence-table">
|
||||
<tr>
|
||||
<td class="sentence-header">{s.german}</td>
|
||||
</tr>
|
||||
"""
|
||||
if show_solutions:
|
||||
html += f"""
|
||||
<tr>
|
||||
<td class="sentence-line sentence-answer">{s.english}</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td class="sentence-line"></td>
|
||||
</tr>
|
||||
"""
|
||||
else:
|
||||
html += """
|
||||
<tr>
|
||||
<td class="sentence-line"></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td class="sentence-line"></td>
|
||||
</tr>
|
||||
"""
|
||||
html += """
|
||||
</table>
|
||||
"""
|
||||
|
||||
html += """
|
||||
<div class="page-info">Lernsaetze aus Unit</div>
|
||||
</div>
|
||||
"""
|
||||
|
||||
html += """
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
return html
|
||||
|
||||
|
||||
def generate_nru_worksheet_html(
|
||||
entries: List[Dict],
|
||||
title: str = "Vokabeltest",
|
||||
show_solutions: bool = False,
|
||||
specific_pages: List[int] = None
|
||||
) -> str:
|
||||
"""
|
||||
Generate complete NRU worksheet HTML for all pages.
|
||||
|
||||
Args:
|
||||
entries: List of vocabulary entries with source_page
|
||||
title: Worksheet title
|
||||
show_solutions: Whether to show answers
|
||||
specific_pages: List of specific page numbers to include (1-indexed)
|
||||
|
||||
Returns:
|
||||
Complete HTML document
|
||||
"""
|
||||
# Separate into vocab and sentences
|
||||
vocab_list, sentence_list = separate_vocab_and_sentences(entries)
|
||||
|
||||
# Get unique page numbers
|
||||
all_pages = set()
|
||||
for v in vocab_list:
|
||||
all_pages.add(v.source_page)
|
||||
for s in sentence_list:
|
||||
all_pages.add(s.source_page)
|
||||
|
||||
# Filter to specific pages if requested
|
||||
if specific_pages:
|
||||
all_pages = all_pages.intersection(set(specific_pages))
|
||||
|
||||
pages_sorted = sorted(all_pages)
|
||||
|
||||
logger.info(f"Generating NRU worksheet for pages {pages_sorted}")
|
||||
logger.info(f"Total vocab: {len(vocab_list)}, Total sentences: {len(sentence_list)}")
|
||||
|
||||
# Generate HTML for each page
|
||||
combined_html = """<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<style>
|
||||
@page {
|
||||
size: A4;
|
||||
margin: 1.5cm 2cm;
|
||||
}
|
||||
* {
|
||||
box-sizing: border-box;
|
||||
}
|
||||
body {
|
||||
font-family: Arial, Helvetica, sans-serif;
|
||||
font-size: 12pt;
|
||||
line-height: 1.4;
|
||||
margin: 0;
|
||||
padding: 0;
|
||||
}
|
||||
.page {
|
||||
page-break-after: always;
|
||||
min-height: 100%;
|
||||
}
|
||||
.page:last-child {
|
||||
page-break-after: avoid;
|
||||
}
|
||||
h1 {
|
||||
font-size: 16pt;
|
||||
margin: 0 0 8px 0;
|
||||
text-align: center;
|
||||
}
|
||||
.header {
|
||||
margin-bottom: 15px;
|
||||
}
|
||||
.name-line {
|
||||
font-size: 11pt;
|
||||
margin-bottom: 10px;
|
||||
}
|
||||
|
||||
/* Vocabulary Table - 3 columns */
|
||||
.vocab-table {
|
||||
width: 100%;
|
||||
border-collapse: collapse;
|
||||
table-layout: fixed;
|
||||
}
|
||||
.vocab-table th {
|
||||
background: #f0f0f0;
|
||||
border: 1px solid #333;
|
||||
padding: 6px 8px;
|
||||
font-weight: bold;
|
||||
font-size: 11pt;
|
||||
text-align: left;
|
||||
}
|
||||
.vocab-table td {
|
||||
border: 1px solid #333;
|
||||
padding: 4px 8px;
|
||||
height: 28px;
|
||||
vertical-align: middle;
|
||||
}
|
||||
.vocab-table .col-english { width: 35%; }
|
||||
.vocab-table .col-german { width: 35%; }
|
||||
.vocab-table .col-correction { width: 30%; }
|
||||
.vocab-answer {
|
||||
color: #0066cc;
|
||||
font-style: italic;
|
||||
}
|
||||
|
||||
/* Sentence Table - full width */
|
||||
.sentence-table {
|
||||
width: 100%;
|
||||
border-collapse: collapse;
|
||||
margin-bottom: 15px;
|
||||
}
|
||||
.sentence-table td {
|
||||
border: 1px solid #333;
|
||||
padding: 6px 10px;
|
||||
}
|
||||
.sentence-header {
|
||||
background: #f5f5f5;
|
||||
font-weight: normal;
|
||||
min-height: 30px;
|
||||
}
|
||||
.sentence-line {
|
||||
height: 32px;
|
||||
}
|
||||
.sentence-answer {
|
||||
color: #0066cc;
|
||||
font-style: italic;
|
||||
font-size: 11pt;
|
||||
}
|
||||
|
||||
.page-info {
|
||||
font-size: 9pt;
|
||||
color: #666;
|
||||
text-align: right;
|
||||
margin-top: 10px;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
"""
|
||||
|
||||
for page_num in pages_sorted:
|
||||
page_vocab = [v for v in vocab_list if v.source_page == page_num]
|
||||
page_sentences = [s for s in sentence_list if s.source_page == page_num]
|
||||
|
||||
# PAGE 1: VOCABULARY TABLE
|
||||
if page_vocab:
|
||||
combined_html += f"""
|
||||
<div class="page">
|
||||
<div class="header">
|
||||
<h1>{title} - Vokabeln (Seite {page_num})</h1>
|
||||
<div class="name-line">Name: _________________________ Datum: _____________</div>
|
||||
</div>
|
||||
|
||||
<table class="vocab-table">
|
||||
<thead>
|
||||
<tr>
|
||||
<th class="col-english">Englisch</th>
|
||||
<th class="col-german">Deutsch</th>
|
||||
<th class="col-correction">Korrektur</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
"""
|
||||
for v in page_vocab:
|
||||
if show_solutions:
|
||||
combined_html += f"""
|
||||
<tr>
|
||||
<td>{v.english}</td>
|
||||
<td class="vocab-answer">{v.german}</td>
|
||||
<td></td>
|
||||
</tr>
|
||||
"""
|
||||
else:
|
||||
combined_html += f"""
|
||||
<tr>
|
||||
<td>{v.english}</td>
|
||||
<td></td>
|
||||
<td></td>
|
||||
</tr>
|
||||
"""
|
||||
|
||||
combined_html += f"""
|
||||
</tbody>
|
||||
</table>
|
||||
<div class="page-info">{title} - Seite {page_num}</div>
|
||||
</div>
|
||||
"""
|
||||
|
||||
# PAGE 2: SENTENCE PRACTICE
|
||||
if page_sentences:
|
||||
combined_html += f"""
|
||||
<div class="page">
|
||||
<div class="header">
|
||||
<h1>{title} - Lernsaetze (Seite {page_num})</h1>
|
||||
<div class="name-line">Name: _________________________ Datum: _____________</div>
|
||||
</div>
|
||||
"""
|
||||
for s in page_sentences:
|
||||
combined_html += f"""
|
||||
<table class="sentence-table">
|
||||
<tr>
|
||||
<td class="sentence-header">{s.german}</td>
|
||||
</tr>
|
||||
"""
|
||||
if show_solutions:
|
||||
combined_html += f"""
|
||||
<tr>
|
||||
<td class="sentence-line sentence-answer">{s.english}</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td class="sentence-line"></td>
|
||||
</tr>
|
||||
"""
|
||||
else:
|
||||
combined_html += """
|
||||
<tr>
|
||||
<td class="sentence-line"></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td class="sentence-line"></td>
|
||||
</tr>
|
||||
"""
|
||||
combined_html += """
|
||||
</table>
|
||||
"""
|
||||
|
||||
combined_html += f"""
|
||||
<div class="page-info">{title} - Seite {page_num}</div>
|
||||
</div>
|
||||
"""
|
||||
|
||||
combined_html += """
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
return combined_html
|
||||
# Backward-compat shim -- module moved to worksheet/nru_html.py
|
||||
import importlib as _importlib
|
||||
import sys as _sys
|
||||
_sys.modules[__name__] = _importlib.import_module("worksheet.nru_html")
|
||||
|
||||
@@ -1,70 +1,4 @@
|
||||
"""
|
||||
NRU Worksheet Models — data classes and entry separation logic.
|
||||
|
||||
Extracted from nru_worksheet_generator.py for modularity.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from typing import List, Dict, Tuple
|
||||
from dataclasses import dataclass
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class VocabEntry:
|
||||
english: str
|
||||
german: str
|
||||
source_page: int = 1
|
||||
|
||||
|
||||
@dataclass
|
||||
class SentenceEntry:
|
||||
german: str
|
||||
english: str # For solution sheet
|
||||
source_page: int = 1
|
||||
|
||||
|
||||
def separate_vocab_and_sentences(entries: List[Dict]) -> Tuple[List[VocabEntry], List[SentenceEntry]]:
|
||||
"""
|
||||
Separate vocabulary entries into single words/phrases and full sentences.
|
||||
|
||||
Sentences are identified by:
|
||||
- Ending with punctuation (. ! ?)
|
||||
- Being longer than 40 characters
|
||||
- Containing multiple words with capital letters mid-sentence
|
||||
"""
|
||||
vocab_list = []
|
||||
sentence_list = []
|
||||
|
||||
for entry in entries:
|
||||
english = entry.get("english", "").strip()
|
||||
german = entry.get("german", "").strip()
|
||||
source_page = entry.get("source_page", 1)
|
||||
|
||||
if not english or not german:
|
||||
continue
|
||||
|
||||
# Detect if this is a sentence
|
||||
is_sentence = (
|
||||
english.endswith('.') or
|
||||
english.endswith('!') or
|
||||
english.endswith('?') or
|
||||
len(english) > 50 or
|
||||
(len(english.split()) > 5 and any(w[0].isupper() for w in english.split()[1:] if w))
|
||||
)
|
||||
|
||||
if is_sentence:
|
||||
sentence_list.append(SentenceEntry(
|
||||
german=german,
|
||||
english=english,
|
||||
source_page=source_page
|
||||
))
|
||||
else:
|
||||
vocab_list.append(VocabEntry(
|
||||
english=english,
|
||||
german=german,
|
||||
source_page=source_page
|
||||
))
|
||||
|
||||
return vocab_list, sentence_list
|
||||
# Backward-compat shim -- module moved to worksheet/nru_models.py
|
||||
import importlib as _importlib
|
||||
import sys as _sys
|
||||
_sys.modules[__name__] = _importlib.import_module("worksheet.nru_models")
|
||||
|
||||
@@ -1,31 +1,4 @@
|
||||
"""
|
||||
NRU Worksheet PDF — PDF generation using weasyprint.
|
||||
|
||||
Extracted from nru_worksheet_generator.py for modularity.
|
||||
"""
|
||||
|
||||
from typing import List, Dict, Tuple
|
||||
|
||||
from nru_worksheet_html import generate_nru_worksheet_html
|
||||
|
||||
|
||||
async def generate_nru_pdf(entries: List[Dict], title: str = "Vokabeltest", include_solutions: bool = True) -> Tuple[bytes, bytes]:
|
||||
"""
|
||||
Generate NRU worksheet PDFs.
|
||||
|
||||
Returns:
|
||||
Tuple of (worksheet_pdf_bytes, solution_pdf_bytes)
|
||||
"""
|
||||
from weasyprint import HTML
|
||||
|
||||
# Generate worksheet HTML
|
||||
worksheet_html = generate_nru_worksheet_html(entries, title, show_solutions=False)
|
||||
worksheet_pdf = HTML(string=worksheet_html).write_pdf()
|
||||
|
||||
# Generate solution HTML
|
||||
solution_pdf = None
|
||||
if include_solutions:
|
||||
solution_html = generate_nru_worksheet_html(entries, title, show_solutions=True)
|
||||
solution_pdf = HTML(string=solution_html).write_pdf()
|
||||
|
||||
return worksheet_pdf, solution_pdf
|
||||
# Backward-compat shim -- module moved to worksheet/nru_pdf.py
|
||||
import importlib as _importlib
|
||||
import sys as _sys
|
||||
_sys.modules[__name__] = _importlib.import_module("worksheet.nru_pdf")
|
||||
|
||||
@@ -1,17 +1,4 @@
|
||||
"""
|
||||
PDF Export Module for Abiturkorrektur System
|
||||
|
||||
Barrel re-export: all PDF generation functions and constants.
|
||||
"""
|
||||
|
||||
from pdf_export_styles import ( # noqa: F401
|
||||
GRADE_POINTS_TO_NOTE,
|
||||
CRITERIA_DISPLAY_NAMES,
|
||||
CRITERIA_WEIGHTS,
|
||||
get_custom_styles,
|
||||
)
|
||||
from pdf_export_gutachten import generate_gutachten_pdf # noqa: F401
|
||||
from pdf_export_overview import ( # noqa: F401
|
||||
generate_klausur_overview_pdf,
|
||||
generate_annotations_pdf,
|
||||
)
|
||||
# Backward-compat shim -- module moved to korrektur/pdf_export.py
|
||||
import importlib as _importlib
|
||||
import sys as _sys
|
||||
_sys.modules[__name__] = _importlib.import_module("korrektur.pdf_export")
|
||||
|
||||
@@ -1,315 +1,4 @@
|
||||
"""
|
||||
PDF Export - Individual Gutachten PDF generation.
|
||||
|
||||
Generates a single student's Gutachten with criteria table,
|
||||
workflow info, and annotation summary.
|
||||
"""
|
||||
|
||||
import io
|
||||
from datetime import datetime
|
||||
from typing import Dict, List, Optional, Any
|
||||
|
||||
from reportlab.lib import colors
|
||||
from reportlab.lib.pagesizes import A4
|
||||
from reportlab.lib.units import cm
|
||||
from reportlab.platypus import (
|
||||
SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle,
|
||||
HRFlowable, KeepTogether
|
||||
)
|
||||
|
||||
from pdf_export_styles import (
|
||||
GRADE_POINTS_TO_NOTE,
|
||||
CRITERIA_DISPLAY_NAMES,
|
||||
CRITERIA_WEIGHTS,
|
||||
get_custom_styles,
|
||||
)
|
||||
|
||||
|
||||
def generate_gutachten_pdf(
|
||||
student_data: Dict[str, Any],
|
||||
klausur_data: Dict[str, Any],
|
||||
annotations: List[Dict[str, Any]] = None,
|
||||
workflow_data: Dict[str, Any] = None
|
||||
) -> bytes:
|
||||
"""
|
||||
Generate a PDF Gutachten for a single student.
|
||||
|
||||
Args:
|
||||
student_data: Student work data including criteria_scores, gutachten, grade_points
|
||||
klausur_data: Klausur metadata (title, subject, year, etc.)
|
||||
annotations: List of annotations for annotation summary
|
||||
workflow_data: Examiner workflow data (EK, ZK, DK info)
|
||||
|
||||
Returns:
|
||||
PDF as bytes
|
||||
"""
|
||||
buffer = io.BytesIO()
|
||||
doc = SimpleDocTemplate(
|
||||
buffer,
|
||||
pagesize=A4,
|
||||
rightMargin=2*cm,
|
||||
leftMargin=2*cm,
|
||||
topMargin=2*cm,
|
||||
bottomMargin=2*cm
|
||||
)
|
||||
|
||||
styles = get_custom_styles()
|
||||
story = []
|
||||
|
||||
# Header
|
||||
story.append(Paragraph("Gutachten zur Abiturklausur", styles['GutachtenTitle']))
|
||||
story.append(Paragraph(f"{klausur_data.get('subject', 'Deutsch')} - {klausur_data.get('title', '')}", styles['GutachtenSubtitle']))
|
||||
story.append(Spacer(1, 0.5*cm))
|
||||
|
||||
# Meta information table
|
||||
meta_data = [
|
||||
["Pruefling:", student_data.get('student_name', 'Anonym')],
|
||||
["Schuljahr:", f"{klausur_data.get('year', 2025)}"],
|
||||
["Kurs:", klausur_data.get('semester', 'Abitur')],
|
||||
["Datum:", datetime.now().strftime("%d.%m.%Y")]
|
||||
]
|
||||
|
||||
meta_table = Table(meta_data, colWidths=[4*cm, 10*cm])
|
||||
meta_table.setStyle(TableStyle([
|
||||
('FONTNAME', (0, 0), (0, -1), 'Helvetica-Bold'),
|
||||
('FONTSIZE', (0, 0), (-1, -1), 10),
|
||||
('BOTTOMPADDING', (0, 0), (-1, -1), 4),
|
||||
('TOPPADDING', (0, 0), (-1, -1), 4),
|
||||
]))
|
||||
story.append(meta_table)
|
||||
story.append(Spacer(1, 0.5*cm))
|
||||
story.append(HRFlowable(width="100%", thickness=1, color=colors.HexColor('#e2e8f0')))
|
||||
story.append(Spacer(1, 0.5*cm))
|
||||
|
||||
# Gutachten content
|
||||
_add_gutachten_content(story, styles, student_data)
|
||||
|
||||
story.append(Spacer(1, 0.5*cm))
|
||||
story.append(HRFlowable(width="100%", thickness=1, color=colors.HexColor('#e2e8f0')))
|
||||
story.append(Spacer(1, 0.5*cm))
|
||||
|
||||
# Bewertungstabelle
|
||||
_add_criteria_table(story, styles, student_data)
|
||||
|
||||
# Final grade box
|
||||
_add_grade_box(story, styles, student_data)
|
||||
|
||||
# Examiner workflow information
|
||||
if workflow_data:
|
||||
_add_workflow_info(story, styles, workflow_data)
|
||||
|
||||
# Annotation summary
|
||||
if annotations:
|
||||
_add_annotation_summary(story, styles, annotations)
|
||||
|
||||
# Footer
|
||||
_add_footer(story, styles)
|
||||
|
||||
# Build PDF
|
||||
doc.build(story)
|
||||
buffer.seek(0)
|
||||
return buffer.getvalue()
|
||||
|
||||
|
||||
def _add_gutachten_content(story, styles, student_data):
|
||||
"""Add gutachten text sections to the story."""
|
||||
gutachten = student_data.get('gutachten', {})
|
||||
|
||||
if gutachten:
|
||||
if gutachten.get('einleitung'):
|
||||
story.append(Paragraph("Einleitung", styles['SectionHeader']))
|
||||
story.append(Paragraph(gutachten['einleitung'], styles['GutachtenBody']))
|
||||
story.append(Spacer(1, 0.3*cm))
|
||||
|
||||
if gutachten.get('hauptteil'):
|
||||
story.append(Paragraph("Hauptteil", styles['SectionHeader']))
|
||||
story.append(Paragraph(gutachten['hauptteil'], styles['GutachtenBody']))
|
||||
story.append(Spacer(1, 0.3*cm))
|
||||
|
||||
if gutachten.get('fazit'):
|
||||
story.append(Paragraph("Fazit", styles['SectionHeader']))
|
||||
story.append(Paragraph(gutachten['fazit'], styles['GutachtenBody']))
|
||||
story.append(Spacer(1, 0.3*cm))
|
||||
|
||||
if gutachten.get('staerken') or gutachten.get('schwaechen'):
|
||||
story.append(Spacer(1, 0.3*cm))
|
||||
|
||||
if gutachten.get('staerken'):
|
||||
story.append(Paragraph("Staerken:", styles['SectionHeader']))
|
||||
for s in gutachten['staerken']:
|
||||
story.append(Paragraph(f"• {s}", styles['ListItem']))
|
||||
|
||||
if gutachten.get('schwaechen'):
|
||||
story.append(Paragraph("Verbesserungspotenzial:", styles['SectionHeader']))
|
||||
for s in gutachten['schwaechen']:
|
||||
story.append(Paragraph(f"• {s}", styles['ListItem']))
|
||||
else:
|
||||
story.append(Paragraph("<i>Kein Gutachten-Text vorhanden.</i>", styles['GutachtenBody']))
|
||||
|
||||
|
||||
def _add_criteria_table(story, styles, student_data):
|
||||
"""Add criteria scoring table to the story."""
|
||||
story.append(Paragraph("Bewertung nach Kriterien", styles['SectionHeader']))
|
||||
story.append(Spacer(1, 0.2*cm))
|
||||
|
||||
criteria_scores = student_data.get('criteria_scores', {})
|
||||
|
||||
table_data = [["Kriterium", "Gewichtung", "Erreicht", "Punkte"]]
|
||||
total_weighted = 0
|
||||
total_weight = 0
|
||||
|
||||
for key, display_name in CRITERIA_DISPLAY_NAMES.items():
|
||||
weight = CRITERIA_WEIGHTS.get(key, 0)
|
||||
score_data = criteria_scores.get(key, {})
|
||||
score = score_data.get('score', 0) if isinstance(score_data, dict) else score_data
|
||||
|
||||
weighted_score = (score / 100) * weight if score else 0
|
||||
total_weighted += weighted_score
|
||||
total_weight += weight
|
||||
|
||||
table_data.append([
|
||||
display_name,
|
||||
f"{weight}%",
|
||||
f"{score}%",
|
||||
f"{weighted_score:.1f}"
|
||||
])
|
||||
|
||||
table_data.append([
|
||||
"Gesamt",
|
||||
f"{total_weight}%",
|
||||
"",
|
||||
f"{total_weighted:.1f}"
|
||||
])
|
||||
|
||||
criteria_table = Table(table_data, colWidths=[8*cm, 2.5*cm, 2.5*cm, 2.5*cm])
|
||||
criteria_table.setStyle(TableStyle([
|
||||
('BACKGROUND', (0, 0), (-1, 0), colors.HexColor('#2c5282')),
|
||||
('TEXTCOLOR', (0, 0), (-1, 0), colors.white),
|
||||
('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
|
||||
('FONTSIZE', (0, 0), (-1, 0), 10),
|
||||
('ALIGN', (1, 0), (-1, -1), 'CENTER'),
|
||||
('FONTSIZE', (0, 1), (-1, -1), 9),
|
||||
('BOTTOMPADDING', (0, 0), (-1, -1), 6),
|
||||
('TOPPADDING', (0, 0), (-1, -1), 6),
|
||||
('GRID', (0, 0), (-1, -1), 0.5, colors.HexColor('#e2e8f0')),
|
||||
('BACKGROUND', (0, -1), (-1, -1), colors.HexColor('#f7fafc')),
|
||||
('FONTNAME', (0, -1), (-1, -1), 'Helvetica-Bold'),
|
||||
('ROWBACKGROUNDS', (0, 1), (-1, -2), [colors.white, colors.HexColor('#f7fafc')]),
|
||||
]))
|
||||
story.append(criteria_table)
|
||||
story.append(Spacer(1, 0.5*cm))
|
||||
|
||||
|
||||
def _add_grade_box(story, styles, student_data):
|
||||
"""Add final grade box to the story."""
|
||||
grade_points = student_data.get('grade_points', 0)
|
||||
grade_note = GRADE_POINTS_TO_NOTE.get(grade_points, "?")
|
||||
raw_points = student_data.get('raw_points', 0)
|
||||
|
||||
grade_data = [
|
||||
["Rohpunkte:", f"{raw_points} / 100"],
|
||||
["Notenpunkte:", f"{grade_points} Punkte"],
|
||||
["Note:", grade_note]
|
||||
]
|
||||
|
||||
grade_table = Table(grade_data, colWidths=[4*cm, 4*cm])
|
||||
grade_table.setStyle(TableStyle([
|
||||
('BACKGROUND', (0, 0), (-1, -1), colors.HexColor('#ebf8ff')),
|
||||
('FONTNAME', (0, 0), (0, -1), 'Helvetica-Bold'),
|
||||
('FONTNAME', (1, -1), (1, -1), 'Helvetica-Bold'),
|
||||
('FONTSIZE', (0, 0), (-1, -1), 11),
|
||||
('FONTSIZE', (1, -1), (1, -1), 14),
|
||||
('TEXTCOLOR', (1, -1), (1, -1), colors.HexColor('#2c5282')),
|
||||
('BOTTOMPADDING', (0, 0), (-1, -1), 8),
|
||||
('TOPPADDING', (0, 0), (-1, -1), 8),
|
||||
('LEFTPADDING', (0, 0), (-1, -1), 12),
|
||||
('BOX', (0, 0), (-1, -1), 1, colors.HexColor('#2c5282')),
|
||||
('ALIGN', (1, 0), (1, -1), 'RIGHT'),
|
||||
]))
|
||||
|
||||
story.append(KeepTogether([
|
||||
Paragraph("Endergebnis", styles['SectionHeader']),
|
||||
Spacer(1, 0.2*cm),
|
||||
grade_table
|
||||
]))
|
||||
|
||||
|
||||
def _add_workflow_info(story, styles, workflow_data):
|
||||
"""Add examiner workflow information to the story."""
|
||||
story.append(Spacer(1, 0.5*cm))
|
||||
story.append(HRFlowable(width="100%", thickness=1, color=colors.HexColor('#e2e8f0')))
|
||||
story.append(Spacer(1, 0.3*cm))
|
||||
story.append(Paragraph("Korrekturverlauf", styles['SectionHeader']))
|
||||
|
||||
workflow_rows = []
|
||||
|
||||
if workflow_data.get('erst_korrektor'):
|
||||
ek = workflow_data['erst_korrektor']
|
||||
workflow_rows.append([
|
||||
"Erstkorrektor:",
|
||||
ek.get('name', 'Unbekannt'),
|
||||
f"{ek.get('grade_points', '-')} Punkte"
|
||||
])
|
||||
|
||||
if workflow_data.get('zweit_korrektor'):
|
||||
zk = workflow_data['zweit_korrektor']
|
||||
workflow_rows.append([
|
||||
"Zweitkorrektor:",
|
||||
zk.get('name', 'Unbekannt'),
|
||||
f"{zk.get('grade_points', '-')} Punkte"
|
||||
])
|
||||
|
||||
if workflow_data.get('dritt_korrektor'):
|
||||
dk = workflow_data['dritt_korrektor']
|
||||
workflow_rows.append([
|
||||
"Drittkorrektor:",
|
||||
dk.get('name', 'Unbekannt'),
|
||||
f"{dk.get('grade_points', '-')} Punkte"
|
||||
])
|
||||
|
||||
if workflow_data.get('final_grade_source'):
|
||||
workflow_rows.append([
|
||||
"Endnote durch:",
|
||||
workflow_data['final_grade_source'],
|
||||
""
|
||||
])
|
||||
|
||||
if workflow_rows:
|
||||
workflow_table = Table(workflow_rows, colWidths=[4*cm, 6*cm, 4*cm])
|
||||
workflow_table.setStyle(TableStyle([
|
||||
('FONTNAME', (0, 0), (0, -1), 'Helvetica-Bold'),
|
||||
('FONTSIZE', (0, 0), (-1, -1), 9),
|
||||
('BOTTOMPADDING', (0, 0), (-1, -1), 4),
|
||||
('TOPPADDING', (0, 0), (-1, -1), 4),
|
||||
]))
|
||||
story.append(workflow_table)
|
||||
|
||||
|
||||
def _add_annotation_summary(story, styles, annotations):
|
||||
"""Add annotation summary to the story."""
|
||||
story.append(Spacer(1, 0.5*cm))
|
||||
story.append(HRFlowable(width="100%", thickness=1, color=colors.HexColor('#e2e8f0')))
|
||||
story.append(Spacer(1, 0.3*cm))
|
||||
story.append(Paragraph("Anmerkungen (Zusammenfassung)", styles['SectionHeader']))
|
||||
|
||||
by_type = {}
|
||||
for ann in annotations:
|
||||
ann_type = ann.get('type', 'comment')
|
||||
if ann_type not in by_type:
|
||||
by_type[ann_type] = []
|
||||
by_type[ann_type].append(ann)
|
||||
|
||||
for ann_type, anns in by_type.items():
|
||||
type_name = CRITERIA_DISPLAY_NAMES.get(ann_type, ann_type.replace('_', ' ').title())
|
||||
story.append(Paragraph(f"{type_name} ({len(anns)} Anmerkungen)", styles['ListItem']))
|
||||
|
||||
|
||||
def _add_footer(story, styles):
|
||||
"""Add generation footer to the story."""
|
||||
story.append(Spacer(1, 1*cm))
|
||||
story.append(HRFlowable(width="100%", thickness=0.5, color=colors.HexColor('#cbd5e0')))
|
||||
story.append(Spacer(1, 0.2*cm))
|
||||
story.append(Paragraph(
|
||||
f"Erstellt am {datetime.now().strftime('%d.%m.%Y um %H:%M Uhr')} | BreakPilot Abiturkorrektur-System",
|
||||
styles['MetaText']
|
||||
))
|
||||
# Backward-compat shim -- module moved to korrektur/pdf_export_gutachten.py
|
||||
import importlib as _importlib
|
||||
import sys as _sys
|
||||
_sys.modules[__name__] = _importlib.import_module("korrektur.pdf_export_gutachten")
|
||||
|
||||
@@ -1,297 +1,4 @@
|
||||
"""
|
||||
PDF Export - Klausur overview and annotations PDF generation.
|
||||
|
||||
Generates:
|
||||
- Klausur overview with grade distribution for all students
|
||||
- Annotations PDF for a single student
|
||||
"""
|
||||
|
||||
import io
|
||||
from datetime import datetime
|
||||
from typing import Dict, List, Optional, Any
|
||||
|
||||
from reportlab.lib import colors
|
||||
from reportlab.lib.pagesizes import A4
|
||||
from reportlab.lib.units import cm
|
||||
from reportlab.platypus import (
|
||||
SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle,
|
||||
HRFlowable
|
||||
)
|
||||
|
||||
from pdf_export_styles import (
|
||||
GRADE_POINTS_TO_NOTE,
|
||||
CRITERIA_DISPLAY_NAMES,
|
||||
get_custom_styles,
|
||||
)
|
||||
|
||||
|
||||
def generate_klausur_overview_pdf(
|
||||
klausur_data: Dict[str, Any],
|
||||
students: List[Dict[str, Any]],
|
||||
fairness_data: Optional[Dict[str, Any]] = None
|
||||
) -> bytes:
|
||||
"""
|
||||
Generate an overview PDF for an entire Klausur with all student grades.
|
||||
|
||||
Args:
|
||||
klausur_data: Klausur metadata
|
||||
students: List of all student work data
|
||||
fairness_data: Optional fairness analysis data
|
||||
|
||||
Returns:
|
||||
PDF as bytes
|
||||
"""
|
||||
buffer = io.BytesIO()
|
||||
doc = SimpleDocTemplate(
|
||||
buffer,
|
||||
pagesize=A4,
|
||||
rightMargin=1.5*cm,
|
||||
leftMargin=1.5*cm,
|
||||
topMargin=2*cm,
|
||||
bottomMargin=2*cm
|
||||
)
|
||||
|
||||
styles = get_custom_styles()
|
||||
story = []
|
||||
|
||||
# Header
|
||||
story.append(Paragraph("Notenuebersicht", styles['GutachtenTitle']))
|
||||
story.append(Paragraph(f"{klausur_data.get('subject', 'Deutsch')} - {klausur_data.get('title', '')}", styles['GutachtenSubtitle']))
|
||||
story.append(Spacer(1, 0.5*cm))
|
||||
|
||||
# Meta information
|
||||
meta_data = [
|
||||
["Schuljahr:", f"{klausur_data.get('year', 2025)}"],
|
||||
["Kurs:", klausur_data.get('semester', 'Abitur')],
|
||||
["Anzahl Arbeiten:", str(len(students))],
|
||||
["Stand:", datetime.now().strftime("%d.%m.%Y")]
|
||||
]
|
||||
|
||||
meta_table = Table(meta_data, colWidths=[4*cm, 10*cm])
|
||||
meta_table.setStyle(TableStyle([
|
||||
('FONTNAME', (0, 0), (0, -1), 'Helvetica-Bold'),
|
||||
('FONTSIZE', (0, 0), (-1, -1), 10),
|
||||
('BOTTOMPADDING', (0, 0), (-1, -1), 4),
|
||||
('TOPPADDING', (0, 0), (-1, -1), 4),
|
||||
]))
|
||||
story.append(meta_table)
|
||||
story.append(Spacer(1, 0.5*cm))
|
||||
|
||||
# Statistics (if fairness data available)
|
||||
if fairness_data and fairness_data.get('statistics'):
|
||||
_add_statistics(story, styles, fairness_data['statistics'])
|
||||
|
||||
story.append(HRFlowable(width="100%", thickness=1, color=colors.HexColor('#e2e8f0')))
|
||||
story.append(Spacer(1, 0.5*cm))
|
||||
|
||||
# Student grades table
|
||||
sorted_students = sorted(students, key=lambda s: s.get('grade_points', 0), reverse=True)
|
||||
_add_student_table(story, styles, sorted_students)
|
||||
|
||||
# Grade distribution
|
||||
_add_grade_distribution(story, styles, sorted_students)
|
||||
|
||||
# Footer
|
||||
story.append(Spacer(1, 1*cm))
|
||||
story.append(HRFlowable(width="100%", thickness=0.5, color=colors.HexColor('#cbd5e0')))
|
||||
story.append(Spacer(1, 0.2*cm))
|
||||
story.append(Paragraph(
|
||||
f"Erstellt am {datetime.now().strftime('%d.%m.%Y um %H:%M Uhr')} | BreakPilot Abiturkorrektur-System",
|
||||
styles['MetaText']
|
||||
))
|
||||
|
||||
# Build PDF
|
||||
doc.build(story)
|
||||
buffer.seek(0)
|
||||
return buffer.getvalue()
|
||||
|
||||
|
||||
def _add_statistics(story, styles, stats):
|
||||
"""Add statistics section."""
|
||||
story.append(Paragraph("Statistik", styles['SectionHeader']))
|
||||
|
||||
stats_data = [
|
||||
["Durchschnitt:", f"{stats.get('average_grade', 0):.1f} Punkte"],
|
||||
["Minimum:", f"{stats.get('min_grade', 0)} Punkte"],
|
||||
["Maximum:", f"{stats.get('max_grade', 0)} Punkte"],
|
||||
["Standardabweichung:", f"{stats.get('standard_deviation', 0):.2f}"],
|
||||
]
|
||||
|
||||
stats_table = Table(stats_data, colWidths=[4*cm, 4*cm])
|
||||
stats_table.setStyle(TableStyle([
|
||||
('FONTNAME', (0, 0), (0, -1), 'Helvetica-Bold'),
|
||||
('FONTSIZE', (0, 0), (-1, -1), 9),
|
||||
('BOTTOMPADDING', (0, 0), (-1, -1), 4),
|
||||
('BACKGROUND', (0, 0), (-1, -1), colors.HexColor('#f7fafc')),
|
||||
('BOX', (0, 0), (-1, -1), 0.5, colors.HexColor('#e2e8f0')),
|
||||
]))
|
||||
story.append(stats_table)
|
||||
story.append(Spacer(1, 0.5*cm))
|
||||
|
||||
|
||||
def _add_student_table(story, styles, sorted_students):
|
||||
"""Add student grades table."""
|
||||
story.append(Paragraph("Einzelergebnisse", styles['SectionHeader']))
|
||||
story.append(Spacer(1, 0.2*cm))
|
||||
|
||||
table_data = [["#", "Name", "Rohpunkte", "Notenpunkte", "Note", "Status"]]
|
||||
|
||||
for idx, student in enumerate(sorted_students, 1):
|
||||
grade_points = student.get('grade_points', 0)
|
||||
grade_note = GRADE_POINTS_TO_NOTE.get(grade_points, "-")
|
||||
raw_points = student.get('raw_points', 0)
|
||||
status = student.get('status', 'unknown')
|
||||
|
||||
status_display = {
|
||||
'completed': 'Abgeschlossen',
|
||||
'first_examiner': 'In Korrektur',
|
||||
'second_examiner': 'Zweitkorrektur',
|
||||
'uploaded': 'Hochgeladen',
|
||||
'ocr_complete': 'OCR fertig',
|
||||
'analyzing': 'Wird analysiert'
|
||||
}.get(status, status)
|
||||
|
||||
table_data.append([
|
||||
str(idx),
|
||||
student.get('student_name', 'Anonym'),
|
||||
f"{raw_points}/100",
|
||||
str(grade_points),
|
||||
grade_note,
|
||||
status_display
|
||||
])
|
||||
|
||||
student_table = Table(table_data, colWidths=[1*cm, 5*cm, 2.5*cm, 3*cm, 2*cm, 3*cm])
|
||||
student_table.setStyle(TableStyle([
|
||||
('BACKGROUND', (0, 0), (-1, 0), colors.HexColor('#2c5282')),
|
||||
('TEXTCOLOR', (0, 0), (-1, 0), colors.white),
|
||||
('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
|
||||
('FONTSIZE', (0, 0), (-1, 0), 9),
|
||||
('ALIGN', (0, 0), (-1, 0), 'CENTER'),
|
||||
('FONTSIZE', (0, 1), (-1, -1), 9),
|
||||
('ALIGN', (0, 1), (0, -1), 'CENTER'),
|
||||
('ALIGN', (2, 1), (4, -1), 'CENTER'),
|
||||
('BOTTOMPADDING', (0, 0), (-1, -1), 6),
|
||||
('TOPPADDING', (0, 0), (-1, -1), 6),
|
||||
('GRID', (0, 0), (-1, -1), 0.5, colors.HexColor('#e2e8f0')),
|
||||
('ROWBACKGROUNDS', (0, 1), (-1, -1), [colors.white, colors.HexColor('#f7fafc')]),
|
||||
]))
|
||||
story.append(student_table)
|
||||
|
||||
|
||||
def _add_grade_distribution(story, styles, sorted_students):
|
||||
"""Add grade distribution table."""
|
||||
story.append(Spacer(1, 0.5*cm))
|
||||
story.append(Paragraph("Notenverteilung", styles['SectionHeader']))
|
||||
story.append(Spacer(1, 0.2*cm))
|
||||
|
||||
grade_counts = {}
|
||||
for student in sorted_students:
|
||||
gp = student.get('grade_points', 0)
|
||||
grade_counts[gp] = grade_counts.get(gp, 0) + 1
|
||||
|
||||
dist_data = [["Punkte", "Note", "Anzahl"]]
|
||||
for points in range(15, -1, -1):
|
||||
if points in grade_counts:
|
||||
note = GRADE_POINTS_TO_NOTE.get(points, "-")
|
||||
count = grade_counts[points]
|
||||
dist_data.append([str(points), note, str(count)])
|
||||
|
||||
if len(dist_data) > 1:
|
||||
dist_table = Table(dist_data, colWidths=[2.5*cm, 2.5*cm, 2.5*cm])
|
||||
dist_table.setStyle(TableStyle([
|
||||
('BACKGROUND', (0, 0), (-1, 0), colors.HexColor('#2c5282')),
|
||||
('TEXTCOLOR', (0, 0), (-1, 0), colors.white),
|
||||
('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
|
||||
('FONTSIZE', (0, 0), (-1, -1), 9),
|
||||
('ALIGN', (0, 0), (-1, -1), 'CENTER'),
|
||||
('BOTTOMPADDING', (0, 0), (-1, -1), 4),
|
||||
('TOPPADDING', (0, 0), (-1, -1), 4),
|
||||
('GRID', (0, 0), (-1, -1), 0.5, colors.HexColor('#e2e8f0')),
|
||||
]))
|
||||
story.append(dist_table)
|
||||
|
||||
|
||||
def generate_annotations_pdf(
|
||||
student_data: Dict[str, Any],
|
||||
klausur_data: Dict[str, Any],
|
||||
annotations: List[Dict[str, Any]]
|
||||
) -> bytes:
|
||||
"""
|
||||
Generate a PDF with all annotations for a student work.
|
||||
|
||||
Args:
|
||||
student_data: Student work data
|
||||
klausur_data: Klausur metadata
|
||||
annotations: List of all annotations
|
||||
|
||||
Returns:
|
||||
PDF as bytes
|
||||
"""
|
||||
buffer = io.BytesIO()
|
||||
doc = SimpleDocTemplate(
|
||||
buffer,
|
||||
pagesize=A4,
|
||||
rightMargin=2*cm,
|
||||
leftMargin=2*cm,
|
||||
topMargin=2*cm,
|
||||
bottomMargin=2*cm
|
||||
)
|
||||
|
||||
styles = get_custom_styles()
|
||||
story = []
|
||||
|
||||
# Header
|
||||
story.append(Paragraph("Anmerkungen zur Klausur", styles['GutachtenTitle']))
|
||||
story.append(Paragraph(f"{student_data.get('student_name', 'Anonym')}", styles['GutachtenSubtitle']))
|
||||
story.append(Spacer(1, 0.5*cm))
|
||||
|
||||
if not annotations:
|
||||
story.append(Paragraph("<i>Keine Anmerkungen vorhanden.</i>", styles['GutachtenBody']))
|
||||
else:
|
||||
# Group by type
|
||||
by_type = {}
|
||||
for ann in annotations:
|
||||
ann_type = ann.get('type', 'comment')
|
||||
if ann_type not in by_type:
|
||||
by_type[ann_type] = []
|
||||
by_type[ann_type].append(ann)
|
||||
|
||||
for ann_type, anns in by_type.items():
|
||||
type_name = CRITERIA_DISPLAY_NAMES.get(ann_type, ann_type.replace('_', ' ').title())
|
||||
story.append(Paragraph(f"{type_name} ({len(anns)})", styles['SectionHeader']))
|
||||
story.append(Spacer(1, 0.2*cm))
|
||||
|
||||
sorted_anns = sorted(anns, key=lambda a: (a.get('page', 0), a.get('position', {}).get('y', 0)))
|
||||
|
||||
for idx, ann in enumerate(sorted_anns, 1):
|
||||
page = ann.get('page', 1)
|
||||
text = ann.get('text', '')
|
||||
suggestion = ann.get('suggestion', '')
|
||||
severity = ann.get('severity', 'minor')
|
||||
|
||||
ann_text = f"<b>[S.{page}]</b> {text}"
|
||||
if suggestion:
|
||||
ann_text += f" -> <i>{suggestion}</i>"
|
||||
|
||||
if severity == 'critical':
|
||||
ann_text = f"<font color='red'>{ann_text}</font>"
|
||||
elif severity == 'major':
|
||||
ann_text = f"<font color='orange'>{ann_text}</font>"
|
||||
|
||||
story.append(Paragraph(f"{idx}. {ann_text}", styles['ListItem']))
|
||||
|
||||
story.append(Spacer(1, 0.3*cm))
|
||||
|
||||
# Footer
|
||||
story.append(Spacer(1, 1*cm))
|
||||
story.append(HRFlowable(width="100%", thickness=0.5, color=colors.HexColor('#cbd5e0')))
|
||||
story.append(Spacer(1, 0.2*cm))
|
||||
story.append(Paragraph(
|
||||
f"Erstellt am {datetime.now().strftime('%d.%m.%Y um %H:%M Uhr')} | BreakPilot Abiturkorrektur-System",
|
||||
styles['MetaText']
|
||||
))
|
||||
|
||||
# Build PDF
|
||||
doc.build(story)
|
||||
buffer.seek(0)
|
||||
return buffer.getvalue()
|
||||
# Backward-compat shim -- module moved to korrektur/pdf_export_overview.py
|
||||
import importlib as _importlib
|
||||
import sys as _sys
|
||||
_sys.modules[__name__] = _importlib.import_module("korrektur.pdf_export_overview")
|
||||
|
||||
@@ -1,110 +1,4 @@
|
||||
"""
|
||||
PDF Export - Constants and ReportLab styles for Abiturkorrektur PDFs.
|
||||
"""
|
||||
|
||||
from reportlab.lib import colors
|
||||
from reportlab.lib.enums import TA_LEFT, TA_CENTER, TA_JUSTIFY
|
||||
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
|
||||
|
||||
|
||||
# =============================================
|
||||
# CONSTANTS
|
||||
# =============================================
|
||||
|
||||
GRADE_POINTS_TO_NOTE = {
|
||||
15: "1+", 14: "1", 13: "1-",
|
||||
12: "2+", 11: "2", 10: "2-",
|
||||
9: "3+", 8: "3", 7: "3-",
|
||||
6: "4+", 5: "4", 4: "4-",
|
||||
3: "5+", 2: "5", 1: "5-",
|
||||
0: "6"
|
||||
}
|
||||
|
||||
CRITERIA_DISPLAY_NAMES = {
|
||||
"rechtschreibung": "Sprachliche Richtigkeit (Rechtschreibung)",
|
||||
"grammatik": "Sprachliche Richtigkeit (Grammatik)",
|
||||
"inhalt": "Inhaltliche Leistung",
|
||||
"struktur": "Aufbau und Struktur",
|
||||
"stil": "Ausdruck und Stil"
|
||||
}
|
||||
|
||||
CRITERIA_WEIGHTS = {
|
||||
"rechtschreibung": 15,
|
||||
"grammatik": 15,
|
||||
"inhalt": 40,
|
||||
"struktur": 15,
|
||||
"stil": 15
|
||||
}
|
||||
|
||||
|
||||
# =============================================
|
||||
# STYLES
|
||||
# =============================================
|
||||
|
||||
def get_custom_styles():
|
||||
"""Create custom paragraph styles for Gutachten."""
|
||||
styles = getSampleStyleSheet()
|
||||
|
||||
# Title style
|
||||
styles.add(ParagraphStyle(
|
||||
name='GutachtenTitle',
|
||||
parent=styles['Heading1'],
|
||||
fontSize=16,
|
||||
spaceAfter=12,
|
||||
alignment=TA_CENTER,
|
||||
textColor=colors.HexColor('#1e3a5f')
|
||||
))
|
||||
|
||||
# Subtitle style
|
||||
styles.add(ParagraphStyle(
|
||||
name='GutachtenSubtitle',
|
||||
parent=styles['Heading2'],
|
||||
fontSize=12,
|
||||
spaceAfter=8,
|
||||
spaceBefore=16,
|
||||
textColor=colors.HexColor('#2c5282')
|
||||
))
|
||||
|
||||
# Section header
|
||||
styles.add(ParagraphStyle(
|
||||
name='SectionHeader',
|
||||
parent=styles['Heading3'],
|
||||
fontSize=11,
|
||||
spaceAfter=6,
|
||||
spaceBefore=12,
|
||||
textColor=colors.HexColor('#2d3748'),
|
||||
borderColor=colors.HexColor('#e2e8f0'),
|
||||
borderWidth=0,
|
||||
borderPadding=0
|
||||
))
|
||||
|
||||
# Body text
|
||||
styles.add(ParagraphStyle(
|
||||
name='GutachtenBody',
|
||||
parent=styles['Normal'],
|
||||
fontSize=10,
|
||||
leading=14,
|
||||
alignment=TA_JUSTIFY,
|
||||
spaceAfter=6
|
||||
))
|
||||
|
||||
# Small text for footer/meta
|
||||
styles.add(ParagraphStyle(
|
||||
name='MetaText',
|
||||
parent=styles['Normal'],
|
||||
fontSize=8,
|
||||
textColor=colors.grey,
|
||||
alignment=TA_LEFT
|
||||
))
|
||||
|
||||
# List item
|
||||
styles.add(ParagraphStyle(
|
||||
name='ListItem',
|
||||
parent=styles['Normal'],
|
||||
fontSize=10,
|
||||
leftIndent=20,
|
||||
bulletIndent=10,
|
||||
spaceAfter=4
|
||||
))
|
||||
|
||||
return styles
|
||||
# Backward-compat shim -- module moved to korrektur/pdf_export_styles.py
|
||||
import importlib as _importlib
|
||||
import sys as _sys
|
||||
_sys.modules[__name__] = _importlib.import_module("korrektur.pdf_export_styles")
|
||||
|
||||
@@ -1,164 +1,4 @@
|
||||
"""
|
||||
PDF Extraction Module
|
||||
|
||||
NOTE: This module delegates ML-heavy operations to the embedding-service via HTTP.
|
||||
|
||||
Provides enhanced PDF text extraction using multiple backends (in embedding-service):
|
||||
1. Unstructured.io - Best for complex layouts, tables, headers (Apache 2.0)
|
||||
2. pypdf - Modern, BSD-licensed PDF library (recommended default)
|
||||
|
||||
License Compliance:
|
||||
- Default backends (unstructured, pypdf) are BSD/Apache licensed
|
||||
"""
|
||||
|
||||
import os
|
||||
import logging
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Configuration (for backward compatibility - actual config in embedding-service)
|
||||
EMBEDDING_SERVICE_URL = os.getenv("EMBEDDING_SERVICE_URL", "http://embedding-service:8087")
|
||||
PDF_BACKEND = os.getenv("PDF_EXTRACTION_BACKEND", "auto")
|
||||
|
||||
|
||||
class PDFExtractionError(Exception):
|
||||
"""Error during PDF extraction."""
|
||||
pass
|
||||
|
||||
|
||||
class PDFExtractionResult:
|
||||
"""Result of PDF extraction with metadata."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
text: str,
|
||||
backend_used: str,
|
||||
pages: int = 0,
|
||||
elements: Optional[List[Dict]] = None,
|
||||
tables: Optional[List[Dict]] = None,
|
||||
metadata: Optional[Dict] = None,
|
||||
):
|
||||
self.text = text
|
||||
self.backend_used = backend_used
|
||||
self.pages = pages
|
||||
self.elements = elements or []
|
||||
self.tables = tables or []
|
||||
self.metadata = metadata or {}
|
||||
|
||||
def to_dict(self) -> Dict:
|
||||
return {
|
||||
"text": self.text,
|
||||
"backend_used": self.backend_used,
|
||||
"pages": self.pages,
|
||||
"element_count": len(self.elements),
|
||||
"table_count": len(self.tables),
|
||||
"metadata": self.metadata,
|
||||
}
|
||||
|
||||
|
||||
def _detect_available_backends() -> List[str]:
|
||||
"""Get available backends from embedding-service."""
|
||||
import httpx
|
||||
|
||||
try:
|
||||
with httpx.Client(timeout=5.0) as client:
|
||||
response = client.get(f"{EMBEDDING_SERVICE_URL}/models")
|
||||
if response.status_code == 200:
|
||||
data = response.json()
|
||||
return data.get("available_pdf_backends", ["pypdf"])
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not reach embedding-service: {e}")
|
||||
|
||||
return []
|
||||
|
||||
|
||||
def extract_text_from_pdf_enhanced(
|
||||
pdf_content: bytes,
|
||||
backend: str = PDF_BACKEND,
|
||||
fallback: bool = True,
|
||||
) -> PDFExtractionResult:
|
||||
"""
|
||||
Extract text from PDF using embedding-service.
|
||||
|
||||
Args:
|
||||
pdf_content: PDF file content as bytes
|
||||
backend: Preferred backend (auto, unstructured, pypdf)
|
||||
fallback: If True, try other backends if preferred fails
|
||||
|
||||
Returns:
|
||||
PDFExtractionResult with extracted text and metadata
|
||||
"""
|
||||
import httpx
|
||||
|
||||
try:
|
||||
with httpx.Client(timeout=120.0) as client:
|
||||
response = client.post(
|
||||
f"{EMBEDDING_SERVICE_URL}/extract-pdf",
|
||||
content=pdf_content,
|
||||
headers={"Content-Type": "application/octet-stream"}
|
||||
)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
|
||||
return PDFExtractionResult(
|
||||
text=data.get("text", ""),
|
||||
backend_used=data.get("backend_used", "unknown"),
|
||||
pages=data.get("pages", 0),
|
||||
tables=[{"count": data.get("table_count", 0)}] if data.get("table_count", 0) > 0 else [],
|
||||
metadata={"embedding_service": True}
|
||||
)
|
||||
except httpx.TimeoutException:
|
||||
raise PDFExtractionError("PDF extraction timeout")
|
||||
except httpx.HTTPStatusError as e:
|
||||
raise PDFExtractionError(f"PDF extraction error: {e.response.status_code}")
|
||||
except Exception as e:
|
||||
raise PDFExtractionError(f"Failed to extract PDF: {str(e)}")
|
||||
|
||||
|
||||
def extract_text_from_pdf(pdf_content: bytes) -> str:
|
||||
"""
|
||||
Extract text from PDF (simple interface).
|
||||
|
||||
This is a drop-in replacement for the original function
|
||||
that uses the embedding-service internally.
|
||||
"""
|
||||
result = extract_text_from_pdf_enhanced(pdf_content)
|
||||
return result.text
|
||||
|
||||
|
||||
def get_pdf_extraction_info() -> dict:
|
||||
"""Get information about PDF extraction configuration."""
|
||||
import httpx
|
||||
|
||||
try:
|
||||
with httpx.Client(timeout=5.0) as client:
|
||||
response = client.get(f"{EMBEDDING_SERVICE_URL}/models")
|
||||
if response.status_code == 200:
|
||||
data = response.json()
|
||||
available = data.get("available_pdf_backends", [])
|
||||
return {
|
||||
"configured_backend": data.get("pdf_backend", PDF_BACKEND),
|
||||
"available_backends": available,
|
||||
"recommended": "unstructured" if "unstructured" in available else "pypdf",
|
||||
"backend_licenses": {
|
||||
"unstructured": "Apache-2.0",
|
||||
"pypdf": "BSD-3-Clause",
|
||||
},
|
||||
"commercial_safe_backends": available,
|
||||
"embedding_service_url": EMBEDDING_SERVICE_URL,
|
||||
"embedding_service_available": True,
|
||||
}
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not reach embedding-service: {e}")
|
||||
|
||||
# Fallback when embedding-service is not available
|
||||
return {
|
||||
"configured_backend": PDF_BACKEND,
|
||||
"available_backends": [],
|
||||
"recommended": None,
|
||||
"backend_licenses": {},
|
||||
"commercial_safe_backends": [],
|
||||
"embedding_service_url": EMBEDDING_SERVICE_URL,
|
||||
"embedding_service_available": False,
|
||||
}
|
||||
# Backward-compat shim -- module moved to korrektur/pdf_extraction.py
|
||||
import importlib as _importlib
|
||||
import sys as _sys
|
||||
_sys.modules[__name__] = _importlib.import_module("korrektur.pdf_extraction")
|
||||
|
||||
@@ -1,38 +1,4 @@
|
||||
"""
|
||||
RBAC/ABAC Policy System for Klausur-Service (barrel re-export)
|
||||
|
||||
This module was split into:
|
||||
- rbac_types.py (Enums, data structures)
|
||||
- rbac_permissions.py (Permission matrix)
|
||||
- rbac_engine.py (PolicyEngine, default policies, API guards)
|
||||
|
||||
All public symbols are re-exported here for backwards compatibility.
|
||||
"""
|
||||
|
||||
# Types and enums
|
||||
from rbac_types import ( # noqa: F401
|
||||
Role,
|
||||
Action,
|
||||
ResourceType,
|
||||
ZKVisibilityMode,
|
||||
EHVisibilityMode,
|
||||
VerfahrenType,
|
||||
PolicySet,
|
||||
RoleAssignment,
|
||||
KeyShare,
|
||||
Tenant,
|
||||
Namespace,
|
||||
ExamPackage,
|
||||
)
|
||||
|
||||
# Permission matrix
|
||||
from rbac_permissions import DEFAULT_PERMISSIONS # noqa: F401
|
||||
|
||||
# Engine, policies, guards
|
||||
from rbac_engine import ( # noqa: F401
|
||||
PolicyEngine,
|
||||
create_default_policy_sets,
|
||||
get_policy_engine,
|
||||
require_permission,
|
||||
require_role,
|
||||
)
|
||||
# Backward-compat shim -- module moved to compliance/rbac.py
|
||||
import importlib as _importlib
|
||||
import sys as _sys
|
||||
_sys.modules[__name__] = _importlib.import_module("compliance.rbac")
|
||||
|
||||
@@ -1,498 +1,4 @@
|
||||
"""
|
||||
RBAC Policy Engine
|
||||
|
||||
Core engine for RBAC/ABAC permission checks,
|
||||
role assignments, key shares, and default policies.
|
||||
Extracted from rbac.py for file-size compliance.
|
||||
"""
|
||||
|
||||
from typing import Optional, List, Dict, Set
|
||||
from datetime import datetime, timezone
|
||||
import uuid
|
||||
from functools import wraps
|
||||
|
||||
from fastapi import HTTPException, Request
|
||||
|
||||
from rbac_types import (
|
||||
Role,
|
||||
Action,
|
||||
ResourceType,
|
||||
ZKVisibilityMode,
|
||||
PolicySet,
|
||||
RoleAssignment,
|
||||
KeyShare,
|
||||
)
|
||||
from rbac_permissions import DEFAULT_PERMISSIONS
|
||||
|
||||
|
||||
# =============================================
|
||||
# POLICY ENGINE
|
||||
# =============================================
|
||||
|
||||
class PolicyEngine:
|
||||
"""
|
||||
Engine fuer RBAC/ABAC Entscheidungen.
|
||||
|
||||
Prueft:
|
||||
1. Basis-Rollenberechtigung (RBAC)
|
||||
2. Policy-Einschraenkungen (ABAC)
|
||||
3. Key Share Berechtigungen
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self.policy_sets: Dict[str, PolicySet] = {}
|
||||
self.role_assignments: Dict[str, List[RoleAssignment]] = {} # user_id -> assignments
|
||||
self.key_shares: Dict[str, List[KeyShare]] = {} # user_id -> shares
|
||||
|
||||
def register_policy_set(self, policy: PolicySet):
|
||||
"""Registriere ein Policy Set."""
|
||||
self.policy_sets[policy.id] = policy
|
||||
|
||||
def get_policy_for_context(
|
||||
self,
|
||||
bundesland: str,
|
||||
jahr: int,
|
||||
fach: Optional[str] = None,
|
||||
verfahren: str = "abitur"
|
||||
) -> Optional[PolicySet]:
|
||||
"""Finde das passende Policy Set fuer einen Kontext."""
|
||||
# Exakte Uebereinstimmung
|
||||
for policy in self.policy_sets.values():
|
||||
if (policy.bundesland == bundesland and
|
||||
policy.jahr == jahr and
|
||||
policy.verfahren == verfahren):
|
||||
if policy.fach is None or policy.fach == fach:
|
||||
return policy
|
||||
|
||||
# Fallback: Default Policy
|
||||
for policy in self.policy_sets.values():
|
||||
if policy.bundesland == "DEFAULT":
|
||||
return policy
|
||||
|
||||
return None
|
||||
|
||||
def assign_role(
|
||||
self,
|
||||
user_id: str,
|
||||
role: Role,
|
||||
resource_type: ResourceType,
|
||||
resource_id: str,
|
||||
granted_by: str,
|
||||
tenant_id: Optional[str] = None,
|
||||
namespace_id: Optional[str] = None,
|
||||
valid_to: Optional[datetime] = None
|
||||
) -> RoleAssignment:
|
||||
"""Weise einem User eine Rolle zu."""
|
||||
assignment = RoleAssignment(
|
||||
id=str(uuid.uuid4()),
|
||||
user_id=user_id,
|
||||
role=role,
|
||||
resource_type=resource_type,
|
||||
resource_id=resource_id,
|
||||
tenant_id=tenant_id,
|
||||
namespace_id=namespace_id,
|
||||
granted_by=granted_by,
|
||||
valid_to=valid_to
|
||||
)
|
||||
|
||||
if user_id not in self.role_assignments:
|
||||
self.role_assignments[user_id] = []
|
||||
self.role_assignments[user_id].append(assignment)
|
||||
|
||||
return assignment
|
||||
|
||||
def revoke_role(self, assignment_id: str, revoked_by: str) -> bool:
|
||||
"""Widerrufe eine Rollenzuweisung."""
|
||||
for user_assignments in self.role_assignments.values():
|
||||
for assignment in user_assignments:
|
||||
if assignment.id == assignment_id:
|
||||
assignment.revoked_at = datetime.now(timezone.utc)
|
||||
return True
|
||||
return False
|
||||
|
||||
def get_user_roles(
|
||||
self,
|
||||
user_id: str,
|
||||
resource_type: Optional[ResourceType] = None,
|
||||
resource_id: Optional[str] = None
|
||||
) -> List[Role]:
|
||||
"""Hole alle aktiven Rollen eines Users."""
|
||||
assignments = self.role_assignments.get(user_id, [])
|
||||
roles = []
|
||||
|
||||
for assignment in assignments:
|
||||
if not assignment.is_active():
|
||||
continue
|
||||
if resource_type and assignment.resource_type != resource_type:
|
||||
continue
|
||||
if resource_id and assignment.resource_id != resource_id:
|
||||
continue
|
||||
roles.append(assignment.role)
|
||||
|
||||
return list(set(roles))
|
||||
|
||||
def create_key_share(
|
||||
self,
|
||||
user_id: str,
|
||||
package_id: str,
|
||||
permissions: Set[str],
|
||||
granted_by: str,
|
||||
scope: str = "full",
|
||||
invite_token: Optional[str] = None
|
||||
) -> KeyShare:
|
||||
"""Erstelle einen Key Share."""
|
||||
share = KeyShare(
|
||||
id=str(uuid.uuid4()),
|
||||
user_id=user_id,
|
||||
package_id=package_id,
|
||||
permissions=permissions,
|
||||
scope=scope,
|
||||
granted_by=granted_by,
|
||||
invite_token=invite_token
|
||||
)
|
||||
|
||||
if user_id not in self.key_shares:
|
||||
self.key_shares[user_id] = []
|
||||
self.key_shares[user_id].append(share)
|
||||
|
||||
return share
|
||||
|
||||
def accept_key_share(self, share_id: str, token: str) -> bool:
|
||||
"""Akzeptiere einen Key Share via Invite Token."""
|
||||
for user_shares in self.key_shares.values():
|
||||
for share in user_shares:
|
||||
if share.id == share_id and share.invite_token == token:
|
||||
share.accepted_at = datetime.now(timezone.utc)
|
||||
return True
|
||||
return False
|
||||
|
||||
def revoke_key_share(self, share_id: str, revoked_by: str) -> bool:
|
||||
"""Widerrufe einen Key Share."""
|
||||
for user_shares in self.key_shares.values():
|
||||
for share in user_shares:
|
||||
if share.id == share_id:
|
||||
share.revoked_at = datetime.now(timezone.utc)
|
||||
share.revoked_by = revoked_by
|
||||
return True
|
||||
return False
|
||||
|
||||
def check_permission(
|
||||
self,
|
||||
user_id: str,
|
||||
action: Action,
|
||||
resource_type: ResourceType,
|
||||
resource_id: str,
|
||||
policy: Optional[PolicySet] = None,
|
||||
package_id: Optional[str] = None
|
||||
) -> bool:
|
||||
"""
|
||||
Pruefe ob ein User eine Aktion ausfuehren darf.
|
||||
|
||||
Prueft:
|
||||
1. Basis-RBAC
|
||||
2. Policy-Einschraenkungen
|
||||
3. Key Share (falls package_id angegeben)
|
||||
"""
|
||||
# 1. Hole aktive Rollen
|
||||
roles = self.get_user_roles(user_id, resource_type, resource_id)
|
||||
|
||||
if not roles:
|
||||
return False
|
||||
|
||||
# 2. Pruefe Basis-RBAC
|
||||
has_permission = False
|
||||
for role in roles:
|
||||
role_permissions = DEFAULT_PERMISSIONS.get(role, {})
|
||||
resource_permissions = role_permissions.get(resource_type, set())
|
||||
if action in resource_permissions:
|
||||
has_permission = True
|
||||
break
|
||||
|
||||
if not has_permission:
|
||||
return False
|
||||
|
||||
# 3. Pruefe Policy-Einschraenkungen
|
||||
if policy:
|
||||
# ZK Visibility Mode
|
||||
if Role.ZWEITKORREKTOR in roles:
|
||||
if policy.zk_visibility_mode == ZKVisibilityMode.BLIND:
|
||||
# Blind: ZK darf EK-Outputs nicht sehen
|
||||
if resource_type in [ResourceType.EVALUATION, ResourceType.REPORT, ResourceType.GRADE_DECISION]:
|
||||
if action == Action.READ:
|
||||
# Pruefe ob es EK-Outputs sind (muesste ueber Metadaten geprueft werden)
|
||||
pass # Implementierung abhaengig von Datenmodell
|
||||
|
||||
elif policy.zk_visibility_mode == ZKVisibilityMode.SEMI:
|
||||
# Semi: ZK sieht Annotationen, aber keine Note
|
||||
if resource_type == ResourceType.GRADE_DECISION and action == Action.READ:
|
||||
return False
|
||||
|
||||
# 4. Pruefe Key Share (falls Package-basiert)
|
||||
if package_id:
|
||||
user_shares = self.key_shares.get(user_id, [])
|
||||
has_key_share = any(
|
||||
share.package_id == package_id and share.is_active()
|
||||
for share in user_shares
|
||||
)
|
||||
if not has_key_share:
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def get_allowed_actions(
|
||||
self,
|
||||
user_id: str,
|
||||
resource_type: ResourceType,
|
||||
resource_id: str,
|
||||
policy: Optional[PolicySet] = None
|
||||
) -> Set[Action]:
|
||||
"""Hole alle erlaubten Aktionen fuer einen User auf einer Ressource."""
|
||||
roles = self.get_user_roles(user_id, resource_type, resource_id)
|
||||
allowed = set()
|
||||
|
||||
for role in roles:
|
||||
role_permissions = DEFAULT_PERMISSIONS.get(role, {})
|
||||
resource_permissions = role_permissions.get(resource_type, set())
|
||||
allowed.update(resource_permissions)
|
||||
|
||||
# Policy-Einschraenkungen anwenden
|
||||
if policy and Role.ZWEITKORREKTOR in roles:
|
||||
if policy.zk_visibility_mode == ZKVisibilityMode.BLIND:
|
||||
# Entferne READ fuer bestimmte Ressourcen
|
||||
pass # Detailimplementierung
|
||||
|
||||
return allowed
|
||||
|
||||
|
||||
# =============================================
|
||||
# DEFAULT POLICY SETS (alle Bundeslaender)
|
||||
# =============================================
|
||||
|
||||
def create_default_policy_sets() -> List[PolicySet]:
|
||||
"""
|
||||
Erstelle Default Policy Sets fuer alle Bundeslaender.
|
||||
|
||||
Diese koennen spaeter pro Land verfeinert werden.
|
||||
"""
|
||||
bundeslaender = [
|
||||
"baden-wuerttemberg", "bayern", "berlin", "brandenburg",
|
||||
"bremen", "hamburg", "hessen", "mecklenburg-vorpommern",
|
||||
"niedersachsen", "nordrhein-westfalen", "rheinland-pfalz",
|
||||
"saarland", "sachsen", "sachsen-anhalt", "schleswig-holstein",
|
||||
"thueringen"
|
||||
]
|
||||
|
||||
policies = []
|
||||
|
||||
# Default Policy (Fallback)
|
||||
policies.append(PolicySet(
|
||||
id="DEFAULT-2025",
|
||||
bundesland="DEFAULT",
|
||||
jahr=2025,
|
||||
fach=None,
|
||||
verfahren="abitur",
|
||||
zk_visibility_mode=ZKVisibilityMode.FULL,
|
||||
eh_visibility_mode=PolicySet.__dataclass_fields__["eh_visibility_mode"].default,
|
||||
allow_teacher_uploaded_eh=True,
|
||||
allow_land_uploaded_eh=True,
|
||||
require_rights_confirmation_on_upload=True,
|
||||
third_correction_threshold=4,
|
||||
final_signoff_role="fachvorsitz"
|
||||
))
|
||||
|
||||
# Niedersachsen (Beispiel mit spezifischen Anpassungen)
|
||||
policies.append(PolicySet(
|
||||
id="NI-2025-ABITUR",
|
||||
bundesland="niedersachsen",
|
||||
jahr=2025,
|
||||
fach=None,
|
||||
verfahren="abitur",
|
||||
zk_visibility_mode=ZKVisibilityMode.FULL, # In NI sieht ZK alles
|
||||
allow_teacher_uploaded_eh=True,
|
||||
allow_land_uploaded_eh=True,
|
||||
require_rights_confirmation_on_upload=True,
|
||||
third_correction_threshold=4,
|
||||
final_signoff_role="fachvorsitz",
|
||||
export_template_id="niedersachsen-abitur"
|
||||
))
|
||||
|
||||
# Bayern (Beispiel mit SEMI visibility)
|
||||
policies.append(PolicySet(
|
||||
id="BY-2025-ABITUR",
|
||||
bundesland="bayern",
|
||||
jahr=2025,
|
||||
fach=None,
|
||||
verfahren="abitur",
|
||||
zk_visibility_mode=ZKVisibilityMode.SEMI, # ZK sieht Annotationen, nicht Note
|
||||
allow_teacher_uploaded_eh=True,
|
||||
allow_land_uploaded_eh=True,
|
||||
require_rights_confirmation_on_upload=True,
|
||||
third_correction_threshold=4,
|
||||
final_signoff_role="fachvorsitz",
|
||||
export_template_id="bayern-abitur"
|
||||
))
|
||||
|
||||
# NRW (Beispiel)
|
||||
policies.append(PolicySet(
|
||||
id="NW-2025-ABITUR",
|
||||
bundesland="nordrhein-westfalen",
|
||||
jahr=2025,
|
||||
fach=None,
|
||||
verfahren="abitur",
|
||||
zk_visibility_mode=ZKVisibilityMode.FULL,
|
||||
allow_teacher_uploaded_eh=True,
|
||||
allow_land_uploaded_eh=True,
|
||||
require_rights_confirmation_on_upload=True,
|
||||
third_correction_threshold=4,
|
||||
final_signoff_role="fachvorsitz",
|
||||
export_template_id="nrw-abitur"
|
||||
))
|
||||
|
||||
# Generiere Basis-Policies fuer alle anderen Bundeslaender
|
||||
for bl in bundeslaender:
|
||||
if bl not in ["niedersachsen", "bayern", "nordrhein-westfalen"]:
|
||||
policies.append(PolicySet(
|
||||
id=f"{bl[:2].upper()}-2025-ABITUR",
|
||||
bundesland=bl,
|
||||
jahr=2025,
|
||||
fach=None,
|
||||
verfahren="abitur",
|
||||
zk_visibility_mode=ZKVisibilityMode.FULL,
|
||||
allow_teacher_uploaded_eh=True,
|
||||
allow_land_uploaded_eh=True,
|
||||
require_rights_confirmation_on_upload=True,
|
||||
third_correction_threshold=4,
|
||||
final_signoff_role="fachvorsitz"
|
||||
))
|
||||
|
||||
return policies
|
||||
|
||||
|
||||
# =============================================
|
||||
# GLOBAL POLICY ENGINE INSTANCE
|
||||
# =============================================
|
||||
|
||||
# Singleton Policy Engine
|
||||
_policy_engine: Optional[PolicyEngine] = None
|
||||
|
||||
|
||||
def get_policy_engine() -> PolicyEngine:
|
||||
"""Hole die globale Policy Engine Instanz."""
|
||||
global _policy_engine
|
||||
if _policy_engine is None:
|
||||
_policy_engine = PolicyEngine()
|
||||
# Registriere Default Policies
|
||||
for policy in create_default_policy_sets():
|
||||
_policy_engine.register_policy_set(policy)
|
||||
return _policy_engine
|
||||
|
||||
|
||||
# =============================================
|
||||
# API GUARDS (Decorators fuer FastAPI)
|
||||
# =============================================
|
||||
|
||||
def require_permission(
|
||||
action: Action,
|
||||
resource_type: ResourceType,
|
||||
resource_id_param: str = "resource_id"
|
||||
):
|
||||
"""
|
||||
Decorator fuer FastAPI Endpoints.
|
||||
|
||||
Prueft ob der aktuelle User die angegebene Berechtigung hat.
|
||||
|
||||
Usage:
|
||||
@app.get("/api/v1/packages/{package_id}")
|
||||
@require_permission(Action.READ, ResourceType.EXAM_PACKAGE, "package_id")
|
||||
async def get_package(package_id: str, request: Request):
|
||||
...
|
||||
"""
|
||||
def decorator(func):
|
||||
@wraps(func)
|
||||
async def wrapper(*args, **kwargs):
|
||||
request = kwargs.get('request')
|
||||
if not request:
|
||||
for arg in args:
|
||||
if isinstance(arg, Request):
|
||||
request = arg
|
||||
break
|
||||
|
||||
if not request:
|
||||
raise HTTPException(status_code=500, detail="Request not found")
|
||||
|
||||
# User aus Token holen
|
||||
user = getattr(request.state, 'user', None)
|
||||
if not user:
|
||||
raise HTTPException(status_code=401, detail="Not authenticated")
|
||||
|
||||
user_id = user.get('user_id')
|
||||
resource_id = kwargs.get(resource_id_param)
|
||||
|
||||
# Policy Engine pruefen
|
||||
engine = get_policy_engine()
|
||||
|
||||
# Optional: Policy aus Kontext laden
|
||||
policy = None
|
||||
bundesland = user.get('bundesland')
|
||||
if bundesland:
|
||||
policy = engine.get_policy_for_context(bundesland, 2025)
|
||||
|
||||
if not engine.check_permission(
|
||||
user_id=user_id,
|
||||
action=action,
|
||||
resource_type=resource_type,
|
||||
resource_id=resource_id,
|
||||
policy=policy
|
||||
):
|
||||
raise HTTPException(
|
||||
status_code=403,
|
||||
detail=f"Permission denied: {action.value} on {resource_type.value}"
|
||||
)
|
||||
|
||||
return await func(*args, **kwargs)
|
||||
|
||||
return wrapper
|
||||
return decorator
|
||||
|
||||
|
||||
def require_role(role: Role):
|
||||
"""
|
||||
Decorator der prueft ob User eine bestimmte Rolle hat.
|
||||
|
||||
Usage:
|
||||
@app.post("/api/v1/eh/publish")
|
||||
@require_role(Role.LAND_ADMIN)
|
||||
async def publish_eh(request: Request):
|
||||
...
|
||||
"""
|
||||
def decorator(func):
|
||||
@wraps(func)
|
||||
async def wrapper(*args, **kwargs):
|
||||
request = kwargs.get('request')
|
||||
if not request:
|
||||
for arg in args:
|
||||
if isinstance(arg, Request):
|
||||
request = arg
|
||||
break
|
||||
|
||||
if not request:
|
||||
raise HTTPException(status_code=500, detail="Request not found")
|
||||
|
||||
user = getattr(request.state, 'user', None)
|
||||
if not user:
|
||||
raise HTTPException(status_code=401, detail="Not authenticated")
|
||||
|
||||
user_id = user.get('user_id')
|
||||
engine = get_policy_engine()
|
||||
|
||||
user_roles = engine.get_user_roles(user_id)
|
||||
if role not in user_roles:
|
||||
raise HTTPException(
|
||||
status_code=403,
|
||||
detail=f"Role required: {role.value}"
|
||||
)
|
||||
|
||||
return await func(*args, **kwargs)
|
||||
|
||||
return wrapper
|
||||
return decorator
|
||||
# Backward-compat shim -- module moved to compliance/rbac_engine.py
|
||||
import importlib as _importlib
|
||||
import sys as _sys
|
||||
_sys.modules[__name__] = _importlib.import_module("compliance.rbac_engine")
|
||||
|
||||
@@ -1,221 +1,4 @@
|
||||
"""
|
||||
RBAC Permission Matrix
|
||||
|
||||
Default role-to-resource permission mappings for
|
||||
Klausur-Korrektur and Zeugnis workflows.
|
||||
Extracted from rbac.py for file-size compliance.
|
||||
"""
|
||||
|
||||
from typing import Dict, Set
|
||||
|
||||
from rbac_types import Role, Action, ResourceType
|
||||
|
||||
|
||||
# =============================================
|
||||
# RBAC PERMISSION MATRIX
|
||||
# =============================================
|
||||
|
||||
# Standard-Berechtigungsmatrix (kann durch Policies ueberschrieben werden)
|
||||
DEFAULT_PERMISSIONS: Dict[Role, Dict[ResourceType, Set[Action]]] = {
|
||||
# Erstkorrektor
|
||||
Role.ERSTKORREKTOR: {
|
||||
ResourceType.EXAM_PACKAGE: {Action.READ, Action.UPDATE, Action.SHARE_KEY, Action.LOCK},
|
||||
ResourceType.STUDENT_WORK: {Action.READ, Action.UPDATE},
|
||||
ResourceType.EH_DOCUMENT: {Action.READ, Action.UPLOAD, Action.UPDATE},
|
||||
ResourceType.RUBRIC: {Action.READ, Action.UPDATE},
|
||||
ResourceType.ANNOTATION: {Action.CREATE, Action.READ, Action.UPDATE, Action.DELETE},
|
||||
ResourceType.EVALUATION: {Action.CREATE, Action.READ, Action.UPDATE},
|
||||
ResourceType.REPORT: {Action.CREATE, Action.READ, Action.UPDATE},
|
||||
ResourceType.GRADE_DECISION: {Action.CREATE, Action.READ, Action.UPDATE},
|
||||
ResourceType.EXPORT: {Action.CREATE, Action.READ, Action.DOWNLOAD},
|
||||
ResourceType.AUDIT_LOG: {Action.READ},
|
||||
},
|
||||
|
||||
# Zweitkorrektor (Standard: FULL visibility)
|
||||
Role.ZWEITKORREKTOR: {
|
||||
ResourceType.EXAM_PACKAGE: {Action.READ},
|
||||
ResourceType.STUDENT_WORK: {Action.READ, Action.UPDATE},
|
||||
ResourceType.EH_DOCUMENT: {Action.READ},
|
||||
ResourceType.RUBRIC: {Action.READ},
|
||||
ResourceType.ANNOTATION: {Action.CREATE, Action.READ, Action.UPDATE},
|
||||
ResourceType.EVALUATION: {Action.CREATE, Action.READ, Action.UPDATE},
|
||||
ResourceType.REPORT: {Action.CREATE, Action.READ, Action.UPDATE},
|
||||
ResourceType.GRADE_DECISION: {Action.CREATE, Action.READ, Action.UPDATE},
|
||||
ResourceType.EXPORT: {Action.READ, Action.DOWNLOAD},
|
||||
ResourceType.AUDIT_LOG: {Action.READ},
|
||||
},
|
||||
|
||||
# Drittkorrektor
|
||||
Role.DRITTKORREKTOR: {
|
||||
ResourceType.EXAM_PACKAGE: {Action.READ},
|
||||
ResourceType.STUDENT_WORK: {Action.READ, Action.UPDATE},
|
||||
ResourceType.EH_DOCUMENT: {Action.READ},
|
||||
ResourceType.RUBRIC: {Action.READ},
|
||||
ResourceType.ANNOTATION: {Action.CREATE, Action.READ, Action.UPDATE},
|
||||
ResourceType.EVALUATION: {Action.CREATE, Action.READ, Action.UPDATE},
|
||||
ResourceType.REPORT: {Action.CREATE, Action.READ, Action.UPDATE},
|
||||
ResourceType.GRADE_DECISION: {Action.CREATE, Action.READ, Action.UPDATE},
|
||||
ResourceType.AUDIT_LOG: {Action.READ},
|
||||
},
|
||||
|
||||
# Fachvorsitz
|
||||
Role.FACHVORSITZ: {
|
||||
ResourceType.TENANT: {Action.READ},
|
||||
ResourceType.NAMESPACE: {Action.READ, Action.UPDATE},
|
||||
ResourceType.EXAM_PACKAGE: {Action.READ, Action.UPDATE, Action.LOCK, Action.UNLOCK, Action.SIGN_OFF},
|
||||
ResourceType.STUDENT_WORK: {Action.READ, Action.UPDATE},
|
||||
ResourceType.EH_DOCUMENT: {Action.READ, Action.UPLOAD, Action.UPDATE},
|
||||
ResourceType.RUBRIC: {Action.READ, Action.UPDATE},
|
||||
ResourceType.ANNOTATION: {Action.READ, Action.UPDATE},
|
||||
ResourceType.EVALUATION: {Action.READ, Action.UPDATE},
|
||||
ResourceType.REPORT: {Action.READ, Action.UPDATE},
|
||||
ResourceType.GRADE_DECISION: {Action.READ, Action.UPDATE, Action.SIGN_OFF},
|
||||
ResourceType.EXPORT: {Action.CREATE, Action.READ, Action.DOWNLOAD},
|
||||
ResourceType.AUDIT_LOG: {Action.READ},
|
||||
},
|
||||
|
||||
# Pruefungsvorsitz
|
||||
Role.PRUEFUNGSVORSITZ: {
|
||||
ResourceType.TENANT: {Action.READ},
|
||||
ResourceType.NAMESPACE: {Action.READ, Action.CREATE},
|
||||
ResourceType.EXAM_PACKAGE: {Action.READ, Action.SIGN_OFF},
|
||||
ResourceType.STUDENT_WORK: {Action.READ},
|
||||
ResourceType.EH_DOCUMENT: {Action.READ},
|
||||
ResourceType.GRADE_DECISION: {Action.READ, Action.SIGN_OFF},
|
||||
ResourceType.EXPORT: {Action.CREATE, Action.READ, Action.DOWNLOAD},
|
||||
ResourceType.AUDIT_LOG: {Action.READ},
|
||||
},
|
||||
|
||||
# Schul-Admin
|
||||
Role.SCHUL_ADMIN: {
|
||||
ResourceType.TENANT: {Action.READ, Action.UPDATE},
|
||||
ResourceType.NAMESPACE: {Action.CREATE, Action.READ, Action.UPDATE, Action.DELETE},
|
||||
ResourceType.EXAM_PACKAGE: {Action.CREATE, Action.READ, Action.DELETE, Action.ASSIGN_ROLE},
|
||||
ResourceType.EH_DOCUMENT: {Action.READ, Action.UPLOAD, Action.DELETE},
|
||||
ResourceType.AUDIT_LOG: {Action.READ},
|
||||
},
|
||||
|
||||
# Land-Admin (Behoerde)
|
||||
Role.LAND_ADMIN: {
|
||||
ResourceType.TENANT: {Action.READ},
|
||||
ResourceType.EH_DOCUMENT: {Action.READ, Action.UPLOAD, Action.UPDATE, Action.DELETE, Action.PUBLISH_OFFICIAL},
|
||||
ResourceType.AUDIT_LOG: {Action.READ},
|
||||
},
|
||||
|
||||
# Auditor
|
||||
Role.AUDITOR: {
|
||||
ResourceType.AUDIT_LOG: {Action.READ},
|
||||
ResourceType.EXAM_PACKAGE: {Action.READ}, # Nur Metadaten
|
||||
# Kein Zugriff auf Inhalte!
|
||||
},
|
||||
|
||||
# Operator
|
||||
Role.OPERATOR: {
|
||||
ResourceType.TENANT: {Action.READ},
|
||||
ResourceType.NAMESPACE: {Action.READ},
|
||||
ResourceType.EXAM_PACKAGE: {Action.READ}, # Nur Metadaten
|
||||
ResourceType.AUDIT_LOG: {Action.READ},
|
||||
# Break-glass separat gehandhabt
|
||||
},
|
||||
|
||||
# Teacher Assistant
|
||||
Role.TEACHER_ASSISTANT: {
|
||||
ResourceType.STUDENT_WORK: {Action.READ},
|
||||
ResourceType.ANNOTATION: {Action.CREATE, Action.READ}, # Nur bestimmte Typen
|
||||
ResourceType.EH_DOCUMENT: {Action.READ},
|
||||
},
|
||||
|
||||
# Exam Author (nur Vorabi)
|
||||
Role.EXAM_AUTHOR: {
|
||||
ResourceType.EH_DOCUMENT: {Action.CREATE, Action.READ, Action.UPDATE, Action.DELETE},
|
||||
ResourceType.RUBRIC: {Action.CREATE, Action.READ, Action.UPDATE, Action.DELETE},
|
||||
},
|
||||
|
||||
# =============================================
|
||||
# ZEUGNIS-WORKFLOW ROLLEN
|
||||
# =============================================
|
||||
|
||||
# Klassenlehrer - Erstellt Zeugnisse, Kopfnoten, Bemerkungen
|
||||
Role.KLASSENLEHRER: {
|
||||
ResourceType.NAMESPACE: {Action.READ},
|
||||
ResourceType.ZEUGNIS: {Action.CREATE, Action.READ, Action.UPDATE},
|
||||
ResourceType.ZEUGNIS_ENTWURF: {Action.CREATE, Action.READ, Action.UPDATE, Action.DELETE},
|
||||
ResourceType.ZEUGNIS_VORLAGE: {Action.READ},
|
||||
ResourceType.SCHUELER_DATEN: {Action.READ, Action.UPDATE},
|
||||
ResourceType.FACHNOTE: {Action.READ}, # Liest Fachnoten der Fachlehrer
|
||||
ResourceType.KOPFNOTE: {Action.CREATE, Action.READ, Action.UPDATE},
|
||||
ResourceType.FEHLZEITEN: {Action.READ, Action.UPDATE},
|
||||
ResourceType.BEMERKUNG: {Action.CREATE, Action.READ, Action.UPDATE, Action.DELETE},
|
||||
ResourceType.VERSETZUNG: {Action.READ},
|
||||
ResourceType.EXPORT: {Action.CREATE, Action.READ, Action.DOWNLOAD},
|
||||
ResourceType.AUDIT_LOG: {Action.READ},
|
||||
},
|
||||
|
||||
# Fachlehrer - Traegt Fachnoten ein
|
||||
Role.FACHLEHRER: {
|
||||
ResourceType.NAMESPACE: {Action.READ},
|
||||
ResourceType.SCHUELER_DATEN: {Action.READ}, # Nur eigene Schueler
|
||||
ResourceType.FACHNOTE: {Action.CREATE, Action.READ, Action.UPDATE}, # Nur eigenes Fach
|
||||
ResourceType.BEMERKUNG: {Action.CREATE, Action.READ}, # Fachbezogene Bemerkungen
|
||||
ResourceType.AUDIT_LOG: {Action.READ},
|
||||
},
|
||||
|
||||
# Zeugnisbeauftragter - Qualitaetskontrolle
|
||||
Role.ZEUGNISBEAUFTRAGTER: {
|
||||
ResourceType.NAMESPACE: {Action.READ, Action.UPDATE},
|
||||
ResourceType.ZEUGNIS: {Action.READ, Action.UPDATE},
|
||||
ResourceType.ZEUGNIS_ENTWURF: {Action.READ, Action.UPDATE},
|
||||
ResourceType.ZEUGNIS_VORLAGE: {Action.READ, Action.UPDATE, Action.UPLOAD},
|
||||
ResourceType.SCHUELER_DATEN: {Action.READ},
|
||||
ResourceType.FACHNOTE: {Action.READ},
|
||||
ResourceType.KOPFNOTE: {Action.READ, Action.UPDATE},
|
||||
ResourceType.FEHLZEITEN: {Action.READ},
|
||||
ResourceType.BEMERKUNG: {Action.READ, Action.UPDATE},
|
||||
ResourceType.VERSETZUNG: {Action.READ},
|
||||
ResourceType.EXPORT: {Action.CREATE, Action.READ, Action.DOWNLOAD},
|
||||
ResourceType.AUDIT_LOG: {Action.READ},
|
||||
},
|
||||
|
||||
# Sekretariat - Druck, Versand, Archivierung
|
||||
Role.SEKRETARIAT: {
|
||||
ResourceType.ZEUGNIS: {Action.READ, Action.DOWNLOAD},
|
||||
ResourceType.ZEUGNIS_VORLAGE: {Action.READ},
|
||||
ResourceType.SCHUELER_DATEN: {Action.READ}, # Fuer Adressdaten
|
||||
ResourceType.EXPORT: {Action.CREATE, Action.READ, Action.DOWNLOAD},
|
||||
ResourceType.AUDIT_LOG: {Action.READ},
|
||||
},
|
||||
|
||||
# Schulleitung - Finale Zeugnis-Freigabe
|
||||
Role.SCHULLEITUNG: {
|
||||
ResourceType.TENANT: {Action.READ},
|
||||
ResourceType.NAMESPACE: {Action.READ, Action.CREATE},
|
||||
ResourceType.ZEUGNIS: {Action.READ, Action.SIGN_OFF, Action.LOCK},
|
||||
ResourceType.ZEUGNIS_ENTWURF: {Action.READ, Action.UPDATE},
|
||||
ResourceType.ZEUGNIS_VORLAGE: {Action.READ, Action.UPDATE},
|
||||
ResourceType.SCHUELER_DATEN: {Action.READ},
|
||||
ResourceType.FACHNOTE: {Action.READ},
|
||||
ResourceType.KOPFNOTE: {Action.READ, Action.UPDATE},
|
||||
ResourceType.FEHLZEITEN: {Action.READ},
|
||||
ResourceType.BEMERKUNG: {Action.READ, Action.UPDATE},
|
||||
ResourceType.KONFERENZ_BESCHLUSS: {Action.CREATE, Action.READ, Action.UPDATE, Action.SIGN_OFF},
|
||||
ResourceType.VERSETZUNG: {Action.CREATE, Action.READ, Action.UPDATE, Action.SIGN_OFF},
|
||||
ResourceType.EXPORT: {Action.CREATE, Action.READ, Action.DOWNLOAD},
|
||||
ResourceType.AUDIT_LOG: {Action.READ},
|
||||
},
|
||||
|
||||
# Stufenleitung - Stufenkoordination (z.B. Oberstufe)
|
||||
Role.STUFENLEITUNG: {
|
||||
ResourceType.NAMESPACE: {Action.READ, Action.UPDATE},
|
||||
ResourceType.ZEUGNIS: {Action.READ, Action.UPDATE},
|
||||
ResourceType.ZEUGNIS_ENTWURF: {Action.READ, Action.UPDATE},
|
||||
ResourceType.SCHUELER_DATEN: {Action.READ},
|
||||
ResourceType.FACHNOTE: {Action.READ},
|
||||
ResourceType.KOPFNOTE: {Action.READ},
|
||||
ResourceType.FEHLZEITEN: {Action.READ},
|
||||
ResourceType.BEMERKUNG: {Action.READ, Action.UPDATE},
|
||||
ResourceType.KONFERENZ_BESCHLUSS: {Action.READ},
|
||||
ResourceType.VERSETZUNG: {Action.READ, Action.UPDATE},
|
||||
ResourceType.EXPORT: {Action.READ, Action.DOWNLOAD},
|
||||
ResourceType.AUDIT_LOG: {Action.READ},
|
||||
},
|
||||
}
|
||||
# Backward-compat shim -- module moved to compliance/rbac_permissions.py
|
||||
import importlib as _importlib
|
||||
import sys as _sys
|
||||
_sys.modules[__name__] = _importlib.import_module("compliance.rbac_permissions")
|
||||
|
||||
@@ -1,438 +1,4 @@
|
||||
"""
|
||||
RBAC/ABAC Type Definitions
|
||||
|
||||
Enums, data structures, and models for the policy system.
|
||||
Extracted from rbac.py for file-size compliance.
|
||||
"""
|
||||
|
||||
import json
|
||||
from enum import Enum
|
||||
from dataclasses import dataclass, field, asdict
|
||||
from typing import Optional, List, Dict, Set, Any
|
||||
from datetime import datetime, timezone
|
||||
import uuid
|
||||
|
||||
|
||||
# =============================================
|
||||
# ENUMS: Roles, Actions, Resources
|
||||
# =============================================
|
||||
|
||||
class Role(str, Enum):
|
||||
"""Fachliche Rollen in Korrektur- und Zeugniskette."""
|
||||
|
||||
# === Klausur-Korrekturkette ===
|
||||
ERSTKORREKTOR = "erstkorrektor" # EK
|
||||
ZWEITKORREKTOR = "zweitkorrektor" # ZK
|
||||
DRITTKORREKTOR = "drittkorrektor" # DK
|
||||
|
||||
# === Zeugnis-Workflow ===
|
||||
KLASSENLEHRER = "klassenlehrer" # KL - Erstellt Zeugnis, Kopfnoten, Bemerkungen
|
||||
FACHLEHRER = "fachlehrer" # FL - Traegt Fachnoten ein
|
||||
ZEUGNISBEAUFTRAGTER = "zeugnisbeauftragter" # ZB - Qualitaetskontrolle
|
||||
SEKRETARIAT = "sekretariat" # SEK - Druck, Versand, Archivierung
|
||||
|
||||
# === Leitung (Klausur + Zeugnis) ===
|
||||
FACHVORSITZ = "fachvorsitz" # FVL - Fachpruefungsleitung
|
||||
PRUEFUNGSVORSITZ = "pruefungsvorsitz" # PV - Schulleitung / Pruefungsvorsitz
|
||||
SCHULLEITUNG = "schulleitung" # SL - Finale Zeugnis-Freigabe
|
||||
STUFENLEITUNG = "stufenleitung" # STL - Stufenkoordination
|
||||
|
||||
# === Administration ===
|
||||
SCHUL_ADMIN = "schul_admin" # SA
|
||||
LAND_ADMIN = "land_admin" # LA - Behoerde
|
||||
|
||||
# === Spezial ===
|
||||
AUDITOR = "auditor" # DSB/Auditor
|
||||
OPERATOR = "operator" # OPS - Support
|
||||
TEACHER_ASSISTANT = "teacher_assistant" # TA - Referendar
|
||||
EXAM_AUTHOR = "exam_author" # EA - nur Vorabi
|
||||
|
||||
|
||||
class Action(str, Enum):
|
||||
"""Moegliche Operationen auf Ressourcen."""
|
||||
CREATE = "create"
|
||||
READ = "read"
|
||||
UPDATE = "update"
|
||||
DELETE = "delete"
|
||||
|
||||
ASSIGN_ROLE = "assign_role"
|
||||
INVITE_USER = "invite_user"
|
||||
REMOVE_USER = "remove_user"
|
||||
|
||||
UPLOAD = "upload"
|
||||
DOWNLOAD = "download"
|
||||
|
||||
LOCK = "lock" # Finalisieren
|
||||
UNLOCK = "unlock" # Nur mit Sonderrecht
|
||||
SIGN_OFF = "sign_off" # Freigabe
|
||||
|
||||
SHARE_KEY = "share_key" # Key Share erzeugen
|
||||
VIEW_PII = "view_pii" # Falls PII vorhanden
|
||||
BREAK_GLASS = "break_glass" # Notfallzugriff
|
||||
|
||||
PUBLISH_OFFICIAL = "publish_official" # Amtliche EH verteilen
|
||||
|
||||
|
||||
class ResourceType(str, Enum):
|
||||
"""Ressourcentypen im System."""
|
||||
TENANT = "tenant"
|
||||
NAMESPACE = "namespace"
|
||||
|
||||
# === Klausur-Korrektur ===
|
||||
EXAM_PACKAGE = "exam_package"
|
||||
STUDENT_WORK = "student_work"
|
||||
EH_DOCUMENT = "eh_document"
|
||||
RUBRIC = "rubric" # Punkteraster
|
||||
ANNOTATION = "annotation"
|
||||
EVALUATION = "evaluation" # Kriterien/Punkte
|
||||
REPORT = "report" # Gutachten
|
||||
GRADE_DECISION = "grade_decision"
|
||||
|
||||
# === Zeugnisgenerator ===
|
||||
ZEUGNIS = "zeugnis" # Zeugnisdokument
|
||||
ZEUGNIS_VORLAGE = "zeugnis_vorlage" # Zeugnisvorlage/Template
|
||||
ZEUGNIS_ENTWURF = "zeugnis_entwurf" # Zeugnisentwurf (vor Freigabe)
|
||||
SCHUELER_DATEN = "schueler_daten" # Schueler-Stammdaten, Noten
|
||||
FACHNOTE = "fachnote" # Einzelne Fachnote
|
||||
KOPFNOTE = "kopfnote" # Arbeits-/Sozialverhalten
|
||||
FEHLZEITEN = "fehlzeiten" # Fehlzeiten
|
||||
BEMERKUNG = "bemerkung" # Zeugnisbemerkungen
|
||||
KONFERENZ_BESCHLUSS = "konferenz_beschluss" # Konferenzergebnis
|
||||
VERSETZUNG = "versetzung" # Versetzungsentscheidung
|
||||
|
||||
# === Allgemein ===
|
||||
DOCUMENT = "document" # Generischer Dokumenttyp (EH, Vorlagen, etc.)
|
||||
TEMPLATE = "template" # Generische Vorlagen
|
||||
EXPORT = "export"
|
||||
AUDIT_LOG = "audit_log"
|
||||
KEY_MATERIAL = "key_material"
|
||||
|
||||
|
||||
class ZKVisibilityMode(str, Enum):
|
||||
"""Sichtbarkeitsmodus fuer Zweitkorrektoren."""
|
||||
BLIND = "blind" # ZK sieht keine EK-Note/Gutachten
|
||||
SEMI = "semi" # ZK sieht Annotationen, aber keine Note
|
||||
FULL = "full" # ZK sieht alles
|
||||
|
||||
|
||||
class EHVisibilityMode(str, Enum):
|
||||
"""Sichtbarkeitsmodus fuer Erwartungshorizonte."""
|
||||
BLIND = "blind" # ZK sieht EH nicht (selten)
|
||||
SHARED = "shared" # ZK sieht EH (Standard)
|
||||
|
||||
|
||||
class VerfahrenType(str, Enum):
|
||||
"""Verfahrenstypen fuer Klausuren und Zeugnisse."""
|
||||
|
||||
# === Klausur/Pruefungsverfahren ===
|
||||
ABITUR = "abitur"
|
||||
VORABITUR = "vorabitur"
|
||||
KLAUSUR = "klausur"
|
||||
NACHPRUEFUNG = "nachpruefung"
|
||||
|
||||
# === Zeugnisverfahren ===
|
||||
HALBJAHRESZEUGNIS = "halbjahreszeugnis"
|
||||
JAHRESZEUGNIS = "jahreszeugnis"
|
||||
ABSCHLUSSZEUGNIS = "abschlusszeugnis"
|
||||
ABGANGSZEUGNIS = "abgangszeugnis"
|
||||
|
||||
@classmethod
|
||||
def is_exam_type(cls, verfahren: str) -> bool:
|
||||
"""Pruefe ob Verfahren ein Pruefungstyp ist."""
|
||||
exam_types = {cls.ABITUR, cls.VORABITUR, cls.KLAUSUR, cls.NACHPRUEFUNG}
|
||||
try:
|
||||
return cls(verfahren) in exam_types
|
||||
except ValueError:
|
||||
return False
|
||||
|
||||
@classmethod
|
||||
def is_certificate_type(cls, verfahren: str) -> bool:
|
||||
"""Pruefe ob Verfahren ein Zeugnistyp ist."""
|
||||
cert_types = {cls.HALBJAHRESZEUGNIS, cls.JAHRESZEUGNIS, cls.ABSCHLUSSZEUGNIS, cls.ABGANGSZEUGNIS}
|
||||
try:
|
||||
return cls(verfahren) in cert_types
|
||||
except ValueError:
|
||||
return False
|
||||
|
||||
|
||||
# =============================================
|
||||
# DATA STRUCTURES
|
||||
# =============================================
|
||||
|
||||
@dataclass
|
||||
class PolicySet:
|
||||
"""
|
||||
Policy-Konfiguration pro Bundesland/Jahr/Fach.
|
||||
|
||||
Ermoeglicht bundesland-spezifische Unterschiede ohne
|
||||
harte Codierung im Quellcode.
|
||||
|
||||
Unterstuetzte Verfahrenstypen:
|
||||
- Pruefungen: abitur, vorabitur, klausur, nachpruefung
|
||||
- Zeugnisse: halbjahreszeugnis, jahreszeugnis, abschlusszeugnis, abgangszeugnis
|
||||
"""
|
||||
id: str
|
||||
bundesland: str
|
||||
jahr: int
|
||||
fach: Optional[str] # None = gilt fuer alle Faecher
|
||||
verfahren: str # See VerfahrenType enum
|
||||
|
||||
# Sichtbarkeitsregeln (Klausur)
|
||||
zk_visibility_mode: ZKVisibilityMode = ZKVisibilityMode.FULL
|
||||
eh_visibility_mode: EHVisibilityMode = EHVisibilityMode.SHARED
|
||||
|
||||
# EH-Quellen (Klausur)
|
||||
allow_teacher_uploaded_eh: bool = True
|
||||
allow_land_uploaded_eh: bool = True
|
||||
require_rights_confirmation_on_upload: bool = True
|
||||
require_dual_control_for_official_eh_update: bool = False
|
||||
|
||||
# Korrekturregeln (Klausur)
|
||||
third_correction_threshold: int = 4 # Notenpunkte Abweichung
|
||||
final_signoff_role: str = "fachvorsitz"
|
||||
|
||||
# Zeugnisregeln (Zeugnis)
|
||||
require_klassenlehrer_approval: bool = True
|
||||
require_schulleitung_signoff: bool = True
|
||||
allow_sekretariat_edit_after_approval: bool = False
|
||||
konferenz_protokoll_required: bool = True
|
||||
bemerkungen_require_review: bool = True
|
||||
fehlzeiten_auto_import: bool = True
|
||||
kopfnoten_enabled: bool = False
|
||||
versetzung_auto_calculate: bool = True
|
||||
|
||||
# Export & Anzeige
|
||||
quote_verbatim_allowed: bool = False # Amtliche Texte in UI
|
||||
export_template_id: str = "default"
|
||||
|
||||
# Zusaetzliche Flags
|
||||
flags: Dict[str, Any] = field(default_factory=dict)
|
||||
|
||||
created_at: datetime = field(default_factory=lambda: datetime.now(timezone.utc))
|
||||
|
||||
def is_exam_policy(self) -> bool:
|
||||
"""Pruefe ob diese Policy fuer Pruefungen ist."""
|
||||
return VerfahrenType.is_exam_type(self.verfahren)
|
||||
|
||||
def is_certificate_policy(self) -> bool:
|
||||
"""Pruefe ob diese Policy fuer Zeugnisse ist."""
|
||||
return VerfahrenType.is_certificate_type(self.verfahren)
|
||||
|
||||
def to_dict(self):
|
||||
d = asdict(self)
|
||||
d['zk_visibility_mode'] = self.zk_visibility_mode.value
|
||||
d['eh_visibility_mode'] = self.eh_visibility_mode.value
|
||||
d['created_at'] = self.created_at.isoformat()
|
||||
return d
|
||||
|
||||
|
||||
@dataclass
|
||||
class RoleAssignment:
|
||||
"""
|
||||
Zuweisung einer Rolle zu einem User fuer eine spezifische Ressource.
|
||||
"""
|
||||
id: str
|
||||
user_id: str
|
||||
role: Role
|
||||
resource_type: ResourceType
|
||||
resource_id: str
|
||||
|
||||
# Optionale Einschraenkungen
|
||||
tenant_id: Optional[str] = None
|
||||
namespace_id: Optional[str] = None
|
||||
|
||||
# Gueltigkeit
|
||||
valid_from: datetime = field(default_factory=lambda: datetime.now(timezone.utc))
|
||||
valid_to: Optional[datetime] = None
|
||||
|
||||
# Metadaten
|
||||
granted_by: str = ""
|
||||
granted_at: datetime = field(default_factory=lambda: datetime.now(timezone.utc))
|
||||
revoked_at: Optional[datetime] = None
|
||||
|
||||
def is_active(self) -> bool:
|
||||
now = datetime.now(timezone.utc)
|
||||
if self.revoked_at:
|
||||
return False
|
||||
if self.valid_to and now > self.valid_to:
|
||||
return False
|
||||
return now >= self.valid_from
|
||||
|
||||
def to_dict(self):
|
||||
return {
|
||||
'id': self.id,
|
||||
'user_id': self.user_id,
|
||||
'role': self.role.value,
|
||||
'resource_type': self.resource_type.value,
|
||||
'resource_id': self.resource_id,
|
||||
'tenant_id': self.tenant_id,
|
||||
'namespace_id': self.namespace_id,
|
||||
'valid_from': self.valid_from.isoformat(),
|
||||
'valid_to': self.valid_to.isoformat() if self.valid_to else None,
|
||||
'granted_by': self.granted_by,
|
||||
'granted_at': self.granted_at.isoformat(),
|
||||
'revoked_at': self.revoked_at.isoformat() if self.revoked_at else None,
|
||||
'is_active': self.is_active()
|
||||
}
|
||||
|
||||
|
||||
@dataclass
|
||||
class KeyShare:
|
||||
"""
|
||||
Berechtigung fuer einen User, auf verschluesselte Inhalte zuzugreifen.
|
||||
|
||||
Ein KeyShare ist KEIN Schluessel im Klartext, sondern eine
|
||||
Berechtigung in Verbindung mit Role Assignment.
|
||||
"""
|
||||
id: str
|
||||
user_id: str
|
||||
package_id: str
|
||||
|
||||
# Berechtigungsumfang
|
||||
permissions: Set[str] = field(default_factory=set)
|
||||
# z.B. {"read_original", "read_eh", "read_ek_outputs", "write_annotations"}
|
||||
|
||||
# Optionale Einschraenkungen
|
||||
scope: str = "full" # "full", "original_only", "eh_only", "outputs_only"
|
||||
|
||||
# Kette
|
||||
granted_by: str = ""
|
||||
granted_at: datetime = field(default_factory=lambda: datetime.now(timezone.utc))
|
||||
|
||||
# Akzeptanz (fuer Invite-Flow)
|
||||
invite_token: Optional[str] = None
|
||||
accepted_at: Optional[datetime] = None
|
||||
|
||||
# Widerruf
|
||||
revoked_at: Optional[datetime] = None
|
||||
revoked_by: Optional[str] = None
|
||||
|
||||
def is_active(self) -> bool:
|
||||
return self.revoked_at is None and (
|
||||
self.invite_token is None or self.accepted_at is not None
|
||||
)
|
||||
|
||||
def to_dict(self):
|
||||
return {
|
||||
'id': self.id,
|
||||
'user_id': self.user_id,
|
||||
'package_id': self.package_id,
|
||||
'permissions': list(self.permissions),
|
||||
'scope': self.scope,
|
||||
'granted_by': self.granted_by,
|
||||
'granted_at': self.granted_at.isoformat(),
|
||||
'invite_token': self.invite_token,
|
||||
'accepted_at': self.accepted_at.isoformat() if self.accepted_at else None,
|
||||
'revoked_at': self.revoked_at.isoformat() if self.revoked_at else None,
|
||||
'is_active': self.is_active()
|
||||
}
|
||||
|
||||
|
||||
@dataclass
|
||||
class Tenant:
|
||||
"""
|
||||
Hoechste Isolationseinheit - typischerweise eine Schule.
|
||||
"""
|
||||
id: str
|
||||
name: str
|
||||
bundesland: str
|
||||
tenant_type: str = "school" # "school", "pruefungszentrum", "behoerde"
|
||||
|
||||
# Verschluesselung
|
||||
encryption_enabled: bool = True
|
||||
|
||||
# Metadaten
|
||||
created_at: datetime = field(default_factory=lambda: datetime.now(timezone.utc))
|
||||
deleted_at: Optional[datetime] = None
|
||||
|
||||
def to_dict(self):
|
||||
return {
|
||||
'id': self.id,
|
||||
'name': self.name,
|
||||
'bundesland': self.bundesland,
|
||||
'tenant_type': self.tenant_type,
|
||||
'encryption_enabled': self.encryption_enabled,
|
||||
'created_at': self.created_at.isoformat()
|
||||
}
|
||||
|
||||
|
||||
@dataclass
|
||||
class Namespace:
|
||||
"""
|
||||
Arbeitsraum innerhalb eines Tenants.
|
||||
z.B. "Abitur 2026 - Deutsch LK - Kurs 12a"
|
||||
"""
|
||||
id: str
|
||||
tenant_id: str
|
||||
name: str
|
||||
|
||||
# Kontext
|
||||
jahr: int
|
||||
fach: str
|
||||
kurs: Optional[str] = None
|
||||
pruefungsart: str = "abitur" # "abitur", "vorabitur"
|
||||
|
||||
# Policy
|
||||
policy_set_id: Optional[str] = None
|
||||
|
||||
# Metadaten
|
||||
created_by: str = ""
|
||||
created_at: datetime = field(default_factory=lambda: datetime.now(timezone.utc))
|
||||
deleted_at: Optional[datetime] = None
|
||||
|
||||
def to_dict(self):
|
||||
return {
|
||||
'id': self.id,
|
||||
'tenant_id': self.tenant_id,
|
||||
'name': self.name,
|
||||
'jahr': self.jahr,
|
||||
'fach': self.fach,
|
||||
'kurs': self.kurs,
|
||||
'pruefungsart': self.pruefungsart,
|
||||
'policy_set_id': self.policy_set_id,
|
||||
'created_by': self.created_by,
|
||||
'created_at': self.created_at.isoformat()
|
||||
}
|
||||
|
||||
|
||||
@dataclass
|
||||
class ExamPackage:
|
||||
"""
|
||||
Pruefungspaket - kompletter Satz Arbeiten mit allen Artefakten.
|
||||
"""
|
||||
id: str
|
||||
namespace_id: str
|
||||
tenant_id: str
|
||||
|
||||
name: str
|
||||
beschreibung: Optional[str] = None
|
||||
|
||||
# Workflow-Status
|
||||
status: str = "draft" # "draft", "in_progress", "locked", "signed_off"
|
||||
|
||||
# Beteiligte (Rollen werden separat zugewiesen)
|
||||
owner_id: str = "" # Typischerweise EK
|
||||
|
||||
# Verschluesselung
|
||||
encryption_key_id: Optional[str] = None
|
||||
|
||||
# Timestamps
|
||||
created_at: datetime = field(default_factory=lambda: datetime.now(timezone.utc))
|
||||
locked_at: Optional[datetime] = None
|
||||
signed_off_at: Optional[datetime] = None
|
||||
signed_off_by: Optional[str] = None
|
||||
|
||||
def to_dict(self):
|
||||
return {
|
||||
'id': self.id,
|
||||
'namespace_id': self.namespace_id,
|
||||
'tenant_id': self.tenant_id,
|
||||
'name': self.name,
|
||||
'beschreibung': self.beschreibung,
|
||||
'status': self.status,
|
||||
'owner_id': self.owner_id,
|
||||
'created_at': self.created_at.isoformat(),
|
||||
'locked_at': self.locked_at.isoformat() if self.locked_at else None,
|
||||
'signed_off_at': self.signed_off_at.isoformat() if self.signed_off_at else None,
|
||||
'signed_off_by': self.signed_off_by
|
||||
}
|
||||
# Backward-compat shim -- module moved to compliance/rbac_types.py
|
||||
import importlib as _importlib
|
||||
import sys as _sys
|
||||
_sys.modules[__name__] = _importlib.import_module("compliance.rbac_types")
|
||||
|
||||
@@ -0,0 +1,6 @@
|
||||
"""
|
||||
training package — training API, simulation, export, TrOCR.
|
||||
|
||||
Backward-compatible re-exports: consumers can still use
|
||||
``from training_api import ...`` etc. via the shim files in backend/.
|
||||
"""
|
||||
@@ -0,0 +1,31 @@
|
||||
"""
|
||||
Training API — barrel re-export.
|
||||
|
||||
The actual code lives in:
|
||||
- training_models.py (enums, Pydantic models, in-memory state)
|
||||
- training_simulation.py (simulate_training_progress, SSE generators)
|
||||
- training_routes.py (FastAPI router + all endpoints)
|
||||
"""
|
||||
|
||||
# Models & enums
|
||||
from .models import ( # noqa: F401
|
||||
TrainingStatus,
|
||||
ModelType,
|
||||
TrainingConfig,
|
||||
TrainingMetrics,
|
||||
TrainingJob,
|
||||
ModelVersion,
|
||||
DatasetStats,
|
||||
TrainingState,
|
||||
_state,
|
||||
)
|
||||
|
||||
# Simulation helpers
|
||||
from .simulation import ( # noqa: F401
|
||||
simulate_training_progress,
|
||||
training_metrics_generator,
|
||||
batch_ocr_progress_generator,
|
||||
)
|
||||
|
||||
# Router
|
||||
from .routes import router # noqa: F401
|
||||
@@ -0,0 +1,448 @@
|
||||
"""
|
||||
Training Export Service for OCR Labeling Data
|
||||
|
||||
Exports labeled OCR data in formats suitable for fine-tuning:
|
||||
- TrOCR (Microsoft's Transformer-based OCR model)
|
||||
- llama3.2-vision (Meta's Vision-Language Model)
|
||||
- Generic JSONL format
|
||||
|
||||
DATENSCHUTZ/PRIVACY:
|
||||
- Alle Daten bleiben lokal auf dem Mac Mini
|
||||
- Keine Cloud-Uploads ohne explizite Zustimmung
|
||||
- Export-Pfade sind konfigurierbar
|
||||
"""
|
||||
|
||||
import os
|
||||
import json
|
||||
import base64
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
from typing import List, Dict, Optional, Any
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime
|
||||
import hashlib
|
||||
|
||||
# Export directory configuration
|
||||
EXPORT_BASE_PATH = os.getenv("OCR_EXPORT_PATH", "/app/ocr-exports")
|
||||
TROCR_EXPORT_PATH = os.path.join(EXPORT_BASE_PATH, "trocr")
|
||||
LLAMA_VISION_EXPORT_PATH = os.path.join(EXPORT_BASE_PATH, "llama-vision")
|
||||
GENERIC_EXPORT_PATH = os.path.join(EXPORT_BASE_PATH, "generic")
|
||||
|
||||
|
||||
@dataclass
|
||||
class TrainingSample:
|
||||
"""A single training sample for OCR fine-tuning."""
|
||||
id: str
|
||||
image_path: str
|
||||
ground_truth: str
|
||||
ocr_text: Optional[str] = None
|
||||
ocr_confidence: Optional[float] = None
|
||||
metadata: Optional[Dict[str, Any]] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class ExportResult:
|
||||
"""Result of a training data export."""
|
||||
export_format: str
|
||||
export_path: str
|
||||
sample_count: int
|
||||
batch_id: str
|
||||
created_at: datetime
|
||||
manifest_path: str
|
||||
|
||||
|
||||
class TrOCRExporter:
|
||||
"""
|
||||
Export training data for TrOCR fine-tuning.
|
||||
|
||||
TrOCR expects:
|
||||
- Image files (PNG/JPG)
|
||||
- A CSV/TSV file with: image_path, text
|
||||
- Or a JSONL file with: {"file_name": "img.png", "text": "ground truth"}
|
||||
|
||||
We use the JSONL format for flexibility.
|
||||
"""
|
||||
|
||||
def __init__(self, export_path: str = TROCR_EXPORT_PATH):
|
||||
self.export_path = export_path
|
||||
os.makedirs(export_path, exist_ok=True)
|
||||
|
||||
def export(
|
||||
self,
|
||||
samples: List[TrainingSample],
|
||||
batch_id: str,
|
||||
copy_images: bool = True,
|
||||
) -> ExportResult:
|
||||
"""
|
||||
Export samples in TrOCR format.
|
||||
|
||||
Args:
|
||||
samples: List of training samples
|
||||
batch_id: Unique batch identifier
|
||||
copy_images: Whether to copy images to export directory
|
||||
|
||||
Returns:
|
||||
ExportResult with export details
|
||||
"""
|
||||
batch_path = os.path.join(self.export_path, batch_id)
|
||||
images_path = os.path.join(batch_path, "images")
|
||||
os.makedirs(images_path, exist_ok=True)
|
||||
|
||||
# Export data
|
||||
export_data = []
|
||||
for sample in samples:
|
||||
# Copy image if requested
|
||||
if copy_images and os.path.exists(sample.image_path):
|
||||
image_filename = f"{sample.id}{Path(sample.image_path).suffix}"
|
||||
dest_path = os.path.join(images_path, image_filename)
|
||||
shutil.copy2(sample.image_path, dest_path)
|
||||
image_ref = f"images/{image_filename}"
|
||||
else:
|
||||
image_ref = sample.image_path
|
||||
|
||||
export_data.append({
|
||||
"file_name": image_ref,
|
||||
"text": sample.ground_truth,
|
||||
"id": sample.id,
|
||||
})
|
||||
|
||||
# Write JSONL file
|
||||
jsonl_path = os.path.join(batch_path, "train.jsonl")
|
||||
with open(jsonl_path, 'w', encoding='utf-8') as f:
|
||||
for item in export_data:
|
||||
f.write(json.dumps(item, ensure_ascii=False) + '\n')
|
||||
|
||||
# Write manifest
|
||||
manifest = {
|
||||
"format": "trocr",
|
||||
"version": "1.0",
|
||||
"batch_id": batch_id,
|
||||
"sample_count": len(samples),
|
||||
"created_at": datetime.utcnow().isoformat(),
|
||||
"files": {
|
||||
"data": "train.jsonl",
|
||||
"images": "images/",
|
||||
},
|
||||
"model_config": {
|
||||
"base_model": "microsoft/trocr-base-handwritten",
|
||||
"task": "handwriting-recognition",
|
||||
},
|
||||
}
|
||||
manifest_path = os.path.join(batch_path, "manifest.json")
|
||||
with open(manifest_path, 'w') as f:
|
||||
json.dump(manifest, f, indent=2)
|
||||
|
||||
return ExportResult(
|
||||
export_format="trocr",
|
||||
export_path=batch_path,
|
||||
sample_count=len(samples),
|
||||
batch_id=batch_id,
|
||||
created_at=datetime.utcnow(),
|
||||
manifest_path=manifest_path,
|
||||
)
|
||||
|
||||
|
||||
class LlamaVisionExporter:
|
||||
"""
|
||||
Export training data for llama3.2-vision fine-tuning.
|
||||
|
||||
Llama Vision fine-tuning expects:
|
||||
- JSONL format with base64-encoded images or image URLs
|
||||
- Format: {"messages": [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": "..."}]}, {"role": "assistant", "content": "..."}]}
|
||||
|
||||
We create a supervised fine-tuning dataset.
|
||||
"""
|
||||
|
||||
def __init__(self, export_path: str = LLAMA_VISION_EXPORT_PATH):
|
||||
self.export_path = export_path
|
||||
os.makedirs(export_path, exist_ok=True)
|
||||
|
||||
def _encode_image_base64(self, image_path: str) -> Optional[str]:
|
||||
"""Encode image to base64."""
|
||||
try:
|
||||
with open(image_path, 'rb') as f:
|
||||
return base64.b64encode(f.read()).decode('utf-8')
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
def export(
|
||||
self,
|
||||
samples: List[TrainingSample],
|
||||
batch_id: str,
|
||||
include_base64: bool = False,
|
||||
copy_images: bool = True,
|
||||
) -> ExportResult:
|
||||
"""
|
||||
Export samples in Llama Vision fine-tuning format.
|
||||
|
||||
Args:
|
||||
samples: List of training samples
|
||||
batch_id: Unique batch identifier
|
||||
include_base64: Whether to include base64-encoded images in JSONL
|
||||
copy_images: Whether to copy images to export directory
|
||||
|
||||
Returns:
|
||||
ExportResult with export details
|
||||
"""
|
||||
batch_path = os.path.join(self.export_path, batch_id)
|
||||
images_path = os.path.join(batch_path, "images")
|
||||
os.makedirs(images_path, exist_ok=True)
|
||||
|
||||
# OCR instruction prompt
|
||||
system_prompt = (
|
||||
"Du bist ein OCR-Experte für deutsche Handschrift. "
|
||||
"Lies den handgeschriebenen Text im Bild und gib ihn wortgetreu wieder."
|
||||
)
|
||||
|
||||
# Export data
|
||||
export_data = []
|
||||
for sample in samples:
|
||||
# Copy image if requested
|
||||
if copy_images and os.path.exists(sample.image_path):
|
||||
image_filename = f"{sample.id}{Path(sample.image_path).suffix}"
|
||||
dest_path = os.path.join(images_path, image_filename)
|
||||
shutil.copy2(sample.image_path, dest_path)
|
||||
image_ref = f"images/{image_filename}"
|
||||
else:
|
||||
image_ref = sample.image_path
|
||||
|
||||
# Build message format
|
||||
user_content = [
|
||||
{"type": "image_url", "image_url": {"url": image_ref}},
|
||||
{"type": "text", "text": "Lies den handgeschriebenen Text in diesem Bild."},
|
||||
]
|
||||
|
||||
# Optionally include base64
|
||||
if include_base64:
|
||||
b64 = self._encode_image_base64(sample.image_path)
|
||||
if b64:
|
||||
ext = Path(sample.image_path).suffix.lower().replace('.', '')
|
||||
mime = {'png': 'image/png', 'jpg': 'image/jpeg', 'jpeg': 'image/jpeg'}.get(ext, 'image/png')
|
||||
user_content[0] = {
|
||||
"type": "image_url",
|
||||
"image_url": {"url": f"data:{mime};base64,{b64}"}
|
||||
}
|
||||
|
||||
export_data.append({
|
||||
"id": sample.id,
|
||||
"messages": [
|
||||
{"role": "system", "content": system_prompt},
|
||||
{"role": "user", "content": user_content},
|
||||
{"role": "assistant", "content": sample.ground_truth},
|
||||
],
|
||||
})
|
||||
|
||||
# Write JSONL file
|
||||
jsonl_path = os.path.join(batch_path, "train.jsonl")
|
||||
with open(jsonl_path, 'w', encoding='utf-8') as f:
|
||||
for item in export_data:
|
||||
f.write(json.dumps(item, ensure_ascii=False) + '\n')
|
||||
|
||||
# Write manifest
|
||||
manifest = {
|
||||
"format": "llama_vision",
|
||||
"version": "1.0",
|
||||
"batch_id": batch_id,
|
||||
"sample_count": len(samples),
|
||||
"created_at": datetime.utcnow().isoformat(),
|
||||
"files": {
|
||||
"data": "train.jsonl",
|
||||
"images": "images/",
|
||||
},
|
||||
"model_config": {
|
||||
"base_model": "llama3.2-vision:11b",
|
||||
"task": "handwriting-ocr",
|
||||
"system_prompt": system_prompt,
|
||||
},
|
||||
}
|
||||
manifest_path = os.path.join(batch_path, "manifest.json")
|
||||
with open(manifest_path, 'w') as f:
|
||||
json.dump(manifest, f, indent=2)
|
||||
|
||||
return ExportResult(
|
||||
export_format="llama_vision",
|
||||
export_path=batch_path,
|
||||
sample_count=len(samples),
|
||||
batch_id=batch_id,
|
||||
created_at=datetime.utcnow(),
|
||||
manifest_path=manifest_path,
|
||||
)
|
||||
|
||||
|
||||
class GenericExporter:
|
||||
"""
|
||||
Export training data in a generic JSONL format.
|
||||
|
||||
This format is compatible with most ML frameworks and can be
|
||||
easily converted to other formats.
|
||||
"""
|
||||
|
||||
def __init__(self, export_path: str = GENERIC_EXPORT_PATH):
|
||||
self.export_path = export_path
|
||||
os.makedirs(export_path, exist_ok=True)
|
||||
|
||||
def export(
|
||||
self,
|
||||
samples: List[TrainingSample],
|
||||
batch_id: str,
|
||||
copy_images: bool = True,
|
||||
) -> ExportResult:
|
||||
"""
|
||||
Export samples in generic JSONL format.
|
||||
|
||||
Args:
|
||||
samples: List of training samples
|
||||
batch_id: Unique batch identifier
|
||||
copy_images: Whether to copy images to export directory
|
||||
|
||||
Returns:
|
||||
ExportResult with export details
|
||||
"""
|
||||
batch_path = os.path.join(self.export_path, batch_id)
|
||||
images_path = os.path.join(batch_path, "images")
|
||||
os.makedirs(images_path, exist_ok=True)
|
||||
|
||||
# Export data
|
||||
export_data = []
|
||||
for sample in samples:
|
||||
# Copy image if requested
|
||||
if copy_images and os.path.exists(sample.image_path):
|
||||
image_filename = f"{sample.id}{Path(sample.image_path).suffix}"
|
||||
dest_path = os.path.join(images_path, image_filename)
|
||||
shutil.copy2(sample.image_path, dest_path)
|
||||
image_ref = f"images/{image_filename}"
|
||||
else:
|
||||
image_ref = sample.image_path
|
||||
|
||||
export_data.append({
|
||||
"id": sample.id,
|
||||
"image_path": image_ref,
|
||||
"ground_truth": sample.ground_truth,
|
||||
"ocr_text": sample.ocr_text,
|
||||
"ocr_confidence": sample.ocr_confidence,
|
||||
"metadata": sample.metadata or {},
|
||||
})
|
||||
|
||||
# Write JSONL file
|
||||
jsonl_path = os.path.join(batch_path, "data.jsonl")
|
||||
with open(jsonl_path, 'w', encoding='utf-8') as f:
|
||||
for item in export_data:
|
||||
f.write(json.dumps(item, ensure_ascii=False) + '\n')
|
||||
|
||||
# Also write as single JSON for convenience
|
||||
json_path = os.path.join(batch_path, "data.json")
|
||||
with open(json_path, 'w', encoding='utf-8') as f:
|
||||
json.dump(export_data, f, indent=2, ensure_ascii=False)
|
||||
|
||||
# Write manifest
|
||||
manifest = {
|
||||
"format": "generic",
|
||||
"version": "1.0",
|
||||
"batch_id": batch_id,
|
||||
"sample_count": len(samples),
|
||||
"created_at": datetime.utcnow().isoformat(),
|
||||
"files": {
|
||||
"data_jsonl": "data.jsonl",
|
||||
"data_json": "data.json",
|
||||
"images": "images/",
|
||||
},
|
||||
}
|
||||
manifest_path = os.path.join(batch_path, "manifest.json")
|
||||
with open(manifest_path, 'w') as f:
|
||||
json.dump(manifest, f, indent=2)
|
||||
|
||||
return ExportResult(
|
||||
export_format="generic",
|
||||
export_path=batch_path,
|
||||
sample_count=len(samples),
|
||||
batch_id=batch_id,
|
||||
created_at=datetime.utcnow(),
|
||||
manifest_path=manifest_path,
|
||||
)
|
||||
|
||||
|
||||
class TrainingExportService:
|
||||
"""
|
||||
Main service for exporting OCR labeling data to various training formats.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self.trocr_exporter = TrOCRExporter()
|
||||
self.llama_vision_exporter = LlamaVisionExporter()
|
||||
self.generic_exporter = GenericExporter()
|
||||
|
||||
def export(
|
||||
self,
|
||||
samples: List[TrainingSample],
|
||||
export_format: str,
|
||||
batch_id: Optional[str] = None,
|
||||
**kwargs,
|
||||
) -> ExportResult:
|
||||
"""
|
||||
Export training samples in the specified format.
|
||||
|
||||
Args:
|
||||
samples: List of training samples
|
||||
export_format: 'trocr', 'llama_vision', or 'generic'
|
||||
batch_id: Optional batch ID (generated if not provided)
|
||||
**kwargs: Additional format-specific options
|
||||
|
||||
Returns:
|
||||
ExportResult with export details
|
||||
"""
|
||||
if not batch_id:
|
||||
batch_id = datetime.utcnow().strftime("%Y%m%d_%H%M%S")
|
||||
|
||||
if export_format == "trocr":
|
||||
return self.trocr_exporter.export(samples, batch_id, **kwargs)
|
||||
elif export_format == "llama_vision":
|
||||
return self.llama_vision_exporter.export(samples, batch_id, **kwargs)
|
||||
elif export_format == "generic":
|
||||
return self.generic_exporter.export(samples, batch_id, **kwargs)
|
||||
else:
|
||||
raise ValueError(f"Unknown export format: {export_format}")
|
||||
|
||||
def list_exports(self, export_format: Optional[str] = None) -> List[Dict]:
|
||||
"""
|
||||
List all available exports.
|
||||
|
||||
Args:
|
||||
export_format: Optional filter by format
|
||||
|
||||
Returns:
|
||||
List of export manifests
|
||||
"""
|
||||
exports = []
|
||||
|
||||
paths_to_check = []
|
||||
if export_format is None or export_format == "trocr":
|
||||
paths_to_check.append((TROCR_EXPORT_PATH, "trocr"))
|
||||
if export_format is None or export_format == "llama_vision":
|
||||
paths_to_check.append((LLAMA_VISION_EXPORT_PATH, "llama_vision"))
|
||||
if export_format is None or export_format == "generic":
|
||||
paths_to_check.append((GENERIC_EXPORT_PATH, "generic"))
|
||||
|
||||
for base_path, fmt in paths_to_check:
|
||||
if not os.path.exists(base_path):
|
||||
continue
|
||||
for batch_dir in os.listdir(base_path):
|
||||
manifest_path = os.path.join(base_path, batch_dir, "manifest.json")
|
||||
if os.path.exists(manifest_path):
|
||||
with open(manifest_path, 'r') as f:
|
||||
manifest = json.load(f)
|
||||
manifest["export_path"] = os.path.join(base_path, batch_dir)
|
||||
exports.append(manifest)
|
||||
|
||||
return sorted(exports, key=lambda x: x.get("created_at", ""), reverse=True)
|
||||
|
||||
|
||||
# Singleton instance
|
||||
_export_service: Optional[TrainingExportService] = None
|
||||
|
||||
|
||||
def get_training_export_service() -> TrainingExportService:
|
||||
"""Get or create the training export service singleton."""
|
||||
global _export_service
|
||||
if _export_service is None:
|
||||
_export_service = TrainingExportService()
|
||||
return _export_service
|
||||
@@ -0,0 +1,118 @@
|
||||
"""
|
||||
Training API — enums, request/response models, and in-memory state.
|
||||
"""
|
||||
|
||||
import uuid
|
||||
from datetime import datetime
|
||||
from typing import Optional, List, Dict, Any
|
||||
from enum import Enum
|
||||
from dataclasses import dataclass, field
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# ENUMS
|
||||
# ============================================================================
|
||||
|
||||
class TrainingStatus(str, Enum):
|
||||
QUEUED = "queued"
|
||||
PREPARING = "preparing"
|
||||
TRAINING = "training"
|
||||
VALIDATING = "validating"
|
||||
COMPLETED = "completed"
|
||||
FAILED = "failed"
|
||||
PAUSED = "paused"
|
||||
CANCELLED = "cancelled"
|
||||
|
||||
|
||||
class ModelType(str, Enum):
|
||||
ZEUGNIS = "zeugnis"
|
||||
KLAUSUR = "klausur"
|
||||
GENERAL = "general"
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# REQUEST/RESPONSE MODELS
|
||||
# ============================================================================
|
||||
|
||||
class TrainingConfig(BaseModel):
|
||||
"""Configuration for a training job."""
|
||||
name: str = Field(..., description="Name for the training job")
|
||||
model_type: ModelType = Field(ModelType.ZEUGNIS, description="Type of model to train")
|
||||
bundeslaender: List[str] = Field(..., description="List of Bundesland codes to include")
|
||||
batch_size: int = Field(16, ge=1, le=128)
|
||||
learning_rate: float = Field(0.00005, ge=0.000001, le=0.1)
|
||||
epochs: int = Field(10, ge=1, le=100)
|
||||
warmup_steps: int = Field(500, ge=0, le=10000)
|
||||
weight_decay: float = Field(0.01, ge=0, le=1)
|
||||
gradient_accumulation: int = Field(4, ge=1, le=32)
|
||||
mixed_precision: bool = Field(True, description="Use FP16 mixed precision training")
|
||||
|
||||
|
||||
class TrainingMetrics(BaseModel):
|
||||
"""Metrics from a training job."""
|
||||
precision: float = 0.0
|
||||
recall: float = 0.0
|
||||
f1_score: float = 0.0
|
||||
accuracy: float = 0.0
|
||||
loss_history: List[float] = []
|
||||
val_loss_history: List[float] = []
|
||||
|
||||
|
||||
class TrainingJob(BaseModel):
|
||||
"""A training job with full details."""
|
||||
id: str
|
||||
name: str
|
||||
model_type: ModelType
|
||||
status: TrainingStatus
|
||||
progress: float
|
||||
current_epoch: int
|
||||
total_epochs: int
|
||||
loss: float
|
||||
val_loss: float
|
||||
learning_rate: float
|
||||
documents_processed: int
|
||||
total_documents: int
|
||||
started_at: Optional[datetime]
|
||||
estimated_completion: Optional[datetime]
|
||||
completed_at: Optional[datetime]
|
||||
error_message: Optional[str]
|
||||
metrics: TrainingMetrics
|
||||
config: TrainingConfig
|
||||
|
||||
|
||||
class ModelVersion(BaseModel):
|
||||
"""A trained model version."""
|
||||
id: str
|
||||
job_id: str
|
||||
version: str
|
||||
model_type: ModelType
|
||||
created_at: datetime
|
||||
metrics: TrainingMetrics
|
||||
is_active: bool
|
||||
size_mb: float
|
||||
bundeslaender: List[str]
|
||||
|
||||
|
||||
class DatasetStats(BaseModel):
|
||||
"""Statistics about the training dataset."""
|
||||
total_documents: int
|
||||
total_chunks: int
|
||||
training_allowed: int
|
||||
by_bundesland: Dict[str, int]
|
||||
by_doc_type: Dict[str, int]
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# IN-MEMORY STATE (Replace with database in production)
|
||||
# ============================================================================
|
||||
|
||||
@dataclass
|
||||
class TrainingState:
|
||||
"""Global training state."""
|
||||
jobs: Dict[str, dict] = field(default_factory=dict)
|
||||
model_versions: Dict[str, dict] = field(default_factory=dict)
|
||||
active_job_id: Optional[str] = None
|
||||
|
||||
|
||||
_state = TrainingState()
|
||||
@@ -0,0 +1,303 @@
|
||||
"""
|
||||
Training API — FastAPI route handlers.
|
||||
"""
|
||||
|
||||
import uuid
|
||||
from datetime import datetime
|
||||
from typing import List
|
||||
|
||||
from fastapi import APIRouter, HTTPException, BackgroundTasks, Request
|
||||
from fastapi.responses import StreamingResponse
|
||||
|
||||
from .models import (
|
||||
TrainingStatus,
|
||||
TrainingConfig,
|
||||
_state,
|
||||
)
|
||||
from .simulation import (
|
||||
simulate_training_progress,
|
||||
training_metrics_generator,
|
||||
batch_ocr_progress_generator,
|
||||
)
|
||||
|
||||
router = APIRouter(prefix="/api/v1/admin/training", tags=["Training"])
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# TRAINING JOBS
|
||||
# ============================================================================
|
||||
|
||||
@router.get("/jobs", response_model=List[dict])
|
||||
async def list_training_jobs():
|
||||
"""Get all training jobs."""
|
||||
return list(_state.jobs.values())
|
||||
|
||||
|
||||
@router.get("/jobs/{job_id}", response_model=dict)
|
||||
async def get_training_job(job_id: str):
|
||||
"""Get details for a specific training job."""
|
||||
if job_id not in _state.jobs:
|
||||
raise HTTPException(status_code=404, detail="Job not found")
|
||||
return _state.jobs[job_id]
|
||||
|
||||
|
||||
@router.post("/jobs", response_model=dict)
|
||||
async def create_training_job(config: TrainingConfig, background_tasks: BackgroundTasks):
|
||||
"""Create and start a new training job."""
|
||||
# Check if there's already an active job
|
||||
if _state.active_job_id:
|
||||
active_job = _state.jobs.get(_state.active_job_id)
|
||||
if active_job and active_job["status"] in [
|
||||
TrainingStatus.TRAINING.value,
|
||||
TrainingStatus.PREPARING.value,
|
||||
]:
|
||||
raise HTTPException(
|
||||
status_code=409,
|
||||
detail="Another training job is already running"
|
||||
)
|
||||
|
||||
# Create job
|
||||
job_id = str(uuid.uuid4())
|
||||
job = {
|
||||
"id": job_id,
|
||||
"name": config.name,
|
||||
"model_type": config.model_type.value,
|
||||
"status": TrainingStatus.QUEUED.value,
|
||||
"progress": 0,
|
||||
"current_epoch": 0,
|
||||
"total_epochs": config.epochs,
|
||||
"loss": 1.0,
|
||||
"val_loss": 1.0,
|
||||
"learning_rate": config.learning_rate,
|
||||
"documents_processed": 0,
|
||||
"total_documents": len(config.bundeslaender) * 50, # Estimate
|
||||
"started_at": None,
|
||||
"estimated_completion": None,
|
||||
"completed_at": None,
|
||||
"error_message": None,
|
||||
"metrics": {
|
||||
"precision": 0.0,
|
||||
"recall": 0.0,
|
||||
"f1_score": 0.0,
|
||||
"accuracy": 0.0,
|
||||
"loss_history": [],
|
||||
"val_loss_history": [],
|
||||
},
|
||||
"config": config.dict(),
|
||||
}
|
||||
|
||||
_state.jobs[job_id] = job
|
||||
_state.active_job_id = job_id
|
||||
|
||||
# Start training in background
|
||||
background_tasks.add_task(simulate_training_progress, job_id)
|
||||
|
||||
return {"id": job_id, "status": "queued", "message": "Training job created"}
|
||||
|
||||
|
||||
@router.post("/jobs/{job_id}/pause", response_model=dict)
|
||||
async def pause_training_job(job_id: str):
|
||||
"""Pause a running training job."""
|
||||
if job_id not in _state.jobs:
|
||||
raise HTTPException(status_code=404, detail="Job not found")
|
||||
|
||||
job = _state.jobs[job_id]
|
||||
if job["status"] != TrainingStatus.TRAINING.value:
|
||||
raise HTTPException(status_code=400, detail="Job is not running")
|
||||
|
||||
job["status"] = TrainingStatus.PAUSED.value
|
||||
return {"success": True, "message": "Training paused"}
|
||||
|
||||
|
||||
@router.post("/jobs/{job_id}/resume", response_model=dict)
|
||||
async def resume_training_job(job_id: str, background_tasks: BackgroundTasks):
|
||||
"""Resume a paused training job."""
|
||||
if job_id not in _state.jobs:
|
||||
raise HTTPException(status_code=404, detail="Job not found")
|
||||
|
||||
job = _state.jobs[job_id]
|
||||
if job["status"] != TrainingStatus.PAUSED.value:
|
||||
raise HTTPException(status_code=400, detail="Job is not paused")
|
||||
|
||||
job["status"] = TrainingStatus.TRAINING.value
|
||||
_state.active_job_id = job_id
|
||||
background_tasks.add_task(simulate_training_progress, job_id)
|
||||
|
||||
return {"success": True, "message": "Training resumed"}
|
||||
|
||||
|
||||
@router.post("/jobs/{job_id}/cancel", response_model=dict)
|
||||
async def cancel_training_job(job_id: str):
|
||||
"""Cancel a training job."""
|
||||
if job_id not in _state.jobs:
|
||||
raise HTTPException(status_code=404, detail="Job not found")
|
||||
|
||||
job = _state.jobs[job_id]
|
||||
job["status"] = TrainingStatus.CANCELLED.value
|
||||
job["completed_at"] = datetime.now().isoformat()
|
||||
|
||||
if _state.active_job_id == job_id:
|
||||
_state.active_job_id = None
|
||||
|
||||
return {"success": True, "message": "Training cancelled"}
|
||||
|
||||
|
||||
@router.delete("/jobs/{job_id}", response_model=dict)
|
||||
async def delete_training_job(job_id: str):
|
||||
"""Delete a training job."""
|
||||
if job_id not in _state.jobs:
|
||||
raise HTTPException(status_code=404, detail="Job not found")
|
||||
|
||||
job = _state.jobs[job_id]
|
||||
if job["status"] == TrainingStatus.TRAINING.value:
|
||||
raise HTTPException(status_code=400, detail="Cannot delete running job")
|
||||
|
||||
del _state.jobs[job_id]
|
||||
return {"success": True, "message": "Job deleted"}
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# MODEL VERSIONS
|
||||
# ============================================================================
|
||||
|
||||
@router.get("/models", response_model=List[dict])
|
||||
async def list_model_versions():
|
||||
"""Get all trained model versions."""
|
||||
return list(_state.model_versions.values())
|
||||
|
||||
|
||||
@router.get("/models/{version_id}", response_model=dict)
|
||||
async def get_model_version(version_id: str):
|
||||
"""Get details for a specific model version."""
|
||||
if version_id not in _state.model_versions:
|
||||
raise HTTPException(status_code=404, detail="Model version not found")
|
||||
return _state.model_versions[version_id]
|
||||
|
||||
|
||||
@router.post("/models/{version_id}/activate", response_model=dict)
|
||||
async def activate_model_version(version_id: str):
|
||||
"""Set a model version as active."""
|
||||
if version_id not in _state.model_versions:
|
||||
raise HTTPException(status_code=404, detail="Model version not found")
|
||||
|
||||
# Deactivate all other versions of same type
|
||||
model = _state.model_versions[version_id]
|
||||
for v in _state.model_versions.values():
|
||||
if v["model_type"] == model["model_type"]:
|
||||
v["is_active"] = False
|
||||
|
||||
model["is_active"] = True
|
||||
return {"success": True, "message": "Model activated"}
|
||||
|
||||
|
||||
@router.delete("/models/{version_id}", response_model=dict)
|
||||
async def delete_model_version(version_id: str):
|
||||
"""Delete a model version."""
|
||||
if version_id not in _state.model_versions:
|
||||
raise HTTPException(status_code=404, detail="Model version not found")
|
||||
|
||||
model = _state.model_versions[version_id]
|
||||
if model["is_active"]:
|
||||
raise HTTPException(status_code=400, detail="Cannot delete active model")
|
||||
|
||||
del _state.model_versions[version_id]
|
||||
return {"success": True, "message": "Model deleted"}
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# DATASET STATS & STATUS
|
||||
# ============================================================================
|
||||
|
||||
@router.get("/dataset/stats", response_model=dict)
|
||||
async def get_dataset_stats():
|
||||
"""Get statistics about the training dataset."""
|
||||
from metrics_db import get_zeugnis_stats
|
||||
|
||||
zeugnis_stats = await get_zeugnis_stats()
|
||||
|
||||
return {
|
||||
"total_documents": zeugnis_stats.get("total_documents", 0),
|
||||
"total_chunks": zeugnis_stats.get("total_documents", 0) * 12,
|
||||
"training_allowed": zeugnis_stats.get("training_allowed_documents", 0),
|
||||
"by_bundesland": {
|
||||
bl["bundesland"]: bl.get("doc_count", 0)
|
||||
for bl in zeugnis_stats.get("per_bundesland", [])
|
||||
},
|
||||
"by_doc_type": {
|
||||
"verordnung": 150,
|
||||
"schulordnung": 80,
|
||||
"handreichung": 45,
|
||||
"erlass": 30,
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
@router.get("/status", response_model=dict)
|
||||
async def get_training_status():
|
||||
"""Get overall training system status."""
|
||||
active_job = None
|
||||
if _state.active_job_id and _state.active_job_id in _state.jobs:
|
||||
active_job = _state.jobs[_state.active_job_id]
|
||||
|
||||
return {
|
||||
"is_training": _state.active_job_id is not None and active_job is not None and
|
||||
active_job["status"] == TrainingStatus.TRAINING.value,
|
||||
"active_job_id": _state.active_job_id,
|
||||
"total_jobs": len(_state.jobs),
|
||||
"completed_jobs": sum(
|
||||
1 for j in _state.jobs.values()
|
||||
if j["status"] == TrainingStatus.COMPLETED.value
|
||||
),
|
||||
"failed_jobs": sum(
|
||||
1 for j in _state.jobs.values()
|
||||
if j["status"] == TrainingStatus.FAILED.value
|
||||
),
|
||||
"model_versions": len(_state.model_versions),
|
||||
"active_models": sum(1 for m in _state.model_versions.values() if m["is_active"]),
|
||||
}
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# SSE ENDPOINTS
|
||||
# ============================================================================
|
||||
|
||||
@router.get("/metrics/stream")
|
||||
async def stream_training_metrics(job_id: str, request: Request):
|
||||
"""
|
||||
SSE endpoint for streaming training metrics.
|
||||
|
||||
Streams real-time training progress for a specific job.
|
||||
"""
|
||||
if job_id not in _state.jobs:
|
||||
raise HTTPException(status_code=404, detail="Job not found")
|
||||
|
||||
return StreamingResponse(
|
||||
training_metrics_generator(job_id, request),
|
||||
media_type="text/event-stream",
|
||||
headers={
|
||||
"Cache-Control": "no-cache",
|
||||
"Connection": "keep-alive",
|
||||
"X-Accel-Buffering": "no"
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
@router.get("/ocr/stream")
|
||||
async def stream_batch_ocr(images_count: int, request: Request):
|
||||
"""
|
||||
SSE endpoint for streaming batch OCR progress.
|
||||
|
||||
Simulates batch OCR processing with progress updates.
|
||||
"""
|
||||
if images_count < 1 or images_count > 100:
|
||||
raise HTTPException(status_code=400, detail="images_count must be between 1 and 100")
|
||||
|
||||
return StreamingResponse(
|
||||
batch_ocr_progress_generator(images_count, request),
|
||||
media_type="text/event-stream",
|
||||
headers={
|
||||
"Cache-Control": "no-cache",
|
||||
"Connection": "keep-alive",
|
||||
"X-Accel-Buffering": "no"
|
||||
}
|
||||
)
|
||||
@@ -0,0 +1,190 @@
|
||||
"""
|
||||
Training API — simulation helper and SSE generators.
|
||||
"""
|
||||
|
||||
import json
|
||||
import uuid
|
||||
import asyncio
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
from .models import TrainingStatus, _state
|
||||
|
||||
|
||||
async def simulate_training_progress(job_id: str):
|
||||
"""Simulate training progress (replace with actual training logic)."""
|
||||
if job_id not in _state.jobs:
|
||||
return
|
||||
|
||||
job = _state.jobs[job_id]
|
||||
job["status"] = TrainingStatus.TRAINING.value
|
||||
job["started_at"] = datetime.now().isoformat()
|
||||
|
||||
total_steps = job["total_epochs"] * 100 # Simulate 100 steps per epoch
|
||||
current_step = 0
|
||||
|
||||
while current_step < total_steps and job["status"] == TrainingStatus.TRAINING.value:
|
||||
# Update progress
|
||||
progress = (current_step / total_steps) * 100
|
||||
current_epoch = current_step // 100 + 1
|
||||
|
||||
# Simulate decreasing loss
|
||||
base_loss = 0.8 * (1 - progress / 100) + 0.1
|
||||
loss = base_loss + (0.05 * (0.5 - (current_step % 100) / 100))
|
||||
val_loss = loss * 1.1
|
||||
|
||||
# Update job state
|
||||
job["progress"] = progress
|
||||
job["current_epoch"] = min(current_epoch, job["total_epochs"])
|
||||
job["loss"] = round(loss, 4)
|
||||
job["val_loss"] = round(val_loss, 4)
|
||||
job["documents_processed"] = int((progress / 100) * job["total_documents"])
|
||||
|
||||
# Update metrics
|
||||
job["metrics"]["loss_history"].append(round(loss, 4))
|
||||
job["metrics"]["val_loss_history"].append(round(val_loss, 4))
|
||||
job["metrics"]["precision"] = round(0.5 + (progress / 200), 3)
|
||||
job["metrics"]["recall"] = round(0.45 + (progress / 200), 3)
|
||||
job["metrics"]["f1_score"] = round(0.47 + (progress / 200), 3)
|
||||
job["metrics"]["accuracy"] = round(0.6 + (progress / 250), 3)
|
||||
|
||||
# Keep only last 50 history points
|
||||
if len(job["metrics"]["loss_history"]) > 50:
|
||||
job["metrics"]["loss_history"] = job["metrics"]["loss_history"][-50:]
|
||||
job["metrics"]["val_loss_history"] = job["metrics"]["val_loss_history"][-50:]
|
||||
|
||||
# Estimate completion
|
||||
if progress > 0:
|
||||
elapsed = (datetime.now() - datetime.fromisoformat(job["started_at"])).total_seconds()
|
||||
remaining = (elapsed / progress) * (100 - progress)
|
||||
job["estimated_completion"] = (datetime.now() + timedelta(seconds=remaining)).isoformat()
|
||||
|
||||
current_step += 1
|
||||
await asyncio.sleep(0.5) # Simulate work
|
||||
|
||||
# Mark as completed
|
||||
if job["status"] == TrainingStatus.TRAINING.value:
|
||||
job["status"] = TrainingStatus.COMPLETED.value
|
||||
job["progress"] = 100
|
||||
job["completed_at"] = datetime.now().isoformat()
|
||||
|
||||
# Create model version
|
||||
version_id = str(uuid.uuid4())
|
||||
_state.model_versions[version_id] = {
|
||||
"id": version_id,
|
||||
"job_id": job_id,
|
||||
"version": f"v{len(_state.model_versions) + 1}.0",
|
||||
"model_type": job["model_type"],
|
||||
"created_at": datetime.now().isoformat(),
|
||||
"metrics": job["metrics"],
|
||||
"is_active": True,
|
||||
"size_mb": 245.7,
|
||||
"bundeslaender": job["config"]["bundeslaender"],
|
||||
}
|
||||
|
||||
_state.active_job_id = None
|
||||
|
||||
|
||||
async def training_metrics_generator(job_id: str, request):
|
||||
"""
|
||||
SSE generator for streaming training metrics.
|
||||
|
||||
Yields JSON-encoded training status updates every 500ms.
|
||||
"""
|
||||
while True:
|
||||
# Check if client disconnected
|
||||
if await request.is_disconnected():
|
||||
break
|
||||
|
||||
# Get job status
|
||||
if job_id not in _state.jobs:
|
||||
yield f"data: {json.dumps({'error': 'Job not found'})}\n\n"
|
||||
break
|
||||
|
||||
job = _state.jobs[job_id]
|
||||
|
||||
# Build metrics response
|
||||
metrics_data = {
|
||||
"job_id": job["id"],
|
||||
"status": job["status"],
|
||||
"progress": job["progress"],
|
||||
"current_epoch": job["current_epoch"],
|
||||
"total_epochs": job["total_epochs"],
|
||||
"current_step": int(job["progress"] * job["total_epochs"]),
|
||||
"total_steps": job["total_epochs"] * 100,
|
||||
"elapsed_time_ms": 0,
|
||||
"estimated_remaining_ms": 0,
|
||||
"metrics": {
|
||||
"loss": job["loss"],
|
||||
"val_loss": job["val_loss"],
|
||||
"accuracy": job["metrics"]["accuracy"],
|
||||
"learning_rate": job["learning_rate"]
|
||||
},
|
||||
"history": [
|
||||
{
|
||||
"epoch": i + 1,
|
||||
"step": (i + 1) * 10,
|
||||
"loss": loss,
|
||||
"val_loss": job["metrics"]["val_loss_history"][i] if i < len(job["metrics"]["val_loss_history"]) else None,
|
||||
"learning_rate": job["learning_rate"],
|
||||
"timestamp": 0
|
||||
}
|
||||
for i, loss in enumerate(job["metrics"]["loss_history"][-50:])
|
||||
]
|
||||
}
|
||||
|
||||
# Calculate elapsed time
|
||||
if job["started_at"]:
|
||||
started = datetime.fromisoformat(job["started_at"])
|
||||
metrics_data["elapsed_time_ms"] = int((datetime.now() - started).total_seconds() * 1000)
|
||||
|
||||
# Calculate remaining time
|
||||
if job["estimated_completion"]:
|
||||
estimated = datetime.fromisoformat(job["estimated_completion"])
|
||||
metrics_data["estimated_remaining_ms"] = max(0, int((estimated - datetime.now()).total_seconds() * 1000))
|
||||
|
||||
# Send SSE event
|
||||
yield f"data: {json.dumps(metrics_data)}\n\n"
|
||||
|
||||
# Check if job completed
|
||||
if job["status"] in [TrainingStatus.COMPLETED.value, TrainingStatus.FAILED.value, TrainingStatus.CANCELLED.value]:
|
||||
break
|
||||
|
||||
# Wait before next update
|
||||
await asyncio.sleep(0.5)
|
||||
|
||||
|
||||
async def batch_ocr_progress_generator(images_count: int, request):
|
||||
"""
|
||||
SSE generator for batch OCR progress simulation.
|
||||
|
||||
In production, this would integrate with actual OCR processing.
|
||||
"""
|
||||
import random
|
||||
|
||||
for i in range(images_count):
|
||||
# Check if client disconnected
|
||||
if await request.is_disconnected():
|
||||
break
|
||||
|
||||
# Simulate processing time
|
||||
await asyncio.sleep(random.uniform(0.3, 0.8))
|
||||
|
||||
progress_data = {
|
||||
"type": "progress",
|
||||
"current": i + 1,
|
||||
"total": images_count,
|
||||
"progress_percent": ((i + 1) / images_count) * 100,
|
||||
"elapsed_ms": (i + 1) * 500,
|
||||
"estimated_remaining_ms": (images_count - i - 1) * 500,
|
||||
"result": {
|
||||
"text": f"Sample recognized text for image {i + 1}",
|
||||
"confidence": round(random.uniform(0.7, 0.98), 2),
|
||||
"processing_time_ms": random.randint(200, 600),
|
||||
"from_cache": random.random() < 0.2
|
||||
}
|
||||
}
|
||||
|
||||
yield f"data: {json.dumps(progress_data)}\n\n"
|
||||
|
||||
# Send completion event
|
||||
yield f"data: {json.dumps({'type': 'complete', 'total_time_ms': images_count * 500, 'processed_count': images_count})}\n\n"
|
||||
@@ -0,0 +1,261 @@
|
||||
"""
|
||||
TrOCR API - REST endpoints for TrOCR handwriting OCR.
|
||||
|
||||
Provides:
|
||||
- /ocr/trocr - Single image OCR
|
||||
- /ocr/trocr/batch - Batch image processing
|
||||
- /ocr/trocr/status - Model status
|
||||
- /ocr/trocr/cache - Cache statistics
|
||||
"""
|
||||
|
||||
from fastapi import APIRouter, UploadFile, File, HTTPException, Query
|
||||
from fastapi.responses import StreamingResponse
|
||||
from pydantic import BaseModel, Field
|
||||
from typing import List, Optional
|
||||
import json
|
||||
import logging
|
||||
|
||||
from services.trocr_service import (
|
||||
run_trocr_ocr_enhanced,
|
||||
run_trocr_batch,
|
||||
run_trocr_batch_stream,
|
||||
get_model_status,
|
||||
get_cache_stats,
|
||||
preload_trocr_model,
|
||||
OCRResult,
|
||||
BatchOCRResult
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
router = APIRouter(prefix="/api/v1/ocr/trocr", tags=["TrOCR"])
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# MODELS
|
||||
# =============================================================================
|
||||
|
||||
class TrOCRResponse(BaseModel):
|
||||
"""Response model for single image OCR."""
|
||||
text: str = Field(..., description="Extracted text")
|
||||
confidence: float = Field(..., ge=0.0, le=1.0, description="Overall confidence")
|
||||
processing_time_ms: int = Field(..., ge=0, description="Processing time in milliseconds")
|
||||
model: str = Field(..., description="Model used for OCR")
|
||||
has_lora_adapter: bool = Field(False, description="Whether LoRA adapter was used")
|
||||
from_cache: bool = Field(False, description="Whether result was from cache")
|
||||
image_hash: str = Field("", description="SHA256 hash of image (first 16 chars)")
|
||||
word_count: int = Field(0, description="Number of words detected")
|
||||
|
||||
|
||||
class BatchOCRResponse(BaseModel):
|
||||
"""Response model for batch OCR."""
|
||||
results: List[TrOCRResponse] = Field(..., description="Individual OCR results")
|
||||
total_time_ms: int = Field(..., ge=0, description="Total processing time")
|
||||
processed_count: int = Field(..., ge=0, description="Number of images processed")
|
||||
cached_count: int = Field(0, description="Number of results from cache")
|
||||
error_count: int = Field(0, description="Number of errors")
|
||||
|
||||
|
||||
class ModelStatusResponse(BaseModel):
|
||||
"""Response model for model status."""
|
||||
status: str = Field(..., description="Model status: available, not_installed")
|
||||
is_loaded: bool = Field(..., description="Whether model is loaded in memory")
|
||||
model_name: Optional[str] = Field(None, description="Name of loaded model")
|
||||
device: Optional[str] = Field(None, description="Device model is running on")
|
||||
loaded_at: Optional[str] = Field(None, description="ISO timestamp when model was loaded")
|
||||
|
||||
|
||||
class CacheStatsResponse(BaseModel):
|
||||
"""Response model for cache statistics."""
|
||||
size: int = Field(..., ge=0, description="Current cache size")
|
||||
max_size: int = Field(..., ge=0, description="Maximum cache size")
|
||||
ttl_seconds: int = Field(..., ge=0, description="Cache TTL in seconds")
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# ENDPOINTS
|
||||
# =============================================================================
|
||||
|
||||
@router.get("/status", response_model=ModelStatusResponse)
|
||||
async def get_trocr_status():
|
||||
"""
|
||||
Get TrOCR model status.
|
||||
|
||||
Returns information about whether the model is loaded and available.
|
||||
"""
|
||||
return get_model_status()
|
||||
|
||||
|
||||
@router.get("/cache", response_model=CacheStatsResponse)
|
||||
async def get_trocr_cache_stats():
|
||||
"""
|
||||
Get TrOCR cache statistics.
|
||||
|
||||
Returns information about the OCR result cache.
|
||||
"""
|
||||
return get_cache_stats()
|
||||
|
||||
|
||||
@router.post("/preload")
|
||||
async def preload_model(handwritten: bool = Query(True, description="Load handwritten model")):
|
||||
"""
|
||||
Preload TrOCR model into memory.
|
||||
|
||||
This speeds up the first OCR request by loading the model ahead of time.
|
||||
"""
|
||||
success = preload_trocr_model(handwritten=handwritten)
|
||||
if success:
|
||||
return {"status": "success", "message": "Model preloaded successfully"}
|
||||
else:
|
||||
raise HTTPException(status_code=500, detail="Failed to preload model")
|
||||
|
||||
|
||||
@router.post("", response_model=TrOCRResponse)
|
||||
async def run_trocr(
|
||||
file: UploadFile = File(..., description="Image file to process"),
|
||||
handwritten: bool = Query(True, description="Use handwritten model"),
|
||||
split_lines: bool = Query(True, description="Split image into lines"),
|
||||
use_cache: bool = Query(True, description="Use result caching")
|
||||
):
|
||||
"""
|
||||
Run TrOCR on a single image.
|
||||
|
||||
Supports PNG, JPG, and other common image formats.
|
||||
"""
|
||||
# Validate file type
|
||||
if not file.content_type or not file.content_type.startswith("image/"):
|
||||
raise HTTPException(status_code=400, detail="File must be an image")
|
||||
|
||||
try:
|
||||
image_data = await file.read()
|
||||
|
||||
result = await run_trocr_ocr_enhanced(
|
||||
image_data,
|
||||
handwritten=handwritten,
|
||||
split_lines=split_lines,
|
||||
use_cache=use_cache
|
||||
)
|
||||
|
||||
return TrOCRResponse(
|
||||
text=result.text,
|
||||
confidence=result.confidence,
|
||||
processing_time_ms=result.processing_time_ms,
|
||||
model=result.model,
|
||||
has_lora_adapter=result.has_lora_adapter,
|
||||
from_cache=result.from_cache,
|
||||
image_hash=result.image_hash,
|
||||
word_count=len(result.text.split()) if result.text else 0
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"TrOCR API error: {e}")
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
@router.post("/batch", response_model=BatchOCRResponse)
|
||||
async def run_trocr_batch_endpoint(
|
||||
files: List[UploadFile] = File(..., description="Image files to process"),
|
||||
handwritten: bool = Query(True, description="Use handwritten model"),
|
||||
split_lines: bool = Query(True, description="Split images into lines"),
|
||||
use_cache: bool = Query(True, description="Use result caching")
|
||||
):
|
||||
"""
|
||||
Run TrOCR on multiple images.
|
||||
|
||||
Processes images sequentially and returns all results.
|
||||
"""
|
||||
if not files:
|
||||
raise HTTPException(status_code=400, detail="No files provided")
|
||||
|
||||
if len(files) > 50:
|
||||
raise HTTPException(status_code=400, detail="Maximum 50 images per batch")
|
||||
|
||||
try:
|
||||
images = []
|
||||
for file in files:
|
||||
if not file.content_type or not file.content_type.startswith("image/"):
|
||||
raise HTTPException(status_code=400, detail=f"File {file.filename} is not an image")
|
||||
images.append(await file.read())
|
||||
|
||||
batch_result = await run_trocr_batch(
|
||||
images,
|
||||
handwritten=handwritten,
|
||||
split_lines=split_lines,
|
||||
use_cache=use_cache
|
||||
)
|
||||
|
||||
return BatchOCRResponse(
|
||||
results=[
|
||||
TrOCRResponse(
|
||||
text=r.text,
|
||||
confidence=r.confidence,
|
||||
processing_time_ms=r.processing_time_ms,
|
||||
model=r.model,
|
||||
has_lora_adapter=r.has_lora_adapter,
|
||||
from_cache=r.from_cache,
|
||||
image_hash=r.image_hash,
|
||||
word_count=len(r.text.split()) if r.text else 0
|
||||
)
|
||||
for r in batch_result.results
|
||||
],
|
||||
total_time_ms=batch_result.total_time_ms,
|
||||
processed_count=batch_result.processed_count,
|
||||
cached_count=batch_result.cached_count,
|
||||
error_count=batch_result.error_count
|
||||
)
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"TrOCR batch API error: {e}")
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
@router.post("/batch/stream")
|
||||
async def run_trocr_batch_stream_endpoint(
|
||||
files: List[UploadFile] = File(..., description="Image files to process"),
|
||||
handwritten: bool = Query(True, description="Use handwritten model"),
|
||||
split_lines: bool = Query(True, description="Split images into lines"),
|
||||
use_cache: bool = Query(True, description="Use result caching")
|
||||
):
|
||||
"""
|
||||
Run TrOCR on multiple images with Server-Sent Events (SSE) progress updates.
|
||||
|
||||
Returns a stream of progress events as images are processed.
|
||||
"""
|
||||
if not files:
|
||||
raise HTTPException(status_code=400, detail="No files provided")
|
||||
|
||||
if len(files) > 50:
|
||||
raise HTTPException(status_code=400, detail="Maximum 50 images per batch")
|
||||
|
||||
try:
|
||||
images = []
|
||||
for file in files:
|
||||
if not file.content_type or not file.content_type.startswith("image/"):
|
||||
raise HTTPException(status_code=400, detail=f"File {file.filename} is not an image")
|
||||
images.append(await file.read())
|
||||
|
||||
async def event_generator():
|
||||
async for update in run_trocr_batch_stream(
|
||||
images,
|
||||
handwritten=handwritten,
|
||||
split_lines=split_lines,
|
||||
use_cache=use_cache
|
||||
):
|
||||
yield f"data: {json.dumps(update)}\n\n"
|
||||
|
||||
return StreamingResponse(
|
||||
event_generator(),
|
||||
media_type="text/event-stream",
|
||||
headers={
|
||||
"Cache-Control": "no-cache",
|
||||
"Connection": "keep-alive"
|
||||
}
|
||||
)
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"TrOCR stream API error: {e}")
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
@@ -1,31 +1,4 @@
|
||||
"""
|
||||
Training API — barrel re-export.
|
||||
|
||||
The actual code lives in:
|
||||
- training_models.py (enums, Pydantic models, in-memory state)
|
||||
- training_simulation.py (simulate_training_progress, SSE generators)
|
||||
- training_routes.py (FastAPI router + all endpoints)
|
||||
"""
|
||||
|
||||
# Models & enums
|
||||
from training_models import ( # noqa: F401
|
||||
TrainingStatus,
|
||||
ModelType,
|
||||
TrainingConfig,
|
||||
TrainingMetrics,
|
||||
TrainingJob,
|
||||
ModelVersion,
|
||||
DatasetStats,
|
||||
TrainingState,
|
||||
_state,
|
||||
)
|
||||
|
||||
# Simulation helpers
|
||||
from training_simulation import ( # noqa: F401
|
||||
simulate_training_progress,
|
||||
training_metrics_generator,
|
||||
batch_ocr_progress_generator,
|
||||
)
|
||||
|
||||
# Router
|
||||
from training_routes import router # noqa: F401
|
||||
# Backward-compat shim -- module moved to training/api.py
|
||||
import importlib as _importlib
|
||||
import sys as _sys
|
||||
_sys.modules[__name__] = _importlib.import_module("training.api")
|
||||
|
||||
@@ -1,448 +1,4 @@
|
||||
"""
|
||||
Training Export Service for OCR Labeling Data
|
||||
|
||||
Exports labeled OCR data in formats suitable for fine-tuning:
|
||||
- TrOCR (Microsoft's Transformer-based OCR model)
|
||||
- llama3.2-vision (Meta's Vision-Language Model)
|
||||
- Generic JSONL format
|
||||
|
||||
DATENSCHUTZ/PRIVACY:
|
||||
- Alle Daten bleiben lokal auf dem Mac Mini
|
||||
- Keine Cloud-Uploads ohne explizite Zustimmung
|
||||
- Export-Pfade sind konfigurierbar
|
||||
"""
|
||||
|
||||
import os
|
||||
import json
|
||||
import base64
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
from typing import List, Dict, Optional, Any
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime
|
||||
import hashlib
|
||||
|
||||
# Export directory configuration
|
||||
EXPORT_BASE_PATH = os.getenv("OCR_EXPORT_PATH", "/app/ocr-exports")
|
||||
TROCR_EXPORT_PATH = os.path.join(EXPORT_BASE_PATH, "trocr")
|
||||
LLAMA_VISION_EXPORT_PATH = os.path.join(EXPORT_BASE_PATH, "llama-vision")
|
||||
GENERIC_EXPORT_PATH = os.path.join(EXPORT_BASE_PATH, "generic")
|
||||
|
||||
|
||||
@dataclass
|
||||
class TrainingSample:
|
||||
"""A single training sample for OCR fine-tuning."""
|
||||
id: str
|
||||
image_path: str
|
||||
ground_truth: str
|
||||
ocr_text: Optional[str] = None
|
||||
ocr_confidence: Optional[float] = None
|
||||
metadata: Optional[Dict[str, Any]] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class ExportResult:
|
||||
"""Result of a training data export."""
|
||||
export_format: str
|
||||
export_path: str
|
||||
sample_count: int
|
||||
batch_id: str
|
||||
created_at: datetime
|
||||
manifest_path: str
|
||||
|
||||
|
||||
class TrOCRExporter:
|
||||
"""
|
||||
Export training data for TrOCR fine-tuning.
|
||||
|
||||
TrOCR expects:
|
||||
- Image files (PNG/JPG)
|
||||
- A CSV/TSV file with: image_path, text
|
||||
- Or a JSONL file with: {"file_name": "img.png", "text": "ground truth"}
|
||||
|
||||
We use the JSONL format for flexibility.
|
||||
"""
|
||||
|
||||
def __init__(self, export_path: str = TROCR_EXPORT_PATH):
|
||||
self.export_path = export_path
|
||||
os.makedirs(export_path, exist_ok=True)
|
||||
|
||||
def export(
|
||||
self,
|
||||
samples: List[TrainingSample],
|
||||
batch_id: str,
|
||||
copy_images: bool = True,
|
||||
) -> ExportResult:
|
||||
"""
|
||||
Export samples in TrOCR format.
|
||||
|
||||
Args:
|
||||
samples: List of training samples
|
||||
batch_id: Unique batch identifier
|
||||
copy_images: Whether to copy images to export directory
|
||||
|
||||
Returns:
|
||||
ExportResult with export details
|
||||
"""
|
||||
batch_path = os.path.join(self.export_path, batch_id)
|
||||
images_path = os.path.join(batch_path, "images")
|
||||
os.makedirs(images_path, exist_ok=True)
|
||||
|
||||
# Export data
|
||||
export_data = []
|
||||
for sample in samples:
|
||||
# Copy image if requested
|
||||
if copy_images and os.path.exists(sample.image_path):
|
||||
image_filename = f"{sample.id}{Path(sample.image_path).suffix}"
|
||||
dest_path = os.path.join(images_path, image_filename)
|
||||
shutil.copy2(sample.image_path, dest_path)
|
||||
image_ref = f"images/{image_filename}"
|
||||
else:
|
||||
image_ref = sample.image_path
|
||||
|
||||
export_data.append({
|
||||
"file_name": image_ref,
|
||||
"text": sample.ground_truth,
|
||||
"id": sample.id,
|
||||
})
|
||||
|
||||
# Write JSONL file
|
||||
jsonl_path = os.path.join(batch_path, "train.jsonl")
|
||||
with open(jsonl_path, 'w', encoding='utf-8') as f:
|
||||
for item in export_data:
|
||||
f.write(json.dumps(item, ensure_ascii=False) + '\n')
|
||||
|
||||
# Write manifest
|
||||
manifest = {
|
||||
"format": "trocr",
|
||||
"version": "1.0",
|
||||
"batch_id": batch_id,
|
||||
"sample_count": len(samples),
|
||||
"created_at": datetime.utcnow().isoformat(),
|
||||
"files": {
|
||||
"data": "train.jsonl",
|
||||
"images": "images/",
|
||||
},
|
||||
"model_config": {
|
||||
"base_model": "microsoft/trocr-base-handwritten",
|
||||
"task": "handwriting-recognition",
|
||||
},
|
||||
}
|
||||
manifest_path = os.path.join(batch_path, "manifest.json")
|
||||
with open(manifest_path, 'w') as f:
|
||||
json.dump(manifest, f, indent=2)
|
||||
|
||||
return ExportResult(
|
||||
export_format="trocr",
|
||||
export_path=batch_path,
|
||||
sample_count=len(samples),
|
||||
batch_id=batch_id,
|
||||
created_at=datetime.utcnow(),
|
||||
manifest_path=manifest_path,
|
||||
)
|
||||
|
||||
|
||||
class LlamaVisionExporter:
|
||||
"""
|
||||
Export training data for llama3.2-vision fine-tuning.
|
||||
|
||||
Llama Vision fine-tuning expects:
|
||||
- JSONL format with base64-encoded images or image URLs
|
||||
- Format: {"messages": [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": "..."}]}, {"role": "assistant", "content": "..."}]}
|
||||
|
||||
We create a supervised fine-tuning dataset.
|
||||
"""
|
||||
|
||||
def __init__(self, export_path: str = LLAMA_VISION_EXPORT_PATH):
|
||||
self.export_path = export_path
|
||||
os.makedirs(export_path, exist_ok=True)
|
||||
|
||||
def _encode_image_base64(self, image_path: str) -> Optional[str]:
|
||||
"""Encode image to base64."""
|
||||
try:
|
||||
with open(image_path, 'rb') as f:
|
||||
return base64.b64encode(f.read()).decode('utf-8')
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
def export(
|
||||
self,
|
||||
samples: List[TrainingSample],
|
||||
batch_id: str,
|
||||
include_base64: bool = False,
|
||||
copy_images: bool = True,
|
||||
) -> ExportResult:
|
||||
"""
|
||||
Export samples in Llama Vision fine-tuning format.
|
||||
|
||||
Args:
|
||||
samples: List of training samples
|
||||
batch_id: Unique batch identifier
|
||||
include_base64: Whether to include base64-encoded images in JSONL
|
||||
copy_images: Whether to copy images to export directory
|
||||
|
||||
Returns:
|
||||
ExportResult with export details
|
||||
"""
|
||||
batch_path = os.path.join(self.export_path, batch_id)
|
||||
images_path = os.path.join(batch_path, "images")
|
||||
os.makedirs(images_path, exist_ok=True)
|
||||
|
||||
# OCR instruction prompt
|
||||
system_prompt = (
|
||||
"Du bist ein OCR-Experte für deutsche Handschrift. "
|
||||
"Lies den handgeschriebenen Text im Bild und gib ihn wortgetreu wieder."
|
||||
)
|
||||
|
||||
# Export data
|
||||
export_data = []
|
||||
for sample in samples:
|
||||
# Copy image if requested
|
||||
if copy_images and os.path.exists(sample.image_path):
|
||||
image_filename = f"{sample.id}{Path(sample.image_path).suffix}"
|
||||
dest_path = os.path.join(images_path, image_filename)
|
||||
shutil.copy2(sample.image_path, dest_path)
|
||||
image_ref = f"images/{image_filename}"
|
||||
else:
|
||||
image_ref = sample.image_path
|
||||
|
||||
# Build message format
|
||||
user_content = [
|
||||
{"type": "image_url", "image_url": {"url": image_ref}},
|
||||
{"type": "text", "text": "Lies den handgeschriebenen Text in diesem Bild."},
|
||||
]
|
||||
|
||||
# Optionally include base64
|
||||
if include_base64:
|
||||
b64 = self._encode_image_base64(sample.image_path)
|
||||
if b64:
|
||||
ext = Path(sample.image_path).suffix.lower().replace('.', '')
|
||||
mime = {'png': 'image/png', 'jpg': 'image/jpeg', 'jpeg': 'image/jpeg'}.get(ext, 'image/png')
|
||||
user_content[0] = {
|
||||
"type": "image_url",
|
||||
"image_url": {"url": f"data:{mime};base64,{b64}"}
|
||||
}
|
||||
|
||||
export_data.append({
|
||||
"id": sample.id,
|
||||
"messages": [
|
||||
{"role": "system", "content": system_prompt},
|
||||
{"role": "user", "content": user_content},
|
||||
{"role": "assistant", "content": sample.ground_truth},
|
||||
],
|
||||
})
|
||||
|
||||
# Write JSONL file
|
||||
jsonl_path = os.path.join(batch_path, "train.jsonl")
|
||||
with open(jsonl_path, 'w', encoding='utf-8') as f:
|
||||
for item in export_data:
|
||||
f.write(json.dumps(item, ensure_ascii=False) + '\n')
|
||||
|
||||
# Write manifest
|
||||
manifest = {
|
||||
"format": "llama_vision",
|
||||
"version": "1.0",
|
||||
"batch_id": batch_id,
|
||||
"sample_count": len(samples),
|
||||
"created_at": datetime.utcnow().isoformat(),
|
||||
"files": {
|
||||
"data": "train.jsonl",
|
||||
"images": "images/",
|
||||
},
|
||||
"model_config": {
|
||||
"base_model": "llama3.2-vision:11b",
|
||||
"task": "handwriting-ocr",
|
||||
"system_prompt": system_prompt,
|
||||
},
|
||||
}
|
||||
manifest_path = os.path.join(batch_path, "manifest.json")
|
||||
with open(manifest_path, 'w') as f:
|
||||
json.dump(manifest, f, indent=2)
|
||||
|
||||
return ExportResult(
|
||||
export_format="llama_vision",
|
||||
export_path=batch_path,
|
||||
sample_count=len(samples),
|
||||
batch_id=batch_id,
|
||||
created_at=datetime.utcnow(),
|
||||
manifest_path=manifest_path,
|
||||
)
|
||||
|
||||
|
||||
class GenericExporter:
|
||||
"""
|
||||
Export training data in a generic JSONL format.
|
||||
|
||||
This format is compatible with most ML frameworks and can be
|
||||
easily converted to other formats.
|
||||
"""
|
||||
|
||||
def __init__(self, export_path: str = GENERIC_EXPORT_PATH):
|
||||
self.export_path = export_path
|
||||
os.makedirs(export_path, exist_ok=True)
|
||||
|
||||
def export(
|
||||
self,
|
||||
samples: List[TrainingSample],
|
||||
batch_id: str,
|
||||
copy_images: bool = True,
|
||||
) -> ExportResult:
|
||||
"""
|
||||
Export samples in generic JSONL format.
|
||||
|
||||
Args:
|
||||
samples: List of training samples
|
||||
batch_id: Unique batch identifier
|
||||
copy_images: Whether to copy images to export directory
|
||||
|
||||
Returns:
|
||||
ExportResult with export details
|
||||
"""
|
||||
batch_path = os.path.join(self.export_path, batch_id)
|
||||
images_path = os.path.join(batch_path, "images")
|
||||
os.makedirs(images_path, exist_ok=True)
|
||||
|
||||
# Export data
|
||||
export_data = []
|
||||
for sample in samples:
|
||||
# Copy image if requested
|
||||
if copy_images and os.path.exists(sample.image_path):
|
||||
image_filename = f"{sample.id}{Path(sample.image_path).suffix}"
|
||||
dest_path = os.path.join(images_path, image_filename)
|
||||
shutil.copy2(sample.image_path, dest_path)
|
||||
image_ref = f"images/{image_filename}"
|
||||
else:
|
||||
image_ref = sample.image_path
|
||||
|
||||
export_data.append({
|
||||
"id": sample.id,
|
||||
"image_path": image_ref,
|
||||
"ground_truth": sample.ground_truth,
|
||||
"ocr_text": sample.ocr_text,
|
||||
"ocr_confidence": sample.ocr_confidence,
|
||||
"metadata": sample.metadata or {},
|
||||
})
|
||||
|
||||
# Write JSONL file
|
||||
jsonl_path = os.path.join(batch_path, "data.jsonl")
|
||||
with open(jsonl_path, 'w', encoding='utf-8') as f:
|
||||
for item in export_data:
|
||||
f.write(json.dumps(item, ensure_ascii=False) + '\n')
|
||||
|
||||
# Also write as single JSON for convenience
|
||||
json_path = os.path.join(batch_path, "data.json")
|
||||
with open(json_path, 'w', encoding='utf-8') as f:
|
||||
json.dump(export_data, f, indent=2, ensure_ascii=False)
|
||||
|
||||
# Write manifest
|
||||
manifest = {
|
||||
"format": "generic",
|
||||
"version": "1.0",
|
||||
"batch_id": batch_id,
|
||||
"sample_count": len(samples),
|
||||
"created_at": datetime.utcnow().isoformat(),
|
||||
"files": {
|
||||
"data_jsonl": "data.jsonl",
|
||||
"data_json": "data.json",
|
||||
"images": "images/",
|
||||
},
|
||||
}
|
||||
manifest_path = os.path.join(batch_path, "manifest.json")
|
||||
with open(manifest_path, 'w') as f:
|
||||
json.dump(manifest, f, indent=2)
|
||||
|
||||
return ExportResult(
|
||||
export_format="generic",
|
||||
export_path=batch_path,
|
||||
sample_count=len(samples),
|
||||
batch_id=batch_id,
|
||||
created_at=datetime.utcnow(),
|
||||
manifest_path=manifest_path,
|
||||
)
|
||||
|
||||
|
||||
class TrainingExportService:
|
||||
"""
|
||||
Main service for exporting OCR labeling data to various training formats.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self.trocr_exporter = TrOCRExporter()
|
||||
self.llama_vision_exporter = LlamaVisionExporter()
|
||||
self.generic_exporter = GenericExporter()
|
||||
|
||||
def export(
|
||||
self,
|
||||
samples: List[TrainingSample],
|
||||
export_format: str,
|
||||
batch_id: Optional[str] = None,
|
||||
**kwargs,
|
||||
) -> ExportResult:
|
||||
"""
|
||||
Export training samples in the specified format.
|
||||
|
||||
Args:
|
||||
samples: List of training samples
|
||||
export_format: 'trocr', 'llama_vision', or 'generic'
|
||||
batch_id: Optional batch ID (generated if not provided)
|
||||
**kwargs: Additional format-specific options
|
||||
|
||||
Returns:
|
||||
ExportResult with export details
|
||||
"""
|
||||
if not batch_id:
|
||||
batch_id = datetime.utcnow().strftime("%Y%m%d_%H%M%S")
|
||||
|
||||
if export_format == "trocr":
|
||||
return self.trocr_exporter.export(samples, batch_id, **kwargs)
|
||||
elif export_format == "llama_vision":
|
||||
return self.llama_vision_exporter.export(samples, batch_id, **kwargs)
|
||||
elif export_format == "generic":
|
||||
return self.generic_exporter.export(samples, batch_id, **kwargs)
|
||||
else:
|
||||
raise ValueError(f"Unknown export format: {export_format}")
|
||||
|
||||
def list_exports(self, export_format: Optional[str] = None) -> List[Dict]:
|
||||
"""
|
||||
List all available exports.
|
||||
|
||||
Args:
|
||||
export_format: Optional filter by format
|
||||
|
||||
Returns:
|
||||
List of export manifests
|
||||
"""
|
||||
exports = []
|
||||
|
||||
paths_to_check = []
|
||||
if export_format is None or export_format == "trocr":
|
||||
paths_to_check.append((TROCR_EXPORT_PATH, "trocr"))
|
||||
if export_format is None or export_format == "llama_vision":
|
||||
paths_to_check.append((LLAMA_VISION_EXPORT_PATH, "llama_vision"))
|
||||
if export_format is None or export_format == "generic":
|
||||
paths_to_check.append((GENERIC_EXPORT_PATH, "generic"))
|
||||
|
||||
for base_path, fmt in paths_to_check:
|
||||
if not os.path.exists(base_path):
|
||||
continue
|
||||
for batch_dir in os.listdir(base_path):
|
||||
manifest_path = os.path.join(base_path, batch_dir, "manifest.json")
|
||||
if os.path.exists(manifest_path):
|
||||
with open(manifest_path, 'r') as f:
|
||||
manifest = json.load(f)
|
||||
manifest["export_path"] = os.path.join(base_path, batch_dir)
|
||||
exports.append(manifest)
|
||||
|
||||
return sorted(exports, key=lambda x: x.get("created_at", ""), reverse=True)
|
||||
|
||||
|
||||
# Singleton instance
|
||||
_export_service: Optional[TrainingExportService] = None
|
||||
|
||||
|
||||
def get_training_export_service() -> TrainingExportService:
|
||||
"""Get or create the training export service singleton."""
|
||||
global _export_service
|
||||
if _export_service is None:
|
||||
_export_service = TrainingExportService()
|
||||
return _export_service
|
||||
# Backward-compat shim -- module moved to training/export_service.py
|
||||
import importlib as _importlib
|
||||
import sys as _sys
|
||||
_sys.modules[__name__] = _importlib.import_module("training.export_service")
|
||||
|
||||
@@ -1,118 +1,4 @@
|
||||
"""
|
||||
Training API — enums, request/response models, and in-memory state.
|
||||
"""
|
||||
|
||||
import uuid
|
||||
from datetime import datetime
|
||||
from typing import Optional, List, Dict, Any
|
||||
from enum import Enum
|
||||
from dataclasses import dataclass, field
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# ENUMS
|
||||
# ============================================================================
|
||||
|
||||
class TrainingStatus(str, Enum):
|
||||
QUEUED = "queued"
|
||||
PREPARING = "preparing"
|
||||
TRAINING = "training"
|
||||
VALIDATING = "validating"
|
||||
COMPLETED = "completed"
|
||||
FAILED = "failed"
|
||||
PAUSED = "paused"
|
||||
CANCELLED = "cancelled"
|
||||
|
||||
|
||||
class ModelType(str, Enum):
|
||||
ZEUGNIS = "zeugnis"
|
||||
KLAUSUR = "klausur"
|
||||
GENERAL = "general"
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# REQUEST/RESPONSE MODELS
|
||||
# ============================================================================
|
||||
|
||||
class TrainingConfig(BaseModel):
|
||||
"""Configuration for a training job."""
|
||||
name: str = Field(..., description="Name for the training job")
|
||||
model_type: ModelType = Field(ModelType.ZEUGNIS, description="Type of model to train")
|
||||
bundeslaender: List[str] = Field(..., description="List of Bundesland codes to include")
|
||||
batch_size: int = Field(16, ge=1, le=128)
|
||||
learning_rate: float = Field(0.00005, ge=0.000001, le=0.1)
|
||||
epochs: int = Field(10, ge=1, le=100)
|
||||
warmup_steps: int = Field(500, ge=0, le=10000)
|
||||
weight_decay: float = Field(0.01, ge=0, le=1)
|
||||
gradient_accumulation: int = Field(4, ge=1, le=32)
|
||||
mixed_precision: bool = Field(True, description="Use FP16 mixed precision training")
|
||||
|
||||
|
||||
class TrainingMetrics(BaseModel):
|
||||
"""Metrics from a training job."""
|
||||
precision: float = 0.0
|
||||
recall: float = 0.0
|
||||
f1_score: float = 0.0
|
||||
accuracy: float = 0.0
|
||||
loss_history: List[float] = []
|
||||
val_loss_history: List[float] = []
|
||||
|
||||
|
||||
class TrainingJob(BaseModel):
|
||||
"""A training job with full details."""
|
||||
id: str
|
||||
name: str
|
||||
model_type: ModelType
|
||||
status: TrainingStatus
|
||||
progress: float
|
||||
current_epoch: int
|
||||
total_epochs: int
|
||||
loss: float
|
||||
val_loss: float
|
||||
learning_rate: float
|
||||
documents_processed: int
|
||||
total_documents: int
|
||||
started_at: Optional[datetime]
|
||||
estimated_completion: Optional[datetime]
|
||||
completed_at: Optional[datetime]
|
||||
error_message: Optional[str]
|
||||
metrics: TrainingMetrics
|
||||
config: TrainingConfig
|
||||
|
||||
|
||||
class ModelVersion(BaseModel):
|
||||
"""A trained model version."""
|
||||
id: str
|
||||
job_id: str
|
||||
version: str
|
||||
model_type: ModelType
|
||||
created_at: datetime
|
||||
metrics: TrainingMetrics
|
||||
is_active: bool
|
||||
size_mb: float
|
||||
bundeslaender: List[str]
|
||||
|
||||
|
||||
class DatasetStats(BaseModel):
|
||||
"""Statistics about the training dataset."""
|
||||
total_documents: int
|
||||
total_chunks: int
|
||||
training_allowed: int
|
||||
by_bundesland: Dict[str, int]
|
||||
by_doc_type: Dict[str, int]
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# IN-MEMORY STATE (Replace with database in production)
|
||||
# ============================================================================
|
||||
|
||||
@dataclass
|
||||
class TrainingState:
|
||||
"""Global training state."""
|
||||
jobs: Dict[str, dict] = field(default_factory=dict)
|
||||
model_versions: Dict[str, dict] = field(default_factory=dict)
|
||||
active_job_id: Optional[str] = None
|
||||
|
||||
|
||||
_state = TrainingState()
|
||||
# Backward-compat shim -- module moved to training/models.py
|
||||
import importlib as _importlib
|
||||
import sys as _sys
|
||||
_sys.modules[__name__] = _importlib.import_module("training.models")
|
||||
|
||||
@@ -1,303 +1,4 @@
|
||||
"""
|
||||
Training API — FastAPI route handlers.
|
||||
"""
|
||||
|
||||
import uuid
|
||||
from datetime import datetime
|
||||
from typing import List
|
||||
|
||||
from fastapi import APIRouter, HTTPException, BackgroundTasks, Request
|
||||
from fastapi.responses import StreamingResponse
|
||||
|
||||
from training_models import (
|
||||
TrainingStatus,
|
||||
TrainingConfig,
|
||||
_state,
|
||||
)
|
||||
from training_simulation import (
|
||||
simulate_training_progress,
|
||||
training_metrics_generator,
|
||||
batch_ocr_progress_generator,
|
||||
)
|
||||
|
||||
router = APIRouter(prefix="/api/v1/admin/training", tags=["Training"])
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# TRAINING JOBS
|
||||
# ============================================================================
|
||||
|
||||
@router.get("/jobs", response_model=List[dict])
|
||||
async def list_training_jobs():
|
||||
"""Get all training jobs."""
|
||||
return list(_state.jobs.values())
|
||||
|
||||
|
||||
@router.get("/jobs/{job_id}", response_model=dict)
|
||||
async def get_training_job(job_id: str):
|
||||
"""Get details for a specific training job."""
|
||||
if job_id not in _state.jobs:
|
||||
raise HTTPException(status_code=404, detail="Job not found")
|
||||
return _state.jobs[job_id]
|
||||
|
||||
|
||||
@router.post("/jobs", response_model=dict)
|
||||
async def create_training_job(config: TrainingConfig, background_tasks: BackgroundTasks):
|
||||
"""Create and start a new training job."""
|
||||
# Check if there's already an active job
|
||||
if _state.active_job_id:
|
||||
active_job = _state.jobs.get(_state.active_job_id)
|
||||
if active_job and active_job["status"] in [
|
||||
TrainingStatus.TRAINING.value,
|
||||
TrainingStatus.PREPARING.value,
|
||||
]:
|
||||
raise HTTPException(
|
||||
status_code=409,
|
||||
detail="Another training job is already running"
|
||||
)
|
||||
|
||||
# Create job
|
||||
job_id = str(uuid.uuid4())
|
||||
job = {
|
||||
"id": job_id,
|
||||
"name": config.name,
|
||||
"model_type": config.model_type.value,
|
||||
"status": TrainingStatus.QUEUED.value,
|
||||
"progress": 0,
|
||||
"current_epoch": 0,
|
||||
"total_epochs": config.epochs,
|
||||
"loss": 1.0,
|
||||
"val_loss": 1.0,
|
||||
"learning_rate": config.learning_rate,
|
||||
"documents_processed": 0,
|
||||
"total_documents": len(config.bundeslaender) * 50, # Estimate
|
||||
"started_at": None,
|
||||
"estimated_completion": None,
|
||||
"completed_at": None,
|
||||
"error_message": None,
|
||||
"metrics": {
|
||||
"precision": 0.0,
|
||||
"recall": 0.0,
|
||||
"f1_score": 0.0,
|
||||
"accuracy": 0.0,
|
||||
"loss_history": [],
|
||||
"val_loss_history": [],
|
||||
},
|
||||
"config": config.dict(),
|
||||
}
|
||||
|
||||
_state.jobs[job_id] = job
|
||||
_state.active_job_id = job_id
|
||||
|
||||
# Start training in background
|
||||
background_tasks.add_task(simulate_training_progress, job_id)
|
||||
|
||||
return {"id": job_id, "status": "queued", "message": "Training job created"}
|
||||
|
||||
|
||||
@router.post("/jobs/{job_id}/pause", response_model=dict)
|
||||
async def pause_training_job(job_id: str):
|
||||
"""Pause a running training job."""
|
||||
if job_id not in _state.jobs:
|
||||
raise HTTPException(status_code=404, detail="Job not found")
|
||||
|
||||
job = _state.jobs[job_id]
|
||||
if job["status"] != TrainingStatus.TRAINING.value:
|
||||
raise HTTPException(status_code=400, detail="Job is not running")
|
||||
|
||||
job["status"] = TrainingStatus.PAUSED.value
|
||||
return {"success": True, "message": "Training paused"}
|
||||
|
||||
|
||||
@router.post("/jobs/{job_id}/resume", response_model=dict)
|
||||
async def resume_training_job(job_id: str, background_tasks: BackgroundTasks):
|
||||
"""Resume a paused training job."""
|
||||
if job_id not in _state.jobs:
|
||||
raise HTTPException(status_code=404, detail="Job not found")
|
||||
|
||||
job = _state.jobs[job_id]
|
||||
if job["status"] != TrainingStatus.PAUSED.value:
|
||||
raise HTTPException(status_code=400, detail="Job is not paused")
|
||||
|
||||
job["status"] = TrainingStatus.TRAINING.value
|
||||
_state.active_job_id = job_id
|
||||
background_tasks.add_task(simulate_training_progress, job_id)
|
||||
|
||||
return {"success": True, "message": "Training resumed"}
|
||||
|
||||
|
||||
@router.post("/jobs/{job_id}/cancel", response_model=dict)
|
||||
async def cancel_training_job(job_id: str):
|
||||
"""Cancel a training job."""
|
||||
if job_id not in _state.jobs:
|
||||
raise HTTPException(status_code=404, detail="Job not found")
|
||||
|
||||
job = _state.jobs[job_id]
|
||||
job["status"] = TrainingStatus.CANCELLED.value
|
||||
job["completed_at"] = datetime.now().isoformat()
|
||||
|
||||
if _state.active_job_id == job_id:
|
||||
_state.active_job_id = None
|
||||
|
||||
return {"success": True, "message": "Training cancelled"}
|
||||
|
||||
|
||||
@router.delete("/jobs/{job_id}", response_model=dict)
|
||||
async def delete_training_job(job_id: str):
|
||||
"""Delete a training job."""
|
||||
if job_id not in _state.jobs:
|
||||
raise HTTPException(status_code=404, detail="Job not found")
|
||||
|
||||
job = _state.jobs[job_id]
|
||||
if job["status"] == TrainingStatus.TRAINING.value:
|
||||
raise HTTPException(status_code=400, detail="Cannot delete running job")
|
||||
|
||||
del _state.jobs[job_id]
|
||||
return {"success": True, "message": "Job deleted"}
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# MODEL VERSIONS
|
||||
# ============================================================================
|
||||
|
||||
@router.get("/models", response_model=List[dict])
|
||||
async def list_model_versions():
|
||||
"""Get all trained model versions."""
|
||||
return list(_state.model_versions.values())
|
||||
|
||||
|
||||
@router.get("/models/{version_id}", response_model=dict)
|
||||
async def get_model_version(version_id: str):
|
||||
"""Get details for a specific model version."""
|
||||
if version_id not in _state.model_versions:
|
||||
raise HTTPException(status_code=404, detail="Model version not found")
|
||||
return _state.model_versions[version_id]
|
||||
|
||||
|
||||
@router.post("/models/{version_id}/activate", response_model=dict)
|
||||
async def activate_model_version(version_id: str):
|
||||
"""Set a model version as active."""
|
||||
if version_id not in _state.model_versions:
|
||||
raise HTTPException(status_code=404, detail="Model version not found")
|
||||
|
||||
# Deactivate all other versions of same type
|
||||
model = _state.model_versions[version_id]
|
||||
for v in _state.model_versions.values():
|
||||
if v["model_type"] == model["model_type"]:
|
||||
v["is_active"] = False
|
||||
|
||||
model["is_active"] = True
|
||||
return {"success": True, "message": "Model activated"}
|
||||
|
||||
|
||||
@router.delete("/models/{version_id}", response_model=dict)
|
||||
async def delete_model_version(version_id: str):
|
||||
"""Delete a model version."""
|
||||
if version_id not in _state.model_versions:
|
||||
raise HTTPException(status_code=404, detail="Model version not found")
|
||||
|
||||
model = _state.model_versions[version_id]
|
||||
if model["is_active"]:
|
||||
raise HTTPException(status_code=400, detail="Cannot delete active model")
|
||||
|
||||
del _state.model_versions[version_id]
|
||||
return {"success": True, "message": "Model deleted"}
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# DATASET STATS & STATUS
|
||||
# ============================================================================
|
||||
|
||||
@router.get("/dataset/stats", response_model=dict)
|
||||
async def get_dataset_stats():
|
||||
"""Get statistics about the training dataset."""
|
||||
from metrics_db import get_zeugnis_stats
|
||||
|
||||
zeugnis_stats = await get_zeugnis_stats()
|
||||
|
||||
return {
|
||||
"total_documents": zeugnis_stats.get("total_documents", 0),
|
||||
"total_chunks": zeugnis_stats.get("total_documents", 0) * 12,
|
||||
"training_allowed": zeugnis_stats.get("training_allowed_documents", 0),
|
||||
"by_bundesland": {
|
||||
bl["bundesland"]: bl.get("doc_count", 0)
|
||||
for bl in zeugnis_stats.get("per_bundesland", [])
|
||||
},
|
||||
"by_doc_type": {
|
||||
"verordnung": 150,
|
||||
"schulordnung": 80,
|
||||
"handreichung": 45,
|
||||
"erlass": 30,
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
@router.get("/status", response_model=dict)
|
||||
async def get_training_status():
|
||||
"""Get overall training system status."""
|
||||
active_job = None
|
||||
if _state.active_job_id and _state.active_job_id in _state.jobs:
|
||||
active_job = _state.jobs[_state.active_job_id]
|
||||
|
||||
return {
|
||||
"is_training": _state.active_job_id is not None and active_job is not None and
|
||||
active_job["status"] == TrainingStatus.TRAINING.value,
|
||||
"active_job_id": _state.active_job_id,
|
||||
"total_jobs": len(_state.jobs),
|
||||
"completed_jobs": sum(
|
||||
1 for j in _state.jobs.values()
|
||||
if j["status"] == TrainingStatus.COMPLETED.value
|
||||
),
|
||||
"failed_jobs": sum(
|
||||
1 for j in _state.jobs.values()
|
||||
if j["status"] == TrainingStatus.FAILED.value
|
||||
),
|
||||
"model_versions": len(_state.model_versions),
|
||||
"active_models": sum(1 for m in _state.model_versions.values() if m["is_active"]),
|
||||
}
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# SSE ENDPOINTS
|
||||
# ============================================================================
|
||||
|
||||
@router.get("/metrics/stream")
|
||||
async def stream_training_metrics(job_id: str, request: Request):
|
||||
"""
|
||||
SSE endpoint for streaming training metrics.
|
||||
|
||||
Streams real-time training progress for a specific job.
|
||||
"""
|
||||
if job_id not in _state.jobs:
|
||||
raise HTTPException(status_code=404, detail="Job not found")
|
||||
|
||||
return StreamingResponse(
|
||||
training_metrics_generator(job_id, request),
|
||||
media_type="text/event-stream",
|
||||
headers={
|
||||
"Cache-Control": "no-cache",
|
||||
"Connection": "keep-alive",
|
||||
"X-Accel-Buffering": "no"
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
@router.get("/ocr/stream")
|
||||
async def stream_batch_ocr(images_count: int, request: Request):
|
||||
"""
|
||||
SSE endpoint for streaming batch OCR progress.
|
||||
|
||||
Simulates batch OCR processing with progress updates.
|
||||
"""
|
||||
if images_count < 1 or images_count > 100:
|
||||
raise HTTPException(status_code=400, detail="images_count must be between 1 and 100")
|
||||
|
||||
return StreamingResponse(
|
||||
batch_ocr_progress_generator(images_count, request),
|
||||
media_type="text/event-stream",
|
||||
headers={
|
||||
"Cache-Control": "no-cache",
|
||||
"Connection": "keep-alive",
|
||||
"X-Accel-Buffering": "no"
|
||||
}
|
||||
)
|
||||
# Backward-compat shim -- module moved to training/routes.py
|
||||
import importlib as _importlib
|
||||
import sys as _sys
|
||||
_sys.modules[__name__] = _importlib.import_module("training.routes")
|
||||
|
||||
@@ -1,190 +1,4 @@
|
||||
"""
|
||||
Training API — simulation helper and SSE generators.
|
||||
"""
|
||||
|
||||
import json
|
||||
import uuid
|
||||
import asyncio
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
from training_models import TrainingStatus, _state
|
||||
|
||||
|
||||
async def simulate_training_progress(job_id: str):
|
||||
"""Simulate training progress (replace with actual training logic)."""
|
||||
if job_id not in _state.jobs:
|
||||
return
|
||||
|
||||
job = _state.jobs[job_id]
|
||||
job["status"] = TrainingStatus.TRAINING.value
|
||||
job["started_at"] = datetime.now().isoformat()
|
||||
|
||||
total_steps = job["total_epochs"] * 100 # Simulate 100 steps per epoch
|
||||
current_step = 0
|
||||
|
||||
while current_step < total_steps and job["status"] == TrainingStatus.TRAINING.value:
|
||||
# Update progress
|
||||
progress = (current_step / total_steps) * 100
|
||||
current_epoch = current_step // 100 + 1
|
||||
|
||||
# Simulate decreasing loss
|
||||
base_loss = 0.8 * (1 - progress / 100) + 0.1
|
||||
loss = base_loss + (0.05 * (0.5 - (current_step % 100) / 100))
|
||||
val_loss = loss * 1.1
|
||||
|
||||
# Update job state
|
||||
job["progress"] = progress
|
||||
job["current_epoch"] = min(current_epoch, job["total_epochs"])
|
||||
job["loss"] = round(loss, 4)
|
||||
job["val_loss"] = round(val_loss, 4)
|
||||
job["documents_processed"] = int((progress / 100) * job["total_documents"])
|
||||
|
||||
# Update metrics
|
||||
job["metrics"]["loss_history"].append(round(loss, 4))
|
||||
job["metrics"]["val_loss_history"].append(round(val_loss, 4))
|
||||
job["metrics"]["precision"] = round(0.5 + (progress / 200), 3)
|
||||
job["metrics"]["recall"] = round(0.45 + (progress / 200), 3)
|
||||
job["metrics"]["f1_score"] = round(0.47 + (progress / 200), 3)
|
||||
job["metrics"]["accuracy"] = round(0.6 + (progress / 250), 3)
|
||||
|
||||
# Keep only last 50 history points
|
||||
if len(job["metrics"]["loss_history"]) > 50:
|
||||
job["metrics"]["loss_history"] = job["metrics"]["loss_history"][-50:]
|
||||
job["metrics"]["val_loss_history"] = job["metrics"]["val_loss_history"][-50:]
|
||||
|
||||
# Estimate completion
|
||||
if progress > 0:
|
||||
elapsed = (datetime.now() - datetime.fromisoformat(job["started_at"])).total_seconds()
|
||||
remaining = (elapsed / progress) * (100 - progress)
|
||||
job["estimated_completion"] = (datetime.now() + timedelta(seconds=remaining)).isoformat()
|
||||
|
||||
current_step += 1
|
||||
await asyncio.sleep(0.5) # Simulate work
|
||||
|
||||
# Mark as completed
|
||||
if job["status"] == TrainingStatus.TRAINING.value:
|
||||
job["status"] = TrainingStatus.COMPLETED.value
|
||||
job["progress"] = 100
|
||||
job["completed_at"] = datetime.now().isoformat()
|
||||
|
||||
# Create model version
|
||||
version_id = str(uuid.uuid4())
|
||||
_state.model_versions[version_id] = {
|
||||
"id": version_id,
|
||||
"job_id": job_id,
|
||||
"version": f"v{len(_state.model_versions) + 1}.0",
|
||||
"model_type": job["model_type"],
|
||||
"created_at": datetime.now().isoformat(),
|
||||
"metrics": job["metrics"],
|
||||
"is_active": True,
|
||||
"size_mb": 245.7,
|
||||
"bundeslaender": job["config"]["bundeslaender"],
|
||||
}
|
||||
|
||||
_state.active_job_id = None
|
||||
|
||||
|
||||
async def training_metrics_generator(job_id: str, request):
|
||||
"""
|
||||
SSE generator for streaming training metrics.
|
||||
|
||||
Yields JSON-encoded training status updates every 500ms.
|
||||
"""
|
||||
while True:
|
||||
# Check if client disconnected
|
||||
if await request.is_disconnected():
|
||||
break
|
||||
|
||||
# Get job status
|
||||
if job_id not in _state.jobs:
|
||||
yield f"data: {json.dumps({'error': 'Job not found'})}\n\n"
|
||||
break
|
||||
|
||||
job = _state.jobs[job_id]
|
||||
|
||||
# Build metrics response
|
||||
metrics_data = {
|
||||
"job_id": job["id"],
|
||||
"status": job["status"],
|
||||
"progress": job["progress"],
|
||||
"current_epoch": job["current_epoch"],
|
||||
"total_epochs": job["total_epochs"],
|
||||
"current_step": int(job["progress"] * job["total_epochs"]),
|
||||
"total_steps": job["total_epochs"] * 100,
|
||||
"elapsed_time_ms": 0,
|
||||
"estimated_remaining_ms": 0,
|
||||
"metrics": {
|
||||
"loss": job["loss"],
|
||||
"val_loss": job["val_loss"],
|
||||
"accuracy": job["metrics"]["accuracy"],
|
||||
"learning_rate": job["learning_rate"]
|
||||
},
|
||||
"history": [
|
||||
{
|
||||
"epoch": i + 1,
|
||||
"step": (i + 1) * 10,
|
||||
"loss": loss,
|
||||
"val_loss": job["metrics"]["val_loss_history"][i] if i < len(job["metrics"]["val_loss_history"]) else None,
|
||||
"learning_rate": job["learning_rate"],
|
||||
"timestamp": 0
|
||||
}
|
||||
for i, loss in enumerate(job["metrics"]["loss_history"][-50:])
|
||||
]
|
||||
}
|
||||
|
||||
# Calculate elapsed time
|
||||
if job["started_at"]:
|
||||
started = datetime.fromisoformat(job["started_at"])
|
||||
metrics_data["elapsed_time_ms"] = int((datetime.now() - started).total_seconds() * 1000)
|
||||
|
||||
# Calculate remaining time
|
||||
if job["estimated_completion"]:
|
||||
estimated = datetime.fromisoformat(job["estimated_completion"])
|
||||
metrics_data["estimated_remaining_ms"] = max(0, int((estimated - datetime.now()).total_seconds() * 1000))
|
||||
|
||||
# Send SSE event
|
||||
yield f"data: {json.dumps(metrics_data)}\n\n"
|
||||
|
||||
# Check if job completed
|
||||
if job["status"] in [TrainingStatus.COMPLETED.value, TrainingStatus.FAILED.value, TrainingStatus.CANCELLED.value]:
|
||||
break
|
||||
|
||||
# Wait before next update
|
||||
await asyncio.sleep(0.5)
|
||||
|
||||
|
||||
async def batch_ocr_progress_generator(images_count: int, request):
|
||||
"""
|
||||
SSE generator for batch OCR progress simulation.
|
||||
|
||||
In production, this would integrate with actual OCR processing.
|
||||
"""
|
||||
import random
|
||||
|
||||
for i in range(images_count):
|
||||
# Check if client disconnected
|
||||
if await request.is_disconnected():
|
||||
break
|
||||
|
||||
# Simulate processing time
|
||||
await asyncio.sleep(random.uniform(0.3, 0.8))
|
||||
|
||||
progress_data = {
|
||||
"type": "progress",
|
||||
"current": i + 1,
|
||||
"total": images_count,
|
||||
"progress_percent": ((i + 1) / images_count) * 100,
|
||||
"elapsed_ms": (i + 1) * 500,
|
||||
"estimated_remaining_ms": (images_count - i - 1) * 500,
|
||||
"result": {
|
||||
"text": f"Sample recognized text for image {i + 1}",
|
||||
"confidence": round(random.uniform(0.7, 0.98), 2),
|
||||
"processing_time_ms": random.randint(200, 600),
|
||||
"from_cache": random.random() < 0.2
|
||||
}
|
||||
}
|
||||
|
||||
yield f"data: {json.dumps(progress_data)}\n\n"
|
||||
|
||||
# Send completion event
|
||||
yield f"data: {json.dumps({'type': 'complete', 'total_time_ms': images_count * 500, 'processed_count': images_count})}\n\n"
|
||||
# Backward-compat shim -- module moved to training/simulation.py
|
||||
import importlib as _importlib
|
||||
import sys as _sys
|
||||
_sys.modules[__name__] = _importlib.import_module("training.simulation")
|
||||
|
||||
@@ -1,261 +1,4 @@
|
||||
"""
|
||||
TrOCR API - REST endpoints for TrOCR handwriting OCR.
|
||||
|
||||
Provides:
|
||||
- /ocr/trocr - Single image OCR
|
||||
- /ocr/trocr/batch - Batch image processing
|
||||
- /ocr/trocr/status - Model status
|
||||
- /ocr/trocr/cache - Cache statistics
|
||||
"""
|
||||
|
||||
from fastapi import APIRouter, UploadFile, File, HTTPException, Query
|
||||
from fastapi.responses import StreamingResponse
|
||||
from pydantic import BaseModel, Field
|
||||
from typing import List, Optional
|
||||
import json
|
||||
import logging
|
||||
|
||||
from services.trocr_service import (
|
||||
run_trocr_ocr_enhanced,
|
||||
run_trocr_batch,
|
||||
run_trocr_batch_stream,
|
||||
get_model_status,
|
||||
get_cache_stats,
|
||||
preload_trocr_model,
|
||||
OCRResult,
|
||||
BatchOCRResult
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
router = APIRouter(prefix="/api/v1/ocr/trocr", tags=["TrOCR"])
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# MODELS
|
||||
# =============================================================================
|
||||
|
||||
class TrOCRResponse(BaseModel):
|
||||
"""Response model for single image OCR."""
|
||||
text: str = Field(..., description="Extracted text")
|
||||
confidence: float = Field(..., ge=0.0, le=1.0, description="Overall confidence")
|
||||
processing_time_ms: int = Field(..., ge=0, description="Processing time in milliseconds")
|
||||
model: str = Field(..., description="Model used for OCR")
|
||||
has_lora_adapter: bool = Field(False, description="Whether LoRA adapter was used")
|
||||
from_cache: bool = Field(False, description="Whether result was from cache")
|
||||
image_hash: str = Field("", description="SHA256 hash of image (first 16 chars)")
|
||||
word_count: int = Field(0, description="Number of words detected")
|
||||
|
||||
|
||||
class BatchOCRResponse(BaseModel):
|
||||
"""Response model for batch OCR."""
|
||||
results: List[TrOCRResponse] = Field(..., description="Individual OCR results")
|
||||
total_time_ms: int = Field(..., ge=0, description="Total processing time")
|
||||
processed_count: int = Field(..., ge=0, description="Number of images processed")
|
||||
cached_count: int = Field(0, description="Number of results from cache")
|
||||
error_count: int = Field(0, description="Number of errors")
|
||||
|
||||
|
||||
class ModelStatusResponse(BaseModel):
|
||||
"""Response model for model status."""
|
||||
status: str = Field(..., description="Model status: available, not_installed")
|
||||
is_loaded: bool = Field(..., description="Whether model is loaded in memory")
|
||||
model_name: Optional[str] = Field(None, description="Name of loaded model")
|
||||
device: Optional[str] = Field(None, description="Device model is running on")
|
||||
loaded_at: Optional[str] = Field(None, description="ISO timestamp when model was loaded")
|
||||
|
||||
|
||||
class CacheStatsResponse(BaseModel):
|
||||
"""Response model for cache statistics."""
|
||||
size: int = Field(..., ge=0, description="Current cache size")
|
||||
max_size: int = Field(..., ge=0, description="Maximum cache size")
|
||||
ttl_seconds: int = Field(..., ge=0, description="Cache TTL in seconds")
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# ENDPOINTS
|
||||
# =============================================================================
|
||||
|
||||
@router.get("/status", response_model=ModelStatusResponse)
|
||||
async def get_trocr_status():
|
||||
"""
|
||||
Get TrOCR model status.
|
||||
|
||||
Returns information about whether the model is loaded and available.
|
||||
"""
|
||||
return get_model_status()
|
||||
|
||||
|
||||
@router.get("/cache", response_model=CacheStatsResponse)
|
||||
async def get_trocr_cache_stats():
|
||||
"""
|
||||
Get TrOCR cache statistics.
|
||||
|
||||
Returns information about the OCR result cache.
|
||||
"""
|
||||
return get_cache_stats()
|
||||
|
||||
|
||||
@router.post("/preload")
|
||||
async def preload_model(handwritten: bool = Query(True, description="Load handwritten model")):
|
||||
"""
|
||||
Preload TrOCR model into memory.
|
||||
|
||||
This speeds up the first OCR request by loading the model ahead of time.
|
||||
"""
|
||||
success = preload_trocr_model(handwritten=handwritten)
|
||||
if success:
|
||||
return {"status": "success", "message": "Model preloaded successfully"}
|
||||
else:
|
||||
raise HTTPException(status_code=500, detail="Failed to preload model")
|
||||
|
||||
|
||||
@router.post("", response_model=TrOCRResponse)
|
||||
async def run_trocr(
|
||||
file: UploadFile = File(..., description="Image file to process"),
|
||||
handwritten: bool = Query(True, description="Use handwritten model"),
|
||||
split_lines: bool = Query(True, description="Split image into lines"),
|
||||
use_cache: bool = Query(True, description="Use result caching")
|
||||
):
|
||||
"""
|
||||
Run TrOCR on a single image.
|
||||
|
||||
Supports PNG, JPG, and other common image formats.
|
||||
"""
|
||||
# Validate file type
|
||||
if not file.content_type or not file.content_type.startswith("image/"):
|
||||
raise HTTPException(status_code=400, detail="File must be an image")
|
||||
|
||||
try:
|
||||
image_data = await file.read()
|
||||
|
||||
result = await run_trocr_ocr_enhanced(
|
||||
image_data,
|
||||
handwritten=handwritten,
|
||||
split_lines=split_lines,
|
||||
use_cache=use_cache
|
||||
)
|
||||
|
||||
return TrOCRResponse(
|
||||
text=result.text,
|
||||
confidence=result.confidence,
|
||||
processing_time_ms=result.processing_time_ms,
|
||||
model=result.model,
|
||||
has_lora_adapter=result.has_lora_adapter,
|
||||
from_cache=result.from_cache,
|
||||
image_hash=result.image_hash,
|
||||
word_count=len(result.text.split()) if result.text else 0
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"TrOCR API error: {e}")
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
@router.post("/batch", response_model=BatchOCRResponse)
|
||||
async def run_trocr_batch_endpoint(
|
||||
files: List[UploadFile] = File(..., description="Image files to process"),
|
||||
handwritten: bool = Query(True, description="Use handwritten model"),
|
||||
split_lines: bool = Query(True, description="Split images into lines"),
|
||||
use_cache: bool = Query(True, description="Use result caching")
|
||||
):
|
||||
"""
|
||||
Run TrOCR on multiple images.
|
||||
|
||||
Processes images sequentially and returns all results.
|
||||
"""
|
||||
if not files:
|
||||
raise HTTPException(status_code=400, detail="No files provided")
|
||||
|
||||
if len(files) > 50:
|
||||
raise HTTPException(status_code=400, detail="Maximum 50 images per batch")
|
||||
|
||||
try:
|
||||
images = []
|
||||
for file in files:
|
||||
if not file.content_type or not file.content_type.startswith("image/"):
|
||||
raise HTTPException(status_code=400, detail=f"File {file.filename} is not an image")
|
||||
images.append(await file.read())
|
||||
|
||||
batch_result = await run_trocr_batch(
|
||||
images,
|
||||
handwritten=handwritten,
|
||||
split_lines=split_lines,
|
||||
use_cache=use_cache
|
||||
)
|
||||
|
||||
return BatchOCRResponse(
|
||||
results=[
|
||||
TrOCRResponse(
|
||||
text=r.text,
|
||||
confidence=r.confidence,
|
||||
processing_time_ms=r.processing_time_ms,
|
||||
model=r.model,
|
||||
has_lora_adapter=r.has_lora_adapter,
|
||||
from_cache=r.from_cache,
|
||||
image_hash=r.image_hash,
|
||||
word_count=len(r.text.split()) if r.text else 0
|
||||
)
|
||||
for r in batch_result.results
|
||||
],
|
||||
total_time_ms=batch_result.total_time_ms,
|
||||
processed_count=batch_result.processed_count,
|
||||
cached_count=batch_result.cached_count,
|
||||
error_count=batch_result.error_count
|
||||
)
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"TrOCR batch API error: {e}")
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
@router.post("/batch/stream")
|
||||
async def run_trocr_batch_stream_endpoint(
|
||||
files: List[UploadFile] = File(..., description="Image files to process"),
|
||||
handwritten: bool = Query(True, description="Use handwritten model"),
|
||||
split_lines: bool = Query(True, description="Split images into lines"),
|
||||
use_cache: bool = Query(True, description="Use result caching")
|
||||
):
|
||||
"""
|
||||
Run TrOCR on multiple images with Server-Sent Events (SSE) progress updates.
|
||||
|
||||
Returns a stream of progress events as images are processed.
|
||||
"""
|
||||
if not files:
|
||||
raise HTTPException(status_code=400, detail="No files provided")
|
||||
|
||||
if len(files) > 50:
|
||||
raise HTTPException(status_code=400, detail="Maximum 50 images per batch")
|
||||
|
||||
try:
|
||||
images = []
|
||||
for file in files:
|
||||
if not file.content_type or not file.content_type.startswith("image/"):
|
||||
raise HTTPException(status_code=400, detail=f"File {file.filename} is not an image")
|
||||
images.append(await file.read())
|
||||
|
||||
async def event_generator():
|
||||
async for update in run_trocr_batch_stream(
|
||||
images,
|
||||
handwritten=handwritten,
|
||||
split_lines=split_lines,
|
||||
use_cache=use_cache
|
||||
):
|
||||
yield f"data: {json.dumps(update)}\n\n"
|
||||
|
||||
return StreamingResponse(
|
||||
event_generator(),
|
||||
media_type="text/event-stream",
|
||||
headers={
|
||||
"Cache-Control": "no-cache",
|
||||
"Connection": "keep-alive"
|
||||
}
|
||||
)
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"TrOCR stream API error: {e}")
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
# Backward-compat shim -- module moved to training/trocr_api.py
|
||||
import importlib as _importlib
|
||||
import sys as _sys
|
||||
_sys.modules[__name__] = _importlib.import_module("training.trocr_api")
|
||||
|
||||
@@ -0,0 +1,6 @@
|
||||
"""
|
||||
worksheet package — worksheet editor, NRU generator, cleanup.
|
||||
|
||||
Backward-compatible re-exports: consumers can still use
|
||||
``from worksheet_editor_api import ...`` etc. via the shim files in backend/.
|
||||
"""
|
||||
@@ -0,0 +1,491 @@
|
||||
"""
|
||||
Worksheet Cleanup API - Handschrift-Entfernung und Layout-Rekonstruktion
|
||||
|
||||
Endpoints:
|
||||
- POST /api/v1/worksheet/detect-handwriting - Erkennt Handschrift und gibt Maske zurueck
|
||||
- POST /api/v1/worksheet/remove-handwriting - Entfernt Handschrift aus Bild
|
||||
- POST /api/v1/worksheet/reconstruct - Rekonstruiert Layout als Fabric.js JSON
|
||||
- POST /api/v1/worksheet/cleanup-pipeline - Vollstaendige Pipeline (Erkennung + Entfernung + Layout)
|
||||
|
||||
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal auf dem Mac Mini.
|
||||
"""
|
||||
|
||||
import io
|
||||
import base64
|
||||
import logging
|
||||
from typing import Optional
|
||||
|
||||
from fastapi import APIRouter, HTTPException, UploadFile, File, Form
|
||||
from fastapi.responses import StreamingResponse, JSONResponse
|
||||
from pydantic import BaseModel
|
||||
|
||||
from services.handwriting_detection import (
|
||||
detect_handwriting,
|
||||
detect_handwriting_regions,
|
||||
mask_to_png
|
||||
)
|
||||
from services.inpainting_service import (
|
||||
inpaint_image,
|
||||
remove_handwriting,
|
||||
InpaintingMethod,
|
||||
check_lama_available
|
||||
)
|
||||
from services.layout_reconstruction_service import (
|
||||
reconstruct_layout,
|
||||
layout_to_fabric_json,
|
||||
reconstruct_and_clean
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
router = APIRouter(prefix="/api/v1/worksheet", tags=["Worksheet Cleanup"])
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Pydantic Models
|
||||
# =============================================================================
|
||||
|
||||
class DetectionResponse(BaseModel):
|
||||
has_handwriting: bool
|
||||
confidence: float
|
||||
handwriting_ratio: float
|
||||
detection_method: str
|
||||
mask_base64: Optional[str] = None
|
||||
|
||||
|
||||
class InpaintingResponse(BaseModel):
|
||||
success: bool
|
||||
method_used: str
|
||||
processing_time_ms: float
|
||||
image_base64: Optional[str] = None
|
||||
error: Optional[str] = None
|
||||
|
||||
|
||||
class ReconstructionResponse(BaseModel):
|
||||
success: bool
|
||||
element_count: int
|
||||
page_width: int
|
||||
page_height: int
|
||||
fabric_json: dict
|
||||
table_count: int = 0
|
||||
|
||||
|
||||
class PipelineResponse(BaseModel):
|
||||
success: bool
|
||||
handwriting_detected: bool
|
||||
handwriting_removed: bool
|
||||
layout_reconstructed: bool
|
||||
cleaned_image_base64: Optional[str] = None
|
||||
fabric_json: Optional[dict] = None
|
||||
metadata: dict = {}
|
||||
|
||||
|
||||
class CapabilitiesResponse(BaseModel):
|
||||
opencv_available: bool = True
|
||||
lama_available: bool = False
|
||||
paddleocr_available: bool = False
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# API Endpoints
|
||||
# =============================================================================
|
||||
|
||||
@router.get("/capabilities")
|
||||
async def get_capabilities() -> CapabilitiesResponse:
|
||||
"""
|
||||
Get available cleanup capabilities on this server.
|
||||
"""
|
||||
# Check PaddleOCR
|
||||
paddleocr_available = False
|
||||
try:
|
||||
from hybrid_vocab_extractor import get_paddle_ocr
|
||||
ocr = get_paddle_ocr()
|
||||
paddleocr_available = ocr is not None
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return CapabilitiesResponse(
|
||||
opencv_available=True,
|
||||
lama_available=check_lama_available(),
|
||||
paddleocr_available=paddleocr_available
|
||||
)
|
||||
|
||||
|
||||
@router.post("/detect-handwriting")
|
||||
async def detect_handwriting_endpoint(
|
||||
image: UploadFile = File(...),
|
||||
return_mask: bool = Form(default=True),
|
||||
min_confidence: float = Form(default=0.3)
|
||||
) -> DetectionResponse:
|
||||
"""
|
||||
Detect handwriting in an image.
|
||||
|
||||
Args:
|
||||
image: Input image (PNG, JPG)
|
||||
return_mask: Whether to return the binary mask as base64
|
||||
min_confidence: Minimum confidence threshold
|
||||
|
||||
Returns:
|
||||
DetectionResponse with detection results and optional mask
|
||||
"""
|
||||
logger.info(f"Handwriting detection request: {image.filename}")
|
||||
|
||||
# Validate file type
|
||||
content_type = image.content_type or ""
|
||||
if not content_type.startswith("image/"):
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail="Only image files (PNG, JPG) are supported"
|
||||
)
|
||||
|
||||
try:
|
||||
image_bytes = await image.read()
|
||||
|
||||
# Detect handwriting
|
||||
result = detect_handwriting(image_bytes)
|
||||
|
||||
has_handwriting = (
|
||||
result.confidence >= min_confidence and
|
||||
result.handwriting_ratio > 0.005
|
||||
)
|
||||
|
||||
response = DetectionResponse(
|
||||
has_handwriting=has_handwriting,
|
||||
confidence=result.confidence,
|
||||
handwriting_ratio=result.handwriting_ratio,
|
||||
detection_method=result.detection_method
|
||||
)
|
||||
|
||||
if return_mask:
|
||||
mask_bytes = mask_to_png(result.mask)
|
||||
response.mask_base64 = base64.b64encode(mask_bytes).decode('utf-8')
|
||||
|
||||
logger.info(f"Detection complete: handwriting={has_handwriting}, "
|
||||
f"confidence={result.confidence:.2f}")
|
||||
|
||||
return response
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Handwriting detection failed: {e}")
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
@router.post("/detect-handwriting/mask")
|
||||
async def get_handwriting_mask(
|
||||
image: UploadFile = File(...)
|
||||
) -> StreamingResponse:
|
||||
"""
|
||||
Get handwriting detection mask as PNG image.
|
||||
|
||||
Returns binary mask where white (255) = handwriting.
|
||||
"""
|
||||
content_type = image.content_type or ""
|
||||
if not content_type.startswith("image/"):
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail="Only image files are supported"
|
||||
)
|
||||
|
||||
try:
|
||||
image_bytes = await image.read()
|
||||
result = detect_handwriting(image_bytes)
|
||||
mask_bytes = mask_to_png(result.mask)
|
||||
|
||||
return StreamingResponse(
|
||||
io.BytesIO(mask_bytes),
|
||||
media_type="image/png",
|
||||
headers={
|
||||
"Content-Disposition": "attachment; filename=handwriting_mask.png"
|
||||
}
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Mask generation failed: {e}")
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
@router.post("/remove-handwriting")
|
||||
async def remove_handwriting_endpoint(
|
||||
image: UploadFile = File(...),
|
||||
mask: Optional[UploadFile] = File(default=None),
|
||||
method: str = Form(default="auto"),
|
||||
return_base64: bool = Form(default=False)
|
||||
):
|
||||
"""
|
||||
Remove handwriting from an image.
|
||||
|
||||
Args:
|
||||
image: Input image with handwriting
|
||||
mask: Optional pre-computed mask (if not provided, auto-detected)
|
||||
method: Inpainting method (auto, opencv_telea, opencv_ns, lama)
|
||||
return_base64: If True, return image as base64, else as file
|
||||
|
||||
Returns:
|
||||
Cleaned image (as PNG file or base64 in JSON)
|
||||
"""
|
||||
logger.info(f"Remove handwriting request: {image.filename}, method={method}")
|
||||
|
||||
content_type = image.content_type or ""
|
||||
if not content_type.startswith("image/"):
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail="Only image files are supported"
|
||||
)
|
||||
|
||||
try:
|
||||
image_bytes = await image.read()
|
||||
|
||||
# Get mask if provided
|
||||
mask_array = None
|
||||
if mask is not None:
|
||||
mask_bytes = await mask.read()
|
||||
from PIL import Image
|
||||
import numpy as np
|
||||
mask_img = Image.open(io.BytesIO(mask_bytes))
|
||||
mask_array = np.array(mask_img)
|
||||
|
||||
# Select inpainting method
|
||||
inpainting_method = InpaintingMethod.AUTO
|
||||
if method == "opencv_telea":
|
||||
inpainting_method = InpaintingMethod.OPENCV_TELEA
|
||||
elif method == "opencv_ns":
|
||||
inpainting_method = InpaintingMethod.OPENCV_NS
|
||||
elif method == "lama":
|
||||
inpainting_method = InpaintingMethod.LAMA
|
||||
|
||||
# Remove handwriting
|
||||
cleaned_bytes, metadata = remove_handwriting(
|
||||
image_bytes,
|
||||
mask=mask_array,
|
||||
method=inpainting_method
|
||||
)
|
||||
|
||||
if return_base64:
|
||||
return JSONResponse({
|
||||
"success": True,
|
||||
"image_base64": base64.b64encode(cleaned_bytes).decode('utf-8'),
|
||||
"metadata": metadata
|
||||
})
|
||||
else:
|
||||
return StreamingResponse(
|
||||
io.BytesIO(cleaned_bytes),
|
||||
media_type="image/png",
|
||||
headers={
|
||||
"Content-Disposition": "attachment; filename=cleaned.png",
|
||||
"X-Method-Used": metadata.get("method_used", "unknown"),
|
||||
"X-Processing-Time-Ms": str(metadata.get("processing_time_ms", 0))
|
||||
}
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Handwriting removal failed: {e}")
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
@router.post("/reconstruct")
|
||||
async def reconstruct_layout_endpoint(
|
||||
image: UploadFile = File(...),
|
||||
clean_handwriting: bool = Form(default=True),
|
||||
detect_tables: bool = Form(default=True)
|
||||
) -> ReconstructionResponse:
|
||||
"""
|
||||
Reconstruct worksheet layout and generate Fabric.js JSON.
|
||||
|
||||
Args:
|
||||
image: Input image (can contain handwriting)
|
||||
clean_handwriting: Whether to remove handwriting first
|
||||
detect_tables: Whether to detect table structures
|
||||
|
||||
Returns:
|
||||
ReconstructionResponse with Fabric.js JSON
|
||||
"""
|
||||
logger.info(f"Layout reconstruction request: {image.filename}")
|
||||
|
||||
content_type = image.content_type or ""
|
||||
if not content_type.startswith("image/"):
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail="Only image files are supported"
|
||||
)
|
||||
|
||||
try:
|
||||
image_bytes = await image.read()
|
||||
|
||||
# Run reconstruction pipeline
|
||||
if clean_handwriting:
|
||||
cleaned_bytes, layout = reconstruct_and_clean(image_bytes)
|
||||
else:
|
||||
layout = reconstruct_layout(image_bytes, detect_tables=detect_tables)
|
||||
|
||||
return ReconstructionResponse(
|
||||
success=True,
|
||||
element_count=len(layout.elements),
|
||||
page_width=layout.page_width,
|
||||
page_height=layout.page_height,
|
||||
fabric_json=layout.fabric_json,
|
||||
table_count=len(layout.table_regions)
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Layout reconstruction failed: {e}")
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
@router.post("/cleanup-pipeline")
|
||||
async def full_cleanup_pipeline(
|
||||
image: UploadFile = File(...),
|
||||
remove_hw: bool = Form(default=True, alias="remove_handwriting"),
|
||||
reconstruct: bool = Form(default=True),
|
||||
inpainting_method: str = Form(default="auto")
|
||||
) -> PipelineResponse:
|
||||
"""
|
||||
Full cleanup pipeline: detect, remove handwriting, reconstruct layout.
|
||||
|
||||
This is the recommended endpoint for processing filled worksheets.
|
||||
|
||||
Args:
|
||||
image: Input image (scan/photo of filled worksheet)
|
||||
remove_handwriting: Whether to remove detected handwriting
|
||||
reconstruct: Whether to reconstruct layout as Fabric.js JSON
|
||||
inpainting_method: Method for inpainting (auto, opencv_telea, opencv_ns, lama)
|
||||
|
||||
Returns:
|
||||
PipelineResponse with cleaned image and Fabric.js JSON
|
||||
"""
|
||||
logger.info(f"Full cleanup pipeline: {image.filename}")
|
||||
|
||||
content_type = image.content_type or ""
|
||||
if not content_type.startswith("image/"):
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail="Only image files are supported"
|
||||
)
|
||||
|
||||
try:
|
||||
image_bytes = await image.read()
|
||||
metadata = {}
|
||||
|
||||
# Step 1: Detect handwriting
|
||||
detection = detect_handwriting(image_bytes)
|
||||
handwriting_detected = (
|
||||
detection.confidence >= 0.3 and
|
||||
detection.handwriting_ratio > 0.005
|
||||
)
|
||||
|
||||
metadata["detection"] = {
|
||||
"confidence": detection.confidence,
|
||||
"handwriting_ratio": detection.handwriting_ratio,
|
||||
"method": detection.detection_method
|
||||
}
|
||||
|
||||
# Step 2: Remove handwriting if requested and detected
|
||||
cleaned_bytes = image_bytes
|
||||
handwriting_removed = False
|
||||
|
||||
if remove_hw and handwriting_detected:
|
||||
method = InpaintingMethod.AUTO
|
||||
if inpainting_method == "opencv_telea":
|
||||
method = InpaintingMethod.OPENCV_TELEA
|
||||
elif inpainting_method == "opencv_ns":
|
||||
method = InpaintingMethod.OPENCV_NS
|
||||
elif inpainting_method == "lama":
|
||||
method = InpaintingMethod.LAMA
|
||||
|
||||
cleaned_bytes, inpaint_metadata = remove_handwriting(
|
||||
image_bytes,
|
||||
mask=detection.mask,
|
||||
method=method
|
||||
)
|
||||
handwriting_removed = inpaint_metadata.get("inpainting_performed", False)
|
||||
metadata["inpainting"] = inpaint_metadata
|
||||
|
||||
# Step 3: Reconstruct layout if requested
|
||||
fabric_json = None
|
||||
layout_reconstructed = False
|
||||
|
||||
if reconstruct:
|
||||
layout = reconstruct_layout(cleaned_bytes)
|
||||
fabric_json = layout.fabric_json
|
||||
layout_reconstructed = len(layout.elements) > 0
|
||||
metadata["layout"] = {
|
||||
"element_count": len(layout.elements),
|
||||
"table_count": len(layout.table_regions),
|
||||
"page_width": layout.page_width,
|
||||
"page_height": layout.page_height
|
||||
}
|
||||
|
||||
# Encode cleaned image as base64
|
||||
cleaned_base64 = base64.b64encode(cleaned_bytes).decode('utf-8')
|
||||
|
||||
logger.info(f"Pipeline complete: detected={handwriting_detected}, "
|
||||
f"removed={handwriting_removed}, layout={layout_reconstructed}")
|
||||
|
||||
return PipelineResponse(
|
||||
success=True,
|
||||
handwriting_detected=handwriting_detected,
|
||||
handwriting_removed=handwriting_removed,
|
||||
layout_reconstructed=layout_reconstructed,
|
||||
cleaned_image_base64=cleaned_base64,
|
||||
fabric_json=fabric_json,
|
||||
metadata=metadata
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Cleanup pipeline failed: {e}")
|
||||
import traceback
|
||||
logger.error(traceback.format_exc())
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
@router.post("/preview-cleanup")
|
||||
async def preview_cleanup(
|
||||
image: UploadFile = File(...)
|
||||
) -> JSONResponse:
|
||||
"""
|
||||
Quick preview of cleanup results without full processing.
|
||||
|
||||
Returns detection results and estimated processing time.
|
||||
"""
|
||||
content_type = image.content_type or ""
|
||||
if not content_type.startswith("image/"):
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail="Only image files are supported"
|
||||
)
|
||||
|
||||
try:
|
||||
image_bytes = await image.read()
|
||||
|
||||
# Quick detection only
|
||||
result = detect_handwriting_regions(image_bytes)
|
||||
|
||||
# Estimate processing time based on image size
|
||||
from PIL import Image
|
||||
img = Image.open(io.BytesIO(image_bytes))
|
||||
pixel_count = img.width * img.height
|
||||
|
||||
# Rough estimates
|
||||
est_detection_ms = 100 + (pixel_count / 1000000) * 200
|
||||
est_inpainting_ms = 500 + (pixel_count / 1000000) * 1000
|
||||
est_reconstruction_ms = 200 + (pixel_count / 1000000) * 300
|
||||
|
||||
return JSONResponse({
|
||||
"has_handwriting": result["has_handwriting"],
|
||||
"confidence": result["confidence"],
|
||||
"handwriting_ratio": result["handwriting_ratio"],
|
||||
"image_width": img.width,
|
||||
"image_height": img.height,
|
||||
"estimated_times_ms": {
|
||||
"detection": est_detection_ms,
|
||||
"inpainting": est_inpainting_ms if result["has_handwriting"] else 0,
|
||||
"reconstruction": est_reconstruction_ms,
|
||||
"total": est_detection_ms + (est_inpainting_ms if result["has_handwriting"] else 0) + est_reconstruction_ms
|
||||
},
|
||||
"capabilities": {
|
||||
"lama_available": check_lama_available()
|
||||
}
|
||||
})
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Preview failed: {e}")
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
@@ -0,0 +1,485 @@
|
||||
"""
|
||||
Worksheet Editor AI — AI image generation and AI worksheet modification.
|
||||
"""
|
||||
|
||||
import io
|
||||
import json
|
||||
import base64
|
||||
import logging
|
||||
import re
|
||||
import time
|
||||
import random
|
||||
from typing import List, Dict
|
||||
|
||||
import httpx
|
||||
|
||||
from .editor_models import (
|
||||
AIImageRequest,
|
||||
AIImageResponse,
|
||||
AIImageStyle,
|
||||
AIModifyRequest,
|
||||
AIModifyResponse,
|
||||
OLLAMA_URL,
|
||||
STYLE_PROMPTS,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# =============================================
|
||||
# AI IMAGE GENERATION
|
||||
# =============================================
|
||||
|
||||
async def generate_ai_image_logic(request: AIImageRequest) -> AIImageResponse:
|
||||
"""
|
||||
Generate an AI image using Ollama with a text-to-image model.
|
||||
|
||||
Falls back to a placeholder if Ollama is not available.
|
||||
"""
|
||||
from fastapi import HTTPException
|
||||
|
||||
try:
|
||||
# Build enhanced prompt with style
|
||||
style_modifier = STYLE_PROMPTS.get(request.style, "")
|
||||
enhanced_prompt = f"{request.prompt}, {style_modifier}"
|
||||
|
||||
logger.info(f"Generating AI image: {enhanced_prompt[:100]}...")
|
||||
|
||||
# Check if Ollama is available
|
||||
async with httpx.AsyncClient(timeout=10.0) as check_client:
|
||||
try:
|
||||
health_response = await check_client.get(f"{OLLAMA_URL}/api/tags")
|
||||
if health_response.status_code != 200:
|
||||
raise HTTPException(status_code=503, detail="Ollama service not available")
|
||||
except httpx.ConnectError:
|
||||
logger.warning("Ollama not reachable, returning placeholder")
|
||||
return _generate_placeholder_image(request, enhanced_prompt)
|
||||
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=300.0) as client:
|
||||
tags_response = await client.get(f"{OLLAMA_URL}/api/tags")
|
||||
available_models = [m.get("name", "") for m in tags_response.json().get("models", [])]
|
||||
|
||||
sd_model = None
|
||||
for model in available_models:
|
||||
if "stable" in model.lower() or "sd" in model.lower() or "diffusion" in model.lower():
|
||||
sd_model = model
|
||||
break
|
||||
|
||||
if not sd_model:
|
||||
logger.warning("No Stable Diffusion model found in Ollama")
|
||||
return _generate_placeholder_image(request, enhanced_prompt)
|
||||
|
||||
logger.info(f"SD model found: {sd_model}, but image generation API not implemented")
|
||||
return _generate_placeholder_image(request, enhanced_prompt)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Image generation failed: {e}")
|
||||
return _generate_placeholder_image(request, enhanced_prompt)
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"AI image generation error: {e}")
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
def _generate_placeholder_image(request: AIImageRequest, prompt: str) -> AIImageResponse:
|
||||
"""
|
||||
Generate a placeholder image when AI generation is not available.
|
||||
Creates a simple SVG-based placeholder with the prompt text.
|
||||
"""
|
||||
from PIL import Image, ImageDraw, ImageFont
|
||||
|
||||
width, height = request.width, request.height
|
||||
|
||||
style_colors = {
|
||||
AIImageStyle.REALISTIC: ("#2563eb", "#dbeafe"),
|
||||
AIImageStyle.CARTOON: ("#f97316", "#ffedd5"),
|
||||
AIImageStyle.SKETCH: ("#6b7280", "#f3f4f6"),
|
||||
AIImageStyle.CLIPART: ("#8b5cf6", "#ede9fe"),
|
||||
AIImageStyle.EDUCATIONAL: ("#059669", "#d1fae5"),
|
||||
}
|
||||
|
||||
fg_color, bg_color = style_colors.get(request.style, ("#6366f1", "#e0e7ff"))
|
||||
|
||||
img = Image.new('RGB', (width, height), bg_color)
|
||||
draw = ImageDraw.Draw(img)
|
||||
|
||||
draw.rectangle([5, 5, width-6, height-6], outline=fg_color, width=3)
|
||||
|
||||
cx, cy = width // 2, height // 2 - 30
|
||||
draw.ellipse([cx-40, cy-40, cx+40, cy+40], outline=fg_color, width=3)
|
||||
draw.line([cx-20, cy-10, cx+20, cy-10], fill=fg_color, width=3)
|
||||
draw.line([cx, cy-10, cx, cy+20], fill=fg_color, width=3)
|
||||
|
||||
try:
|
||||
font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 14)
|
||||
except Exception:
|
||||
font = ImageFont.load_default()
|
||||
|
||||
max_chars = 40
|
||||
lines = []
|
||||
words = prompt[:200].split()
|
||||
current_line = ""
|
||||
for word in words:
|
||||
if len(current_line) + len(word) + 1 <= max_chars:
|
||||
current_line += (" " + word if current_line else word)
|
||||
else:
|
||||
if current_line:
|
||||
lines.append(current_line)
|
||||
current_line = word
|
||||
if current_line:
|
||||
lines.append(current_line)
|
||||
|
||||
text_y = cy + 60
|
||||
for line in lines[:4]:
|
||||
bbox = draw.textbbox((0, 0), line, font=font)
|
||||
text_width = bbox[2] - bbox[0]
|
||||
draw.text((cx - text_width // 2, text_y), line, fill=fg_color, font=font)
|
||||
text_y += 20
|
||||
|
||||
badge_text = "KI-Bild (Platzhalter)"
|
||||
try:
|
||||
badge_font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 10)
|
||||
except Exception:
|
||||
badge_font = font
|
||||
draw.rectangle([10, height-30, 150, height-10], fill=fg_color)
|
||||
draw.text((15, height-27), badge_text, fill="white", font=badge_font)
|
||||
|
||||
buffer = io.BytesIO()
|
||||
img.save(buffer, format='PNG')
|
||||
buffer.seek(0)
|
||||
|
||||
image_base64 = f"data:image/png;base64,{base64.b64encode(buffer.getvalue()).decode('utf-8')}"
|
||||
|
||||
return AIImageResponse(
|
||||
image_base64=image_base64,
|
||||
prompt_used=prompt,
|
||||
error="AI image generation not available. Using placeholder."
|
||||
)
|
||||
|
||||
|
||||
# =============================================
|
||||
# AI WORKSHEET MODIFICATION
|
||||
# =============================================
|
||||
|
||||
async def modify_worksheet_with_ai_logic(request: AIModifyRequest) -> AIModifyResponse:
|
||||
"""
|
||||
Modify a worksheet using AI based on natural language prompt.
|
||||
"""
|
||||
try:
|
||||
logger.info(f"AI modify request: {request.prompt[:100]}...")
|
||||
|
||||
try:
|
||||
canvas_data = json.loads(request.canvas_json)
|
||||
except json.JSONDecodeError:
|
||||
return AIModifyResponse(
|
||||
message="Fehler beim Parsen des Canvas",
|
||||
error="Invalid canvas JSON"
|
||||
)
|
||||
|
||||
system_prompt = """Du bist ein Assistent fuer die Bearbeitung von Arbeitsblaettern.
|
||||
Du erhaeltst den aktuellen Zustand eines Canvas im JSON-Format und eine Anweisung des Nutzers.
|
||||
Deine Aufgabe ist es, die gewuenschten Aenderungen am Canvas vorzunehmen.
|
||||
|
||||
Der Canvas verwendet Fabric.js. Hier sind die wichtigsten Objekttypen:
|
||||
- i-text: Interaktiver Text mit fontFamily, fontSize, fill, left, top
|
||||
- rect: Rechteck mit left, top, width, height, fill, stroke, strokeWidth
|
||||
- circle: Kreis mit left, top, radius, fill, stroke, strokeWidth
|
||||
- line: Linie mit x1, y1, x2, y2, stroke, strokeWidth
|
||||
|
||||
Das Canvas ist 794x1123 Pixel (A4 bei 96 DPI).
|
||||
|
||||
Antworte NUR mit einem JSON-Objekt in diesem Format:
|
||||
{
|
||||
"action": "modify" oder "add" oder "delete" oder "info",
|
||||
"objects": [...], // Neue/modifizierte Objekte (bei modify/add)
|
||||
"message": "Kurze Beschreibung der Aenderung"
|
||||
}
|
||||
|
||||
Wenn du Objekte hinzufuegst, generiere eindeutige IDs im Format "obj_<timestamp>_<random>".
|
||||
"""
|
||||
|
||||
user_prompt = f"""Aktueller Canvas-Zustand:
|
||||
```json
|
||||
{json.dumps(canvas_data, indent=2)[:5000]}
|
||||
```
|
||||
|
||||
Nutzer-Anweisung: {request.prompt}
|
||||
|
||||
Fuehre die Aenderung durch und antworte mit dem JSON-Objekt."""
|
||||
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=120.0) as client:
|
||||
response = await client.post(
|
||||
f"{OLLAMA_URL}/api/generate",
|
||||
json={
|
||||
"model": request.model,
|
||||
"prompt": user_prompt,
|
||||
"system": system_prompt,
|
||||
"stream": False,
|
||||
"options": {
|
||||
"temperature": 0.3,
|
||||
"num_predict": 4096
|
||||
}
|
||||
}
|
||||
)
|
||||
|
||||
if response.status_code != 200:
|
||||
logger.warning(f"Ollama error: {response.status_code}, trying local fallback")
|
||||
return _handle_simple_modification(request.prompt, canvas_data)
|
||||
|
||||
ai_response = response.json().get("response", "")
|
||||
|
||||
except httpx.ConnectError:
|
||||
logger.warning("Ollama not reachable")
|
||||
return _handle_simple_modification(request.prompt, canvas_data)
|
||||
except httpx.TimeoutException:
|
||||
logger.warning("Ollama timeout, trying local fallback")
|
||||
return _handle_simple_modification(request.prompt, canvas_data)
|
||||
|
||||
try:
|
||||
json_start = ai_response.find('{')
|
||||
json_end = ai_response.rfind('}') + 1
|
||||
|
||||
if json_start == -1 or json_end <= json_start:
|
||||
logger.warning(f"No JSON found in AI response: {ai_response[:200]}")
|
||||
return AIModifyResponse(
|
||||
message="KI konnte die Anfrage nicht verarbeiten",
|
||||
error="No JSON in response"
|
||||
)
|
||||
|
||||
ai_json = json.loads(ai_response[json_start:json_end])
|
||||
action = ai_json.get("action", "info")
|
||||
message = ai_json.get("message", "Aenderungen angewendet")
|
||||
new_objects = ai_json.get("objects", [])
|
||||
|
||||
if action == "info":
|
||||
return AIModifyResponse(message=message)
|
||||
|
||||
if action == "add" and new_objects:
|
||||
existing_objects = canvas_data.get("objects", [])
|
||||
existing_objects.extend(new_objects)
|
||||
canvas_data["objects"] = existing_objects
|
||||
return AIModifyResponse(
|
||||
modified_canvas_json=json.dumps(canvas_data),
|
||||
message=message
|
||||
)
|
||||
|
||||
if action == "modify" and new_objects:
|
||||
existing_objects = canvas_data.get("objects", [])
|
||||
new_ids = {obj.get("id") for obj in new_objects if obj.get("id")}
|
||||
kept_objects = [obj for obj in existing_objects if obj.get("id") not in new_ids]
|
||||
kept_objects.extend(new_objects)
|
||||
canvas_data["objects"] = kept_objects
|
||||
return AIModifyResponse(
|
||||
modified_canvas_json=json.dumps(canvas_data),
|
||||
message=message
|
||||
)
|
||||
|
||||
if action == "delete":
|
||||
delete_ids = ai_json.get("delete_ids", [])
|
||||
if delete_ids:
|
||||
existing_objects = canvas_data.get("objects", [])
|
||||
canvas_data["objects"] = [obj for obj in existing_objects if obj.get("id") not in delete_ids]
|
||||
return AIModifyResponse(
|
||||
modified_canvas_json=json.dumps(canvas_data),
|
||||
message=message
|
||||
)
|
||||
|
||||
return AIModifyResponse(message=message)
|
||||
|
||||
except json.JSONDecodeError as e:
|
||||
logger.error(f"Failed to parse AI JSON: {e}")
|
||||
return AIModifyResponse(
|
||||
message="Fehler beim Verarbeiten der KI-Antwort",
|
||||
error=str(e)
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"AI modify error: {e}")
|
||||
return AIModifyResponse(
|
||||
message="Ein unerwarteter Fehler ist aufgetreten",
|
||||
error=str(e)
|
||||
)
|
||||
|
||||
|
||||
def _handle_simple_modification(prompt: str, canvas_data: dict) -> AIModifyResponse:
|
||||
"""
|
||||
Handle simple modifications locally when Ollama is not available.
|
||||
Supports basic commands like adding headings, lines, etc.
|
||||
"""
|
||||
prompt_lower = prompt.lower()
|
||||
objects = canvas_data.get("objects", [])
|
||||
|
||||
def generate_id():
|
||||
return f"obj_{int(time.time()*1000)}_{random.randint(1000, 9999)}"
|
||||
|
||||
# Add heading
|
||||
if "ueberschrift" in prompt_lower or "titel" in prompt_lower or "heading" in prompt_lower:
|
||||
text_match = re.search(r'"([^"]+)"', prompt)
|
||||
text = text_match.group(1) if text_match else "Ueberschrift"
|
||||
|
||||
new_text = {
|
||||
"type": "i-text", "id": generate_id(), "text": text,
|
||||
"left": 397, "top": 50, "originX": "center",
|
||||
"fontFamily": "Arial", "fontSize": 28, "fontWeight": "bold", "fill": "#000000"
|
||||
}
|
||||
objects.append(new_text)
|
||||
canvas_data["objects"] = objects
|
||||
return AIModifyResponse(
|
||||
modified_canvas_json=json.dumps(canvas_data),
|
||||
message=f"Ueberschrift '{text}' hinzugefuegt"
|
||||
)
|
||||
|
||||
# Add lines for writing
|
||||
if "linie" in prompt_lower or "line" in prompt_lower or "schreib" in prompt_lower:
|
||||
num_match = re.search(r'(\d+)', prompt)
|
||||
num_lines = int(num_match.group(1)) if num_match else 5
|
||||
num_lines = min(num_lines, 20)
|
||||
|
||||
start_y = 150
|
||||
line_spacing = 40
|
||||
|
||||
for i in range(num_lines):
|
||||
new_line = {
|
||||
"type": "line", "id": generate_id(),
|
||||
"x1": 60, "y1": start_y + i * line_spacing,
|
||||
"x2": 734, "y2": start_y + i * line_spacing,
|
||||
"stroke": "#cccccc", "strokeWidth": 1
|
||||
}
|
||||
objects.append(new_line)
|
||||
|
||||
canvas_data["objects"] = objects
|
||||
return AIModifyResponse(
|
||||
modified_canvas_json=json.dumps(canvas_data),
|
||||
message=f"{num_lines} Schreiblinien hinzugefuegt"
|
||||
)
|
||||
|
||||
# Make text bigger
|
||||
if "groesser" in prompt_lower or "bigger" in prompt_lower or "larger" in prompt_lower:
|
||||
modified = 0
|
||||
for obj in objects:
|
||||
if obj.get("type") in ["i-text", "text", "textbox"]:
|
||||
current_size = obj.get("fontSize", 16)
|
||||
obj["fontSize"] = int(current_size * 1.25)
|
||||
modified += 1
|
||||
|
||||
canvas_data["objects"] = objects
|
||||
if modified > 0:
|
||||
return AIModifyResponse(
|
||||
modified_canvas_json=json.dumps(canvas_data),
|
||||
message=f"{modified} Texte vergroessert"
|
||||
)
|
||||
|
||||
# Center elements
|
||||
if "zentrier" in prompt_lower or "center" in prompt_lower or "mitte" in prompt_lower:
|
||||
center_x = 397
|
||||
for obj in objects:
|
||||
if not obj.get("isGrid"):
|
||||
obj["left"] = center_x
|
||||
obj["originX"] = "center"
|
||||
|
||||
canvas_data["objects"] = objects
|
||||
return AIModifyResponse(
|
||||
modified_canvas_json=json.dumps(canvas_data),
|
||||
message="Elemente zentriert"
|
||||
)
|
||||
|
||||
# Add numbering
|
||||
if "nummer" in prompt_lower or "nummerier" in prompt_lower or "1-10" in prompt_lower:
|
||||
range_match = re.search(r'(\d+)\s*[-bis]+\s*(\d+)', prompt)
|
||||
if range_match:
|
||||
start, end = int(range_match.group(1)), int(range_match.group(2))
|
||||
else:
|
||||
start, end = 1, 10
|
||||
|
||||
y = 100
|
||||
for i in range(start, min(end + 1, start + 20)):
|
||||
new_text = {
|
||||
"type": "i-text", "id": generate_id(), "text": f"{i}.",
|
||||
"left": 40, "top": y, "fontFamily": "Arial", "fontSize": 14, "fill": "#000000"
|
||||
}
|
||||
objects.append(new_text)
|
||||
y += 35
|
||||
|
||||
canvas_data["objects"] = objects
|
||||
return AIModifyResponse(
|
||||
modified_canvas_json=json.dumps(canvas_data),
|
||||
message=f"Nummerierung {start}-{end} hinzugefuegt"
|
||||
)
|
||||
|
||||
# Add rectangle/box
|
||||
if "rechteck" in prompt_lower or "box" in prompt_lower or "kasten" in prompt_lower:
|
||||
new_rect = {
|
||||
"type": "rect", "id": generate_id(),
|
||||
"left": 100, "top": 200, "width": 200, "height": 100,
|
||||
"fill": "transparent", "stroke": "#000000", "strokeWidth": 2
|
||||
}
|
||||
objects.append(new_rect)
|
||||
canvas_data["objects"] = objects
|
||||
return AIModifyResponse(
|
||||
modified_canvas_json=json.dumps(canvas_data),
|
||||
message="Rechteck hinzugefuegt"
|
||||
)
|
||||
|
||||
# Add grid/raster
|
||||
if "raster" in prompt_lower or "grid" in prompt_lower or "tabelle" in prompt_lower:
|
||||
dim_match = re.search(r'(\d+)\s*[x/\u00d7\*mal by]\s*(\d+)', prompt_lower)
|
||||
if dim_match:
|
||||
cols = int(dim_match.group(1))
|
||||
rows = int(dim_match.group(2))
|
||||
else:
|
||||
nums = re.findall(r'(\d+)', prompt)
|
||||
if len(nums) >= 2:
|
||||
cols, rows = int(nums[0]), int(nums[1])
|
||||
else:
|
||||
cols, rows = 3, 4
|
||||
|
||||
cols = min(max(1, cols), 10)
|
||||
rows = min(max(1, rows), 15)
|
||||
|
||||
canvas_width = 794
|
||||
canvas_height = 1123
|
||||
margin = 60
|
||||
available_width = canvas_width - 2 * margin
|
||||
available_height = canvas_height - 2 * margin - 80
|
||||
|
||||
cell_width = available_width / cols
|
||||
cell_height = min(available_height / rows, 80)
|
||||
|
||||
start_x = margin
|
||||
start_y = 120
|
||||
|
||||
grid_objects = []
|
||||
for r in range(rows + 1):
|
||||
y = start_y + r * cell_height
|
||||
grid_objects.append({
|
||||
"type": "line", "id": generate_id(),
|
||||
"x1": start_x, "y1": y,
|
||||
"x2": start_x + cols * cell_width, "y2": y,
|
||||
"stroke": "#666666", "strokeWidth": 1, "isGrid": True
|
||||
})
|
||||
|
||||
for c in range(cols + 1):
|
||||
x = start_x + c * cell_width
|
||||
grid_objects.append({
|
||||
"type": "line", "id": generate_id(),
|
||||
"x1": x, "y1": start_y,
|
||||
"x2": x, "y2": start_y + rows * cell_height,
|
||||
"stroke": "#666666", "strokeWidth": 1, "isGrid": True
|
||||
})
|
||||
|
||||
objects.extend(grid_objects)
|
||||
canvas_data["objects"] = objects
|
||||
return AIModifyResponse(
|
||||
modified_canvas_json=json.dumps(canvas_data),
|
||||
message=f"{cols}x{rows} Raster hinzugefuegt ({cols} Spalten, {rows} Zeilen)"
|
||||
)
|
||||
|
||||
# Default: Ollama needed
|
||||
return AIModifyResponse(
|
||||
message="Diese Aenderung erfordert den KI-Service. Bitte stellen Sie sicher, dass Ollama laeuft.",
|
||||
error="Complex modification requires Ollama"
|
||||
)
|
||||
@@ -0,0 +1,388 @@
|
||||
"""
|
||||
Worksheet Editor API - Backend Endpoints for Visual Worksheet Editor
|
||||
|
||||
Provides endpoints for:
|
||||
- AI Image generation via Ollama/Stable Diffusion
|
||||
- Worksheet Save/Load
|
||||
- PDF Export
|
||||
|
||||
Split modules:
|
||||
- worksheet_editor_models: Enums, Pydantic models, configuration
|
||||
- worksheet_editor_ai: AI image generation and AI worksheet modification
|
||||
- worksheet_editor_reconstruct: Document reconstruction from vocab sessions
|
||||
"""
|
||||
|
||||
import os
|
||||
import io
|
||||
import json
|
||||
import logging
|
||||
from datetime import datetime, timezone
|
||||
import uuid
|
||||
|
||||
from fastapi import APIRouter, HTTPException
|
||||
from fastapi.responses import StreamingResponse
|
||||
import httpx
|
||||
|
||||
# Re-export everything from sub-modules for backward compatibility
|
||||
from .editor_models import ( # noqa: F401
|
||||
AIImageStyle,
|
||||
WorksheetStatus,
|
||||
AIImageRequest,
|
||||
AIImageResponse,
|
||||
PageData,
|
||||
PageFormat,
|
||||
WorksheetSaveRequest,
|
||||
WorksheetResponse,
|
||||
AIModifyRequest,
|
||||
AIModifyResponse,
|
||||
ReconstructRequest,
|
||||
ReconstructResponse,
|
||||
worksheets_db,
|
||||
OLLAMA_URL,
|
||||
SD_MODEL,
|
||||
WORKSHEET_STORAGE_DIR,
|
||||
STYLE_PROMPTS,
|
||||
REPORTLAB_AVAILABLE,
|
||||
)
|
||||
|
||||
from .editor_ai import ( # noqa: F401
|
||||
generate_ai_image_logic,
|
||||
_generate_placeholder_image,
|
||||
modify_worksheet_with_ai_logic,
|
||||
_handle_simple_modification,
|
||||
)
|
||||
|
||||
from .editor_reconstruct import ( # noqa: F401
|
||||
reconstruct_document_logic,
|
||||
_detect_image_regions,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# =============================================
|
||||
# ROUTER
|
||||
# =============================================
|
||||
|
||||
router = APIRouter(prefix="/api/v1/worksheet", tags=["Worksheet Editor"])
|
||||
|
||||
# =============================================
|
||||
# AI IMAGE GENERATION
|
||||
# =============================================
|
||||
|
||||
@router.post("/ai-image", response_model=AIImageResponse)
|
||||
async def generate_ai_image(request: AIImageRequest):
|
||||
"""
|
||||
Generate an AI image using Ollama with a text-to-image model.
|
||||
|
||||
Supported models:
|
||||
- stable-diffusion (via Ollama)
|
||||
- sd3.5-medium
|
||||
- llava (for image understanding, not generation)
|
||||
|
||||
Falls back to a placeholder if Ollama is not available.
|
||||
"""
|
||||
return await generate_ai_image_logic(request)
|
||||
|
||||
|
||||
# =============================================
|
||||
# WORKSHEET SAVE/LOAD
|
||||
# =============================================
|
||||
|
||||
@router.post("/save", response_model=WorksheetResponse)
|
||||
async def save_worksheet(request: WorksheetSaveRequest):
|
||||
"""
|
||||
Save a worksheet document.
|
||||
|
||||
- If id is provided, updates existing worksheet
|
||||
- If id is not provided, creates new worksheet
|
||||
"""
|
||||
try:
|
||||
now = datetime.now(timezone.utc).isoformat()
|
||||
|
||||
worksheet_id = request.id or f"ws_{uuid.uuid4().hex[:12]}"
|
||||
|
||||
worksheet = {
|
||||
"id": worksheet_id,
|
||||
"title": request.title,
|
||||
"description": request.description,
|
||||
"pages": [p.dict() for p in request.pages],
|
||||
"pageFormat": (request.pageFormat or PageFormat()).dict(),
|
||||
"createdAt": worksheets_db.get(worksheet_id, {}).get("createdAt", now),
|
||||
"updatedAt": now
|
||||
}
|
||||
|
||||
worksheets_db[worksheet_id] = worksheet
|
||||
|
||||
filepath = os.path.join(WORKSHEET_STORAGE_DIR, f"{worksheet_id}.json")
|
||||
with open(filepath, 'w', encoding='utf-8') as f:
|
||||
json.dump(worksheet, f, ensure_ascii=False, indent=2)
|
||||
|
||||
logger.info(f"Saved worksheet: {worksheet_id}")
|
||||
|
||||
return WorksheetResponse(**worksheet)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to save worksheet: {e}")
|
||||
raise HTTPException(status_code=500, detail=f"Failed to save: {str(e)}")
|
||||
|
||||
|
||||
@router.get("/{worksheet_id}", response_model=WorksheetResponse)
|
||||
async def get_worksheet(worksheet_id: str):
|
||||
"""Load a worksheet document by ID."""
|
||||
try:
|
||||
if worksheet_id in worksheets_db:
|
||||
return WorksheetResponse(**worksheets_db[worksheet_id])
|
||||
|
||||
filepath = os.path.join(WORKSHEET_STORAGE_DIR, f"{worksheet_id}.json")
|
||||
if os.path.exists(filepath):
|
||||
with open(filepath, 'r', encoding='utf-8') as f:
|
||||
worksheet = json.load(f)
|
||||
worksheets_db[worksheet_id] = worksheet
|
||||
return WorksheetResponse(**worksheet)
|
||||
|
||||
raise HTTPException(status_code=404, detail="Worksheet not found")
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to load worksheet {worksheet_id}: {e}")
|
||||
raise HTTPException(status_code=500, detail=f"Failed to load: {str(e)}")
|
||||
|
||||
|
||||
@router.get("/list/all")
|
||||
async def list_worksheets():
|
||||
"""List all available worksheets."""
|
||||
try:
|
||||
worksheets = []
|
||||
|
||||
for filename in os.listdir(WORKSHEET_STORAGE_DIR):
|
||||
if filename.endswith('.json'):
|
||||
filepath = os.path.join(WORKSHEET_STORAGE_DIR, filename)
|
||||
try:
|
||||
with open(filepath, 'r', encoding='utf-8') as f:
|
||||
worksheet = json.load(f)
|
||||
worksheets.append({
|
||||
"id": worksheet.get("id"),
|
||||
"title": worksheet.get("title"),
|
||||
"description": worksheet.get("description"),
|
||||
"pageCount": len(worksheet.get("pages", [])),
|
||||
"updatedAt": worksheet.get("updatedAt"),
|
||||
"createdAt": worksheet.get("createdAt")
|
||||
})
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to load {filename}: {e}")
|
||||
|
||||
worksheets.sort(key=lambda x: x.get("updatedAt", ""), reverse=True)
|
||||
|
||||
return {"worksheets": worksheets, "total": len(worksheets)}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to list worksheets: {e}")
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
@router.delete("/{worksheet_id}")
|
||||
async def delete_worksheet(worksheet_id: str):
|
||||
"""Delete a worksheet document."""
|
||||
try:
|
||||
if worksheet_id in worksheets_db:
|
||||
del worksheets_db[worksheet_id]
|
||||
|
||||
filepath = os.path.join(WORKSHEET_STORAGE_DIR, f"{worksheet_id}.json")
|
||||
if os.path.exists(filepath):
|
||||
os.remove(filepath)
|
||||
logger.info(f"Deleted worksheet: {worksheet_id}")
|
||||
return {"status": "deleted", "id": worksheet_id}
|
||||
|
||||
raise HTTPException(status_code=404, detail="Worksheet not found")
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to delete worksheet {worksheet_id}: {e}")
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
# =============================================
|
||||
# PDF EXPORT
|
||||
# =============================================
|
||||
|
||||
@router.post("/{worksheet_id}/export-pdf")
|
||||
async def export_worksheet_pdf(worksheet_id: str):
|
||||
"""
|
||||
Export worksheet as PDF.
|
||||
|
||||
Note: This creates a basic PDF. For full canvas rendering,
|
||||
the frontend should use pdf-lib with canvas.toDataURL().
|
||||
"""
|
||||
if not REPORTLAB_AVAILABLE:
|
||||
raise HTTPException(status_code=501, detail="PDF export not available (reportlab not installed)")
|
||||
|
||||
try:
|
||||
from reportlab.lib.pagesizes import A4
|
||||
from reportlab.pdfgen import canvas
|
||||
|
||||
worksheet = worksheets_db.get(worksheet_id)
|
||||
if not worksheet:
|
||||
filepath = os.path.join(WORKSHEET_STORAGE_DIR, f"{worksheet_id}.json")
|
||||
if os.path.exists(filepath):
|
||||
with open(filepath, 'r', encoding='utf-8') as f:
|
||||
worksheet = json.load(f)
|
||||
else:
|
||||
raise HTTPException(status_code=404, detail="Worksheet not found")
|
||||
|
||||
buffer = io.BytesIO()
|
||||
c = canvas.Canvas(buffer, pagesize=A4)
|
||||
|
||||
page_width, page_height = A4
|
||||
|
||||
for page_data in worksheet.get("pages", []):
|
||||
if page_data.get("index", 0) == 0:
|
||||
c.setFont("Helvetica-Bold", 18)
|
||||
c.drawString(50, page_height - 50, worksheet.get("title", "Arbeitsblatt"))
|
||||
c.setFont("Helvetica", 10)
|
||||
c.drawString(50, page_height - 70, f"Erstellt: {worksheet.get('createdAt', '')[:10]}")
|
||||
|
||||
canvas_json_str = page_data.get("canvasJSON", "{}")
|
||||
if canvas_json_str:
|
||||
try:
|
||||
canvas_data = json.loads(canvas_json_str)
|
||||
objects = canvas_data.get("objects", [])
|
||||
|
||||
for obj in objects:
|
||||
obj_type = obj.get("type", "")
|
||||
|
||||
if obj_type in ["text", "i-text", "textbox"]:
|
||||
text = obj.get("text", "")
|
||||
left = obj.get("left", 50)
|
||||
top = obj.get("top", 100)
|
||||
font_size = obj.get("fontSize", 12)
|
||||
|
||||
pdf_x = left * 0.75
|
||||
pdf_y = page_height - (top * 0.75)
|
||||
|
||||
c.setFont("Helvetica", min(font_size, 24))
|
||||
c.drawString(pdf_x, pdf_y, text[:100])
|
||||
|
||||
elif obj_type == "rect":
|
||||
left = obj.get("left", 0) * 0.75
|
||||
top = obj.get("top", 0) * 0.75
|
||||
width = obj.get("width", 50) * 0.75
|
||||
height = obj.get("height", 30) * 0.75
|
||||
c.rect(left, page_height - top - height, width, height)
|
||||
|
||||
elif obj_type == "circle":
|
||||
left = obj.get("left", 0) * 0.75
|
||||
top = obj.get("top", 0) * 0.75
|
||||
radius = obj.get("radius", 25) * 0.75
|
||||
c.circle(left + radius, page_height - top - radius, radius)
|
||||
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
c.showPage()
|
||||
|
||||
c.save()
|
||||
buffer.seek(0)
|
||||
|
||||
filename = f"{worksheet.get('title', 'worksheet').replace(' ', '_')}.pdf"
|
||||
|
||||
return StreamingResponse(
|
||||
buffer,
|
||||
media_type="application/pdf",
|
||||
headers={"Content-Disposition": f"attachment; filename={filename}"}
|
||||
)
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"PDF export failed: {e}")
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
# =============================================
|
||||
# AI WORKSHEET MODIFICATION
|
||||
# =============================================
|
||||
|
||||
@router.post("/ai-modify", response_model=AIModifyResponse)
|
||||
async def modify_worksheet_with_ai(request: AIModifyRequest):
|
||||
"""
|
||||
Modify a worksheet using AI based on natural language prompt.
|
||||
|
||||
Uses Ollama with qwen2.5vl:32b to understand the canvas state
|
||||
and generate modifications based on the user's request.
|
||||
"""
|
||||
return await modify_worksheet_with_ai_logic(request)
|
||||
|
||||
|
||||
# =============================================
|
||||
# HEALTH CHECK
|
||||
# =============================================
|
||||
|
||||
@router.get("/health/check")
|
||||
async def health_check():
|
||||
"""Check worksheet editor API health and dependencies."""
|
||||
status = {
|
||||
"status": "healthy",
|
||||
"ollama": False,
|
||||
"storage": os.path.exists(WORKSHEET_STORAGE_DIR),
|
||||
"reportlab": REPORTLAB_AVAILABLE,
|
||||
"worksheets_count": len(worksheets_db)
|
||||
}
|
||||
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=5.0) as client:
|
||||
response = await client.get(f"{OLLAMA_URL}/api/tags")
|
||||
status["ollama"] = response.status_code == 200
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return status
|
||||
|
||||
|
||||
# =============================================
|
||||
# DOCUMENT RECONSTRUCTION FROM VOCAB SESSION
|
||||
# =============================================
|
||||
|
||||
@router.post("/reconstruct-from-session", response_model=ReconstructResponse)
|
||||
async def reconstruct_document_from_session(request: ReconstructRequest):
|
||||
"""
|
||||
Reconstruct a document from a vocab session into Fabric.js canvas format.
|
||||
|
||||
Returns canvas JSON ready to load into the worksheet editor.
|
||||
"""
|
||||
try:
|
||||
return await reconstruct_document_logic(request)
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"Document reconstruction failed: {e}")
|
||||
import traceback
|
||||
logger.error(traceback.format_exc())
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
@router.get("/sessions/available")
|
||||
async def get_available_sessions():
|
||||
"""Get list of available vocab sessions that can be reconstructed."""
|
||||
try:
|
||||
from vocab_worksheet_api import _sessions
|
||||
|
||||
available = []
|
||||
for session_id, session in _sessions.items():
|
||||
if session.get("pdf_data"):
|
||||
available.append({
|
||||
"id": session_id,
|
||||
"name": session.get("name", "Unnamed"),
|
||||
"description": session.get("description"),
|
||||
"vocabulary_count": len(session.get("vocabulary", [])),
|
||||
"page_count": session.get("pdf_page_count", 1),
|
||||
"status": session.get("status", "unknown"),
|
||||
"created_at": session.get("created_at", "").isoformat() if session.get("created_at") else None
|
||||
})
|
||||
|
||||
return {"sessions": available, "total": len(available)}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to list sessions: {e}")
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
@@ -0,0 +1,133 @@
|
||||
"""
|
||||
Worksheet Editor Models — Enums, Pydantic models, and configuration.
|
||||
"""
|
||||
|
||||
import os
|
||||
import logging
|
||||
from typing import Optional, List, Dict
|
||||
from enum import Enum
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# =============================================
|
||||
# CONFIGURATION
|
||||
# =============================================
|
||||
|
||||
OLLAMA_URL = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434")
|
||||
SD_MODEL = os.getenv("SD_MODEL", "stable-diffusion") # or specific SD model
|
||||
WORKSHEET_STORAGE_DIR = os.getenv("WORKSHEET_STORAGE_DIR",
|
||||
os.path.join(os.path.dirname(os.path.abspath(__file__)), "worksheet-storage"))
|
||||
|
||||
# Ensure storage directory exists
|
||||
os.makedirs(WORKSHEET_STORAGE_DIR, exist_ok=True)
|
||||
|
||||
# =============================================
|
||||
# ENUMS & MODELS
|
||||
# =============================================
|
||||
|
||||
class AIImageStyle(str, Enum):
|
||||
REALISTIC = "realistic"
|
||||
CARTOON = "cartoon"
|
||||
SKETCH = "sketch"
|
||||
CLIPART = "clipart"
|
||||
EDUCATIONAL = "educational"
|
||||
|
||||
class WorksheetStatus(str, Enum):
|
||||
DRAFT = "draft"
|
||||
PUBLISHED = "published"
|
||||
ARCHIVED = "archived"
|
||||
|
||||
# Style prompt modifiers
|
||||
STYLE_PROMPTS = {
|
||||
AIImageStyle.REALISTIC: "photorealistic, high detail, professional photography",
|
||||
AIImageStyle.CARTOON: "cartoon style, colorful, child-friendly, simple shapes",
|
||||
AIImageStyle.SKETCH: "pencil sketch, hand-drawn, black and white, artistic",
|
||||
AIImageStyle.CLIPART: "clipart style, flat design, simple, vector-like",
|
||||
AIImageStyle.EDUCATIONAL: "educational illustration, clear, informative, textbook style"
|
||||
}
|
||||
|
||||
# =============================================
|
||||
# REQUEST/RESPONSE MODELS
|
||||
# =============================================
|
||||
|
||||
class AIImageRequest(BaseModel):
|
||||
prompt: str = Field(..., min_length=3, max_length=500)
|
||||
style: AIImageStyle = AIImageStyle.EDUCATIONAL
|
||||
width: int = Field(512, ge=256, le=1024)
|
||||
height: int = Field(512, ge=256, le=1024)
|
||||
|
||||
class AIImageResponse(BaseModel):
|
||||
image_base64: str
|
||||
prompt_used: str
|
||||
error: Optional[str] = None
|
||||
|
||||
class PageData(BaseModel):
|
||||
id: str
|
||||
index: int
|
||||
canvasJSON: str
|
||||
|
||||
class PageFormat(BaseModel):
|
||||
width: float = 210
|
||||
height: float = 297
|
||||
orientation: str = "portrait"
|
||||
margins: Dict[str, float] = {"top": 15, "right": 15, "bottom": 15, "left": 15}
|
||||
|
||||
class WorksheetSaveRequest(BaseModel):
|
||||
id: Optional[str] = None
|
||||
title: str
|
||||
description: Optional[str] = None
|
||||
pages: List[PageData]
|
||||
pageFormat: Optional[PageFormat] = None
|
||||
|
||||
class WorksheetResponse(BaseModel):
|
||||
id: str
|
||||
title: str
|
||||
description: Optional[str]
|
||||
pages: List[PageData]
|
||||
pageFormat: PageFormat
|
||||
createdAt: str
|
||||
updatedAt: str
|
||||
|
||||
class AIModifyRequest(BaseModel):
|
||||
prompt: str = Field(..., min_length=3, max_length=1000)
|
||||
canvas_json: str
|
||||
model: str = "qwen2.5vl:32b"
|
||||
|
||||
class AIModifyResponse(BaseModel):
|
||||
modified_canvas_json: Optional[str] = None
|
||||
message: str
|
||||
error: Optional[str] = None
|
||||
|
||||
class ReconstructRequest(BaseModel):
|
||||
session_id: str
|
||||
page_number: int = 1
|
||||
include_images: bool = True
|
||||
regenerate_graphics: bool = False
|
||||
|
||||
class ReconstructResponse(BaseModel):
|
||||
canvas_json: str
|
||||
page_width: int
|
||||
page_height: int
|
||||
elements_count: int
|
||||
vocabulary_matched: int
|
||||
message: str
|
||||
error: Optional[str] = None
|
||||
|
||||
# =============================================
|
||||
# IN-MEMORY STORAGE (Development)
|
||||
# =============================================
|
||||
|
||||
worksheets_db: Dict[str, Dict] = {}
|
||||
|
||||
# PDF Generation availability
|
||||
try:
|
||||
from reportlab.lib import colors # noqa: F401
|
||||
from reportlab.lib.pagesizes import A4 # noqa: F401
|
||||
from reportlab.lib.units import mm # noqa: F401
|
||||
from reportlab.pdfgen import canvas # noqa: F401
|
||||
from reportlab.lib.styles import getSampleStyleSheet # noqa: F401
|
||||
REPORTLAB_AVAILABLE = True
|
||||
except ImportError:
|
||||
REPORTLAB_AVAILABLE = False
|
||||
@@ -0,0 +1,255 @@
|
||||
"""
|
||||
Worksheet Editor Reconstruct — Document reconstruction from vocab sessions.
|
||||
"""
|
||||
|
||||
import io
|
||||
import uuid
|
||||
import base64
|
||||
import logging
|
||||
from typing import List, Dict
|
||||
|
||||
import numpy as np
|
||||
|
||||
from .editor_models import (
|
||||
ReconstructRequest,
|
||||
ReconstructResponse,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
async def reconstruct_document_logic(request: ReconstructRequest) -> ReconstructResponse:
|
||||
"""
|
||||
Reconstruct a document from a vocab session into Fabric.js canvas format.
|
||||
|
||||
This function:
|
||||
1. Loads the original PDF from the vocab session
|
||||
2. Runs OCR with position tracking
|
||||
3. Creates Fabric.js canvas JSON with positioned elements
|
||||
4. Maps extracted vocabulary to their positions
|
||||
|
||||
Returns ReconstructResponse ready to send to the client.
|
||||
"""
|
||||
from fastapi import HTTPException
|
||||
from vocab_worksheet_api import _sessions, convert_pdf_page_to_image
|
||||
|
||||
# Check if session exists
|
||||
if request.session_id not in _sessions:
|
||||
raise HTTPException(status_code=404, detail=f"Session {request.session_id} not found")
|
||||
|
||||
session = _sessions[request.session_id]
|
||||
|
||||
if not session.get("pdf_data"):
|
||||
raise HTTPException(status_code=400, detail="Session has no PDF data")
|
||||
|
||||
pdf_data = session["pdf_data"]
|
||||
page_count = session.get("pdf_page_count", 1)
|
||||
|
||||
if request.page_number < 1 or request.page_number > page_count:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail=f"Page {request.page_number} not found. PDF has {page_count} pages."
|
||||
)
|
||||
|
||||
vocabulary = session.get("vocabulary", [])
|
||||
page_vocab = [v for v in vocabulary if v.get("source_page") == request.page_number]
|
||||
|
||||
logger.info(f"Reconstructing page {request.page_number} from session {request.session_id}")
|
||||
logger.info(f"Found {len(page_vocab)} vocabulary items for this page")
|
||||
|
||||
image_bytes = await convert_pdf_page_to_image(pdf_data, request.page_number)
|
||||
if not image_bytes:
|
||||
raise HTTPException(status_code=500, detail="Failed to convert PDF page to image")
|
||||
|
||||
from PIL import Image
|
||||
img = Image.open(io.BytesIO(image_bytes))
|
||||
img_width, img_height = img.size
|
||||
|
||||
from hybrid_vocab_extractor import run_paddle_ocr
|
||||
ocr_regions, raw_text = run_paddle_ocr(image_bytes)
|
||||
|
||||
logger.info(f"OCR found {len(ocr_regions)} text regions")
|
||||
|
||||
A4_WIDTH = 794
|
||||
A4_HEIGHT = 1123
|
||||
scale_x = A4_WIDTH / img_width
|
||||
scale_y = A4_HEIGHT / img_height
|
||||
|
||||
fabric_objects = []
|
||||
|
||||
# 1. Add white background
|
||||
fabric_objects.append({
|
||||
"type": "rect", "left": 0, "top": 0,
|
||||
"width": A4_WIDTH, "height": A4_HEIGHT,
|
||||
"fill": "#ffffff", "selectable": False,
|
||||
"evented": False, "isBackground": True
|
||||
})
|
||||
|
||||
# 2. Group OCR regions by Y-coordinate to detect rows
|
||||
sorted_regions = sorted(ocr_regions, key=lambda r: (r.y1, r.x1))
|
||||
|
||||
# 3. Detect headers (larger text at top)
|
||||
headers = []
|
||||
for region in sorted_regions:
|
||||
height = region.y2 - region.y1
|
||||
if region.y1 < img_height * 0.15 and height > 30:
|
||||
headers.append(region)
|
||||
|
||||
# 4. Create text objects for each region
|
||||
vocab_matched = 0
|
||||
|
||||
for region in sorted_regions:
|
||||
left = int(region.x1 * scale_x)
|
||||
top = int(region.y1 * scale_y)
|
||||
|
||||
is_header = region in headers
|
||||
|
||||
region_height = region.y2 - region.y1
|
||||
base_font_size = max(10, min(32, int(region_height * scale_y * 0.8)))
|
||||
|
||||
if is_header:
|
||||
base_font_size = max(base_font_size, 24)
|
||||
|
||||
is_vocab = False
|
||||
vocab_match = None
|
||||
for v in page_vocab:
|
||||
if v.get("english", "").lower() in region.text.lower() or \
|
||||
v.get("german", "").lower() in region.text.lower():
|
||||
is_vocab = True
|
||||
vocab_match = v
|
||||
vocab_matched += 1
|
||||
break
|
||||
|
||||
text_obj = {
|
||||
"type": "i-text",
|
||||
"id": f"text_{uuid.uuid4().hex[:8]}",
|
||||
"left": left, "top": top,
|
||||
"text": region.text,
|
||||
"fontFamily": "Arial",
|
||||
"fontSize": base_font_size,
|
||||
"fontWeight": "bold" if is_header else "normal",
|
||||
"fill": "#000000",
|
||||
"originX": "left", "originY": "top",
|
||||
}
|
||||
|
||||
if is_vocab and vocab_match:
|
||||
text_obj["isVocabulary"] = True
|
||||
text_obj["vocabularyId"] = vocab_match.get("id")
|
||||
text_obj["english"] = vocab_match.get("english")
|
||||
text_obj["german"] = vocab_match.get("german")
|
||||
|
||||
fabric_objects.append(text_obj)
|
||||
|
||||
# 5. If include_images, detect and extract image regions
|
||||
if request.include_images:
|
||||
image_regions = await _detect_image_regions(image_bytes, ocr_regions, img_width, img_height)
|
||||
|
||||
for i, img_region in enumerate(image_regions):
|
||||
img_x1 = int(img_region["x1"])
|
||||
img_y1 = int(img_region["y1"])
|
||||
img_x2 = int(img_region["x2"])
|
||||
img_y2 = int(img_region["y2"])
|
||||
|
||||
cropped = img.crop((img_x1, img_y1, img_x2, img_y2))
|
||||
|
||||
buffer = io.BytesIO()
|
||||
cropped.save(buffer, format='PNG')
|
||||
buffer.seek(0)
|
||||
img_base64 = f"data:image/png;base64,{base64.b64encode(buffer.getvalue()).decode('utf-8')}"
|
||||
|
||||
fabric_objects.append({
|
||||
"type": "image",
|
||||
"id": f"img_{uuid.uuid4().hex[:8]}",
|
||||
"left": int(img_x1 * scale_x),
|
||||
"top": int(img_y1 * scale_y),
|
||||
"width": int((img_x2 - img_x1) * scale_x),
|
||||
"height": int((img_y2 - img_y1) * scale_y),
|
||||
"src": img_base64,
|
||||
"scaleX": 1, "scaleY": 1,
|
||||
})
|
||||
|
||||
import json
|
||||
canvas_data = {
|
||||
"version": "6.0.0",
|
||||
"objects": fabric_objects,
|
||||
"background": "#ffffff"
|
||||
}
|
||||
|
||||
return ReconstructResponse(
|
||||
canvas_json=json.dumps(canvas_data),
|
||||
page_width=A4_WIDTH,
|
||||
page_height=A4_HEIGHT,
|
||||
elements_count=len(fabric_objects),
|
||||
vocabulary_matched=vocab_matched,
|
||||
message=f"Reconstructed page {request.page_number} with {len(fabric_objects)} elements, "
|
||||
f"{vocab_matched} vocabulary items matched"
|
||||
)
|
||||
|
||||
|
||||
async def _detect_image_regions(
|
||||
image_bytes: bytes,
|
||||
ocr_regions: list,
|
||||
img_width: int,
|
||||
img_height: int
|
||||
) -> List[Dict]:
|
||||
"""
|
||||
Detect image/graphic regions in the document.
|
||||
|
||||
Uses a simple approach:
|
||||
1. Find large gaps between text regions (potential image areas)
|
||||
2. Use edge detection to find bounded regions
|
||||
3. Filter out text areas
|
||||
"""
|
||||
from PIL import Image
|
||||
import cv2
|
||||
|
||||
try:
|
||||
img = Image.open(io.BytesIO(image_bytes))
|
||||
img_array = np.array(img.convert('L'))
|
||||
|
||||
text_mask = np.ones_like(img_array, dtype=bool)
|
||||
for region in ocr_regions:
|
||||
x1 = max(0, region.x1 - 5)
|
||||
y1 = max(0, region.y1 - 5)
|
||||
x2 = min(img_width, region.x2 + 5)
|
||||
y2 = min(img_height, region.y2 + 5)
|
||||
text_mask[y1:y2, x1:x2] = False
|
||||
|
||||
image_regions = []
|
||||
|
||||
edges = cv2.Canny(img_array, 50, 150)
|
||||
edges[~text_mask] = 0
|
||||
|
||||
contours, _ = cv2.findContours(edges, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
||||
|
||||
for contour in contours:
|
||||
x, y, w, h = cv2.boundingRect(contour)
|
||||
|
||||
if w > 50 and h > 50:
|
||||
if w < img_width * 0.9 and h < img_height * 0.9:
|
||||
region_content = img_array[y:y+h, x:x+w]
|
||||
variance = np.var(region_content)
|
||||
|
||||
if variance > 500:
|
||||
image_regions.append({
|
||||
"x1": x, "y1": y,
|
||||
"x2": x + w, "y2": y + h
|
||||
})
|
||||
|
||||
filtered_regions = []
|
||||
for region in sorted(image_regions, key=lambda r: (r["x2"]-r["x1"])*(r["y2"]-r["y1"]), reverse=True):
|
||||
overlaps = False
|
||||
for existing in filtered_regions:
|
||||
if not (region["x2"] < existing["x1"] or region["x1"] > existing["x2"] or
|
||||
region["y2"] < existing["y1"] or region["y1"] > existing["y2"]):
|
||||
overlaps = True
|
||||
break
|
||||
if not overlaps:
|
||||
filtered_regions.append(region)
|
||||
|
||||
logger.info(f"Detected {len(filtered_regions)} image regions")
|
||||
return filtered_regions[:10]
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Image region detection failed: {e}")
|
||||
return []
|
||||
@@ -0,0 +1,26 @@
|
||||
"""
|
||||
NRU Worksheet Generator — barrel re-export.
|
||||
|
||||
All implementation split into:
|
||||
nru_worksheet_models — data classes, entry separation
|
||||
nru_worksheet_html — HTML generation
|
||||
nru_worksheet_pdf — PDF generation
|
||||
|
||||
Per scanned page, we generate 2 worksheet pages.
|
||||
"""
|
||||
|
||||
# Models
|
||||
from .nru_models import ( # noqa: F401
|
||||
VocabEntry,
|
||||
SentenceEntry,
|
||||
separate_vocab_and_sentences,
|
||||
)
|
||||
|
||||
# HTML generation
|
||||
from .nru_html import ( # noqa: F401
|
||||
generate_nru_html,
|
||||
generate_nru_worksheet_html,
|
||||
)
|
||||
|
||||
# PDF generation
|
||||
from .nru_pdf import generate_nru_pdf # noqa: F401
|
||||
@@ -0,0 +1,466 @@
|
||||
"""
|
||||
NRU Worksheet HTML — HTML generation for vocabulary worksheets.
|
||||
|
||||
Extracted from nru_worksheet_generator.py for modularity.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from typing import List, Dict
|
||||
|
||||
from .nru_models import VocabEntry, SentenceEntry, separate_vocab_and_sentences
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def generate_nru_html(
|
||||
vocab_list: List[VocabEntry],
|
||||
sentence_list: List[SentenceEntry],
|
||||
page_number: int,
|
||||
title: str = "Vokabeltest",
|
||||
show_solutions: bool = False,
|
||||
line_height_px: int = 28
|
||||
) -> str:
|
||||
"""
|
||||
Generate HTML for NRU-format worksheet.
|
||||
|
||||
Returns HTML for 2 pages:
|
||||
- Page 1: Vocabulary table (3 columns)
|
||||
- Page 2: Sentence practice (full width)
|
||||
"""
|
||||
|
||||
# Filter by page
|
||||
page_vocab = [v for v in vocab_list if v.source_page == page_number]
|
||||
page_sentences = [s for s in sentence_list if s.source_page == page_number]
|
||||
|
||||
html = f"""<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<style>
|
||||
@page {{
|
||||
size: A4;
|
||||
margin: 1.5cm 2cm;
|
||||
}}
|
||||
* {{
|
||||
box-sizing: border-box;
|
||||
}}
|
||||
body {{
|
||||
font-family: Arial, Helvetica, sans-serif;
|
||||
font-size: 12pt;
|
||||
line-height: 1.4;
|
||||
margin: 0;
|
||||
padding: 0;
|
||||
}}
|
||||
.page {{
|
||||
page-break-after: always;
|
||||
min-height: 100%;
|
||||
}}
|
||||
.page:last-child {{
|
||||
page-break-after: avoid;
|
||||
}}
|
||||
h1 {{
|
||||
font-size: 16pt;
|
||||
margin: 0 0 8px 0;
|
||||
text-align: center;
|
||||
}}
|
||||
.header {{
|
||||
margin-bottom: 15px;
|
||||
}}
|
||||
.name-line {{
|
||||
font-size: 11pt;
|
||||
margin-bottom: 10px;
|
||||
}}
|
||||
|
||||
/* Vocabulary Table - 3 columns */
|
||||
.vocab-table {{
|
||||
width: 100%;
|
||||
border-collapse: collapse;
|
||||
table-layout: fixed;
|
||||
}}
|
||||
.vocab-table th {{
|
||||
background: #f0f0f0;
|
||||
border: 1px solid #333;
|
||||
padding: 6px 8px;
|
||||
font-weight: bold;
|
||||
font-size: 11pt;
|
||||
text-align: left;
|
||||
}}
|
||||
.vocab-table td {{
|
||||
border: 1px solid #333;
|
||||
padding: 4px 8px;
|
||||
height: {line_height_px}px;
|
||||
vertical-align: middle;
|
||||
}}
|
||||
.vocab-table .col-english {{ width: 35%; }}
|
||||
.vocab-table .col-german {{ width: 35%; }}
|
||||
.vocab-table .col-correction {{ width: 30%; }}
|
||||
.vocab-answer {{
|
||||
color: #0066cc;
|
||||
font-style: italic;
|
||||
}}
|
||||
|
||||
/* Sentence Table - full width */
|
||||
.sentence-table {{
|
||||
width: 100%;
|
||||
border-collapse: collapse;
|
||||
margin-bottom: 15px;
|
||||
}}
|
||||
.sentence-table td {{
|
||||
border: 1px solid #333;
|
||||
padding: 6px 10px;
|
||||
}}
|
||||
.sentence-header {{
|
||||
background: #f5f5f5;
|
||||
font-weight: normal;
|
||||
min-height: 30px;
|
||||
}}
|
||||
.sentence-line {{
|
||||
height: {line_height_px + 4}px;
|
||||
}}
|
||||
.sentence-answer {{
|
||||
color: #0066cc;
|
||||
font-style: italic;
|
||||
font-size: 11pt;
|
||||
}}
|
||||
|
||||
.page-info {{
|
||||
font-size: 9pt;
|
||||
color: #666;
|
||||
text-align: right;
|
||||
margin-top: 10px;
|
||||
}}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
"""
|
||||
|
||||
# ========== PAGE 1: VOCABULARY TABLE ==========
|
||||
if page_vocab:
|
||||
html += f"""
|
||||
<div class="page">
|
||||
<div class="header">
|
||||
<h1>{title} - Vokabeln (Seite {page_number})</h1>
|
||||
<div class="name-line">Name: _________________________ Datum: _____________</div>
|
||||
</div>
|
||||
|
||||
<table class="vocab-table">
|
||||
<thead>
|
||||
<tr>
|
||||
<th class="col-english">Englisch</th>
|
||||
<th class="col-german">Deutsch</th>
|
||||
<th class="col-correction">Korrektur</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
"""
|
||||
for v in page_vocab:
|
||||
if show_solutions:
|
||||
html += f"""
|
||||
<tr>
|
||||
<td>{v.english}</td>
|
||||
<td class="vocab-answer">{v.german}</td>
|
||||
<td></td>
|
||||
</tr>
|
||||
"""
|
||||
else:
|
||||
html += f"""
|
||||
<tr>
|
||||
<td>{v.english}</td>
|
||||
<td></td>
|
||||
<td></td>
|
||||
</tr>
|
||||
"""
|
||||
|
||||
html += """
|
||||
</tbody>
|
||||
</table>
|
||||
<div class="page-info">Vokabeln aus Unit</div>
|
||||
</div>
|
||||
"""
|
||||
|
||||
# ========== PAGE 2: SENTENCE PRACTICE ==========
|
||||
if page_sentences:
|
||||
html += f"""
|
||||
<div class="page">
|
||||
<div class="header">
|
||||
<h1>{title} - Lernsaetze (Seite {page_number})</h1>
|
||||
<div class="name-line">Name: _________________________ Datum: _____________</div>
|
||||
</div>
|
||||
"""
|
||||
for s in page_sentences:
|
||||
html += f"""
|
||||
<table class="sentence-table">
|
||||
<tr>
|
||||
<td class="sentence-header">{s.german}</td>
|
||||
</tr>
|
||||
"""
|
||||
if show_solutions:
|
||||
html += f"""
|
||||
<tr>
|
||||
<td class="sentence-line sentence-answer">{s.english}</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td class="sentence-line"></td>
|
||||
</tr>
|
||||
"""
|
||||
else:
|
||||
html += """
|
||||
<tr>
|
||||
<td class="sentence-line"></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td class="sentence-line"></td>
|
||||
</tr>
|
||||
"""
|
||||
html += """
|
||||
</table>
|
||||
"""
|
||||
|
||||
html += """
|
||||
<div class="page-info">Lernsaetze aus Unit</div>
|
||||
</div>
|
||||
"""
|
||||
|
||||
html += """
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
return html
|
||||
|
||||
|
||||
def generate_nru_worksheet_html(
|
||||
entries: List[Dict],
|
||||
title: str = "Vokabeltest",
|
||||
show_solutions: bool = False,
|
||||
specific_pages: List[int] = None
|
||||
) -> str:
|
||||
"""
|
||||
Generate complete NRU worksheet HTML for all pages.
|
||||
|
||||
Args:
|
||||
entries: List of vocabulary entries with source_page
|
||||
title: Worksheet title
|
||||
show_solutions: Whether to show answers
|
||||
specific_pages: List of specific page numbers to include (1-indexed)
|
||||
|
||||
Returns:
|
||||
Complete HTML document
|
||||
"""
|
||||
# Separate into vocab and sentences
|
||||
vocab_list, sentence_list = separate_vocab_and_sentences(entries)
|
||||
|
||||
# Get unique page numbers
|
||||
all_pages = set()
|
||||
for v in vocab_list:
|
||||
all_pages.add(v.source_page)
|
||||
for s in sentence_list:
|
||||
all_pages.add(s.source_page)
|
||||
|
||||
# Filter to specific pages if requested
|
||||
if specific_pages:
|
||||
all_pages = all_pages.intersection(set(specific_pages))
|
||||
|
||||
pages_sorted = sorted(all_pages)
|
||||
|
||||
logger.info(f"Generating NRU worksheet for pages {pages_sorted}")
|
||||
logger.info(f"Total vocab: {len(vocab_list)}, Total sentences: {len(sentence_list)}")
|
||||
|
||||
# Generate HTML for each page
|
||||
combined_html = """<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<style>
|
||||
@page {
|
||||
size: A4;
|
||||
margin: 1.5cm 2cm;
|
||||
}
|
||||
* {
|
||||
box-sizing: border-box;
|
||||
}
|
||||
body {
|
||||
font-family: Arial, Helvetica, sans-serif;
|
||||
font-size: 12pt;
|
||||
line-height: 1.4;
|
||||
margin: 0;
|
||||
padding: 0;
|
||||
}
|
||||
.page {
|
||||
page-break-after: always;
|
||||
min-height: 100%;
|
||||
}
|
||||
.page:last-child {
|
||||
page-break-after: avoid;
|
||||
}
|
||||
h1 {
|
||||
font-size: 16pt;
|
||||
margin: 0 0 8px 0;
|
||||
text-align: center;
|
||||
}
|
||||
.header {
|
||||
margin-bottom: 15px;
|
||||
}
|
||||
.name-line {
|
||||
font-size: 11pt;
|
||||
margin-bottom: 10px;
|
||||
}
|
||||
|
||||
/* Vocabulary Table - 3 columns */
|
||||
.vocab-table {
|
||||
width: 100%;
|
||||
border-collapse: collapse;
|
||||
table-layout: fixed;
|
||||
}
|
||||
.vocab-table th {
|
||||
background: #f0f0f0;
|
||||
border: 1px solid #333;
|
||||
padding: 6px 8px;
|
||||
font-weight: bold;
|
||||
font-size: 11pt;
|
||||
text-align: left;
|
||||
}
|
||||
.vocab-table td {
|
||||
border: 1px solid #333;
|
||||
padding: 4px 8px;
|
||||
height: 28px;
|
||||
vertical-align: middle;
|
||||
}
|
||||
.vocab-table .col-english { width: 35%; }
|
||||
.vocab-table .col-german { width: 35%; }
|
||||
.vocab-table .col-correction { width: 30%; }
|
||||
.vocab-answer {
|
||||
color: #0066cc;
|
||||
font-style: italic;
|
||||
}
|
||||
|
||||
/* Sentence Table - full width */
|
||||
.sentence-table {
|
||||
width: 100%;
|
||||
border-collapse: collapse;
|
||||
margin-bottom: 15px;
|
||||
}
|
||||
.sentence-table td {
|
||||
border: 1px solid #333;
|
||||
padding: 6px 10px;
|
||||
}
|
||||
.sentence-header {
|
||||
background: #f5f5f5;
|
||||
font-weight: normal;
|
||||
min-height: 30px;
|
||||
}
|
||||
.sentence-line {
|
||||
height: 32px;
|
||||
}
|
||||
.sentence-answer {
|
||||
color: #0066cc;
|
||||
font-style: italic;
|
||||
font-size: 11pt;
|
||||
}
|
||||
|
||||
.page-info {
|
||||
font-size: 9pt;
|
||||
color: #666;
|
||||
text-align: right;
|
||||
margin-top: 10px;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
"""
|
||||
|
||||
for page_num in pages_sorted:
|
||||
page_vocab = [v for v in vocab_list if v.source_page == page_num]
|
||||
page_sentences = [s for s in sentence_list if s.source_page == page_num]
|
||||
|
||||
# PAGE 1: VOCABULARY TABLE
|
||||
if page_vocab:
|
||||
combined_html += f"""
|
||||
<div class="page">
|
||||
<div class="header">
|
||||
<h1>{title} - Vokabeln (Seite {page_num})</h1>
|
||||
<div class="name-line">Name: _________________________ Datum: _____________</div>
|
||||
</div>
|
||||
|
||||
<table class="vocab-table">
|
||||
<thead>
|
||||
<tr>
|
||||
<th class="col-english">Englisch</th>
|
||||
<th class="col-german">Deutsch</th>
|
||||
<th class="col-correction">Korrektur</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
"""
|
||||
for v in page_vocab:
|
||||
if show_solutions:
|
||||
combined_html += f"""
|
||||
<tr>
|
||||
<td>{v.english}</td>
|
||||
<td class="vocab-answer">{v.german}</td>
|
||||
<td></td>
|
||||
</tr>
|
||||
"""
|
||||
else:
|
||||
combined_html += f"""
|
||||
<tr>
|
||||
<td>{v.english}</td>
|
||||
<td></td>
|
||||
<td></td>
|
||||
</tr>
|
||||
"""
|
||||
|
||||
combined_html += f"""
|
||||
</tbody>
|
||||
</table>
|
||||
<div class="page-info">{title} - Seite {page_num}</div>
|
||||
</div>
|
||||
"""
|
||||
|
||||
# PAGE 2: SENTENCE PRACTICE
|
||||
if page_sentences:
|
||||
combined_html += f"""
|
||||
<div class="page">
|
||||
<div class="header">
|
||||
<h1>{title} - Lernsaetze (Seite {page_num})</h1>
|
||||
<div class="name-line">Name: _________________________ Datum: _____________</div>
|
||||
</div>
|
||||
"""
|
||||
for s in page_sentences:
|
||||
combined_html += f"""
|
||||
<table class="sentence-table">
|
||||
<tr>
|
||||
<td class="sentence-header">{s.german}</td>
|
||||
</tr>
|
||||
"""
|
||||
if show_solutions:
|
||||
combined_html += f"""
|
||||
<tr>
|
||||
<td class="sentence-line sentence-answer">{s.english}</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td class="sentence-line"></td>
|
||||
</tr>
|
||||
"""
|
||||
else:
|
||||
combined_html += """
|
||||
<tr>
|
||||
<td class="sentence-line"></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td class="sentence-line"></td>
|
||||
</tr>
|
||||
"""
|
||||
combined_html += """
|
||||
</table>
|
||||
"""
|
||||
|
||||
combined_html += f"""
|
||||
<div class="page-info">{title} - Seite {page_num}</div>
|
||||
</div>
|
||||
"""
|
||||
|
||||
combined_html += """
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
return combined_html
|
||||
@@ -0,0 +1,70 @@
|
||||
"""
|
||||
NRU Worksheet Models — data classes and entry separation logic.
|
||||
|
||||
Extracted from nru_worksheet_generator.py for modularity.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from typing import List, Dict, Tuple
|
||||
from dataclasses import dataclass
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class VocabEntry:
|
||||
english: str
|
||||
german: str
|
||||
source_page: int = 1
|
||||
|
||||
|
||||
@dataclass
|
||||
class SentenceEntry:
|
||||
german: str
|
||||
english: str # For solution sheet
|
||||
source_page: int = 1
|
||||
|
||||
|
||||
def separate_vocab_and_sentences(entries: List[Dict]) -> Tuple[List[VocabEntry], List[SentenceEntry]]:
|
||||
"""
|
||||
Separate vocabulary entries into single words/phrases and full sentences.
|
||||
|
||||
Sentences are identified by:
|
||||
- Ending with punctuation (. ! ?)
|
||||
- Being longer than 40 characters
|
||||
- Containing multiple words with capital letters mid-sentence
|
||||
"""
|
||||
vocab_list = []
|
||||
sentence_list = []
|
||||
|
||||
for entry in entries:
|
||||
english = entry.get("english", "").strip()
|
||||
german = entry.get("german", "").strip()
|
||||
source_page = entry.get("source_page", 1)
|
||||
|
||||
if not english or not german:
|
||||
continue
|
||||
|
||||
# Detect if this is a sentence
|
||||
is_sentence = (
|
||||
english.endswith('.') or
|
||||
english.endswith('!') or
|
||||
english.endswith('?') or
|
||||
len(english) > 50 or
|
||||
(len(english.split()) > 5 and any(w[0].isupper() for w in english.split()[1:] if w))
|
||||
)
|
||||
|
||||
if is_sentence:
|
||||
sentence_list.append(SentenceEntry(
|
||||
german=german,
|
||||
english=english,
|
||||
source_page=source_page
|
||||
))
|
||||
else:
|
||||
vocab_list.append(VocabEntry(
|
||||
english=english,
|
||||
german=german,
|
||||
source_page=source_page
|
||||
))
|
||||
|
||||
return vocab_list, sentence_list
|
||||
@@ -0,0 +1,31 @@
|
||||
"""
|
||||
NRU Worksheet PDF — PDF generation using weasyprint.
|
||||
|
||||
Extracted from nru_worksheet_generator.py for modularity.
|
||||
"""
|
||||
|
||||
from typing import List, Dict, Tuple
|
||||
|
||||
from .nru_html import generate_nru_worksheet_html
|
||||
|
||||
|
||||
async def generate_nru_pdf(entries: List[Dict], title: str = "Vokabeltest", include_solutions: bool = True) -> Tuple[bytes, bytes]:
|
||||
"""
|
||||
Generate NRU worksheet PDFs.
|
||||
|
||||
Returns:
|
||||
Tuple of (worksheet_pdf_bytes, solution_pdf_bytes)
|
||||
"""
|
||||
from weasyprint import HTML
|
||||
|
||||
# Generate worksheet HTML
|
||||
worksheet_html = generate_nru_worksheet_html(entries, title, show_solutions=False)
|
||||
worksheet_pdf = HTML(string=worksheet_html).write_pdf()
|
||||
|
||||
# Generate solution HTML
|
||||
solution_pdf = None
|
||||
if include_solutions:
|
||||
solution_html = generate_nru_worksheet_html(entries, title, show_solutions=True)
|
||||
solution_pdf = HTML(string=solution_html).write_pdf()
|
||||
|
||||
return worksheet_pdf, solution_pdf
|
||||
@@ -1,491 +1,4 @@
|
||||
"""
|
||||
Worksheet Cleanup API - Handschrift-Entfernung und Layout-Rekonstruktion
|
||||
|
||||
Endpoints:
|
||||
- POST /api/v1/worksheet/detect-handwriting - Erkennt Handschrift und gibt Maske zurueck
|
||||
- POST /api/v1/worksheet/remove-handwriting - Entfernt Handschrift aus Bild
|
||||
- POST /api/v1/worksheet/reconstruct - Rekonstruiert Layout als Fabric.js JSON
|
||||
- POST /api/v1/worksheet/cleanup-pipeline - Vollstaendige Pipeline (Erkennung + Entfernung + Layout)
|
||||
|
||||
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal auf dem Mac Mini.
|
||||
"""
|
||||
|
||||
import io
|
||||
import base64
|
||||
import logging
|
||||
from typing import Optional
|
||||
|
||||
from fastapi import APIRouter, HTTPException, UploadFile, File, Form
|
||||
from fastapi.responses import StreamingResponse, JSONResponse
|
||||
from pydantic import BaseModel
|
||||
|
||||
from services.handwriting_detection import (
|
||||
detect_handwriting,
|
||||
detect_handwriting_regions,
|
||||
mask_to_png
|
||||
)
|
||||
from services.inpainting_service import (
|
||||
inpaint_image,
|
||||
remove_handwriting,
|
||||
InpaintingMethod,
|
||||
check_lama_available
|
||||
)
|
||||
from services.layout_reconstruction_service import (
|
||||
reconstruct_layout,
|
||||
layout_to_fabric_json,
|
||||
reconstruct_and_clean
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
router = APIRouter(prefix="/api/v1/worksheet", tags=["Worksheet Cleanup"])
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Pydantic Models
|
||||
# =============================================================================
|
||||
|
||||
class DetectionResponse(BaseModel):
|
||||
has_handwriting: bool
|
||||
confidence: float
|
||||
handwriting_ratio: float
|
||||
detection_method: str
|
||||
mask_base64: Optional[str] = None
|
||||
|
||||
|
||||
class InpaintingResponse(BaseModel):
|
||||
success: bool
|
||||
method_used: str
|
||||
processing_time_ms: float
|
||||
image_base64: Optional[str] = None
|
||||
error: Optional[str] = None
|
||||
|
||||
|
||||
class ReconstructionResponse(BaseModel):
|
||||
success: bool
|
||||
element_count: int
|
||||
page_width: int
|
||||
page_height: int
|
||||
fabric_json: dict
|
||||
table_count: int = 0
|
||||
|
||||
|
||||
class PipelineResponse(BaseModel):
|
||||
success: bool
|
||||
handwriting_detected: bool
|
||||
handwriting_removed: bool
|
||||
layout_reconstructed: bool
|
||||
cleaned_image_base64: Optional[str] = None
|
||||
fabric_json: Optional[dict] = None
|
||||
metadata: dict = {}
|
||||
|
||||
|
||||
class CapabilitiesResponse(BaseModel):
|
||||
opencv_available: bool = True
|
||||
lama_available: bool = False
|
||||
paddleocr_available: bool = False
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# API Endpoints
|
||||
# =============================================================================
|
||||
|
||||
@router.get("/capabilities")
|
||||
async def get_capabilities() -> CapabilitiesResponse:
|
||||
"""
|
||||
Get available cleanup capabilities on this server.
|
||||
"""
|
||||
# Check PaddleOCR
|
||||
paddleocr_available = False
|
||||
try:
|
||||
from hybrid_vocab_extractor import get_paddle_ocr
|
||||
ocr = get_paddle_ocr()
|
||||
paddleocr_available = ocr is not None
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return CapabilitiesResponse(
|
||||
opencv_available=True,
|
||||
lama_available=check_lama_available(),
|
||||
paddleocr_available=paddleocr_available
|
||||
)
|
||||
|
||||
|
||||
@router.post("/detect-handwriting")
|
||||
async def detect_handwriting_endpoint(
|
||||
image: UploadFile = File(...),
|
||||
return_mask: bool = Form(default=True),
|
||||
min_confidence: float = Form(default=0.3)
|
||||
) -> DetectionResponse:
|
||||
"""
|
||||
Detect handwriting in an image.
|
||||
|
||||
Args:
|
||||
image: Input image (PNG, JPG)
|
||||
return_mask: Whether to return the binary mask as base64
|
||||
min_confidence: Minimum confidence threshold
|
||||
|
||||
Returns:
|
||||
DetectionResponse with detection results and optional mask
|
||||
"""
|
||||
logger.info(f"Handwriting detection request: {image.filename}")
|
||||
|
||||
# Validate file type
|
||||
content_type = image.content_type or ""
|
||||
if not content_type.startswith("image/"):
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail="Only image files (PNG, JPG) are supported"
|
||||
)
|
||||
|
||||
try:
|
||||
image_bytes = await image.read()
|
||||
|
||||
# Detect handwriting
|
||||
result = detect_handwriting(image_bytes)
|
||||
|
||||
has_handwriting = (
|
||||
result.confidence >= min_confidence and
|
||||
result.handwriting_ratio > 0.005
|
||||
)
|
||||
|
||||
response = DetectionResponse(
|
||||
has_handwriting=has_handwriting,
|
||||
confidence=result.confidence,
|
||||
handwriting_ratio=result.handwriting_ratio,
|
||||
detection_method=result.detection_method
|
||||
)
|
||||
|
||||
if return_mask:
|
||||
mask_bytes = mask_to_png(result.mask)
|
||||
response.mask_base64 = base64.b64encode(mask_bytes).decode('utf-8')
|
||||
|
||||
logger.info(f"Detection complete: handwriting={has_handwriting}, "
|
||||
f"confidence={result.confidence:.2f}")
|
||||
|
||||
return response
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Handwriting detection failed: {e}")
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
@router.post("/detect-handwriting/mask")
|
||||
async def get_handwriting_mask(
|
||||
image: UploadFile = File(...)
|
||||
) -> StreamingResponse:
|
||||
"""
|
||||
Get handwriting detection mask as PNG image.
|
||||
|
||||
Returns binary mask where white (255) = handwriting.
|
||||
"""
|
||||
content_type = image.content_type or ""
|
||||
if not content_type.startswith("image/"):
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail="Only image files are supported"
|
||||
)
|
||||
|
||||
try:
|
||||
image_bytes = await image.read()
|
||||
result = detect_handwriting(image_bytes)
|
||||
mask_bytes = mask_to_png(result.mask)
|
||||
|
||||
return StreamingResponse(
|
||||
io.BytesIO(mask_bytes),
|
||||
media_type="image/png",
|
||||
headers={
|
||||
"Content-Disposition": "attachment; filename=handwriting_mask.png"
|
||||
}
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Mask generation failed: {e}")
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
@router.post("/remove-handwriting")
|
||||
async def remove_handwriting_endpoint(
|
||||
image: UploadFile = File(...),
|
||||
mask: Optional[UploadFile] = File(default=None),
|
||||
method: str = Form(default="auto"),
|
||||
return_base64: bool = Form(default=False)
|
||||
):
|
||||
"""
|
||||
Remove handwriting from an image.
|
||||
|
||||
Args:
|
||||
image: Input image with handwriting
|
||||
mask: Optional pre-computed mask (if not provided, auto-detected)
|
||||
method: Inpainting method (auto, opencv_telea, opencv_ns, lama)
|
||||
return_base64: If True, return image as base64, else as file
|
||||
|
||||
Returns:
|
||||
Cleaned image (as PNG file or base64 in JSON)
|
||||
"""
|
||||
logger.info(f"Remove handwriting request: {image.filename}, method={method}")
|
||||
|
||||
content_type = image.content_type or ""
|
||||
if not content_type.startswith("image/"):
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail="Only image files are supported"
|
||||
)
|
||||
|
||||
try:
|
||||
image_bytes = await image.read()
|
||||
|
||||
# Get mask if provided
|
||||
mask_array = None
|
||||
if mask is not None:
|
||||
mask_bytes = await mask.read()
|
||||
from PIL import Image
|
||||
import numpy as np
|
||||
mask_img = Image.open(io.BytesIO(mask_bytes))
|
||||
mask_array = np.array(mask_img)
|
||||
|
||||
# Select inpainting method
|
||||
inpainting_method = InpaintingMethod.AUTO
|
||||
if method == "opencv_telea":
|
||||
inpainting_method = InpaintingMethod.OPENCV_TELEA
|
||||
elif method == "opencv_ns":
|
||||
inpainting_method = InpaintingMethod.OPENCV_NS
|
||||
elif method == "lama":
|
||||
inpainting_method = InpaintingMethod.LAMA
|
||||
|
||||
# Remove handwriting
|
||||
cleaned_bytes, metadata = remove_handwriting(
|
||||
image_bytes,
|
||||
mask=mask_array,
|
||||
method=inpainting_method
|
||||
)
|
||||
|
||||
if return_base64:
|
||||
return JSONResponse({
|
||||
"success": True,
|
||||
"image_base64": base64.b64encode(cleaned_bytes).decode('utf-8'),
|
||||
"metadata": metadata
|
||||
})
|
||||
else:
|
||||
return StreamingResponse(
|
||||
io.BytesIO(cleaned_bytes),
|
||||
media_type="image/png",
|
||||
headers={
|
||||
"Content-Disposition": "attachment; filename=cleaned.png",
|
||||
"X-Method-Used": metadata.get("method_used", "unknown"),
|
||||
"X-Processing-Time-Ms": str(metadata.get("processing_time_ms", 0))
|
||||
}
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Handwriting removal failed: {e}")
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
@router.post("/reconstruct")
|
||||
async def reconstruct_layout_endpoint(
|
||||
image: UploadFile = File(...),
|
||||
clean_handwriting: bool = Form(default=True),
|
||||
detect_tables: bool = Form(default=True)
|
||||
) -> ReconstructionResponse:
|
||||
"""
|
||||
Reconstruct worksheet layout and generate Fabric.js JSON.
|
||||
|
||||
Args:
|
||||
image: Input image (can contain handwriting)
|
||||
clean_handwriting: Whether to remove handwriting first
|
||||
detect_tables: Whether to detect table structures
|
||||
|
||||
Returns:
|
||||
ReconstructionResponse with Fabric.js JSON
|
||||
"""
|
||||
logger.info(f"Layout reconstruction request: {image.filename}")
|
||||
|
||||
content_type = image.content_type or ""
|
||||
if not content_type.startswith("image/"):
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail="Only image files are supported"
|
||||
)
|
||||
|
||||
try:
|
||||
image_bytes = await image.read()
|
||||
|
||||
# Run reconstruction pipeline
|
||||
if clean_handwriting:
|
||||
cleaned_bytes, layout = reconstruct_and_clean(image_bytes)
|
||||
else:
|
||||
layout = reconstruct_layout(image_bytes, detect_tables=detect_tables)
|
||||
|
||||
return ReconstructionResponse(
|
||||
success=True,
|
||||
element_count=len(layout.elements),
|
||||
page_width=layout.page_width,
|
||||
page_height=layout.page_height,
|
||||
fabric_json=layout.fabric_json,
|
||||
table_count=len(layout.table_regions)
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Layout reconstruction failed: {e}")
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
@router.post("/cleanup-pipeline")
|
||||
async def full_cleanup_pipeline(
|
||||
image: UploadFile = File(...),
|
||||
remove_hw: bool = Form(default=True, alias="remove_handwriting"),
|
||||
reconstruct: bool = Form(default=True),
|
||||
inpainting_method: str = Form(default="auto")
|
||||
) -> PipelineResponse:
|
||||
"""
|
||||
Full cleanup pipeline: detect, remove handwriting, reconstruct layout.
|
||||
|
||||
This is the recommended endpoint for processing filled worksheets.
|
||||
|
||||
Args:
|
||||
image: Input image (scan/photo of filled worksheet)
|
||||
remove_handwriting: Whether to remove detected handwriting
|
||||
reconstruct: Whether to reconstruct layout as Fabric.js JSON
|
||||
inpainting_method: Method for inpainting (auto, opencv_telea, opencv_ns, lama)
|
||||
|
||||
Returns:
|
||||
PipelineResponse with cleaned image and Fabric.js JSON
|
||||
"""
|
||||
logger.info(f"Full cleanup pipeline: {image.filename}")
|
||||
|
||||
content_type = image.content_type or ""
|
||||
if not content_type.startswith("image/"):
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail="Only image files are supported"
|
||||
)
|
||||
|
||||
try:
|
||||
image_bytes = await image.read()
|
||||
metadata = {}
|
||||
|
||||
# Step 1: Detect handwriting
|
||||
detection = detect_handwriting(image_bytes)
|
||||
handwriting_detected = (
|
||||
detection.confidence >= 0.3 and
|
||||
detection.handwriting_ratio > 0.005
|
||||
)
|
||||
|
||||
metadata["detection"] = {
|
||||
"confidence": detection.confidence,
|
||||
"handwriting_ratio": detection.handwriting_ratio,
|
||||
"method": detection.detection_method
|
||||
}
|
||||
|
||||
# Step 2: Remove handwriting if requested and detected
|
||||
cleaned_bytes = image_bytes
|
||||
handwriting_removed = False
|
||||
|
||||
if remove_hw and handwriting_detected:
|
||||
method = InpaintingMethod.AUTO
|
||||
if inpainting_method == "opencv_telea":
|
||||
method = InpaintingMethod.OPENCV_TELEA
|
||||
elif inpainting_method == "opencv_ns":
|
||||
method = InpaintingMethod.OPENCV_NS
|
||||
elif inpainting_method == "lama":
|
||||
method = InpaintingMethod.LAMA
|
||||
|
||||
cleaned_bytes, inpaint_metadata = remove_handwriting(
|
||||
image_bytes,
|
||||
mask=detection.mask,
|
||||
method=method
|
||||
)
|
||||
handwriting_removed = inpaint_metadata.get("inpainting_performed", False)
|
||||
metadata["inpainting"] = inpaint_metadata
|
||||
|
||||
# Step 3: Reconstruct layout if requested
|
||||
fabric_json = None
|
||||
layout_reconstructed = False
|
||||
|
||||
if reconstruct:
|
||||
layout = reconstruct_layout(cleaned_bytes)
|
||||
fabric_json = layout.fabric_json
|
||||
layout_reconstructed = len(layout.elements) > 0
|
||||
metadata["layout"] = {
|
||||
"element_count": len(layout.elements),
|
||||
"table_count": len(layout.table_regions),
|
||||
"page_width": layout.page_width,
|
||||
"page_height": layout.page_height
|
||||
}
|
||||
|
||||
# Encode cleaned image as base64
|
||||
cleaned_base64 = base64.b64encode(cleaned_bytes).decode('utf-8')
|
||||
|
||||
logger.info(f"Pipeline complete: detected={handwriting_detected}, "
|
||||
f"removed={handwriting_removed}, layout={layout_reconstructed}")
|
||||
|
||||
return PipelineResponse(
|
||||
success=True,
|
||||
handwriting_detected=handwriting_detected,
|
||||
handwriting_removed=handwriting_removed,
|
||||
layout_reconstructed=layout_reconstructed,
|
||||
cleaned_image_base64=cleaned_base64,
|
||||
fabric_json=fabric_json,
|
||||
metadata=metadata
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Cleanup pipeline failed: {e}")
|
||||
import traceback
|
||||
logger.error(traceback.format_exc())
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
@router.post("/preview-cleanup")
|
||||
async def preview_cleanup(
|
||||
image: UploadFile = File(...)
|
||||
) -> JSONResponse:
|
||||
"""
|
||||
Quick preview of cleanup results without full processing.
|
||||
|
||||
Returns detection results and estimated processing time.
|
||||
"""
|
||||
content_type = image.content_type or ""
|
||||
if not content_type.startswith("image/"):
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail="Only image files are supported"
|
||||
)
|
||||
|
||||
try:
|
||||
image_bytes = await image.read()
|
||||
|
||||
# Quick detection only
|
||||
result = detect_handwriting_regions(image_bytes)
|
||||
|
||||
# Estimate processing time based on image size
|
||||
from PIL import Image
|
||||
img = Image.open(io.BytesIO(image_bytes))
|
||||
pixel_count = img.width * img.height
|
||||
|
||||
# Rough estimates
|
||||
est_detection_ms = 100 + (pixel_count / 1000000) * 200
|
||||
est_inpainting_ms = 500 + (pixel_count / 1000000) * 1000
|
||||
est_reconstruction_ms = 200 + (pixel_count / 1000000) * 300
|
||||
|
||||
return JSONResponse({
|
||||
"has_handwriting": result["has_handwriting"],
|
||||
"confidence": result["confidence"],
|
||||
"handwriting_ratio": result["handwriting_ratio"],
|
||||
"image_width": img.width,
|
||||
"image_height": img.height,
|
||||
"estimated_times_ms": {
|
||||
"detection": est_detection_ms,
|
||||
"inpainting": est_inpainting_ms if result["has_handwriting"] else 0,
|
||||
"reconstruction": est_reconstruction_ms,
|
||||
"total": est_detection_ms + (est_inpainting_ms if result["has_handwriting"] else 0) + est_reconstruction_ms
|
||||
},
|
||||
"capabilities": {
|
||||
"lama_available": check_lama_available()
|
||||
}
|
||||
})
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Preview failed: {e}")
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
# Backward-compat shim -- module moved to worksheet/cleanup_api.py
|
||||
import importlib as _importlib
|
||||
import sys as _sys
|
||||
_sys.modules[__name__] = _importlib.import_module("worksheet.cleanup_api")
|
||||
|
||||
@@ -1,485 +1,4 @@
|
||||
"""
|
||||
Worksheet Editor AI — AI image generation and AI worksheet modification.
|
||||
"""
|
||||
|
||||
import io
|
||||
import json
|
||||
import base64
|
||||
import logging
|
||||
import re
|
||||
import time
|
||||
import random
|
||||
from typing import List, Dict
|
||||
|
||||
import httpx
|
||||
|
||||
from worksheet_editor_models import (
|
||||
AIImageRequest,
|
||||
AIImageResponse,
|
||||
AIImageStyle,
|
||||
AIModifyRequest,
|
||||
AIModifyResponse,
|
||||
OLLAMA_URL,
|
||||
STYLE_PROMPTS,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# =============================================
|
||||
# AI IMAGE GENERATION
|
||||
# =============================================
|
||||
|
||||
async def generate_ai_image_logic(request: AIImageRequest) -> AIImageResponse:
|
||||
"""
|
||||
Generate an AI image using Ollama with a text-to-image model.
|
||||
|
||||
Falls back to a placeholder if Ollama is not available.
|
||||
"""
|
||||
from fastapi import HTTPException
|
||||
|
||||
try:
|
||||
# Build enhanced prompt with style
|
||||
style_modifier = STYLE_PROMPTS.get(request.style, "")
|
||||
enhanced_prompt = f"{request.prompt}, {style_modifier}"
|
||||
|
||||
logger.info(f"Generating AI image: {enhanced_prompt[:100]}...")
|
||||
|
||||
# Check if Ollama is available
|
||||
async with httpx.AsyncClient(timeout=10.0) as check_client:
|
||||
try:
|
||||
health_response = await check_client.get(f"{OLLAMA_URL}/api/tags")
|
||||
if health_response.status_code != 200:
|
||||
raise HTTPException(status_code=503, detail="Ollama service not available")
|
||||
except httpx.ConnectError:
|
||||
logger.warning("Ollama not reachable, returning placeholder")
|
||||
return _generate_placeholder_image(request, enhanced_prompt)
|
||||
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=300.0) as client:
|
||||
tags_response = await client.get(f"{OLLAMA_URL}/api/tags")
|
||||
available_models = [m.get("name", "") for m in tags_response.json().get("models", [])]
|
||||
|
||||
sd_model = None
|
||||
for model in available_models:
|
||||
if "stable" in model.lower() or "sd" in model.lower() or "diffusion" in model.lower():
|
||||
sd_model = model
|
||||
break
|
||||
|
||||
if not sd_model:
|
||||
logger.warning("No Stable Diffusion model found in Ollama")
|
||||
return _generate_placeholder_image(request, enhanced_prompt)
|
||||
|
||||
logger.info(f"SD model found: {sd_model}, but image generation API not implemented")
|
||||
return _generate_placeholder_image(request, enhanced_prompt)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Image generation failed: {e}")
|
||||
return _generate_placeholder_image(request, enhanced_prompt)
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"AI image generation error: {e}")
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
def _generate_placeholder_image(request: AIImageRequest, prompt: str) -> AIImageResponse:
|
||||
"""
|
||||
Generate a placeholder image when AI generation is not available.
|
||||
Creates a simple SVG-based placeholder with the prompt text.
|
||||
"""
|
||||
from PIL import Image, ImageDraw, ImageFont
|
||||
|
||||
width, height = request.width, request.height
|
||||
|
||||
style_colors = {
|
||||
AIImageStyle.REALISTIC: ("#2563eb", "#dbeafe"),
|
||||
AIImageStyle.CARTOON: ("#f97316", "#ffedd5"),
|
||||
AIImageStyle.SKETCH: ("#6b7280", "#f3f4f6"),
|
||||
AIImageStyle.CLIPART: ("#8b5cf6", "#ede9fe"),
|
||||
AIImageStyle.EDUCATIONAL: ("#059669", "#d1fae5"),
|
||||
}
|
||||
|
||||
fg_color, bg_color = style_colors.get(request.style, ("#6366f1", "#e0e7ff"))
|
||||
|
||||
img = Image.new('RGB', (width, height), bg_color)
|
||||
draw = ImageDraw.Draw(img)
|
||||
|
||||
draw.rectangle([5, 5, width-6, height-6], outline=fg_color, width=3)
|
||||
|
||||
cx, cy = width // 2, height // 2 - 30
|
||||
draw.ellipse([cx-40, cy-40, cx+40, cy+40], outline=fg_color, width=3)
|
||||
draw.line([cx-20, cy-10, cx+20, cy-10], fill=fg_color, width=3)
|
||||
draw.line([cx, cy-10, cx, cy+20], fill=fg_color, width=3)
|
||||
|
||||
try:
|
||||
font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 14)
|
||||
except Exception:
|
||||
font = ImageFont.load_default()
|
||||
|
||||
max_chars = 40
|
||||
lines = []
|
||||
words = prompt[:200].split()
|
||||
current_line = ""
|
||||
for word in words:
|
||||
if len(current_line) + len(word) + 1 <= max_chars:
|
||||
current_line += (" " + word if current_line else word)
|
||||
else:
|
||||
if current_line:
|
||||
lines.append(current_line)
|
||||
current_line = word
|
||||
if current_line:
|
||||
lines.append(current_line)
|
||||
|
||||
text_y = cy + 60
|
||||
for line in lines[:4]:
|
||||
bbox = draw.textbbox((0, 0), line, font=font)
|
||||
text_width = bbox[2] - bbox[0]
|
||||
draw.text((cx - text_width // 2, text_y), line, fill=fg_color, font=font)
|
||||
text_y += 20
|
||||
|
||||
badge_text = "KI-Bild (Platzhalter)"
|
||||
try:
|
||||
badge_font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 10)
|
||||
except Exception:
|
||||
badge_font = font
|
||||
draw.rectangle([10, height-30, 150, height-10], fill=fg_color)
|
||||
draw.text((15, height-27), badge_text, fill="white", font=badge_font)
|
||||
|
||||
buffer = io.BytesIO()
|
||||
img.save(buffer, format='PNG')
|
||||
buffer.seek(0)
|
||||
|
||||
image_base64 = f"data:image/png;base64,{base64.b64encode(buffer.getvalue()).decode('utf-8')}"
|
||||
|
||||
return AIImageResponse(
|
||||
image_base64=image_base64,
|
||||
prompt_used=prompt,
|
||||
error="AI image generation not available. Using placeholder."
|
||||
)
|
||||
|
||||
|
||||
# =============================================
|
||||
# AI WORKSHEET MODIFICATION
|
||||
# =============================================
|
||||
|
||||
async def modify_worksheet_with_ai_logic(request: AIModifyRequest) -> AIModifyResponse:
|
||||
"""
|
||||
Modify a worksheet using AI based on natural language prompt.
|
||||
"""
|
||||
try:
|
||||
logger.info(f"AI modify request: {request.prompt[:100]}...")
|
||||
|
||||
try:
|
||||
canvas_data = json.loads(request.canvas_json)
|
||||
except json.JSONDecodeError:
|
||||
return AIModifyResponse(
|
||||
message="Fehler beim Parsen des Canvas",
|
||||
error="Invalid canvas JSON"
|
||||
)
|
||||
|
||||
system_prompt = """Du bist ein Assistent fuer die Bearbeitung von Arbeitsblaettern.
|
||||
Du erhaeltst den aktuellen Zustand eines Canvas im JSON-Format und eine Anweisung des Nutzers.
|
||||
Deine Aufgabe ist es, die gewuenschten Aenderungen am Canvas vorzunehmen.
|
||||
|
||||
Der Canvas verwendet Fabric.js. Hier sind die wichtigsten Objekttypen:
|
||||
- i-text: Interaktiver Text mit fontFamily, fontSize, fill, left, top
|
||||
- rect: Rechteck mit left, top, width, height, fill, stroke, strokeWidth
|
||||
- circle: Kreis mit left, top, radius, fill, stroke, strokeWidth
|
||||
- line: Linie mit x1, y1, x2, y2, stroke, strokeWidth
|
||||
|
||||
Das Canvas ist 794x1123 Pixel (A4 bei 96 DPI).
|
||||
|
||||
Antworte NUR mit einem JSON-Objekt in diesem Format:
|
||||
{
|
||||
"action": "modify" oder "add" oder "delete" oder "info",
|
||||
"objects": [...], // Neue/modifizierte Objekte (bei modify/add)
|
||||
"message": "Kurze Beschreibung der Aenderung"
|
||||
}
|
||||
|
||||
Wenn du Objekte hinzufuegst, generiere eindeutige IDs im Format "obj_<timestamp>_<random>".
|
||||
"""
|
||||
|
||||
user_prompt = f"""Aktueller Canvas-Zustand:
|
||||
```json
|
||||
{json.dumps(canvas_data, indent=2)[:5000]}
|
||||
```
|
||||
|
||||
Nutzer-Anweisung: {request.prompt}
|
||||
|
||||
Fuehre die Aenderung durch und antworte mit dem JSON-Objekt."""
|
||||
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=120.0) as client:
|
||||
response = await client.post(
|
||||
f"{OLLAMA_URL}/api/generate",
|
||||
json={
|
||||
"model": request.model,
|
||||
"prompt": user_prompt,
|
||||
"system": system_prompt,
|
||||
"stream": False,
|
||||
"options": {
|
||||
"temperature": 0.3,
|
||||
"num_predict": 4096
|
||||
}
|
||||
}
|
||||
)
|
||||
|
||||
if response.status_code != 200:
|
||||
logger.warning(f"Ollama error: {response.status_code}, trying local fallback")
|
||||
return _handle_simple_modification(request.prompt, canvas_data)
|
||||
|
||||
ai_response = response.json().get("response", "")
|
||||
|
||||
except httpx.ConnectError:
|
||||
logger.warning("Ollama not reachable")
|
||||
return _handle_simple_modification(request.prompt, canvas_data)
|
||||
except httpx.TimeoutException:
|
||||
logger.warning("Ollama timeout, trying local fallback")
|
||||
return _handle_simple_modification(request.prompt, canvas_data)
|
||||
|
||||
try:
|
||||
json_start = ai_response.find('{')
|
||||
json_end = ai_response.rfind('}') + 1
|
||||
|
||||
if json_start == -1 or json_end <= json_start:
|
||||
logger.warning(f"No JSON found in AI response: {ai_response[:200]}")
|
||||
return AIModifyResponse(
|
||||
message="KI konnte die Anfrage nicht verarbeiten",
|
||||
error="No JSON in response"
|
||||
)
|
||||
|
||||
ai_json = json.loads(ai_response[json_start:json_end])
|
||||
action = ai_json.get("action", "info")
|
||||
message = ai_json.get("message", "Aenderungen angewendet")
|
||||
new_objects = ai_json.get("objects", [])
|
||||
|
||||
if action == "info":
|
||||
return AIModifyResponse(message=message)
|
||||
|
||||
if action == "add" and new_objects:
|
||||
existing_objects = canvas_data.get("objects", [])
|
||||
existing_objects.extend(new_objects)
|
||||
canvas_data["objects"] = existing_objects
|
||||
return AIModifyResponse(
|
||||
modified_canvas_json=json.dumps(canvas_data),
|
||||
message=message
|
||||
)
|
||||
|
||||
if action == "modify" and new_objects:
|
||||
existing_objects = canvas_data.get("objects", [])
|
||||
new_ids = {obj.get("id") for obj in new_objects if obj.get("id")}
|
||||
kept_objects = [obj for obj in existing_objects if obj.get("id") not in new_ids]
|
||||
kept_objects.extend(new_objects)
|
||||
canvas_data["objects"] = kept_objects
|
||||
return AIModifyResponse(
|
||||
modified_canvas_json=json.dumps(canvas_data),
|
||||
message=message
|
||||
)
|
||||
|
||||
if action == "delete":
|
||||
delete_ids = ai_json.get("delete_ids", [])
|
||||
if delete_ids:
|
||||
existing_objects = canvas_data.get("objects", [])
|
||||
canvas_data["objects"] = [obj for obj in existing_objects if obj.get("id") not in delete_ids]
|
||||
return AIModifyResponse(
|
||||
modified_canvas_json=json.dumps(canvas_data),
|
||||
message=message
|
||||
)
|
||||
|
||||
return AIModifyResponse(message=message)
|
||||
|
||||
except json.JSONDecodeError as e:
|
||||
logger.error(f"Failed to parse AI JSON: {e}")
|
||||
return AIModifyResponse(
|
||||
message="Fehler beim Verarbeiten der KI-Antwort",
|
||||
error=str(e)
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"AI modify error: {e}")
|
||||
return AIModifyResponse(
|
||||
message="Ein unerwarteter Fehler ist aufgetreten",
|
||||
error=str(e)
|
||||
)
|
||||
|
||||
|
||||
def _handle_simple_modification(prompt: str, canvas_data: dict) -> AIModifyResponse:
|
||||
"""
|
||||
Handle simple modifications locally when Ollama is not available.
|
||||
Supports basic commands like adding headings, lines, etc.
|
||||
"""
|
||||
prompt_lower = prompt.lower()
|
||||
objects = canvas_data.get("objects", [])
|
||||
|
||||
def generate_id():
|
||||
return f"obj_{int(time.time()*1000)}_{random.randint(1000, 9999)}"
|
||||
|
||||
# Add heading
|
||||
if "ueberschrift" in prompt_lower or "titel" in prompt_lower or "heading" in prompt_lower:
|
||||
text_match = re.search(r'"([^"]+)"', prompt)
|
||||
text = text_match.group(1) if text_match else "Ueberschrift"
|
||||
|
||||
new_text = {
|
||||
"type": "i-text", "id": generate_id(), "text": text,
|
||||
"left": 397, "top": 50, "originX": "center",
|
||||
"fontFamily": "Arial", "fontSize": 28, "fontWeight": "bold", "fill": "#000000"
|
||||
}
|
||||
objects.append(new_text)
|
||||
canvas_data["objects"] = objects
|
||||
return AIModifyResponse(
|
||||
modified_canvas_json=json.dumps(canvas_data),
|
||||
message=f"Ueberschrift '{text}' hinzugefuegt"
|
||||
)
|
||||
|
||||
# Add lines for writing
|
||||
if "linie" in prompt_lower or "line" in prompt_lower or "schreib" in prompt_lower:
|
||||
num_match = re.search(r'(\d+)', prompt)
|
||||
num_lines = int(num_match.group(1)) if num_match else 5
|
||||
num_lines = min(num_lines, 20)
|
||||
|
||||
start_y = 150
|
||||
line_spacing = 40
|
||||
|
||||
for i in range(num_lines):
|
||||
new_line = {
|
||||
"type": "line", "id": generate_id(),
|
||||
"x1": 60, "y1": start_y + i * line_spacing,
|
||||
"x2": 734, "y2": start_y + i * line_spacing,
|
||||
"stroke": "#cccccc", "strokeWidth": 1
|
||||
}
|
||||
objects.append(new_line)
|
||||
|
||||
canvas_data["objects"] = objects
|
||||
return AIModifyResponse(
|
||||
modified_canvas_json=json.dumps(canvas_data),
|
||||
message=f"{num_lines} Schreiblinien hinzugefuegt"
|
||||
)
|
||||
|
||||
# Make text bigger
|
||||
if "groesser" in prompt_lower or "bigger" in prompt_lower or "larger" in prompt_lower:
|
||||
modified = 0
|
||||
for obj in objects:
|
||||
if obj.get("type") in ["i-text", "text", "textbox"]:
|
||||
current_size = obj.get("fontSize", 16)
|
||||
obj["fontSize"] = int(current_size * 1.25)
|
||||
modified += 1
|
||||
|
||||
canvas_data["objects"] = objects
|
||||
if modified > 0:
|
||||
return AIModifyResponse(
|
||||
modified_canvas_json=json.dumps(canvas_data),
|
||||
message=f"{modified} Texte vergroessert"
|
||||
)
|
||||
|
||||
# Center elements
|
||||
if "zentrier" in prompt_lower or "center" in prompt_lower or "mitte" in prompt_lower:
|
||||
center_x = 397
|
||||
for obj in objects:
|
||||
if not obj.get("isGrid"):
|
||||
obj["left"] = center_x
|
||||
obj["originX"] = "center"
|
||||
|
||||
canvas_data["objects"] = objects
|
||||
return AIModifyResponse(
|
||||
modified_canvas_json=json.dumps(canvas_data),
|
||||
message="Elemente zentriert"
|
||||
)
|
||||
|
||||
# Add numbering
|
||||
if "nummer" in prompt_lower or "nummerier" in prompt_lower or "1-10" in prompt_lower:
|
||||
range_match = re.search(r'(\d+)\s*[-bis]+\s*(\d+)', prompt)
|
||||
if range_match:
|
||||
start, end = int(range_match.group(1)), int(range_match.group(2))
|
||||
else:
|
||||
start, end = 1, 10
|
||||
|
||||
y = 100
|
||||
for i in range(start, min(end + 1, start + 20)):
|
||||
new_text = {
|
||||
"type": "i-text", "id": generate_id(), "text": f"{i}.",
|
||||
"left": 40, "top": y, "fontFamily": "Arial", "fontSize": 14, "fill": "#000000"
|
||||
}
|
||||
objects.append(new_text)
|
||||
y += 35
|
||||
|
||||
canvas_data["objects"] = objects
|
||||
return AIModifyResponse(
|
||||
modified_canvas_json=json.dumps(canvas_data),
|
||||
message=f"Nummerierung {start}-{end} hinzugefuegt"
|
||||
)
|
||||
|
||||
# Add rectangle/box
|
||||
if "rechteck" in prompt_lower or "box" in prompt_lower or "kasten" in prompt_lower:
|
||||
new_rect = {
|
||||
"type": "rect", "id": generate_id(),
|
||||
"left": 100, "top": 200, "width": 200, "height": 100,
|
||||
"fill": "transparent", "stroke": "#000000", "strokeWidth": 2
|
||||
}
|
||||
objects.append(new_rect)
|
||||
canvas_data["objects"] = objects
|
||||
return AIModifyResponse(
|
||||
modified_canvas_json=json.dumps(canvas_data),
|
||||
message="Rechteck hinzugefuegt"
|
||||
)
|
||||
|
||||
# Add grid/raster
|
||||
if "raster" in prompt_lower or "grid" in prompt_lower or "tabelle" in prompt_lower:
|
||||
dim_match = re.search(r'(\d+)\s*[x/\u00d7\*mal by]\s*(\d+)', prompt_lower)
|
||||
if dim_match:
|
||||
cols = int(dim_match.group(1))
|
||||
rows = int(dim_match.group(2))
|
||||
else:
|
||||
nums = re.findall(r'(\d+)', prompt)
|
||||
if len(nums) >= 2:
|
||||
cols, rows = int(nums[0]), int(nums[1])
|
||||
else:
|
||||
cols, rows = 3, 4
|
||||
|
||||
cols = min(max(1, cols), 10)
|
||||
rows = min(max(1, rows), 15)
|
||||
|
||||
canvas_width = 794
|
||||
canvas_height = 1123
|
||||
margin = 60
|
||||
available_width = canvas_width - 2 * margin
|
||||
available_height = canvas_height - 2 * margin - 80
|
||||
|
||||
cell_width = available_width / cols
|
||||
cell_height = min(available_height / rows, 80)
|
||||
|
||||
start_x = margin
|
||||
start_y = 120
|
||||
|
||||
grid_objects = []
|
||||
for r in range(rows + 1):
|
||||
y = start_y + r * cell_height
|
||||
grid_objects.append({
|
||||
"type": "line", "id": generate_id(),
|
||||
"x1": start_x, "y1": y,
|
||||
"x2": start_x + cols * cell_width, "y2": y,
|
||||
"stroke": "#666666", "strokeWidth": 1, "isGrid": True
|
||||
})
|
||||
|
||||
for c in range(cols + 1):
|
||||
x = start_x + c * cell_width
|
||||
grid_objects.append({
|
||||
"type": "line", "id": generate_id(),
|
||||
"x1": x, "y1": start_y,
|
||||
"x2": x, "y2": start_y + rows * cell_height,
|
||||
"stroke": "#666666", "strokeWidth": 1, "isGrid": True
|
||||
})
|
||||
|
||||
objects.extend(grid_objects)
|
||||
canvas_data["objects"] = objects
|
||||
return AIModifyResponse(
|
||||
modified_canvas_json=json.dumps(canvas_data),
|
||||
message=f"{cols}x{rows} Raster hinzugefuegt ({cols} Spalten, {rows} Zeilen)"
|
||||
)
|
||||
|
||||
# Default: Ollama needed
|
||||
return AIModifyResponse(
|
||||
message="Diese Aenderung erfordert den KI-Service. Bitte stellen Sie sicher, dass Ollama laeuft.",
|
||||
error="Complex modification requires Ollama"
|
||||
)
|
||||
# Backward-compat shim -- module moved to worksheet/editor_ai.py
|
||||
import importlib as _importlib
|
||||
import sys as _sys
|
||||
_sys.modules[__name__] = _importlib.import_module("worksheet.editor_ai")
|
||||
|
||||
@@ -1,388 +1,4 @@
|
||||
"""
|
||||
Worksheet Editor API - Backend Endpoints for Visual Worksheet Editor
|
||||
|
||||
Provides endpoints for:
|
||||
- AI Image generation via Ollama/Stable Diffusion
|
||||
- Worksheet Save/Load
|
||||
- PDF Export
|
||||
|
||||
Split modules:
|
||||
- worksheet_editor_models: Enums, Pydantic models, configuration
|
||||
- worksheet_editor_ai: AI image generation and AI worksheet modification
|
||||
- worksheet_editor_reconstruct: Document reconstruction from vocab sessions
|
||||
"""
|
||||
|
||||
import os
|
||||
import io
|
||||
import json
|
||||
import logging
|
||||
from datetime import datetime, timezone
|
||||
import uuid
|
||||
|
||||
from fastapi import APIRouter, HTTPException
|
||||
from fastapi.responses import StreamingResponse
|
||||
import httpx
|
||||
|
||||
# Re-export everything from sub-modules for backward compatibility
|
||||
from worksheet_editor_models import ( # noqa: F401
|
||||
AIImageStyle,
|
||||
WorksheetStatus,
|
||||
AIImageRequest,
|
||||
AIImageResponse,
|
||||
PageData,
|
||||
PageFormat,
|
||||
WorksheetSaveRequest,
|
||||
WorksheetResponse,
|
||||
AIModifyRequest,
|
||||
AIModifyResponse,
|
||||
ReconstructRequest,
|
||||
ReconstructResponse,
|
||||
worksheets_db,
|
||||
OLLAMA_URL,
|
||||
SD_MODEL,
|
||||
WORKSHEET_STORAGE_DIR,
|
||||
STYLE_PROMPTS,
|
||||
REPORTLAB_AVAILABLE,
|
||||
)
|
||||
|
||||
from worksheet_editor_ai import ( # noqa: F401
|
||||
generate_ai_image_logic,
|
||||
_generate_placeholder_image,
|
||||
modify_worksheet_with_ai_logic,
|
||||
_handle_simple_modification,
|
||||
)
|
||||
|
||||
from worksheet_editor_reconstruct import ( # noqa: F401
|
||||
reconstruct_document_logic,
|
||||
_detect_image_regions,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# =============================================
|
||||
# ROUTER
|
||||
# =============================================
|
||||
|
||||
router = APIRouter(prefix="/api/v1/worksheet", tags=["Worksheet Editor"])
|
||||
|
||||
# =============================================
|
||||
# AI IMAGE GENERATION
|
||||
# =============================================
|
||||
|
||||
@router.post("/ai-image", response_model=AIImageResponse)
|
||||
async def generate_ai_image(request: AIImageRequest):
|
||||
"""
|
||||
Generate an AI image using Ollama with a text-to-image model.
|
||||
|
||||
Supported models:
|
||||
- stable-diffusion (via Ollama)
|
||||
- sd3.5-medium
|
||||
- llava (for image understanding, not generation)
|
||||
|
||||
Falls back to a placeholder if Ollama is not available.
|
||||
"""
|
||||
return await generate_ai_image_logic(request)
|
||||
|
||||
|
||||
# =============================================
|
||||
# WORKSHEET SAVE/LOAD
|
||||
# =============================================
|
||||
|
||||
@router.post("/save", response_model=WorksheetResponse)
|
||||
async def save_worksheet(request: WorksheetSaveRequest):
|
||||
"""
|
||||
Save a worksheet document.
|
||||
|
||||
- If id is provided, updates existing worksheet
|
||||
- If id is not provided, creates new worksheet
|
||||
"""
|
||||
try:
|
||||
now = datetime.now(timezone.utc).isoformat()
|
||||
|
||||
worksheet_id = request.id or f"ws_{uuid.uuid4().hex[:12]}"
|
||||
|
||||
worksheet = {
|
||||
"id": worksheet_id,
|
||||
"title": request.title,
|
||||
"description": request.description,
|
||||
"pages": [p.dict() for p in request.pages],
|
||||
"pageFormat": (request.pageFormat or PageFormat()).dict(),
|
||||
"createdAt": worksheets_db.get(worksheet_id, {}).get("createdAt", now),
|
||||
"updatedAt": now
|
||||
}
|
||||
|
||||
worksheets_db[worksheet_id] = worksheet
|
||||
|
||||
filepath = os.path.join(WORKSHEET_STORAGE_DIR, f"{worksheet_id}.json")
|
||||
with open(filepath, 'w', encoding='utf-8') as f:
|
||||
json.dump(worksheet, f, ensure_ascii=False, indent=2)
|
||||
|
||||
logger.info(f"Saved worksheet: {worksheet_id}")
|
||||
|
||||
return WorksheetResponse(**worksheet)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to save worksheet: {e}")
|
||||
raise HTTPException(status_code=500, detail=f"Failed to save: {str(e)}")
|
||||
|
||||
|
||||
@router.get("/{worksheet_id}", response_model=WorksheetResponse)
|
||||
async def get_worksheet(worksheet_id: str):
|
||||
"""Load a worksheet document by ID."""
|
||||
try:
|
||||
if worksheet_id in worksheets_db:
|
||||
return WorksheetResponse(**worksheets_db[worksheet_id])
|
||||
|
||||
filepath = os.path.join(WORKSHEET_STORAGE_DIR, f"{worksheet_id}.json")
|
||||
if os.path.exists(filepath):
|
||||
with open(filepath, 'r', encoding='utf-8') as f:
|
||||
worksheet = json.load(f)
|
||||
worksheets_db[worksheet_id] = worksheet
|
||||
return WorksheetResponse(**worksheet)
|
||||
|
||||
raise HTTPException(status_code=404, detail="Worksheet not found")
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to load worksheet {worksheet_id}: {e}")
|
||||
raise HTTPException(status_code=500, detail=f"Failed to load: {str(e)}")
|
||||
|
||||
|
||||
@router.get("/list/all")
|
||||
async def list_worksheets():
|
||||
"""List all available worksheets."""
|
||||
try:
|
||||
worksheets = []
|
||||
|
||||
for filename in os.listdir(WORKSHEET_STORAGE_DIR):
|
||||
if filename.endswith('.json'):
|
||||
filepath = os.path.join(WORKSHEET_STORAGE_DIR, filename)
|
||||
try:
|
||||
with open(filepath, 'r', encoding='utf-8') as f:
|
||||
worksheet = json.load(f)
|
||||
worksheets.append({
|
||||
"id": worksheet.get("id"),
|
||||
"title": worksheet.get("title"),
|
||||
"description": worksheet.get("description"),
|
||||
"pageCount": len(worksheet.get("pages", [])),
|
||||
"updatedAt": worksheet.get("updatedAt"),
|
||||
"createdAt": worksheet.get("createdAt")
|
||||
})
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to load {filename}: {e}")
|
||||
|
||||
worksheets.sort(key=lambda x: x.get("updatedAt", ""), reverse=True)
|
||||
|
||||
return {"worksheets": worksheets, "total": len(worksheets)}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to list worksheets: {e}")
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
@router.delete("/{worksheet_id}")
|
||||
async def delete_worksheet(worksheet_id: str):
|
||||
"""Delete a worksheet document."""
|
||||
try:
|
||||
if worksheet_id in worksheets_db:
|
||||
del worksheets_db[worksheet_id]
|
||||
|
||||
filepath = os.path.join(WORKSHEET_STORAGE_DIR, f"{worksheet_id}.json")
|
||||
if os.path.exists(filepath):
|
||||
os.remove(filepath)
|
||||
logger.info(f"Deleted worksheet: {worksheet_id}")
|
||||
return {"status": "deleted", "id": worksheet_id}
|
||||
|
||||
raise HTTPException(status_code=404, detail="Worksheet not found")
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to delete worksheet {worksheet_id}: {e}")
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
# =============================================
|
||||
# PDF EXPORT
|
||||
# =============================================
|
||||
|
||||
@router.post("/{worksheet_id}/export-pdf")
|
||||
async def export_worksheet_pdf(worksheet_id: str):
|
||||
"""
|
||||
Export worksheet as PDF.
|
||||
|
||||
Note: This creates a basic PDF. For full canvas rendering,
|
||||
the frontend should use pdf-lib with canvas.toDataURL().
|
||||
"""
|
||||
if not REPORTLAB_AVAILABLE:
|
||||
raise HTTPException(status_code=501, detail="PDF export not available (reportlab not installed)")
|
||||
|
||||
try:
|
||||
from reportlab.lib.pagesizes import A4
|
||||
from reportlab.pdfgen import canvas
|
||||
|
||||
worksheet = worksheets_db.get(worksheet_id)
|
||||
if not worksheet:
|
||||
filepath = os.path.join(WORKSHEET_STORAGE_DIR, f"{worksheet_id}.json")
|
||||
if os.path.exists(filepath):
|
||||
with open(filepath, 'r', encoding='utf-8') as f:
|
||||
worksheet = json.load(f)
|
||||
else:
|
||||
raise HTTPException(status_code=404, detail="Worksheet not found")
|
||||
|
||||
buffer = io.BytesIO()
|
||||
c = canvas.Canvas(buffer, pagesize=A4)
|
||||
|
||||
page_width, page_height = A4
|
||||
|
||||
for page_data in worksheet.get("pages", []):
|
||||
if page_data.get("index", 0) == 0:
|
||||
c.setFont("Helvetica-Bold", 18)
|
||||
c.drawString(50, page_height - 50, worksheet.get("title", "Arbeitsblatt"))
|
||||
c.setFont("Helvetica", 10)
|
||||
c.drawString(50, page_height - 70, f"Erstellt: {worksheet.get('createdAt', '')[:10]}")
|
||||
|
||||
canvas_json_str = page_data.get("canvasJSON", "{}")
|
||||
if canvas_json_str:
|
||||
try:
|
||||
canvas_data = json.loads(canvas_json_str)
|
||||
objects = canvas_data.get("objects", [])
|
||||
|
||||
for obj in objects:
|
||||
obj_type = obj.get("type", "")
|
||||
|
||||
if obj_type in ["text", "i-text", "textbox"]:
|
||||
text = obj.get("text", "")
|
||||
left = obj.get("left", 50)
|
||||
top = obj.get("top", 100)
|
||||
font_size = obj.get("fontSize", 12)
|
||||
|
||||
pdf_x = left * 0.75
|
||||
pdf_y = page_height - (top * 0.75)
|
||||
|
||||
c.setFont("Helvetica", min(font_size, 24))
|
||||
c.drawString(pdf_x, pdf_y, text[:100])
|
||||
|
||||
elif obj_type == "rect":
|
||||
left = obj.get("left", 0) * 0.75
|
||||
top = obj.get("top", 0) * 0.75
|
||||
width = obj.get("width", 50) * 0.75
|
||||
height = obj.get("height", 30) * 0.75
|
||||
c.rect(left, page_height - top - height, width, height)
|
||||
|
||||
elif obj_type == "circle":
|
||||
left = obj.get("left", 0) * 0.75
|
||||
top = obj.get("top", 0) * 0.75
|
||||
radius = obj.get("radius", 25) * 0.75
|
||||
c.circle(left + radius, page_height - top - radius, radius)
|
||||
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
c.showPage()
|
||||
|
||||
c.save()
|
||||
buffer.seek(0)
|
||||
|
||||
filename = f"{worksheet.get('title', 'worksheet').replace(' ', '_')}.pdf"
|
||||
|
||||
return StreamingResponse(
|
||||
buffer,
|
||||
media_type="application/pdf",
|
||||
headers={"Content-Disposition": f"attachment; filename={filename}"}
|
||||
)
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"PDF export failed: {e}")
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
# =============================================
|
||||
# AI WORKSHEET MODIFICATION
|
||||
# =============================================
|
||||
|
||||
@router.post("/ai-modify", response_model=AIModifyResponse)
|
||||
async def modify_worksheet_with_ai(request: AIModifyRequest):
|
||||
"""
|
||||
Modify a worksheet using AI based on natural language prompt.
|
||||
|
||||
Uses Ollama with qwen2.5vl:32b to understand the canvas state
|
||||
and generate modifications based on the user's request.
|
||||
"""
|
||||
return await modify_worksheet_with_ai_logic(request)
|
||||
|
||||
|
||||
# =============================================
|
||||
# HEALTH CHECK
|
||||
# =============================================
|
||||
|
||||
@router.get("/health/check")
|
||||
async def health_check():
|
||||
"""Check worksheet editor API health and dependencies."""
|
||||
status = {
|
||||
"status": "healthy",
|
||||
"ollama": False,
|
||||
"storage": os.path.exists(WORKSHEET_STORAGE_DIR),
|
||||
"reportlab": REPORTLAB_AVAILABLE,
|
||||
"worksheets_count": len(worksheets_db)
|
||||
}
|
||||
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=5.0) as client:
|
||||
response = await client.get(f"{OLLAMA_URL}/api/tags")
|
||||
status["ollama"] = response.status_code == 200
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return status
|
||||
|
||||
|
||||
# =============================================
|
||||
# DOCUMENT RECONSTRUCTION FROM VOCAB SESSION
|
||||
# =============================================
|
||||
|
||||
@router.post("/reconstruct-from-session", response_model=ReconstructResponse)
|
||||
async def reconstruct_document_from_session(request: ReconstructRequest):
|
||||
"""
|
||||
Reconstruct a document from a vocab session into Fabric.js canvas format.
|
||||
|
||||
Returns canvas JSON ready to load into the worksheet editor.
|
||||
"""
|
||||
try:
|
||||
return await reconstruct_document_logic(request)
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"Document reconstruction failed: {e}")
|
||||
import traceback
|
||||
logger.error(traceback.format_exc())
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
@router.get("/sessions/available")
|
||||
async def get_available_sessions():
|
||||
"""Get list of available vocab sessions that can be reconstructed."""
|
||||
try:
|
||||
from vocab_worksheet_api import _sessions
|
||||
|
||||
available = []
|
||||
for session_id, session in _sessions.items():
|
||||
if session.get("pdf_data"):
|
||||
available.append({
|
||||
"id": session_id,
|
||||
"name": session.get("name", "Unnamed"),
|
||||
"description": session.get("description"),
|
||||
"vocabulary_count": len(session.get("vocabulary", [])),
|
||||
"page_count": session.get("pdf_page_count", 1),
|
||||
"status": session.get("status", "unknown"),
|
||||
"created_at": session.get("created_at", "").isoformat() if session.get("created_at") else None
|
||||
})
|
||||
|
||||
return {"sessions": available, "total": len(available)}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to list sessions: {e}")
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
# Backward-compat shim -- module moved to worksheet/editor_api.py
|
||||
import importlib as _importlib
|
||||
import sys as _sys
|
||||
_sys.modules[__name__] = _importlib.import_module("worksheet.editor_api")
|
||||
|
||||
@@ -1,133 +1,4 @@
|
||||
"""
|
||||
Worksheet Editor Models — Enums, Pydantic models, and configuration.
|
||||
"""
|
||||
|
||||
import os
|
||||
import logging
|
||||
from typing import Optional, List, Dict
|
||||
from enum import Enum
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# =============================================
|
||||
# CONFIGURATION
|
||||
# =============================================
|
||||
|
||||
OLLAMA_URL = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434")
|
||||
SD_MODEL = os.getenv("SD_MODEL", "stable-diffusion") # or specific SD model
|
||||
WORKSHEET_STORAGE_DIR = os.getenv("WORKSHEET_STORAGE_DIR",
|
||||
os.path.join(os.path.dirname(os.path.abspath(__file__)), "worksheet-storage"))
|
||||
|
||||
# Ensure storage directory exists
|
||||
os.makedirs(WORKSHEET_STORAGE_DIR, exist_ok=True)
|
||||
|
||||
# =============================================
|
||||
# ENUMS & MODELS
|
||||
# =============================================
|
||||
|
||||
class AIImageStyle(str, Enum):
|
||||
REALISTIC = "realistic"
|
||||
CARTOON = "cartoon"
|
||||
SKETCH = "sketch"
|
||||
CLIPART = "clipart"
|
||||
EDUCATIONAL = "educational"
|
||||
|
||||
class WorksheetStatus(str, Enum):
|
||||
DRAFT = "draft"
|
||||
PUBLISHED = "published"
|
||||
ARCHIVED = "archived"
|
||||
|
||||
# Style prompt modifiers
|
||||
STYLE_PROMPTS = {
|
||||
AIImageStyle.REALISTIC: "photorealistic, high detail, professional photography",
|
||||
AIImageStyle.CARTOON: "cartoon style, colorful, child-friendly, simple shapes",
|
||||
AIImageStyle.SKETCH: "pencil sketch, hand-drawn, black and white, artistic",
|
||||
AIImageStyle.CLIPART: "clipart style, flat design, simple, vector-like",
|
||||
AIImageStyle.EDUCATIONAL: "educational illustration, clear, informative, textbook style"
|
||||
}
|
||||
|
||||
# =============================================
|
||||
# REQUEST/RESPONSE MODELS
|
||||
# =============================================
|
||||
|
||||
class AIImageRequest(BaseModel):
|
||||
prompt: str = Field(..., min_length=3, max_length=500)
|
||||
style: AIImageStyle = AIImageStyle.EDUCATIONAL
|
||||
width: int = Field(512, ge=256, le=1024)
|
||||
height: int = Field(512, ge=256, le=1024)
|
||||
|
||||
class AIImageResponse(BaseModel):
|
||||
image_base64: str
|
||||
prompt_used: str
|
||||
error: Optional[str] = None
|
||||
|
||||
class PageData(BaseModel):
|
||||
id: str
|
||||
index: int
|
||||
canvasJSON: str
|
||||
|
||||
class PageFormat(BaseModel):
|
||||
width: float = 210
|
||||
height: float = 297
|
||||
orientation: str = "portrait"
|
||||
margins: Dict[str, float] = {"top": 15, "right": 15, "bottom": 15, "left": 15}
|
||||
|
||||
class WorksheetSaveRequest(BaseModel):
|
||||
id: Optional[str] = None
|
||||
title: str
|
||||
description: Optional[str] = None
|
||||
pages: List[PageData]
|
||||
pageFormat: Optional[PageFormat] = None
|
||||
|
||||
class WorksheetResponse(BaseModel):
|
||||
id: str
|
||||
title: str
|
||||
description: Optional[str]
|
||||
pages: List[PageData]
|
||||
pageFormat: PageFormat
|
||||
createdAt: str
|
||||
updatedAt: str
|
||||
|
||||
class AIModifyRequest(BaseModel):
|
||||
prompt: str = Field(..., min_length=3, max_length=1000)
|
||||
canvas_json: str
|
||||
model: str = "qwen2.5vl:32b"
|
||||
|
||||
class AIModifyResponse(BaseModel):
|
||||
modified_canvas_json: Optional[str] = None
|
||||
message: str
|
||||
error: Optional[str] = None
|
||||
|
||||
class ReconstructRequest(BaseModel):
|
||||
session_id: str
|
||||
page_number: int = 1
|
||||
include_images: bool = True
|
||||
regenerate_graphics: bool = False
|
||||
|
||||
class ReconstructResponse(BaseModel):
|
||||
canvas_json: str
|
||||
page_width: int
|
||||
page_height: int
|
||||
elements_count: int
|
||||
vocabulary_matched: int
|
||||
message: str
|
||||
error: Optional[str] = None
|
||||
|
||||
# =============================================
|
||||
# IN-MEMORY STORAGE (Development)
|
||||
# =============================================
|
||||
|
||||
worksheets_db: Dict[str, Dict] = {}
|
||||
|
||||
# PDF Generation availability
|
||||
try:
|
||||
from reportlab.lib import colors # noqa: F401
|
||||
from reportlab.lib.pagesizes import A4 # noqa: F401
|
||||
from reportlab.lib.units import mm # noqa: F401
|
||||
from reportlab.pdfgen import canvas # noqa: F401
|
||||
from reportlab.lib.styles import getSampleStyleSheet # noqa: F401
|
||||
REPORTLAB_AVAILABLE = True
|
||||
except ImportError:
|
||||
REPORTLAB_AVAILABLE = False
|
||||
# Backward-compat shim -- module moved to worksheet/editor_models.py
|
||||
import importlib as _importlib
|
||||
import sys as _sys
|
||||
_sys.modules[__name__] = _importlib.import_module("worksheet.editor_models")
|
||||
|
||||
@@ -1,255 +1,4 @@
|
||||
"""
|
||||
Worksheet Editor Reconstruct — Document reconstruction from vocab sessions.
|
||||
"""
|
||||
|
||||
import io
|
||||
import uuid
|
||||
import base64
|
||||
import logging
|
||||
from typing import List, Dict
|
||||
|
||||
import numpy as np
|
||||
|
||||
from worksheet_editor_models import (
|
||||
ReconstructRequest,
|
||||
ReconstructResponse,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
async def reconstruct_document_logic(request: ReconstructRequest) -> ReconstructResponse:
|
||||
"""
|
||||
Reconstruct a document from a vocab session into Fabric.js canvas format.
|
||||
|
||||
This function:
|
||||
1. Loads the original PDF from the vocab session
|
||||
2. Runs OCR with position tracking
|
||||
3. Creates Fabric.js canvas JSON with positioned elements
|
||||
4. Maps extracted vocabulary to their positions
|
||||
|
||||
Returns ReconstructResponse ready to send to the client.
|
||||
"""
|
||||
from fastapi import HTTPException
|
||||
from vocab_worksheet_api import _sessions, convert_pdf_page_to_image
|
||||
|
||||
# Check if session exists
|
||||
if request.session_id not in _sessions:
|
||||
raise HTTPException(status_code=404, detail=f"Session {request.session_id} not found")
|
||||
|
||||
session = _sessions[request.session_id]
|
||||
|
||||
if not session.get("pdf_data"):
|
||||
raise HTTPException(status_code=400, detail="Session has no PDF data")
|
||||
|
||||
pdf_data = session["pdf_data"]
|
||||
page_count = session.get("pdf_page_count", 1)
|
||||
|
||||
if request.page_number < 1 or request.page_number > page_count:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail=f"Page {request.page_number} not found. PDF has {page_count} pages."
|
||||
)
|
||||
|
||||
vocabulary = session.get("vocabulary", [])
|
||||
page_vocab = [v for v in vocabulary if v.get("source_page") == request.page_number]
|
||||
|
||||
logger.info(f"Reconstructing page {request.page_number} from session {request.session_id}")
|
||||
logger.info(f"Found {len(page_vocab)} vocabulary items for this page")
|
||||
|
||||
image_bytes = await convert_pdf_page_to_image(pdf_data, request.page_number)
|
||||
if not image_bytes:
|
||||
raise HTTPException(status_code=500, detail="Failed to convert PDF page to image")
|
||||
|
||||
from PIL import Image
|
||||
img = Image.open(io.BytesIO(image_bytes))
|
||||
img_width, img_height = img.size
|
||||
|
||||
from hybrid_vocab_extractor import run_paddle_ocr
|
||||
ocr_regions, raw_text = run_paddle_ocr(image_bytes)
|
||||
|
||||
logger.info(f"OCR found {len(ocr_regions)} text regions")
|
||||
|
||||
A4_WIDTH = 794
|
||||
A4_HEIGHT = 1123
|
||||
scale_x = A4_WIDTH / img_width
|
||||
scale_y = A4_HEIGHT / img_height
|
||||
|
||||
fabric_objects = []
|
||||
|
||||
# 1. Add white background
|
||||
fabric_objects.append({
|
||||
"type": "rect", "left": 0, "top": 0,
|
||||
"width": A4_WIDTH, "height": A4_HEIGHT,
|
||||
"fill": "#ffffff", "selectable": False,
|
||||
"evented": False, "isBackground": True
|
||||
})
|
||||
|
||||
# 2. Group OCR regions by Y-coordinate to detect rows
|
||||
sorted_regions = sorted(ocr_regions, key=lambda r: (r.y1, r.x1))
|
||||
|
||||
# 3. Detect headers (larger text at top)
|
||||
headers = []
|
||||
for region in sorted_regions:
|
||||
height = region.y2 - region.y1
|
||||
if region.y1 < img_height * 0.15 and height > 30:
|
||||
headers.append(region)
|
||||
|
||||
# 4. Create text objects for each region
|
||||
vocab_matched = 0
|
||||
|
||||
for region in sorted_regions:
|
||||
left = int(region.x1 * scale_x)
|
||||
top = int(region.y1 * scale_y)
|
||||
|
||||
is_header = region in headers
|
||||
|
||||
region_height = region.y2 - region.y1
|
||||
base_font_size = max(10, min(32, int(region_height * scale_y * 0.8)))
|
||||
|
||||
if is_header:
|
||||
base_font_size = max(base_font_size, 24)
|
||||
|
||||
is_vocab = False
|
||||
vocab_match = None
|
||||
for v in page_vocab:
|
||||
if v.get("english", "").lower() in region.text.lower() or \
|
||||
v.get("german", "").lower() in region.text.lower():
|
||||
is_vocab = True
|
||||
vocab_match = v
|
||||
vocab_matched += 1
|
||||
break
|
||||
|
||||
text_obj = {
|
||||
"type": "i-text",
|
||||
"id": f"text_{uuid.uuid4().hex[:8]}",
|
||||
"left": left, "top": top,
|
||||
"text": region.text,
|
||||
"fontFamily": "Arial",
|
||||
"fontSize": base_font_size,
|
||||
"fontWeight": "bold" if is_header else "normal",
|
||||
"fill": "#000000",
|
||||
"originX": "left", "originY": "top",
|
||||
}
|
||||
|
||||
if is_vocab and vocab_match:
|
||||
text_obj["isVocabulary"] = True
|
||||
text_obj["vocabularyId"] = vocab_match.get("id")
|
||||
text_obj["english"] = vocab_match.get("english")
|
||||
text_obj["german"] = vocab_match.get("german")
|
||||
|
||||
fabric_objects.append(text_obj)
|
||||
|
||||
# 5. If include_images, detect and extract image regions
|
||||
if request.include_images:
|
||||
image_regions = await _detect_image_regions(image_bytes, ocr_regions, img_width, img_height)
|
||||
|
||||
for i, img_region in enumerate(image_regions):
|
||||
img_x1 = int(img_region["x1"])
|
||||
img_y1 = int(img_region["y1"])
|
||||
img_x2 = int(img_region["x2"])
|
||||
img_y2 = int(img_region["y2"])
|
||||
|
||||
cropped = img.crop((img_x1, img_y1, img_x2, img_y2))
|
||||
|
||||
buffer = io.BytesIO()
|
||||
cropped.save(buffer, format='PNG')
|
||||
buffer.seek(0)
|
||||
img_base64 = f"data:image/png;base64,{base64.b64encode(buffer.getvalue()).decode('utf-8')}"
|
||||
|
||||
fabric_objects.append({
|
||||
"type": "image",
|
||||
"id": f"img_{uuid.uuid4().hex[:8]}",
|
||||
"left": int(img_x1 * scale_x),
|
||||
"top": int(img_y1 * scale_y),
|
||||
"width": int((img_x2 - img_x1) * scale_x),
|
||||
"height": int((img_y2 - img_y1) * scale_y),
|
||||
"src": img_base64,
|
||||
"scaleX": 1, "scaleY": 1,
|
||||
})
|
||||
|
||||
import json
|
||||
canvas_data = {
|
||||
"version": "6.0.0",
|
||||
"objects": fabric_objects,
|
||||
"background": "#ffffff"
|
||||
}
|
||||
|
||||
return ReconstructResponse(
|
||||
canvas_json=json.dumps(canvas_data),
|
||||
page_width=A4_WIDTH,
|
||||
page_height=A4_HEIGHT,
|
||||
elements_count=len(fabric_objects),
|
||||
vocabulary_matched=vocab_matched,
|
||||
message=f"Reconstructed page {request.page_number} with {len(fabric_objects)} elements, "
|
||||
f"{vocab_matched} vocabulary items matched"
|
||||
)
|
||||
|
||||
|
||||
async def _detect_image_regions(
|
||||
image_bytes: bytes,
|
||||
ocr_regions: list,
|
||||
img_width: int,
|
||||
img_height: int
|
||||
) -> List[Dict]:
|
||||
"""
|
||||
Detect image/graphic regions in the document.
|
||||
|
||||
Uses a simple approach:
|
||||
1. Find large gaps between text regions (potential image areas)
|
||||
2. Use edge detection to find bounded regions
|
||||
3. Filter out text areas
|
||||
"""
|
||||
from PIL import Image
|
||||
import cv2
|
||||
|
||||
try:
|
||||
img = Image.open(io.BytesIO(image_bytes))
|
||||
img_array = np.array(img.convert('L'))
|
||||
|
||||
text_mask = np.ones_like(img_array, dtype=bool)
|
||||
for region in ocr_regions:
|
||||
x1 = max(0, region.x1 - 5)
|
||||
y1 = max(0, region.y1 - 5)
|
||||
x2 = min(img_width, region.x2 + 5)
|
||||
y2 = min(img_height, region.y2 + 5)
|
||||
text_mask[y1:y2, x1:x2] = False
|
||||
|
||||
image_regions = []
|
||||
|
||||
edges = cv2.Canny(img_array, 50, 150)
|
||||
edges[~text_mask] = 0
|
||||
|
||||
contours, _ = cv2.findContours(edges, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
||||
|
||||
for contour in contours:
|
||||
x, y, w, h = cv2.boundingRect(contour)
|
||||
|
||||
if w > 50 and h > 50:
|
||||
if w < img_width * 0.9 and h < img_height * 0.9:
|
||||
region_content = img_array[y:y+h, x:x+w]
|
||||
variance = np.var(region_content)
|
||||
|
||||
if variance > 500:
|
||||
image_regions.append({
|
||||
"x1": x, "y1": y,
|
||||
"x2": x + w, "y2": y + h
|
||||
})
|
||||
|
||||
filtered_regions = []
|
||||
for region in sorted(image_regions, key=lambda r: (r["x2"]-r["x1"])*(r["y2"]-r["y1"]), reverse=True):
|
||||
overlaps = False
|
||||
for existing in filtered_regions:
|
||||
if not (region["x2"] < existing["x1"] or region["x1"] > existing["x2"] or
|
||||
region["y2"] < existing["y1"] or region["y1"] > existing["y2"]):
|
||||
overlaps = True
|
||||
break
|
||||
if not overlaps:
|
||||
filtered_regions.append(region)
|
||||
|
||||
logger.info(f"Detected {len(filtered_regions)} image regions")
|
||||
return filtered_regions[:10]
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Image region detection failed: {e}")
|
||||
return []
|
||||
# Backward-compat shim -- module moved to worksheet/editor_reconstruct.py
|
||||
import importlib as _importlib
|
||||
import sys as _sys
|
||||
_sys.modules[__name__] = _importlib.import_module("worksheet.editor_reconstruct")
|
||||
|
||||
@@ -0,0 +1,6 @@
|
||||
"""
|
||||
zeugnis package — certificate crawler, models, storage.
|
||||
|
||||
Backward-compatible re-exports: consumers can still use
|
||||
``from zeugnis_api import ...`` etc. via the shim files in backend/.
|
||||
"""
|
||||
@@ -0,0 +1,19 @@
|
||||
"""
|
||||
Zeugnis Rights-Aware Crawler — barrel re-export.
|
||||
|
||||
All implementation split into:
|
||||
zeugnis_api_sources — sources, seed URLs, initialization
|
||||
zeugnis_api_docs — documents, crawler, statistics, audit
|
||||
|
||||
FastAPI router for managing zeugnis sources, documents, and crawler operations.
|
||||
"""
|
||||
|
||||
from fastapi import APIRouter
|
||||
|
||||
from .api_sources import router as _sources_router # noqa: F401
|
||||
from .api_docs import router as _docs_router # noqa: F401
|
||||
|
||||
# Composite router (used by main.py)
|
||||
router = APIRouter()
|
||||
router.include_router(_sources_router)
|
||||
router.include_router(_docs_router)
|
||||
@@ -0,0 +1,321 @@
|
||||
"""
|
||||
Zeugnis API Docs — documents, crawler control, statistics, audit endpoints.
|
||||
|
||||
Extracted from zeugnis_api.py for modularity.
|
||||
"""
|
||||
|
||||
from datetime import datetime, timedelta
|
||||
from typing import Optional, List
|
||||
from fastapi import APIRouter, HTTPException, BackgroundTasks, Query
|
||||
|
||||
from .models import (
|
||||
CrawlRequest, EventType,
|
||||
BUNDESLAENDER,
|
||||
generate_id, get_training_allowed, get_license_for_bundesland,
|
||||
)
|
||||
from .crawler import (
|
||||
start_crawler, stop_crawler, get_crawler_status,
|
||||
)
|
||||
from metrics_db import (
|
||||
get_zeugnis_documents, get_zeugnis_stats,
|
||||
log_zeugnis_event, get_pool,
|
||||
)
|
||||
|
||||
|
||||
router = APIRouter(prefix="/api/v1/admin/zeugnis", tags=["Zeugnis Crawler"])
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Documents Endpoints
|
||||
# =============================================================================
|
||||
|
||||
@router.get("/documents", response_model=List[dict])
|
||||
async def list_documents(
|
||||
bundesland: Optional[str] = None,
|
||||
limit: int = Query(100, le=500),
|
||||
offset: int = 0,
|
||||
):
|
||||
"""Get all zeugnis documents with optional filtering."""
|
||||
documents = await get_zeugnis_documents(bundesland=bundesland, limit=limit, offset=offset)
|
||||
return documents
|
||||
|
||||
|
||||
@router.get("/documents/{document_id}", response_model=dict)
|
||||
async def get_document(document_id: str):
|
||||
"""Get details for a specific document."""
|
||||
pool = await get_pool()
|
||||
if not pool:
|
||||
raise HTTPException(status_code=503, detail="Database not available")
|
||||
|
||||
try:
|
||||
async with pool.acquire() as conn:
|
||||
doc = await conn.fetchrow(
|
||||
"""
|
||||
SELECT d.*, s.bundesland, s.name as source_name
|
||||
FROM zeugnis_documents d
|
||||
JOIN zeugnis_seed_urls u ON d.seed_url_id = u.id
|
||||
JOIN zeugnis_sources s ON u.source_id = s.id
|
||||
WHERE d.id = $1
|
||||
""",
|
||||
document_id
|
||||
)
|
||||
if not doc:
|
||||
raise HTTPException(status_code=404, detail="Document not found")
|
||||
|
||||
# Log view event
|
||||
await log_zeugnis_event(document_id, EventType.VIEWED.value)
|
||||
|
||||
return dict(doc)
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
@router.get("/documents/{document_id}/versions", response_model=List[dict])
|
||||
async def get_document_versions(document_id: str):
|
||||
"""Get version history for a document."""
|
||||
pool = await get_pool()
|
||||
if not pool:
|
||||
raise HTTPException(status_code=503, detail="Database not available")
|
||||
|
||||
try:
|
||||
async with pool.acquire() as conn:
|
||||
rows = await conn.fetch(
|
||||
"""
|
||||
SELECT * FROM zeugnis_document_versions
|
||||
WHERE document_id = $1
|
||||
ORDER BY version DESC
|
||||
""",
|
||||
document_id
|
||||
)
|
||||
return [dict(r) for r in rows]
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Crawler Control Endpoints
|
||||
# =============================================================================
|
||||
|
||||
@router.get("/crawler/status", response_model=dict)
|
||||
async def crawler_status():
|
||||
"""Get current crawler status."""
|
||||
return get_crawler_status()
|
||||
|
||||
|
||||
@router.post("/crawler/start", response_model=dict)
|
||||
async def start_crawl(request: CrawlRequest, background_tasks: BackgroundTasks):
|
||||
"""Start the crawler."""
|
||||
success = await start_crawler(
|
||||
bundesland=request.bundesland,
|
||||
source_id=request.source_id,
|
||||
)
|
||||
if not success:
|
||||
raise HTTPException(status_code=409, detail="Crawler already running")
|
||||
return {"success": True, "message": "Crawler started"}
|
||||
|
||||
|
||||
@router.post("/crawler/stop", response_model=dict)
|
||||
async def stop_crawl():
|
||||
"""Stop the crawler."""
|
||||
success = await stop_crawler()
|
||||
if not success:
|
||||
raise HTTPException(status_code=409, detail="Crawler not running")
|
||||
return {"success": True, "message": "Crawler stopped"}
|
||||
|
||||
|
||||
@router.get("/crawler/queue", response_model=List[dict])
|
||||
async def get_queue():
|
||||
"""Get the crawler queue."""
|
||||
pool = await get_pool()
|
||||
if not pool:
|
||||
return []
|
||||
|
||||
try:
|
||||
async with pool.acquire() as conn:
|
||||
rows = await conn.fetch(
|
||||
"""
|
||||
SELECT q.*, s.bundesland, s.name as source_name
|
||||
FROM zeugnis_crawler_queue q
|
||||
JOIN zeugnis_sources s ON q.source_id = s.id
|
||||
ORDER BY q.priority DESC, q.created_at
|
||||
"""
|
||||
)
|
||||
return [dict(r) for r in rows]
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
@router.post("/crawler/queue", response_model=dict)
|
||||
async def add_to_queue(request: CrawlRequest):
|
||||
"""Add a source to the crawler queue."""
|
||||
pool = await get_pool()
|
||||
if not pool:
|
||||
raise HTTPException(status_code=503, detail="Database not available")
|
||||
|
||||
queue_id = generate_id()
|
||||
try:
|
||||
async with pool.acquire() as conn:
|
||||
# Get source ID if bundesland provided
|
||||
source_id = request.source_id
|
||||
if not source_id and request.bundesland:
|
||||
source = await conn.fetchrow(
|
||||
"SELECT id FROM zeugnis_sources WHERE bundesland = $1",
|
||||
request.bundesland
|
||||
)
|
||||
if source:
|
||||
source_id = source["id"]
|
||||
|
||||
if not source_id:
|
||||
raise HTTPException(status_code=400, detail="Source not found")
|
||||
|
||||
await conn.execute(
|
||||
"""
|
||||
INSERT INTO zeugnis_crawler_queue (id, source_id, priority, status)
|
||||
VALUES ($1, $2, $3, 'pending')
|
||||
""",
|
||||
queue_id, source_id, request.priority
|
||||
)
|
||||
return {"id": queue_id, "success": True}
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Statistics Endpoints
|
||||
# =============================================================================
|
||||
|
||||
@router.get("/stats", response_model=dict)
|
||||
async def get_stats():
|
||||
"""Get zeugnis crawler statistics."""
|
||||
stats = await get_zeugnis_stats()
|
||||
return stats
|
||||
|
||||
|
||||
@router.get("/stats/bundesland", response_model=List[dict])
|
||||
async def get_bundesland_stats():
|
||||
"""Get statistics per Bundesland."""
|
||||
pool = await get_pool()
|
||||
|
||||
# Build stats from BUNDESLAENDER with DB data if available
|
||||
stats = []
|
||||
for code, info in BUNDESLAENDER.items():
|
||||
stat = {
|
||||
"bundesland": code,
|
||||
"name": info["name"],
|
||||
"training_allowed": get_training_allowed(code),
|
||||
"document_count": 0,
|
||||
"indexed_count": 0,
|
||||
"last_crawled": None,
|
||||
}
|
||||
|
||||
if pool:
|
||||
try:
|
||||
async with pool.acquire() as conn:
|
||||
row = await conn.fetchrow(
|
||||
"""
|
||||
SELECT
|
||||
COUNT(d.id) as doc_count,
|
||||
COUNT(CASE WHEN d.indexed_in_qdrant THEN 1 END) as indexed_count,
|
||||
MAX(u.last_crawled) as last_crawled
|
||||
FROM zeugnis_sources s
|
||||
LEFT JOIN zeugnis_seed_urls u ON s.id = u.source_id
|
||||
LEFT JOIN zeugnis_documents d ON u.id = d.seed_url_id
|
||||
WHERE s.bundesland = $1
|
||||
GROUP BY s.id
|
||||
""",
|
||||
code
|
||||
)
|
||||
if row:
|
||||
stat["document_count"] = row["doc_count"] or 0
|
||||
stat["indexed_count"] = row["indexed_count"] or 0
|
||||
stat["last_crawled"] = row["last_crawled"].isoformat() if row["last_crawled"] else None
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
stats.append(stat)
|
||||
|
||||
return stats
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Audit Endpoints
|
||||
# =============================================================================
|
||||
|
||||
@router.get("/audit/events", response_model=List[dict])
|
||||
async def get_audit_events(
|
||||
document_id: Optional[str] = None,
|
||||
event_type: Optional[str] = None,
|
||||
limit: int = Query(100, le=1000),
|
||||
days: int = Query(30, le=365),
|
||||
):
|
||||
"""Get audit events with optional filtering."""
|
||||
pool = await get_pool()
|
||||
if not pool:
|
||||
return []
|
||||
|
||||
try:
|
||||
since = datetime.now() - timedelta(days=days)
|
||||
async with pool.acquire() as conn:
|
||||
query = """
|
||||
SELECT * FROM zeugnis_usage_events
|
||||
WHERE created_at >= $1
|
||||
"""
|
||||
params = [since]
|
||||
|
||||
if document_id:
|
||||
query += " AND document_id = $2"
|
||||
params.append(document_id)
|
||||
if event_type:
|
||||
query += f" AND event_type = ${len(params) + 1}"
|
||||
params.append(event_type)
|
||||
|
||||
query += f" ORDER BY created_at DESC LIMIT ${len(params) + 1}"
|
||||
params.append(limit)
|
||||
|
||||
rows = await conn.fetch(query, *params)
|
||||
return [dict(r) for r in rows]
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
@router.get("/audit/export", response_model=dict)
|
||||
async def export_audit(
|
||||
days: int = Query(30, le=365),
|
||||
requested_by: str = Query(..., description="User requesting the export"),
|
||||
):
|
||||
"""Export audit data for GDPR compliance."""
|
||||
pool = await get_pool()
|
||||
if not pool:
|
||||
raise HTTPException(status_code=503, detail="Database not available")
|
||||
|
||||
try:
|
||||
since = datetime.now() - timedelta(days=days)
|
||||
async with pool.acquire() as conn:
|
||||
rows = await conn.fetch(
|
||||
"""
|
||||
SELECT * FROM zeugnis_usage_events
|
||||
WHERE created_at >= $1
|
||||
ORDER BY created_at DESC
|
||||
""",
|
||||
since
|
||||
)
|
||||
|
||||
doc_count = await conn.fetchval(
|
||||
"SELECT COUNT(DISTINCT document_id) FROM zeugnis_usage_events WHERE created_at >= $1",
|
||||
since
|
||||
)
|
||||
|
||||
return {
|
||||
"export_date": datetime.now().isoformat(),
|
||||
"requested_by": requested_by,
|
||||
"events": [dict(r) for r in rows],
|
||||
"document_count": doc_count or 0,
|
||||
"date_range_start": since.isoformat(),
|
||||
"date_range_end": datetime.now().isoformat(),
|
||||
}
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
@@ -0,0 +1,232 @@
|
||||
"""
|
||||
Zeugnis API Sources — source and seed URL management endpoints.
|
||||
|
||||
Extracted from zeugnis_api.py for modularity.
|
||||
"""
|
||||
|
||||
from typing import Optional, List
|
||||
from fastapi import APIRouter, HTTPException
|
||||
from pydantic import BaseModel
|
||||
|
||||
from .models import (
|
||||
ZeugnisSourceCreate, ZeugnisSourceVerify,
|
||||
SeedUrlCreate,
|
||||
LicenseType, DocType,
|
||||
BUNDESLAENDER,
|
||||
generate_id, get_training_allowed, get_bundesland_name, get_license_for_bundesland,
|
||||
)
|
||||
from metrics_db import (
|
||||
get_zeugnis_sources, upsert_zeugnis_source, get_pool,
|
||||
)
|
||||
|
||||
|
||||
router = APIRouter(prefix="/api/v1/admin/zeugnis", tags=["Zeugnis Crawler"])
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Sources Endpoints
|
||||
# =============================================================================
|
||||
|
||||
@router.get("/sources", response_model=List[dict])
|
||||
async def list_sources():
|
||||
"""Get all zeugnis sources (Bundeslaender)."""
|
||||
sources = await get_zeugnis_sources()
|
||||
if not sources:
|
||||
# Return default sources if none exist
|
||||
return [
|
||||
{
|
||||
"id": None,
|
||||
"bundesland": code,
|
||||
"name": info["name"],
|
||||
"base_url": None,
|
||||
"license_type": str(get_license_for_bundesland(code).value),
|
||||
"training_allowed": get_training_allowed(code),
|
||||
"verified_by": None,
|
||||
"verified_at": None,
|
||||
"created_at": None,
|
||||
"updated_at": None,
|
||||
}
|
||||
for code, info in BUNDESLAENDER.items()
|
||||
]
|
||||
return sources
|
||||
|
||||
|
||||
@router.post("/sources", response_model=dict)
|
||||
async def create_source(source: ZeugnisSourceCreate):
|
||||
"""Create or update a zeugnis source."""
|
||||
source_id = generate_id()
|
||||
success = await upsert_zeugnis_source(
|
||||
id=source_id,
|
||||
bundesland=source.bundesland,
|
||||
name=source.name,
|
||||
license_type=source.license_type.value,
|
||||
training_allowed=source.training_allowed,
|
||||
base_url=source.base_url,
|
||||
)
|
||||
if not success:
|
||||
raise HTTPException(status_code=500, detail="Failed to create source")
|
||||
return {"id": source_id, "success": True}
|
||||
|
||||
|
||||
@router.put("/sources/{source_id}/verify", response_model=dict)
|
||||
async def verify_source(source_id: str, verification: ZeugnisSourceVerify):
|
||||
"""Verify a source's license status."""
|
||||
pool = await get_pool()
|
||||
if not pool:
|
||||
raise HTTPException(status_code=503, detail="Database not available")
|
||||
|
||||
try:
|
||||
async with pool.acquire() as conn:
|
||||
await conn.execute(
|
||||
"""
|
||||
UPDATE zeugnis_sources
|
||||
SET license_type = $2,
|
||||
training_allowed = $3,
|
||||
verified_by = $4,
|
||||
verified_at = NOW(),
|
||||
updated_at = NOW()
|
||||
WHERE id = $1
|
||||
""",
|
||||
source_id, verification.license_type.value,
|
||||
verification.training_allowed, verification.verified_by
|
||||
)
|
||||
return {"success": True, "source_id": source_id}
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
@router.get("/sources/{bundesland}", response_model=dict)
|
||||
async def get_source_by_bundesland(bundesland: str):
|
||||
"""Get source details for a specific Bundesland."""
|
||||
pool = await get_pool()
|
||||
if not pool:
|
||||
# Return default info
|
||||
if bundesland not in BUNDESLAENDER:
|
||||
raise HTTPException(status_code=404, detail=f"Bundesland not found: {bundesland}")
|
||||
return {
|
||||
"bundesland": bundesland,
|
||||
"name": get_bundesland_name(bundesland),
|
||||
"training_allowed": get_training_allowed(bundesland),
|
||||
"license_type": get_license_for_bundesland(bundesland).value,
|
||||
"document_count": 0,
|
||||
}
|
||||
|
||||
try:
|
||||
async with pool.acquire() as conn:
|
||||
source = await conn.fetchrow(
|
||||
"SELECT * FROM zeugnis_sources WHERE bundesland = $1",
|
||||
bundesland
|
||||
)
|
||||
if source:
|
||||
doc_count = await conn.fetchval(
|
||||
"""
|
||||
SELECT COUNT(*) FROM zeugnis_documents d
|
||||
JOIN zeugnis_seed_urls u ON d.seed_url_id = u.id
|
||||
WHERE u.source_id = $1
|
||||
""",
|
||||
source["id"]
|
||||
)
|
||||
return {**dict(source), "document_count": doc_count or 0}
|
||||
|
||||
# Return default
|
||||
return {
|
||||
"bundesland": bundesland,
|
||||
"name": get_bundesland_name(bundesland),
|
||||
"training_allowed": get_training_allowed(bundesland),
|
||||
"license_type": get_license_for_bundesland(bundesland).value,
|
||||
"document_count": 0,
|
||||
}
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Seed URLs Endpoints
|
||||
# =============================================================================
|
||||
|
||||
@router.get("/sources/{source_id}/urls", response_model=List[dict])
|
||||
async def list_seed_urls(source_id: str):
|
||||
"""Get all seed URLs for a source."""
|
||||
pool = await get_pool()
|
||||
if not pool:
|
||||
return []
|
||||
|
||||
try:
|
||||
async with pool.acquire() as conn:
|
||||
rows = await conn.fetch(
|
||||
"SELECT * FROM zeugnis_seed_urls WHERE source_id = $1 ORDER BY created_at",
|
||||
source_id
|
||||
)
|
||||
return [dict(r) for r in rows]
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
@router.post("/sources/{source_id}/urls", response_model=dict)
|
||||
async def add_seed_url(source_id: str, seed_url: SeedUrlCreate):
|
||||
"""Add a new seed URL to a source."""
|
||||
pool = await get_pool()
|
||||
if not pool:
|
||||
raise HTTPException(status_code=503, detail="Database not available")
|
||||
|
||||
url_id = generate_id()
|
||||
try:
|
||||
async with pool.acquire() as conn:
|
||||
await conn.execute(
|
||||
"""
|
||||
INSERT INTO zeugnis_seed_urls (id, source_id, url, doc_type, status)
|
||||
VALUES ($1, $2, $3, $4, 'pending')
|
||||
""",
|
||||
url_id, source_id, seed_url.url, seed_url.doc_type.value
|
||||
)
|
||||
return {"id": url_id, "success": True}
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
@router.delete("/urls/{url_id}", response_model=dict)
|
||||
async def delete_seed_url(url_id: str):
|
||||
"""Delete a seed URL."""
|
||||
pool = await get_pool()
|
||||
if not pool:
|
||||
raise HTTPException(status_code=503, detail="Database not available")
|
||||
|
||||
try:
|
||||
async with pool.acquire() as conn:
|
||||
await conn.execute(
|
||||
"DELETE FROM zeugnis_seed_urls WHERE id = $1",
|
||||
url_id
|
||||
)
|
||||
return {"success": True}
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Initialization Endpoint
|
||||
# =============================================================================
|
||||
|
||||
@router.post("/init", response_model=dict)
|
||||
async def initialize_sources():
|
||||
"""Initialize default sources from BUNDESLAENDER."""
|
||||
pool = await get_pool()
|
||||
if not pool:
|
||||
raise HTTPException(status_code=503, detail="Database not available")
|
||||
|
||||
created = 0
|
||||
try:
|
||||
for code, info in BUNDESLAENDER.items():
|
||||
source_id = generate_id()
|
||||
success = await upsert_zeugnis_source(
|
||||
id=source_id,
|
||||
bundesland=code,
|
||||
name=info["name"],
|
||||
license_type=get_license_for_bundesland(code).value,
|
||||
training_allowed=get_training_allowed(code),
|
||||
)
|
||||
if success:
|
||||
created += 1
|
||||
|
||||
return {"success": True, "sources_created": created}
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
@@ -0,0 +1,105 @@
|
||||
"""
|
||||
Zeugnis Crawler - Start/stop/status control functions.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
from typing import Optional, Dict, Any
|
||||
|
||||
from .worker import ZeugnisCrawler, get_crawler_state
|
||||
|
||||
|
||||
_crawler_instance: Optional[ZeugnisCrawler] = None
|
||||
_crawler_task: Optional[asyncio.Task] = None
|
||||
|
||||
|
||||
async def start_crawler(bundesland: Optional[str] = None, source_id: Optional[str] = None) -> bool:
|
||||
"""Start the crawler."""
|
||||
global _crawler_instance, _crawler_task
|
||||
|
||||
state = get_crawler_state()
|
||||
|
||||
if state.is_running:
|
||||
return False
|
||||
|
||||
state.is_running = True
|
||||
state.documents_crawled_today = 0
|
||||
state.documents_indexed_today = 0
|
||||
state.errors_today = 0
|
||||
|
||||
_crawler_instance = ZeugnisCrawler()
|
||||
await _crawler_instance.init()
|
||||
|
||||
async def run_crawler():
|
||||
try:
|
||||
from metrics_db import get_pool
|
||||
pool = await get_pool()
|
||||
|
||||
if pool:
|
||||
async with pool.acquire() as conn:
|
||||
# Get sources to crawl
|
||||
if source_id:
|
||||
sources = await conn.fetch(
|
||||
"SELECT id, bundesland FROM zeugnis_sources WHERE id = $1",
|
||||
source_id
|
||||
)
|
||||
elif bundesland:
|
||||
sources = await conn.fetch(
|
||||
"SELECT id, bundesland FROM zeugnis_sources WHERE bundesland = $1",
|
||||
bundesland
|
||||
)
|
||||
else:
|
||||
sources = await conn.fetch(
|
||||
"SELECT id, bundesland FROM zeugnis_sources ORDER BY bundesland"
|
||||
)
|
||||
|
||||
for source in sources:
|
||||
if not state.is_running:
|
||||
break
|
||||
await _crawler_instance.crawl_source(source["id"])
|
||||
|
||||
except Exception as e:
|
||||
print(f"Crawler error: {e}")
|
||||
|
||||
finally:
|
||||
state.is_running = False
|
||||
if _crawler_instance:
|
||||
await _crawler_instance.close()
|
||||
|
||||
_crawler_task = asyncio.create_task(run_crawler())
|
||||
return True
|
||||
|
||||
|
||||
async def stop_crawler() -> bool:
|
||||
"""Stop the crawler."""
|
||||
global _crawler_task
|
||||
|
||||
state = get_crawler_state()
|
||||
|
||||
if not state.is_running:
|
||||
return False
|
||||
|
||||
state.is_running = False
|
||||
|
||||
if _crawler_task:
|
||||
_crawler_task.cancel()
|
||||
try:
|
||||
await _crawler_task
|
||||
except asyncio.CancelledError:
|
||||
pass
|
||||
|
||||
return True
|
||||
|
||||
|
||||
def get_crawler_status() -> Dict[str, Any]:
|
||||
"""Get current crawler status."""
|
||||
state = get_crawler_state()
|
||||
return {
|
||||
"is_running": state.is_running,
|
||||
"current_source": state.current_source_id,
|
||||
"current_bundesland": state.current_bundesland,
|
||||
"queue_length": len(state.queue),
|
||||
"documents_crawled_today": state.documents_crawled_today,
|
||||
"documents_indexed_today": state.documents_indexed_today,
|
||||
"errors_today": state.errors_today,
|
||||
"last_activity": state.last_activity.isoformat() if state.last_activity else None,
|
||||
}
|
||||
@@ -0,0 +1,26 @@
|
||||
"""
|
||||
Zeugnis Rights-Aware Crawler
|
||||
|
||||
Barrel re-export: all public symbols for backward compatibility.
|
||||
"""
|
||||
|
||||
from .text import ( # noqa: F401
|
||||
extract_text_from_pdf,
|
||||
extract_text_from_html,
|
||||
chunk_text,
|
||||
compute_hash,
|
||||
)
|
||||
from .storage import ( # noqa: F401
|
||||
generate_embeddings,
|
||||
upload_to_minio,
|
||||
index_in_qdrant,
|
||||
)
|
||||
from .worker import ( # noqa: F401
|
||||
CrawlerState,
|
||||
ZeugnisCrawler,
|
||||
)
|
||||
from .control import ( # noqa: F401
|
||||
start_crawler,
|
||||
stop_crawler,
|
||||
get_crawler_status,
|
||||
)
|
||||
@@ -0,0 +1,340 @@
|
||||
"""
|
||||
Zeugnis Rights-Aware Crawler - Data Models
|
||||
|
||||
Pydantic models for API requests/responses and internal data structures.
|
||||
Database schema is defined in metrics_db.py.
|
||||
"""
|
||||
|
||||
from datetime import datetime
|
||||
from enum import Enum
|
||||
from typing import Optional, List, Dict, Any
|
||||
from pydantic import BaseModel, Field
|
||||
import uuid
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Enums
|
||||
# =============================================================================
|
||||
|
||||
class LicenseType(str, Enum):
|
||||
"""License classification for training permission."""
|
||||
PUBLIC_DOMAIN = "public_domain" # Amtliche Werke (§5 UrhG)
|
||||
CC_BY = "cc_by" # Creative Commons Attribution
|
||||
CC_BY_SA = "cc_by_sa" # CC Attribution-ShareAlike
|
||||
CC_BY_NC = "cc_by_nc" # CC NonCommercial - NO TRAINING
|
||||
CC_BY_NC_SA = "cc_by_nc_sa" # CC NC-SA - NO TRAINING
|
||||
GOV_STATUTE_FREE_USE = "gov_statute" # Government statutes (gemeinfrei)
|
||||
ALL_RIGHTS_RESERVED = "all_rights" # Standard copyright - NO TRAINING
|
||||
UNKNOWN_REQUIRES_REVIEW = "unknown" # Needs manual review
|
||||
|
||||
|
||||
class CrawlStatus(str, Enum):
|
||||
"""Status of a crawl job or seed URL."""
|
||||
PENDING = "pending"
|
||||
RUNNING = "running"
|
||||
COMPLETED = "completed"
|
||||
FAILED = "failed"
|
||||
PAUSED = "paused"
|
||||
|
||||
|
||||
class DocType(str, Enum):
|
||||
"""Type of zeugnis document."""
|
||||
VERORDNUNG = "verordnung" # Official regulation
|
||||
HANDREICHUNG = "handreichung" # Implementation guide
|
||||
FORMULAR = "formular" # Form template
|
||||
ERLASS = "erlass" # Decree
|
||||
SCHULORDNUNG = "schulordnung" # School regulations
|
||||
SONSTIGES = "sonstiges" # Other
|
||||
|
||||
|
||||
class EventType(str, Enum):
|
||||
"""Audit event types."""
|
||||
CRAWLED = "crawled"
|
||||
INDEXED = "indexed"
|
||||
DOWNLOADED = "downloaded"
|
||||
VIEWED = "viewed"
|
||||
EXPORTED = "exported"
|
||||
TRAINED_ON = "trained_on"
|
||||
DELETED = "deleted"
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Bundesland Definitions
|
||||
# =============================================================================
|
||||
|
||||
BUNDESLAENDER = {
|
||||
"bw": {"name": "Baden-Württemberg", "short": "BW"},
|
||||
"by": {"name": "Bayern", "short": "BY"},
|
||||
"be": {"name": "Berlin", "short": "BE"},
|
||||
"bb": {"name": "Brandenburg", "short": "BB"},
|
||||
"hb": {"name": "Bremen", "short": "HB"},
|
||||
"hh": {"name": "Hamburg", "short": "HH"},
|
||||
"he": {"name": "Hessen", "short": "HE"},
|
||||
"mv": {"name": "Mecklenburg-Vorpommern", "short": "MV"},
|
||||
"ni": {"name": "Niedersachsen", "short": "NI"},
|
||||
"nw": {"name": "Nordrhein-Westfalen", "short": "NW"},
|
||||
"rp": {"name": "Rheinland-Pfalz", "short": "RP"},
|
||||
"sl": {"name": "Saarland", "short": "SL"},
|
||||
"sn": {"name": "Sachsen", "short": "SN"},
|
||||
"st": {"name": "Sachsen-Anhalt", "short": "ST"},
|
||||
"sh": {"name": "Schleswig-Holstein", "short": "SH"},
|
||||
"th": {"name": "Thüringen", "short": "TH"},
|
||||
}
|
||||
|
||||
|
||||
# Training permission based on Word document analysis
|
||||
TRAINING_PERMISSIONS = {
|
||||
"bw": True, # Amtliches Werk
|
||||
"by": True, # Amtliches Werk
|
||||
"be": False, # Keine Lizenz
|
||||
"bb": False, # Keine Lizenz
|
||||
"hb": False, # Eingeschränkt -> False for safety
|
||||
"hh": False, # Keine Lizenz
|
||||
"he": True, # Amtliches Werk
|
||||
"mv": False, # Eingeschränkt -> False for safety
|
||||
"ni": True, # Amtliches Werk
|
||||
"nw": True, # Amtliches Werk
|
||||
"rp": True, # Amtliches Werk
|
||||
"sl": False, # Keine Lizenz
|
||||
"sn": True, # Amtliches Werk
|
||||
"st": False, # Eingeschränkt -> False for safety
|
||||
"sh": True, # Amtliches Werk
|
||||
"th": True, # Amtliches Werk
|
||||
}
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# API Models - Sources
|
||||
# =============================================================================
|
||||
|
||||
class ZeugnisSourceBase(BaseModel):
|
||||
"""Base model for zeugnis source."""
|
||||
bundesland: str = Field(..., description="Bundesland code (e.g., 'ni', 'by')")
|
||||
name: str = Field(..., description="Full name of the source")
|
||||
base_url: Optional[str] = Field(None, description="Base URL for the source")
|
||||
license_type: LicenseType = Field(..., description="License classification")
|
||||
training_allowed: bool = Field(False, description="Whether AI training is permitted")
|
||||
|
||||
|
||||
class ZeugnisSourceCreate(ZeugnisSourceBase):
|
||||
"""Model for creating a new source."""
|
||||
pass
|
||||
|
||||
|
||||
class ZeugnisSource(ZeugnisSourceBase):
|
||||
"""Full source model with all fields."""
|
||||
id: str
|
||||
verified_by: Optional[str] = None
|
||||
verified_at: Optional[datetime] = None
|
||||
created_at: datetime
|
||||
updated_at: datetime
|
||||
|
||||
class Config:
|
||||
from_attributes = True
|
||||
|
||||
|
||||
class ZeugnisSourceVerify(BaseModel):
|
||||
"""Model for verifying a source's license."""
|
||||
verified_by: str = Field(..., description="User ID who verified")
|
||||
license_type: LicenseType
|
||||
training_allowed: bool
|
||||
notes: Optional[str] = None
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# API Models - Seed URLs
|
||||
# =============================================================================
|
||||
|
||||
class SeedUrlBase(BaseModel):
|
||||
"""Base model for seed URL."""
|
||||
url: str = Field(..., description="URL to crawl")
|
||||
doc_type: DocType = Field(DocType.VERORDNUNG, description="Type of document")
|
||||
|
||||
|
||||
class SeedUrlCreate(SeedUrlBase):
|
||||
"""Model for creating a new seed URL."""
|
||||
source_id: str
|
||||
|
||||
|
||||
class SeedUrl(SeedUrlBase):
|
||||
"""Full seed URL model."""
|
||||
id: str
|
||||
source_id: str
|
||||
status: CrawlStatus = CrawlStatus.PENDING
|
||||
last_crawled: Optional[datetime] = None
|
||||
error_message: Optional[str] = None
|
||||
created_at: datetime
|
||||
|
||||
class Config:
|
||||
from_attributes = True
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# API Models - Documents
|
||||
# =============================================================================
|
||||
|
||||
class ZeugnisDocumentBase(BaseModel):
|
||||
"""Base model for zeugnis document."""
|
||||
title: Optional[str] = None
|
||||
url: str
|
||||
content_type: Optional[str] = None
|
||||
file_size: Optional[int] = None
|
||||
|
||||
|
||||
class ZeugnisDocument(ZeugnisDocumentBase):
|
||||
"""Full document model."""
|
||||
id: str
|
||||
seed_url_id: str
|
||||
content_hash: Optional[str] = None
|
||||
minio_path: Optional[str] = None
|
||||
training_allowed: bool = False
|
||||
indexed_in_qdrant: bool = False
|
||||
bundesland: Optional[str] = None
|
||||
source_name: Optional[str] = None
|
||||
created_at: datetime
|
||||
updated_at: datetime
|
||||
|
||||
class Config:
|
||||
from_attributes = True
|
||||
|
||||
|
||||
class ZeugnisDocumentVersion(BaseModel):
|
||||
"""Document version for history tracking."""
|
||||
id: str
|
||||
document_id: str
|
||||
version: int
|
||||
content_hash: str
|
||||
minio_path: Optional[str] = None
|
||||
change_summary: Optional[str] = None
|
||||
created_at: datetime
|
||||
|
||||
class Config:
|
||||
from_attributes = True
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# API Models - Crawler
|
||||
# =============================================================================
|
||||
|
||||
class CrawlerStatus(BaseModel):
|
||||
"""Current status of the crawler."""
|
||||
is_running: bool = False
|
||||
current_source: Optional[str] = None
|
||||
current_bundesland: Optional[str] = None
|
||||
queue_length: int = 0
|
||||
documents_crawled_today: int = 0
|
||||
documents_indexed_today: int = 0
|
||||
last_activity: Optional[datetime] = None
|
||||
errors_today: int = 0
|
||||
|
||||
|
||||
class CrawlQueueItem(BaseModel):
|
||||
"""Item in the crawl queue."""
|
||||
id: str
|
||||
source_id: str
|
||||
bundesland: str
|
||||
source_name: str
|
||||
priority: int = 5
|
||||
status: CrawlStatus = CrawlStatus.PENDING
|
||||
started_at: Optional[datetime] = None
|
||||
completed_at: Optional[datetime] = None
|
||||
documents_found: int = 0
|
||||
documents_indexed: int = 0
|
||||
error_count: int = 0
|
||||
created_at: datetime
|
||||
|
||||
|
||||
class CrawlRequest(BaseModel):
|
||||
"""Request to start a crawl."""
|
||||
bundesland: Optional[str] = Field(None, description="Specific Bundesland to crawl")
|
||||
source_id: Optional[str] = Field(None, description="Specific source ID to crawl")
|
||||
priority: int = Field(5, ge=1, le=10, description="Priority (1=lowest, 10=highest)")
|
||||
|
||||
|
||||
class CrawlResult(BaseModel):
|
||||
"""Result of a crawl operation."""
|
||||
source_id: str
|
||||
bundesland: str
|
||||
documents_found: int
|
||||
documents_indexed: int
|
||||
documents_skipped: int
|
||||
errors: List[str]
|
||||
duration_seconds: float
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# API Models - Statistics
|
||||
# =============================================================================
|
||||
|
||||
class ZeugnisStats(BaseModel):
|
||||
"""Statistics for the zeugnis crawler."""
|
||||
total_sources: int = 0
|
||||
total_documents: int = 0
|
||||
indexed_documents: int = 0
|
||||
training_allowed_documents: int = 0
|
||||
active_crawls: int = 0
|
||||
per_bundesland: List[Dict[str, Any]] = []
|
||||
|
||||
|
||||
class BundeslandStats(BaseModel):
|
||||
"""Statistics per Bundesland."""
|
||||
bundesland: str
|
||||
name: str
|
||||
training_allowed: bool
|
||||
document_count: int
|
||||
indexed_count: int
|
||||
last_crawled: Optional[datetime] = None
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# API Models - Audit
|
||||
# =============================================================================
|
||||
|
||||
class UsageEvent(BaseModel):
|
||||
"""Usage event for audit trail."""
|
||||
id: str
|
||||
document_id: str
|
||||
event_type: EventType
|
||||
user_id: Optional[str] = None
|
||||
details: Optional[Dict[str, Any]] = None
|
||||
created_at: datetime
|
||||
|
||||
class Config:
|
||||
from_attributes = True
|
||||
|
||||
|
||||
class AuditExport(BaseModel):
|
||||
"""GDPR-compliant audit export."""
|
||||
export_date: datetime
|
||||
requested_by: str
|
||||
events: List[UsageEvent]
|
||||
document_count: int
|
||||
date_range_start: datetime
|
||||
date_range_end: datetime
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Helper Functions
|
||||
# =============================================================================
|
||||
|
||||
def generate_id() -> str:
|
||||
"""Generate a new UUID."""
|
||||
return str(uuid.uuid4())
|
||||
|
||||
|
||||
def get_training_allowed(bundesland: str) -> bool:
|
||||
"""Get training permission for a Bundesland."""
|
||||
return TRAINING_PERMISSIONS.get(bundesland.lower(), False)
|
||||
|
||||
|
||||
def get_bundesland_name(code: str) -> str:
|
||||
"""Get full Bundesland name from code."""
|
||||
info = BUNDESLAENDER.get(code.lower(), {})
|
||||
return info.get("name", code)
|
||||
|
||||
|
||||
def get_license_for_bundesland(bundesland: str) -> LicenseType:
|
||||
"""Get appropriate license type for a Bundesland."""
|
||||
if TRAINING_PERMISSIONS.get(bundesland.lower(), False):
|
||||
return LicenseType.GOV_STATUTE_FREE_USE
|
||||
return LicenseType.UNKNOWN_REQUIRES_REVIEW
|
||||
@@ -0,0 +1,415 @@
|
||||
"""
|
||||
Zeugnis Seed Data - Initial URLs from Word Document
|
||||
|
||||
Contains seed URLs for all 16 German federal states (Bundesländer)
|
||||
based on the "Bundesland URL Zeugnisse.docx" document.
|
||||
|
||||
Training permissions:
|
||||
- Ja: Amtliches Werk (§5 UrhG) - training allowed
|
||||
- Nein: Keine Lizenz angegeben - training NOT allowed
|
||||
- Eingeschränkt: Treated as NOT allowed for safety
|
||||
"""
|
||||
|
||||
from typing import Dict, List, Any
|
||||
|
||||
# Seed data structure: bundesland -> list of seed URLs
|
||||
SEED_DATA: Dict[str, Dict[str, Any]] = {
|
||||
"bw": {
|
||||
"name": "Baden-Württemberg",
|
||||
"license": "gov_statute",
|
||||
"training_allowed": True,
|
||||
"base_url": "https://www.landesrecht-bw.de",
|
||||
"urls": [
|
||||
{
|
||||
"url": "https://www.landesrecht-bw.de/jportal/portal/t/cru/page/bsbawueprod.psml?pid=Dokumentanzeige&showdoccase=1&js_peid=Trefferliste&documentnumber=1&numberofresults=1&fromdoctodoc=yes&doc.id=jlr-SchulGBWpP5&doc.part=X&doc.price=0.0&doc.hl=1",
|
||||
"doc_type": "verordnung",
|
||||
"title": "Schulgesetz BW - Zeugnisse"
|
||||
},
|
||||
{
|
||||
"url": "https://www.landesrecht-bw.de/jportal/portal/t/cs9/page/bsbawueprod.psml?pid=Dokumentanzeige&showdoccase=1&js_peid=Trefferliste&documentnumber=1&numberofresults=1&fromdoctodoc=yes&doc.id=jlr-NotenBildVBW2016rahmen&doc.part=X&doc.price=0.0",
|
||||
"doc_type": "verordnung",
|
||||
"title": "Notenbildungsverordnung"
|
||||
}
|
||||
]
|
||||
},
|
||||
"by": {
|
||||
"name": "Bayern",
|
||||
"license": "gov_statute",
|
||||
"training_allowed": True,
|
||||
"base_url": "https://www.gesetze-bayern.de",
|
||||
"urls": [
|
||||
{
|
||||
"url": "https://www.gesetze-bayern.de/Content/Document/BaySchO2016",
|
||||
"doc_type": "schulordnung",
|
||||
"title": "Bayerische Schulordnung"
|
||||
},
|
||||
{
|
||||
"url": "https://www.gesetze-bayern.de/Content/Document/BayGSO",
|
||||
"doc_type": "schulordnung",
|
||||
"title": "Grundschulordnung Bayern"
|
||||
},
|
||||
{
|
||||
"url": "https://www.gesetze-bayern.de/Content/Document/BayVSO",
|
||||
"doc_type": "schulordnung",
|
||||
"title": "Volksschulordnung Bayern"
|
||||
}
|
||||
]
|
||||
},
|
||||
"be": {
|
||||
"name": "Berlin",
|
||||
"license": "unknown",
|
||||
"training_allowed": False,
|
||||
"base_url": "https://gesetze.berlin.de",
|
||||
"urls": [
|
||||
{
|
||||
"url": "https://gesetze.berlin.de/bsbe/document/jlr-SchulGBEpP58",
|
||||
"doc_type": "verordnung",
|
||||
"title": "Berliner Schulgesetz - Zeugnisse"
|
||||
},
|
||||
{
|
||||
"url": "https://gesetze.berlin.de/bsbe/document/jlr-SekIVBE2010rahmen",
|
||||
"doc_type": "verordnung",
|
||||
"title": "Sekundarstufe I-Verordnung"
|
||||
}
|
||||
]
|
||||
},
|
||||
"bb": {
|
||||
"name": "Brandenburg",
|
||||
"license": "unknown",
|
||||
"training_allowed": False,
|
||||
"base_url": "https://bravors.brandenburg.de",
|
||||
"urls": [
|
||||
{
|
||||
"url": "https://bravors.brandenburg.de/verordnungen/vvzeugnis",
|
||||
"doc_type": "verordnung",
|
||||
"title": "Verwaltungsvorschriften Zeugnisse"
|
||||
},
|
||||
{
|
||||
"url": "https://bravors.brandenburg.de/verordnungen/gostv",
|
||||
"doc_type": "verordnung",
|
||||
"title": "GOST-Verordnung Brandenburg"
|
||||
}
|
||||
]
|
||||
},
|
||||
"hb": {
|
||||
"name": "Bremen",
|
||||
"license": "unknown",
|
||||
"training_allowed": False, # Eingeschränkt -> False for safety
|
||||
"base_url": "https://www.transparenz.bremen.de",
|
||||
"urls": [
|
||||
{
|
||||
"url": "https://www.transparenz.bremen.de/metainformationen/bremisches-schulgesetz-bremschg-vom-28-juni-2005-121009",
|
||||
"doc_type": "verordnung",
|
||||
"title": "Bremisches Schulgesetz"
|
||||
},
|
||||
{
|
||||
"url": "https://www.transparenz.bremen.de/metainformationen/verordnung-ueber-die-sekundarstufe-i-der-oberschule-vom-20-juni-2017-130380",
|
||||
"doc_type": "verordnung",
|
||||
"title": "Sekundarstufe I Verordnung Bremen"
|
||||
}
|
||||
]
|
||||
},
|
||||
"hh": {
|
||||
"name": "Hamburg",
|
||||
"license": "unknown",
|
||||
"training_allowed": False,
|
||||
"base_url": "https://www.landesrecht-hamburg.de",
|
||||
"urls": [
|
||||
{
|
||||
"url": "https://www.landesrecht-hamburg.de/bsha/document/jlr-SchulGHA2009pP44",
|
||||
"doc_type": "verordnung",
|
||||
"title": "Hamburgisches Schulgesetz - Zeugnisse"
|
||||
},
|
||||
{
|
||||
"url": "https://www.landesrecht-hamburg.de/bsha/document/jlr-AusglLeistVHA2011rahmen",
|
||||
"doc_type": "verordnung",
|
||||
"title": "Ausbildungs- und Prüfungsordnung"
|
||||
}
|
||||
]
|
||||
},
|
||||
"he": {
|
||||
"name": "Hessen",
|
||||
"license": "gov_statute",
|
||||
"training_allowed": True,
|
||||
"base_url": "https://www.rv.hessenrecht.hessen.de",
|
||||
"urls": [
|
||||
{
|
||||
"url": "https://www.rv.hessenrecht.hessen.de/bshe/document/jlr-SchulGHE2017pP73",
|
||||
"doc_type": "verordnung",
|
||||
"title": "Hessisches Schulgesetz - Zeugnisse"
|
||||
},
|
||||
{
|
||||
"url": "https://www.rv.hessenrecht.hessen.de/bshe/document/jlr-VOBGM11HE2011rahmen",
|
||||
"doc_type": "verordnung",
|
||||
"title": "Verordnung zur Gestaltung des Schulverhältnisses"
|
||||
}
|
||||
]
|
||||
},
|
||||
"mv": {
|
||||
"name": "Mecklenburg-Vorpommern",
|
||||
"license": "unknown",
|
||||
"training_allowed": False, # Eingeschränkt -> False for safety
|
||||
"base_url": "https://www.landesrecht-mv.de",
|
||||
"urls": [
|
||||
{
|
||||
"url": "https://www.landesrecht-mv.de/bsmv/document/jlr-SchulGMV2010pP63",
|
||||
"doc_type": "verordnung",
|
||||
"title": "Schulgesetz MV - Zeugnisse"
|
||||
},
|
||||
{
|
||||
"url": "https://www.landesrecht-mv.de/bsmv/document/jlr-ZeugnVMVrahmen",
|
||||
"doc_type": "verordnung",
|
||||
"title": "Zeugnisverordnung MV"
|
||||
}
|
||||
]
|
||||
},
|
||||
"ni": {
|
||||
"name": "Niedersachsen",
|
||||
"license": "gov_statute",
|
||||
"training_allowed": True,
|
||||
"base_url": "https://www.nds-voris.de",
|
||||
"urls": [
|
||||
{
|
||||
"url": "https://www.nds-voris.de/jportal/portal/t/1gxi/page/bsvorisprod.psml?pid=Dokumentanzeige&showdoccase=1&js_peid=Trefferliste&documentnumber=1&numberofresults=1&fromdoctodoc=yes&doc.id=jlr-SchulGNDpP59",
|
||||
"doc_type": "verordnung",
|
||||
"title": "Niedersächsisches Schulgesetz - Zeugnisse"
|
||||
},
|
||||
{
|
||||
"url": "https://www.nds-voris.de/jportal/portal/t/1gxi/page/bsvorisprod.psml?pid=Dokumentanzeige&showdoccase=1&js_peid=Trefferliste&documentnumber=1&numberofresults=1&fromdoctodoc=yes&doc.id=jlr-ErgZeugnErlNDrahmen",
|
||||
"doc_type": "erlass",
|
||||
"title": "Ergänzende Bestimmungen für Zeugnisse"
|
||||
},
|
||||
{
|
||||
"url": "https://www.mk.niedersachsen.de/startseite/schule/unsere_schulen/allgemein_bildende_schulen/zeugnisse_versetzungen/zeugnisse-und-versetzungen-6351.html",
|
||||
"doc_type": "handreichung",
|
||||
"title": "Handreichung Zeugnisse NI"
|
||||
}
|
||||
]
|
||||
},
|
||||
"nw": {
|
||||
"name": "Nordrhein-Westfalen",
|
||||
"license": "gov_statute",
|
||||
"training_allowed": True,
|
||||
"base_url": "https://recht.nrw.de",
|
||||
"urls": [
|
||||
{
|
||||
"url": "https://recht.nrw.de/lmi/owa/br_text_anzeigen?v_id=10000000000000000521",
|
||||
"doc_type": "verordnung",
|
||||
"title": "Schulgesetz NRW"
|
||||
},
|
||||
{
|
||||
"url": "https://recht.nrw.de/lmi/owa/br_text_anzeigen?v_id=10000000000000000525",
|
||||
"doc_type": "verordnung",
|
||||
"title": "Ausbildungs- und Prüfungsordnung Sek I"
|
||||
},
|
||||
{
|
||||
"url": "https://www.schulministerium.nrw/zeugnisse",
|
||||
"doc_type": "handreichung",
|
||||
"title": "Handreichung Zeugnisse NRW"
|
||||
}
|
||||
]
|
||||
},
|
||||
"rp": {
|
||||
"name": "Rheinland-Pfalz",
|
||||
"license": "gov_statute",
|
||||
"training_allowed": True,
|
||||
"base_url": "https://landesrecht.rlp.de",
|
||||
"urls": [
|
||||
{
|
||||
"url": "https://landesrecht.rlp.de/bsrp/document/jlr-SchulGRPpP61",
|
||||
"doc_type": "verordnung",
|
||||
"title": "Schulgesetz RP - Zeugnisse"
|
||||
},
|
||||
{
|
||||
"url": "https://landesrecht.rlp.de/bsrp/document/jlr-ZeugnVRPrahmen",
|
||||
"doc_type": "verordnung",
|
||||
"title": "Zeugnisverordnung RP"
|
||||
}
|
||||
]
|
||||
},
|
||||
"sl": {
|
||||
"name": "Saarland",
|
||||
"license": "unknown",
|
||||
"training_allowed": False,
|
||||
"base_url": "https://recht.saarland.de",
|
||||
"urls": [
|
||||
{
|
||||
"url": "https://recht.saarland.de/bssl/document/jlr-SchulOGSLrahmen",
|
||||
"doc_type": "schulordnung",
|
||||
"title": "Schulordnungsgesetz Saarland"
|
||||
},
|
||||
{
|
||||
"url": "https://recht.saarland.de/bssl/document/jlr-ZeugnVSL2014rahmen",
|
||||
"doc_type": "verordnung",
|
||||
"title": "Zeugnisverordnung Saarland"
|
||||
}
|
||||
]
|
||||
},
|
||||
"sn": {
|
||||
"name": "Sachsen",
|
||||
"license": "gov_statute",
|
||||
"training_allowed": True,
|
||||
"base_url": "https://www.revosax.sachsen.de",
|
||||
"urls": [
|
||||
{
|
||||
"url": "https://www.revosax.sachsen.de/vorschrift/4192-Schulgesetz-fuer-den-Freistaat-Sachsen",
|
||||
"doc_type": "verordnung",
|
||||
"title": "Schulgesetz Sachsen"
|
||||
},
|
||||
{
|
||||
"url": "https://www.revosax.sachsen.de/vorschrift/13500-Schulordnung-Gymnasien-Abiturpruefung",
|
||||
"doc_type": "schulordnung",
|
||||
"title": "Schulordnung Gymnasien Sachsen"
|
||||
}
|
||||
]
|
||||
},
|
||||
"st": {
|
||||
"name": "Sachsen-Anhalt",
|
||||
"license": "unknown",
|
||||
"training_allowed": False, # Eingeschränkt -> False for safety
|
||||
"base_url": "https://www.landesrecht.sachsen-anhalt.de",
|
||||
"urls": [
|
||||
{
|
||||
"url": "https://www.landesrecht.sachsen-anhalt.de/bsst/document/jlr-SchulGSTpP27",
|
||||
"doc_type": "verordnung",
|
||||
"title": "Schulgesetz Sachsen-Anhalt"
|
||||
},
|
||||
{
|
||||
"url": "https://www.landesrecht.sachsen-anhalt.de/bsst/document/jlr-VersetzVST2017rahmen",
|
||||
"doc_type": "verordnung",
|
||||
"title": "Versetzungsverordnung ST"
|
||||
}
|
||||
]
|
||||
},
|
||||
"sh": {
|
||||
"name": "Schleswig-Holstein",
|
||||
"license": "gov_statute",
|
||||
"training_allowed": True,
|
||||
"base_url": "https://www.gesetze-rechtsprechung.sh.juris.de",
|
||||
"urls": [
|
||||
{
|
||||
"url": "https://www.gesetze-rechtsprechung.sh.juris.de/jportal/portal/t/10wx/page/bsshoprod.psml?pid=Dokumentanzeige&showdoccase=1&js_peid=Trefferliste&documentnumber=1&numberofresults=1&fromdoctodoc=yes&doc.id=jlr-SchulGSHpP22",
|
||||
"doc_type": "verordnung",
|
||||
"title": "Schulgesetz SH - Zeugnisse"
|
||||
},
|
||||
{
|
||||
"url": "https://www.gesetze-rechtsprechung.sh.juris.de/jportal/portal/t/10wx/page/bsshoprod.psml?pid=Dokumentanzeige&showdoccase=1&js_peid=Trefferliste&documentnumber=1&numberofresults=1&fromdoctodoc=yes&doc.id=jlr-ZeugnVSHrahmen",
|
||||
"doc_type": "verordnung",
|
||||
"title": "Zeugnisverordnung SH"
|
||||
}
|
||||
]
|
||||
},
|
||||
"th": {
|
||||
"name": "Thüringen",
|
||||
"license": "gov_statute",
|
||||
"training_allowed": True,
|
||||
"base_url": "https://landesrecht.thueringen.de",
|
||||
"urls": [
|
||||
{
|
||||
"url": "https://landesrecht.thueringen.de/bsth/document/jlr-SchulGTHpP58",
|
||||
"doc_type": "verordnung",
|
||||
"title": "Thüringer Schulgesetz - Zeugnisse"
|
||||
},
|
||||
{
|
||||
"url": "https://landesrecht.thueringen.de/bsth/document/jlr-SchulOTH2018rahmen",
|
||||
"doc_type": "schulordnung",
|
||||
"title": "Thüringer Schulordnung"
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
async def populate_seed_data():
|
||||
"""Populate database with seed data."""
|
||||
from metrics_db import get_pool, upsert_zeugnis_source
|
||||
from zeugnis_models import generate_id
|
||||
|
||||
pool = await get_pool()
|
||||
if not pool:
|
||||
print("Database not available")
|
||||
return False
|
||||
|
||||
try:
|
||||
async with pool.acquire() as conn:
|
||||
for bundesland, data in SEED_DATA.items():
|
||||
# Create or update source
|
||||
source_id = generate_id()
|
||||
await upsert_zeugnis_source(
|
||||
id=source_id,
|
||||
bundesland=bundesland,
|
||||
name=data["name"],
|
||||
license_type=data["license"],
|
||||
training_allowed=data["training_allowed"],
|
||||
base_url=data.get("base_url"),
|
||||
)
|
||||
|
||||
# Get the actual source ID (might be existing)
|
||||
existing = await conn.fetchrow(
|
||||
"SELECT id FROM zeugnis_sources WHERE bundesland = $1",
|
||||
bundesland
|
||||
)
|
||||
if existing:
|
||||
source_id = existing["id"]
|
||||
|
||||
# Add seed URLs
|
||||
for url_data in data.get("urls", []):
|
||||
url_id = generate_id()
|
||||
await conn.execute(
|
||||
"""
|
||||
INSERT INTO zeugnis_seed_urls (id, source_id, url, doc_type, status)
|
||||
VALUES ($1, $2, $3, $4, 'pending')
|
||||
ON CONFLICT DO NOTHING
|
||||
""",
|
||||
url_id, source_id, url_data["url"], url_data["doc_type"]
|
||||
)
|
||||
|
||||
print(f"Populated {bundesland}: {len(data.get('urls', []))} URLs")
|
||||
|
||||
print("Seed data population complete!")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
print(f"Failed to populate seed data: {e}")
|
||||
return False
|
||||
|
||||
|
||||
def get_training_summary() -> Dict[str, List[str]]:
|
||||
"""Get summary of training permissions."""
|
||||
allowed = []
|
||||
not_allowed = []
|
||||
|
||||
for bundesland, data in SEED_DATA.items():
|
||||
name = data["name"]
|
||||
if data["training_allowed"]:
|
||||
allowed.append(f"{name} ({bundesland})")
|
||||
else:
|
||||
not_allowed.append(f"{name} ({bundesland})")
|
||||
|
||||
return {
|
||||
"training_allowed": sorted(allowed),
|
||||
"training_not_allowed": sorted(not_allowed),
|
||||
"total_allowed": len(allowed),
|
||||
"total_not_allowed": len(not_allowed),
|
||||
}
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import asyncio
|
||||
|
||||
print("=" * 60)
|
||||
print("Zeugnis Seed Data Summary")
|
||||
print("=" * 60)
|
||||
|
||||
summary = get_training_summary()
|
||||
print(f"\nTraining ALLOWED ({summary['total_allowed']} Bundesländer):")
|
||||
for bl in summary["training_allowed"]:
|
||||
print(f" ✓ {bl}")
|
||||
|
||||
print(f"\nTraining NOT ALLOWED ({summary['total_not_allowed']} Bundesländer):")
|
||||
for bl in summary["training_not_allowed"]:
|
||||
print(f" ✗ {bl}")
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print("To populate database, run:")
|
||||
print(" python -c 'import asyncio; from zeugnis_seed_data import populate_seed_data; asyncio.run(populate_seed_data())'")
|
||||
@@ -0,0 +1,180 @@
|
||||
"""
|
||||
Zeugnis Crawler - Embedding generation, MinIO upload, and Qdrant indexing.
|
||||
"""
|
||||
|
||||
import io
|
||||
import os
|
||||
import uuid
|
||||
from datetime import datetime
|
||||
from typing import Optional, List, Dict, Any
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Configuration
|
||||
# =============================================================================
|
||||
|
||||
QDRANT_URL = os.getenv("QDRANT_URL", "http://localhost:6333")
|
||||
MINIO_ENDPOINT = os.getenv("MINIO_ENDPOINT", "localhost:9000")
|
||||
MINIO_ACCESS_KEY = os.getenv("MINIO_ACCESS_KEY", "test-access-key")
|
||||
MINIO_SECRET_KEY = os.getenv("MINIO_SECRET_KEY", "test-secret-key")
|
||||
MINIO_BUCKET = os.getenv("MINIO_BUCKET", "breakpilot-rag")
|
||||
EMBEDDING_BACKEND = os.getenv("EMBEDDING_BACKEND", "local")
|
||||
|
||||
ZEUGNIS_COLLECTION = "bp_zeugnis"
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Embedding Generation
|
||||
# =============================================================================
|
||||
|
||||
_embedding_model = None
|
||||
|
||||
|
||||
def get_embedding_model():
|
||||
"""Get or initialize embedding model."""
|
||||
global _embedding_model
|
||||
if _embedding_model is None and EMBEDDING_BACKEND == "local":
|
||||
try:
|
||||
from sentence_transformers import SentenceTransformer
|
||||
_embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
|
||||
print("Loaded local embedding model: all-MiniLM-L6-v2")
|
||||
except ImportError:
|
||||
print("Warning: sentence-transformers not installed")
|
||||
return _embedding_model
|
||||
|
||||
|
||||
async def generate_embeddings(texts: List[str]) -> List[List[float]]:
|
||||
"""Generate embeddings for a list of texts."""
|
||||
if not texts:
|
||||
return []
|
||||
|
||||
if EMBEDDING_BACKEND == "local":
|
||||
model = get_embedding_model()
|
||||
if model:
|
||||
embeddings = model.encode(texts, show_progress_bar=False)
|
||||
return [emb.tolist() for emb in embeddings]
|
||||
return []
|
||||
|
||||
elif EMBEDDING_BACKEND == "openai":
|
||||
import openai
|
||||
api_key = os.getenv("OPENAI_API_KEY")
|
||||
if not api_key:
|
||||
print("Warning: OPENAI_API_KEY not set")
|
||||
return []
|
||||
|
||||
client = openai.AsyncOpenAI(api_key=api_key)
|
||||
response = await client.embeddings.create(
|
||||
input=texts,
|
||||
model="text-embedding-3-small"
|
||||
)
|
||||
return [item.embedding for item in response.data]
|
||||
|
||||
return []
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# MinIO Storage
|
||||
# =============================================================================
|
||||
|
||||
async def upload_to_minio(
|
||||
content: bytes,
|
||||
bundesland: str,
|
||||
filename: str,
|
||||
content_type: str = "application/pdf",
|
||||
year: Optional[int] = None,
|
||||
) -> Optional[str]:
|
||||
"""Upload document to MinIO."""
|
||||
try:
|
||||
from minio import Minio
|
||||
|
||||
client = Minio(
|
||||
MINIO_ENDPOINT,
|
||||
access_key=MINIO_ACCESS_KEY,
|
||||
secret_key=MINIO_SECRET_KEY,
|
||||
secure=os.getenv("MINIO_SECURE", "false").lower() == "true"
|
||||
)
|
||||
|
||||
# Ensure bucket exists
|
||||
if not client.bucket_exists(MINIO_BUCKET):
|
||||
client.make_bucket(MINIO_BUCKET)
|
||||
|
||||
# Build path
|
||||
year_str = str(year) if year else str(datetime.now().year)
|
||||
object_name = f"landes-daten/{bundesland}/zeugnis/{year_str}/{filename}"
|
||||
|
||||
# Upload
|
||||
client.put_object(
|
||||
MINIO_BUCKET,
|
||||
object_name,
|
||||
io.BytesIO(content),
|
||||
len(content),
|
||||
content_type=content_type,
|
||||
)
|
||||
|
||||
return object_name
|
||||
except Exception as e:
|
||||
print(f"MinIO upload failed: {e}")
|
||||
return None
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Qdrant Indexing
|
||||
# =============================================================================
|
||||
|
||||
async def index_in_qdrant(
|
||||
doc_id: str,
|
||||
chunks: List[str],
|
||||
embeddings: List[List[float]],
|
||||
metadata: Dict[str, Any],
|
||||
) -> int:
|
||||
"""Index document chunks in Qdrant."""
|
||||
try:
|
||||
from qdrant_client import QdrantClient
|
||||
from qdrant_client.models import VectorParams, Distance, PointStruct
|
||||
|
||||
client = QdrantClient(url=QDRANT_URL)
|
||||
|
||||
# Ensure collection exists
|
||||
collections = client.get_collections().collections
|
||||
if not any(c.name == ZEUGNIS_COLLECTION for c in collections):
|
||||
vector_size = len(embeddings[0]) if embeddings else 384
|
||||
client.create_collection(
|
||||
collection_name=ZEUGNIS_COLLECTION,
|
||||
vectors_config=VectorParams(
|
||||
size=vector_size,
|
||||
distance=Distance.COSINE,
|
||||
),
|
||||
)
|
||||
print(f"Created Qdrant collection: {ZEUGNIS_COLLECTION}")
|
||||
|
||||
# Create points
|
||||
points = []
|
||||
for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
|
||||
point_id = str(uuid.uuid4())
|
||||
points.append(PointStruct(
|
||||
id=point_id,
|
||||
vector=embedding,
|
||||
payload={
|
||||
"document_id": doc_id,
|
||||
"chunk_index": i,
|
||||
"chunk_text": chunk[:500], # Store first 500 chars for preview
|
||||
"bundesland": metadata.get("bundesland"),
|
||||
"doc_type": metadata.get("doc_type"),
|
||||
"title": metadata.get("title"),
|
||||
"source_url": metadata.get("url"),
|
||||
"training_allowed": metadata.get("training_allowed", False),
|
||||
"indexed_at": datetime.now().isoformat(),
|
||||
}
|
||||
))
|
||||
|
||||
# Upsert
|
||||
if points:
|
||||
client.upsert(
|
||||
collection_name=ZEUGNIS_COLLECTION,
|
||||
points=points,
|
||||
)
|
||||
|
||||
return len(points)
|
||||
except Exception as e:
|
||||
print(f"Qdrant indexing failed: {e}")
|
||||
return 0
|
||||
@@ -0,0 +1,110 @@
|
||||
"""
|
||||
Zeugnis Crawler - Text extraction, chunking, and hashing utilities.
|
||||
"""
|
||||
|
||||
import hashlib
|
||||
from typing import List
|
||||
|
||||
CHUNK_SIZE = 1000
|
||||
CHUNK_OVERLAP = 200
|
||||
|
||||
|
||||
def extract_text_from_pdf(content: bytes) -> str:
|
||||
"""Extract text from PDF bytes."""
|
||||
try:
|
||||
from PyPDF2 import PdfReader
|
||||
import io
|
||||
|
||||
reader = PdfReader(io.BytesIO(content))
|
||||
text_parts = []
|
||||
for page in reader.pages:
|
||||
text = page.extract_text()
|
||||
if text:
|
||||
text_parts.append(text)
|
||||
return "\n\n".join(text_parts)
|
||||
except Exception as e:
|
||||
print(f"PDF extraction failed: {e}")
|
||||
return ""
|
||||
|
||||
|
||||
def extract_text_from_html(content: bytes, encoding: str = "utf-8") -> str:
|
||||
"""Extract text from HTML bytes."""
|
||||
try:
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
html = content.decode(encoding, errors="replace")
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
|
||||
# Remove script and style elements
|
||||
for element in soup(["script", "style", "nav", "header", "footer"]):
|
||||
element.decompose()
|
||||
|
||||
# Get text
|
||||
text = soup.get_text(separator="\n", strip=True)
|
||||
|
||||
# Clean up whitespace
|
||||
lines = [line.strip() for line in text.splitlines() if line.strip()]
|
||||
return "\n".join(lines)
|
||||
except Exception as e:
|
||||
print(f"HTML extraction failed: {e}")
|
||||
return ""
|
||||
|
||||
|
||||
def chunk_text(text: str, chunk_size: int = CHUNK_SIZE, overlap: int = CHUNK_OVERLAP) -> List[str]:
|
||||
"""Split text into overlapping chunks."""
|
||||
if not text:
|
||||
return []
|
||||
|
||||
chunks = []
|
||||
separators = ["\n\n", "\n", ". ", " "]
|
||||
|
||||
def split_recursive(text: str, sep_index: int = 0) -> List[str]:
|
||||
if len(text) <= chunk_size:
|
||||
return [text] if text.strip() else []
|
||||
|
||||
if sep_index >= len(separators):
|
||||
# Force split at chunk_size
|
||||
result = []
|
||||
for i in range(0, len(text), chunk_size - overlap):
|
||||
chunk = text[i:i + chunk_size]
|
||||
if chunk.strip():
|
||||
result.append(chunk)
|
||||
return result
|
||||
|
||||
sep = separators[sep_index]
|
||||
parts = text.split(sep)
|
||||
result = []
|
||||
current = ""
|
||||
|
||||
for part in parts:
|
||||
if len(current) + len(sep) + len(part) <= chunk_size:
|
||||
current = current + sep + part if current else part
|
||||
else:
|
||||
if current.strip():
|
||||
result.extend(split_recursive(current, sep_index + 1) if len(current) > chunk_size else [current])
|
||||
current = part
|
||||
|
||||
if current.strip():
|
||||
result.extend(split_recursive(current, sep_index + 1) if len(current) > chunk_size else [current])
|
||||
|
||||
return result
|
||||
|
||||
chunks = split_recursive(text)
|
||||
|
||||
# Add overlap
|
||||
if overlap > 0 and len(chunks) > 1:
|
||||
overlapped = []
|
||||
for i, chunk in enumerate(chunks):
|
||||
if i > 0:
|
||||
# Add end of previous chunk
|
||||
prev_end = chunks[i - 1][-overlap:]
|
||||
chunk = prev_end + chunk
|
||||
overlapped.append(chunk)
|
||||
chunks = overlapped
|
||||
|
||||
return chunks
|
||||
|
||||
|
||||
def compute_hash(content: bytes) -> str:
|
||||
"""Compute SHA-256 hash of content."""
|
||||
return hashlib.sha256(content).hexdigest()
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user