[split-required] Split final batch of monoliths >1000 LOC

Python (6 files in klausur-service):
- rbac.py (1,132 → 4), admin_api.py (1,012 → 4)
- routes/eh.py (1,111 → 4), ocr_pipeline_geometry.py (1,105 → 5)

Python (2 files in backend-lehrer):
- unit_api.py (1,226 → 6), game_api.py (1,129 → 5)

Website (6 page files):
- 4x klausur-korrektur pages (1,249-1,328 LOC each) → shared components
  in website/components/klausur-korrektur/ (17 shared files)
- companion (1,057 → 10), magic-help (1,017 → 8)

All re-export barrels preserve backward compatibility.
Zero import errors verified.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-04-24 23:17:30 +02:00
parent b2a0126f14
commit 6811264756
67 changed files with 12270 additions and 13651 deletions

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,316 @@
"""
Admin API - NiBiS Ingestion & Search
Endpoints for NiBiS data discovery, ingestion, search, and statistics.
Extracted from admin_api.py for file-size compliance.
"""
from fastapi import APIRouter, HTTPException, BackgroundTasks, Query
from pydantic import BaseModel
from typing import Optional, List, Dict
from datetime import datetime
from nibis_ingestion import (
run_ingestion,
discover_documents,
extract_zip_files,
DOCS_BASE_PATH,
)
from qdrant_service import QdrantService, search_nibis_eh, get_qdrant_client
from eh_pipeline import generate_single_embedding
router = APIRouter(prefix="/api/v1/admin", tags=["Admin"])
# Store for background task status
_ingestion_status: Dict = {
"running": False,
"last_run": None,
"last_result": None,
}
# =============================================================================
# Models
# =============================================================================
class IngestionRequest(BaseModel):
ewh_only: bool = True
year_filter: Optional[int] = None
subject_filter: Optional[str] = None
class IngestionStatus(BaseModel):
running: bool
last_run: Optional[str]
documents_indexed: Optional[int]
chunks_created: Optional[int]
errors: Optional[List[str]]
class NiBiSSearchRequest(BaseModel):
query: str
year: Optional[int] = None
subject: Optional[str] = None
niveau: Optional[str] = None
limit: int = 5
class NiBiSSearchResult(BaseModel):
id: str
score: float
text: str
year: Optional[int]
subject: Optional[str]
niveau: Optional[str]
task_number: Optional[int]
class DataSourceStats(BaseModel):
source_dir: str
year: int
document_count: int
subjects: List[str]
# =============================================================================
# Endpoints
# =============================================================================
@router.get("/nibis/status", response_model=IngestionStatus)
async def get_ingestion_status():
"""Get status of NiBiS ingestion pipeline."""
last_result = _ingestion_status.get("last_result") or {}
return IngestionStatus(
running=_ingestion_status["running"],
last_run=_ingestion_status.get("last_run"),
documents_indexed=last_result.get("documents_indexed"),
chunks_created=last_result.get("chunks_created"),
errors=(last_result.get("errors") or [])[:10],
)
@router.post("/nibis/extract-zips")
async def extract_zip_files_endpoint():
"""Extract all ZIP files in za-download directories."""
try:
extracted = extract_zip_files(DOCS_BASE_PATH)
return {
"status": "success",
"extracted_count": len(extracted),
"directories": [str(d) for d in extracted],
}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@router.get("/nibis/discover")
async def discover_nibis_documents(
ewh_only: bool = Query(True, description="Only return Erwartungshorizonte"),
year: Optional[int] = Query(None, description="Filter by year"),
subject: Optional[str] = Query(None, description="Filter by subject"),
):
"""
Discover available NiBiS documents without indexing.
Useful for previewing what will be indexed.
"""
try:
documents = discover_documents(DOCS_BASE_PATH, ewh_only=ewh_only)
# Apply filters
if year:
documents = [d for d in documents if d.year == year]
if subject:
documents = [d for d in documents if subject.lower() in d.subject.lower()]
# Group by year and subject
by_year: Dict[int, int] = {}
by_subject: Dict[str, int] = {}
for doc in documents:
by_year[doc.year] = by_year.get(doc.year, 0) + 1
by_subject[doc.subject] = by_subject.get(doc.subject, 0) + 1
return {
"total_documents": len(documents),
"by_year": dict(sorted(by_year.items())),
"by_subject": dict(sorted(by_subject.items(), key=lambda x: -x[1])),
"sample_documents": [
{
"id": d.id,
"filename": d.raw_filename,
"year": d.year,
"subject": d.subject,
"niveau": d.niveau,
"doc_type": d.doc_type,
}
for d in documents[:20]
],
}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@router.post("/nibis/ingest")
async def start_ingestion(
request: IngestionRequest,
background_tasks: BackgroundTasks,
):
"""
Start NiBiS data ingestion in background.
"""
if _ingestion_status["running"]:
raise HTTPException(
status_code=409,
detail="Ingestion already running. Check /nibis/status for progress."
)
async def run_ingestion_task():
global _ingestion_status
_ingestion_status["running"] = True
_ingestion_status["last_run"] = datetime.now().isoformat()
try:
result = await run_ingestion(
ewh_only=request.ewh_only,
dry_run=False,
year_filter=request.year_filter,
subject_filter=request.subject_filter,
)
_ingestion_status["last_result"] = result
except Exception as e:
_ingestion_status["last_result"] = {"error": str(e), "errors": [str(e)]}
finally:
_ingestion_status["running"] = False
background_tasks.add_task(run_ingestion_task)
return {
"status": "started",
"message": "Ingestion started in background. Check /nibis/status for progress.",
"filters": {
"ewh_only": request.ewh_only,
"year": request.year_filter,
"subject": request.subject_filter,
},
}
@router.post("/nibis/search", response_model=List[NiBiSSearchResult])
async def search_nibis(request: NiBiSSearchRequest):
"""
Semantic search in NiBiS Erwartungshorizonte.
"""
try:
query_embedding = await generate_single_embedding(request.query)
if not query_embedding:
raise HTTPException(status_code=500, detail="Failed to generate embedding")
results = await search_nibis_eh(
query_embedding=query_embedding,
year=request.year,
subject=request.subject,
niveau=request.niveau,
limit=request.limit,
)
return [
NiBiSSearchResult(
id=r["id"],
score=r["score"],
text=r.get("text", "")[:500],
year=r.get("year"),
subject=r.get("subject"),
niveau=r.get("niveau"),
task_number=r.get("task_number"),
)
for r in results
]
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@router.get("/nibis/collections")
async def get_collections_info():
"""Get information about all Qdrant collections."""
try:
client = get_qdrant_client()
collections = client.get_collections().collections
result = []
for c in collections:
try:
info = client.get_collection(c.name)
result.append({
"name": c.name,
"vectors_count": info.vectors_count,
"points_count": info.points_count,
"status": info.status.value,
})
except Exception as e:
result.append({
"name": c.name,
"error": str(e),
})
return {"collections": result}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@router.get("/nibis/stats")
async def get_nibis_stats():
"""Get detailed statistics about indexed NiBiS data."""
try:
qdrant = QdrantService()
stats = await qdrant.get_stats("bp_nibis_eh")
if "error" in stats:
return {
"indexed": False,
"message": "NiBiS collection not yet created. Run ingestion first.",
}
client = get_qdrant_client()
scroll_result = client.scroll(
collection_name="bp_nibis_eh",
limit=1000,
with_payload=True,
with_vectors=False,
)
years = set()
subjects = set()
niveaus = set()
for point in scroll_result[0]:
if point.payload:
if "year" in point.payload:
years.add(point.payload["year"])
if "subject" in point.payload:
subjects.add(point.payload["subject"])
if "niveau" in point.payload:
niveaus.add(point.payload["niveau"])
return {
"indexed": True,
"total_chunks": stats.get("points_count", 0),
"years": sorted(list(years)),
"subjects": sorted(list(subjects)),
"niveaus": sorted(list(niveaus)),
}
except Exception as e:
return {
"indexed": False,
"error": str(e),
}
@router.delete("/nibis/collection")
async def delete_nibis_collection():
"""Delete the entire NiBiS collection. WARNING: removes all indexed data!"""
try:
client = get_qdrant_client()
client.delete_collection("bp_nibis_eh")
return {"status": "deleted", "collection": "bp_nibis_eh"}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))

View File

@@ -0,0 +1,281 @@
"""
Admin API - RAG Upload & Metrics
Endpoints for uploading documents, tracking uploads, RAG metrics,
search feedback, storage stats, and service initialization.
Extracted from admin_api.py for file-size compliance.
"""
from fastapi import APIRouter, HTTPException, BackgroundTasks, Query, UploadFile, File, Form
from pydantic import BaseModel
from typing import Optional, List, Dict
from datetime import datetime
from pathlib import Path
import zipfile
import tempfile
import os
from nibis_ingestion import run_ingestion, DOCS_BASE_PATH
# Import ingestion status from nibis module for auto-ingest
from admin_nibis import _ingestion_status
# Optional: MinIO and PostgreSQL integrations
try:
from minio_storage import upload_rag_document, get_storage_stats, init_minio_bucket
MINIO_AVAILABLE = True
except ImportError:
MINIO_AVAILABLE = False
try:
from metrics_db import (
init_metrics_tables, store_feedback, log_search, log_upload,
calculate_metrics, get_recent_feedback, get_upload_history
)
METRICS_DB_AVAILABLE = True
except ImportError:
METRICS_DB_AVAILABLE = False
router = APIRouter(prefix="/api/v1/admin", tags=["Admin"])
# Upload directory configuration
RAG_UPLOAD_BASE = Path(os.getenv("RAG_UPLOAD_BASE", str(DOCS_BASE_PATH)))
# Store for upload tracking
_upload_history: List[Dict] = []
class UploadResult(BaseModel):
status: str
files_received: int
pdfs_extracted: int
target_directory: str
errors: List[str]
@router.post("/rag/upload", response_model=UploadResult)
async def upload_rag_documents(
background_tasks: BackgroundTasks,
file: UploadFile = File(...),
collection: str = Form(default="bp_nibis_eh"),
year: Optional[int] = Form(default=None),
auto_ingest: bool = Form(default=False),
):
"""
Upload documents for RAG indexing.
Supports:
- ZIP archives (automatically extracted)
- Individual PDF files
"""
errors = []
pdfs_extracted = 0
# Determine target year
target_year = year or datetime.now().year
# Target directory: za-download/YYYY/
target_dir = RAG_UPLOAD_BASE / "za-download" / str(target_year)
target_dir.mkdir(parents=True, exist_ok=True)
try:
filename = file.filename or "upload"
if filename.lower().endswith(".zip"):
# Handle ZIP file
with tempfile.NamedTemporaryFile(delete=False, suffix=".zip") as tmp:
content = await file.read()
tmp.write(content)
tmp_path = tmp.name
try:
with zipfile.ZipFile(tmp_path, 'r') as zf:
for member in zf.namelist():
if member.lower().endswith(".pdf") and not member.startswith("__MACOSX"):
pdf_name = Path(member).name
if pdf_name:
target_path = target_dir / pdf_name
with zf.open(member) as src:
with open(target_path, 'wb') as dst:
dst.write(src.read())
pdfs_extracted += 1
finally:
os.unlink(tmp_path)
elif filename.lower().endswith(".pdf"):
target_path = target_dir / filename
content = await file.read()
with open(target_path, 'wb') as f:
f.write(content)
pdfs_extracted = 1
else:
raise HTTPException(
status_code=400,
detail=f"Unsupported file type: {filename}. Only .zip and .pdf are allowed."
)
# Track upload in memory
upload_record = {
"timestamp": datetime.now().isoformat(),
"filename": filename,
"collection": collection,
"year": target_year,
"pdfs_extracted": pdfs_extracted,
"target_directory": str(target_dir),
}
_upload_history.append(upload_record)
# Keep only last 100 uploads in memory
if len(_upload_history) > 100:
_upload_history.pop(0)
# Store in PostgreSQL if available
if METRICS_DB_AVAILABLE:
await log_upload(
filename=filename,
collection_name=collection,
year=target_year,
pdfs_extracted=pdfs_extracted,
minio_path=str(target_dir),
)
# Auto-ingest if requested
if auto_ingest and not _ingestion_status["running"]:
async def run_auto_ingest():
global _ingestion_status
_ingestion_status["running"] = True
_ingestion_status["last_run"] = datetime.now().isoformat()
try:
result = await run_ingestion(
ewh_only=True,
dry_run=False,
year_filter=target_year,
)
_ingestion_status["last_result"] = result
except Exception as e:
_ingestion_status["last_result"] = {"error": str(e), "errors": [str(e)]}
finally:
_ingestion_status["running"] = False
background_tasks.add_task(run_auto_ingest)
return UploadResult(
status="success",
files_received=1,
pdfs_extracted=pdfs_extracted,
target_directory=str(target_dir),
errors=errors,
)
except HTTPException:
raise
except Exception as e:
errors.append(str(e))
raise HTTPException(status_code=500, detail=str(e))
@router.get("/rag/upload/history")
async def get_upload_history_endpoint(limit: int = Query(default=20, le=100)):
"""Get recent upload history."""
return {
"uploads": _upload_history[-limit:][::-1],
"total": len(_upload_history),
}
@router.get("/rag/metrics")
async def get_rag_metrics(
collection: Optional[str] = Query(default=None),
days: int = Query(default=7, le=90),
):
"""Get RAG quality metrics."""
if METRICS_DB_AVAILABLE:
metrics = await calculate_metrics(collection_name=collection, days=days)
if metrics.get("connected"):
return metrics
# Fallback: Return placeholder metrics
return {
"precision_at_5": 0.78,
"recall_at_10": 0.85,
"mrr": 0.72,
"avg_latency_ms": 52,
"total_ratings": len(_upload_history),
"error_rate": 0.3,
"score_distribution": {
"0.9+": 23,
"0.7-0.9": 41,
"0.5-0.7": 28,
"<0.5": 8,
},
"note": "Placeholder metrics - PostgreSQL not connected",
"connected": False,
}
@router.post("/rag/search/feedback")
async def submit_search_feedback(
result_id: str = Form(...),
rating: int = Form(..., ge=1, le=5),
notes: Optional[str] = Form(default=None),
query: Optional[str] = Form(default=None),
collection: Optional[str] = Form(default=None),
score: Optional[float] = Form(default=None),
):
"""Submit feedback for a search result."""
feedback_record = {
"timestamp": datetime.now().isoformat(),
"result_id": result_id,
"rating": rating,
"notes": notes,
}
stored = False
if METRICS_DB_AVAILABLE:
stored = await store_feedback(
result_id=result_id,
rating=rating,
query_text=query,
collection_name=collection,
score=score,
notes=notes,
)
return {
"status": "stored" if stored else "received",
"feedback": feedback_record,
"persisted": stored,
}
@router.get("/rag/storage/stats")
async def get_storage_statistics():
"""Get MinIO storage statistics."""
if MINIO_AVAILABLE:
stats = await get_storage_stats()
return stats
return {
"error": "MinIO not available",
"connected": False,
}
@router.post("/rag/init")
async def initialize_rag_services():
"""Initialize RAG services (MinIO bucket, PostgreSQL tables)."""
results = {
"minio": False,
"postgres": False,
}
if MINIO_AVAILABLE:
results["minio"] = await init_minio_bucket()
if METRICS_DB_AVAILABLE:
results["postgres"] = await init_metrics_tables()
return {
"status": "initialized",
"services": results,
}

View File

@@ -0,0 +1,389 @@
"""
Admin API - Legal Templates
Endpoints for legal template ingestion, search, source management,
license info, and collection management.
Extracted from admin_api.py for file-size compliance.
"""
from fastapi import APIRouter, HTTPException, BackgroundTasks, Query
from pydantic import BaseModel
from typing import Optional, List, Dict
from datetime import datetime
from eh_pipeline import generate_single_embedding
# Import legal templates modules
try:
from legal_templates_ingestion import (
LegalTemplatesIngestion,
LEGAL_TEMPLATES_COLLECTION,
)
from template_sources import (
TEMPLATE_SOURCES,
TEMPLATE_TYPES,
JURISDICTIONS,
LicenseType,
get_enabled_sources,
get_sources_by_priority,
)
from qdrant_service import (
search_legal_templates,
get_legal_templates_stats,
init_legal_templates_collection,
)
LEGAL_TEMPLATES_AVAILABLE = True
except ImportError as e:
print(f"Legal templates module not available: {e}")
LEGAL_TEMPLATES_AVAILABLE = False
router = APIRouter(prefix="/api/v1/admin", tags=["Admin"])
# Store for templates ingestion status
_templates_ingestion_status: Dict = {
"running": False,
"last_run": None,
"current_source": None,
"results": {},
}
class TemplatesSearchRequest(BaseModel):
query: str
template_type: Optional[str] = None
license_types: Optional[List[str]] = None
language: Optional[str] = None
jurisdiction: Optional[str] = None
attribution_required: Optional[bool] = None
limit: int = 10
class TemplatesSearchResult(BaseModel):
id: str
score: float
text: str
document_title: Optional[str]
template_type: Optional[str]
clause_category: Optional[str]
language: Optional[str]
jurisdiction: Optional[str]
license_id: Optional[str]
license_name: Optional[str]
attribution_required: Optional[bool]
attribution_text: Optional[str]
source_name: Optional[str]
source_url: Optional[str]
placeholders: Optional[List[str]]
is_complete_document: Optional[bool]
requires_customization: Optional[bool]
class SourceIngestRequest(BaseModel):
source_name: str
@router.get("/templates/status")
async def get_templates_status():
"""Get status of legal templates collection and ingestion."""
if not LEGAL_TEMPLATES_AVAILABLE:
return {
"available": False,
"error": "Legal templates module not available",
}
try:
stats = await get_legal_templates_stats()
return {
"available": True,
"collection": LEGAL_TEMPLATES_COLLECTION,
"ingestion": {
"running": _templates_ingestion_status["running"],
"last_run": _templates_ingestion_status.get("last_run"),
"current_source": _templates_ingestion_status.get("current_source"),
"results": _templates_ingestion_status.get("results", {}),
},
"stats": stats,
}
except Exception as e:
return {
"available": True,
"error": str(e),
"ingestion": _templates_ingestion_status,
}
@router.get("/templates/sources")
async def get_templates_sources():
"""Get list of all template sources with their configuration."""
if not LEGAL_TEMPLATES_AVAILABLE:
raise HTTPException(status_code=503, detail="Legal templates module not available")
sources = []
for source in TEMPLATE_SOURCES:
sources.append({
"name": source.name,
"description": source.description,
"license_type": source.license_type.value,
"license_name": source.license_info.name,
"template_types": source.template_types,
"languages": source.languages,
"jurisdiction": source.jurisdiction,
"repo_url": source.repo_url,
"web_url": source.web_url,
"priority": source.priority,
"enabled": source.enabled,
"attribution_required": source.license_info.attribution_required,
})
return {
"sources": sources,
"total": len(sources),
"enabled": len([s for s in TEMPLATE_SOURCES if s.enabled]),
"template_types": TEMPLATE_TYPES,
"jurisdictions": JURISDICTIONS,
}
@router.get("/templates/licenses")
async def get_templates_licenses():
"""Get license statistics for indexed templates."""
if not LEGAL_TEMPLATES_AVAILABLE:
raise HTTPException(status_code=503, detail="Legal templates module not available")
try:
stats = await get_legal_templates_stats()
return {
"licenses": stats.get("licenses", {}),
"total_chunks": stats.get("points_count", 0),
}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@router.post("/templates/ingest")
async def start_templates_ingestion(
background_tasks: BackgroundTasks,
max_priority: int = Query(default=3, ge=1, le=5, description="Maximum priority level (1=highest)"),
):
"""
Start legal templates ingestion in background.
Ingests all enabled sources up to the specified priority level.
"""
if not LEGAL_TEMPLATES_AVAILABLE:
raise HTTPException(status_code=503, detail="Legal templates module not available")
if _templates_ingestion_status["running"]:
raise HTTPException(
status_code=409,
detail="Templates ingestion already running. Check /templates/status for progress."
)
async def run_templates_ingestion():
global _templates_ingestion_status
_templates_ingestion_status["running"] = True
_templates_ingestion_status["last_run"] = datetime.now().isoformat()
_templates_ingestion_status["results"] = {}
try:
ingestion = LegalTemplatesIngestion()
sources = get_sources_by_priority(max_priority)
for source in sources:
_templates_ingestion_status["current_source"] = source.name
try:
status = await ingestion.ingest_source(source)
_templates_ingestion_status["results"][source.name] = {
"status": status.status,
"documents_found": status.documents_found,
"chunks_indexed": status.chunks_indexed,
"errors": status.errors[:5] if status.errors else [],
}
except Exception as e:
_templates_ingestion_status["results"][source.name] = {
"status": "failed",
"error": str(e),
}
await ingestion.close()
except Exception as e:
_templates_ingestion_status["results"]["_global_error"] = str(e)
finally:
_templates_ingestion_status["running"] = False
_templates_ingestion_status["current_source"] = None
background_tasks.add_task(run_templates_ingestion)
sources = get_sources_by_priority(max_priority)
return {
"status": "started",
"message": f"Ingesting {len(sources)} sources up to priority {max_priority}",
"sources": [s.name for s in sources],
}
@router.post("/templates/ingest-source")
async def ingest_single_source(
request: SourceIngestRequest,
background_tasks: BackgroundTasks,
):
"""Ingest a single template source by name."""
if not LEGAL_TEMPLATES_AVAILABLE:
raise HTTPException(status_code=503, detail="Legal templates module not available")
source = next((s for s in TEMPLATE_SOURCES if s.name == request.source_name), None)
if not source:
raise HTTPException(
status_code=404,
detail=f"Source not found: {request.source_name}. Use /templates/sources to list available sources."
)
if not source.enabled:
raise HTTPException(
status_code=400,
detail=f"Source is disabled: {request.source_name}"
)
if _templates_ingestion_status["running"]:
raise HTTPException(
status_code=409,
detail="Templates ingestion already running."
)
async def run_single_ingestion():
global _templates_ingestion_status
_templates_ingestion_status["running"] = True
_templates_ingestion_status["current_source"] = source.name
_templates_ingestion_status["last_run"] = datetime.now().isoformat()
try:
ingestion = LegalTemplatesIngestion()
status = await ingestion.ingest_source(source)
_templates_ingestion_status["results"][source.name] = {
"status": status.status,
"documents_found": status.documents_found,
"chunks_indexed": status.chunks_indexed,
"errors": status.errors[:5] if status.errors else [],
}
await ingestion.close()
except Exception as e:
_templates_ingestion_status["results"][source.name] = {
"status": "failed",
"error": str(e),
}
finally:
_templates_ingestion_status["running"] = False
_templates_ingestion_status["current_source"] = None
background_tasks.add_task(run_single_ingestion)
return {
"status": "started",
"source": source.name,
"license": source.license_type.value,
"template_types": source.template_types,
}
@router.post("/templates/search", response_model=List[TemplatesSearchResult])
async def search_templates(request: TemplatesSearchRequest):
"""Semantic search in legal templates collection."""
if not LEGAL_TEMPLATES_AVAILABLE:
raise HTTPException(status_code=503, detail="Legal templates module not available")
try:
query_embedding = await generate_single_embedding(request.query)
if not query_embedding:
raise HTTPException(status_code=500, detail="Failed to generate embedding")
results = await search_legal_templates(
query_embedding=query_embedding,
template_type=request.template_type,
license_types=request.license_types,
language=request.language,
jurisdiction=request.jurisdiction,
attribution_required=request.attribution_required,
limit=request.limit,
)
return [
TemplatesSearchResult(
id=r["id"],
score=r["score"],
text=r.get("text", "")[:1000],
document_title=r.get("document_title"),
template_type=r.get("template_type"),
clause_category=r.get("clause_category"),
language=r.get("language"),
jurisdiction=r.get("jurisdiction"),
license_id=r.get("license_id"),
license_name=r.get("license_name"),
attribution_required=r.get("attribution_required"),
attribution_text=r.get("attribution_text"),
source_name=r.get("source_name"),
source_url=r.get("source_url"),
placeholders=r.get("placeholders"),
is_complete_document=r.get("is_complete_document"),
requires_customization=r.get("requires_customization"),
)
for r in results
]
except HTTPException:
raise
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@router.delete("/templates/reset")
async def reset_templates_collection():
"""Delete and recreate the legal templates collection."""
if not LEGAL_TEMPLATES_AVAILABLE:
raise HTTPException(status_code=503, detail="Legal templates module not available")
if _templates_ingestion_status["running"]:
raise HTTPException(
status_code=409,
detail="Cannot reset while ingestion is running"
)
try:
ingestion = LegalTemplatesIngestion()
ingestion.reset_collection()
await ingestion.close()
_templates_ingestion_status["results"] = {}
return {
"status": "reset",
"collection": LEGAL_TEMPLATES_COLLECTION,
"message": "Collection deleted and recreated. Run ingestion to populate.",
}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@router.delete("/templates/source/{source_name}")
async def delete_templates_source(source_name: str):
"""Delete all templates from a specific source."""
if not LEGAL_TEMPLATES_AVAILABLE:
raise HTTPException(status_code=503, detail="Legal templates module not available")
try:
from qdrant_service import delete_legal_templates_by_source
count = await delete_legal_templates_by_source(source_name)
if source_name in _templates_ingestion_status.get("results", {}):
del _templates_ingestion_status["results"][source_name]
return {
"status": "deleted",
"source": source_name,
"chunks_deleted": count,
}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))

View File

@@ -0,0 +1,293 @@
"""
OCR Pipeline Column Detection Endpoints (Step 5)
Detect invisible columns, manual column override, and ground truth.
Extracted from ocr_pipeline_geometry.py for file-size compliance.
"""
import logging
import time
from dataclasses import asdict
from datetime import datetime
from typing import Dict, List
import cv2
from fastapi import APIRouter, HTTPException
from cv_vocab_pipeline import (
_detect_header_footer_gaps,
_detect_sub_columns,
classify_column_types,
create_layout_image,
create_ocr_image,
analyze_layout,
detect_column_geometry_zoned,
expand_narrow_columns,
)
from ocr_pipeline_session_store import (
get_session_db,
update_session_db,
)
from ocr_pipeline_common import (
_cache,
_load_session_to_cache,
_get_cached,
_append_pipeline_log,
ManualColumnsRequest,
ColumnGroundTruthRequest,
)
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/api/v1/ocr-pipeline", tags=["ocr-pipeline"])
@router.post("/sessions/{session_id}/columns")
async def detect_columns(session_id: str):
"""Run column detection on the cropped (or dewarped) image."""
if session_id not in _cache:
await _load_session_to_cache(session_id)
cached = _get_cached(session_id)
img_bgr = cached.get("cropped_bgr") if cached.get("cropped_bgr") is not None else cached.get("dewarped_bgr")
if img_bgr is None:
raise HTTPException(status_code=400, detail="Crop or dewarp must be completed before column detection")
# -----------------------------------------------------------------------
# Sub-sessions (box crops): skip column detection entirely.
# Instead, create a single pseudo-column spanning the full image width.
# Also run Tesseract + binarization here so that the row detection step
# can reuse the cached intermediates (_word_dicts, _inv, _content_bounds)
# instead of falling back to detect_column_geometry() which may fail
# on small box images with < 5 words.
# -----------------------------------------------------------------------
session = await get_session_db(session_id)
if session and session.get("parent_session_id"):
h, w = img_bgr.shape[:2]
# Binarize + invert for row detection (horizontal projection profile)
ocr_img = create_ocr_image(img_bgr)
inv = cv2.bitwise_not(ocr_img)
# Run Tesseract to get word bounding boxes.
try:
from PIL import Image as PILImage
pil_img = PILImage.fromarray(cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB))
import pytesseract
data = pytesseract.image_to_data(pil_img, lang='eng+deu', output_type=pytesseract.Output.DICT)
word_dicts = []
for i in range(len(data['text'])):
conf = int(data['conf'][i]) if str(data['conf'][i]).lstrip('-').isdigit() else -1
text = str(data['text'][i]).strip()
if conf < 30 or not text:
continue
word_dicts.append({
'text': text, 'conf': conf,
'left': int(data['left'][i]),
'top': int(data['top'][i]),
'width': int(data['width'][i]),
'height': int(data['height'][i]),
})
# Log all words including low-confidence ones for debugging
all_count = sum(1 for i in range(len(data['text']))
if str(data['text'][i]).strip())
low_conf = [(str(data['text'][i]).strip(), int(data['conf'][i]) if str(data['conf'][i]).lstrip('-').isdigit() else -1)
for i in range(len(data['text']))
if str(data['text'][i]).strip()
and (int(data['conf'][i]) if str(data['conf'][i]).lstrip('-').isdigit() else -1) < 30
and (int(data['conf'][i]) if str(data['conf'][i]).lstrip('-').isdigit() else -1) >= 0]
if low_conf:
logger.info(f"OCR Pipeline: sub-session {session_id}: {len(low_conf)} words below conf 30: {low_conf[:20]}")
logger.info(f"OCR Pipeline: sub-session {session_id}: Tesseract found {len(word_dicts)}/{all_count} words (conf>=30)")
except Exception as e:
logger.warning(f"OCR Pipeline: sub-session {session_id}: Tesseract failed: {e}")
word_dicts = []
# Cache intermediates for row detection (detect_rows reuses these)
cached["_word_dicts"] = word_dicts
cached["_inv"] = inv
cached["_content_bounds"] = (0, w, 0, h)
column_result = {
"columns": [{
"type": "column_text",
"x": 0, "y": 0,
"width": w, "height": h,
}],
"zones": None,
"boxes_detected": 0,
"duration_seconds": 0,
"method": "sub_session_pseudo_column",
}
await update_session_db(
session_id,
column_result=column_result,
row_result=None,
word_result=None,
current_step=6,
)
cached["column_result"] = column_result
cached.pop("row_result", None)
cached.pop("word_result", None)
logger.info(f"OCR Pipeline: sub-session {session_id}: pseudo-column {w}x{h}px")
return {"session_id": session_id, **column_result}
t0 = time.time()
# Binarized image for layout analysis
ocr_img = create_ocr_image(img_bgr)
h, w = ocr_img.shape[:2]
# Phase A: Zone-aware geometry detection
zoned_result = detect_column_geometry_zoned(ocr_img, img_bgr)
boxes_detected = 0
if zoned_result is None:
# Fallback to projection-based layout
layout_img = create_layout_image(img_bgr)
regions = analyze_layout(layout_img, ocr_img)
zones_data = None
else:
geometries, left_x, right_x, top_y, bottom_y, word_dicts, inv, zones_data, boxes = zoned_result
content_w = right_x - left_x
boxes_detected = len(boxes)
# Cache intermediates for row detection (avoids second Tesseract run)
cached["_word_dicts"] = word_dicts
cached["_inv"] = inv
cached["_content_bounds"] = (left_x, right_x, top_y, bottom_y)
cached["_zones_data"] = zones_data
cached["_boxes_detected"] = boxes_detected
# Detect header/footer early so sub-column clustering ignores them
header_y, footer_y = _detect_header_footer_gaps(inv, w, h) if inv is not None else (None, None)
# Split sub-columns (e.g. page references) before classification
geometries = _detect_sub_columns(geometries, content_w, left_x=left_x,
top_y=top_y, header_y=header_y, footer_y=footer_y)
# Expand narrow columns (sub-columns are often very narrow)
geometries = expand_narrow_columns(geometries, content_w, left_x, word_dicts)
# Phase B: Content-based classification
regions = classify_column_types(geometries, content_w, top_y, w, h, bottom_y,
left_x=left_x, right_x=right_x, inv=inv)
duration = time.time() - t0
columns = [asdict(r) for r in regions]
# Determine classification methods used
methods = list(set(
c.get("classification_method", "") for c in columns
if c.get("classification_method")
))
column_result = {
"columns": columns,
"classification_methods": methods,
"duration_seconds": round(duration, 2),
"boxes_detected": boxes_detected,
}
# Add zone data when boxes are present
if zones_data and boxes_detected > 0:
column_result["zones"] = zones_data
# Persist to DB -- also invalidate downstream results (rows, words)
await update_session_db(
session_id,
column_result=column_result,
row_result=None,
word_result=None,
current_step=6,
)
# Update cache
cached["column_result"] = column_result
cached.pop("row_result", None)
cached.pop("word_result", None)
col_count = len([c for c in columns if c["type"].startswith("column")])
logger.info(f"OCR Pipeline: columns session {session_id}: "
f"{col_count} columns detected, {boxes_detected} box(es) ({duration:.2f}s)")
img_w = img_bgr.shape[1]
await _append_pipeline_log(session_id, "columns", {
"total_columns": len(columns),
"column_widths_pct": [round(c["width"] / img_w * 100, 1) for c in columns],
"column_types": [c["type"] for c in columns],
"boxes_detected": boxes_detected,
}, duration_ms=int(duration * 1000))
return {
"session_id": session_id,
**column_result,
}
@router.post("/sessions/{session_id}/columns/manual")
async def set_manual_columns(session_id: str, req: ManualColumnsRequest):
"""Override detected columns with manual definitions."""
column_result = {
"columns": req.columns,
"duration_seconds": 0,
"method": "manual",
}
await update_session_db(session_id, column_result=column_result,
row_result=None, word_result=None)
if session_id in _cache:
_cache[session_id]["column_result"] = column_result
_cache[session_id].pop("row_result", None)
_cache[session_id].pop("word_result", None)
logger.info(f"OCR Pipeline: manual columns session {session_id}: "
f"{len(req.columns)} columns set")
return {"session_id": session_id, **column_result}
@router.post("/sessions/{session_id}/ground-truth/columns")
async def save_column_ground_truth(session_id: str, req: ColumnGroundTruthRequest):
"""Save ground truth feedback for the column detection step."""
session = await get_session_db(session_id)
if not session:
raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
ground_truth = session.get("ground_truth") or {}
gt = {
"is_correct": req.is_correct,
"corrected_columns": req.corrected_columns,
"notes": req.notes,
"saved_at": datetime.utcnow().isoformat(),
"column_result": session.get("column_result"),
}
ground_truth["columns"] = gt
await update_session_db(session_id, ground_truth=ground_truth)
if session_id in _cache:
_cache[session_id]["ground_truth"] = ground_truth
return {"session_id": session_id, "ground_truth": gt}
@router.get("/sessions/{session_id}/ground-truth/columns")
async def get_column_ground_truth(session_id: str):
"""Retrieve saved ground truth for column detection, including auto vs GT diff."""
session = await get_session_db(session_id)
if not session:
raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
ground_truth = session.get("ground_truth") or {}
columns_gt = ground_truth.get("columns")
if not columns_gt:
raise HTTPException(status_code=404, detail="No column ground truth saved")
return {
"session_id": session_id,
"columns_gt": columns_gt,
"columns_auto": session.get("column_result"),
}

View File

@@ -0,0 +1,236 @@
"""
OCR Pipeline Deskew Endpoints (Step 2)
Auto deskew, manual deskew, and ground truth for the deskew step.
Extracted from ocr_pipeline_geometry.py for file-size compliance.
"""
import logging
import time
from datetime import datetime
import cv2
from fastapi import APIRouter, HTTPException
from cv_vocab_pipeline import (
create_ocr_image,
deskew_image,
deskew_image_by_word_alignment,
deskew_two_pass,
)
from ocr_pipeline_session_store import (
get_session_db,
update_session_db,
)
from ocr_pipeline_common import (
_cache,
_load_session_to_cache,
_get_cached,
_append_pipeline_log,
ManualDeskewRequest,
DeskewGroundTruthRequest,
)
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/api/v1/ocr-pipeline", tags=["ocr-pipeline"])
@router.post("/sessions/{session_id}/deskew")
async def auto_deskew(session_id: str):
"""Two-pass deskew: iterative projection (wide range) + word-alignment residual."""
# Ensure session is in cache
if session_id not in _cache:
await _load_session_to_cache(session_id)
cached = _get_cached(session_id)
# Deskew runs right after orientation -- use oriented image, fall back to original
img_bgr = next((v for k in ("oriented_bgr", "original_bgr")
if (v := cached.get(k)) is not None), None)
if img_bgr is None:
raise HTTPException(status_code=400, detail="No image available for deskewing")
t0 = time.time()
# Two-pass deskew: iterative (+-5 deg) + word-alignment residual check
deskewed_bgr, angle_applied, two_pass_debug = deskew_two_pass(img_bgr.copy())
# Also run individual methods for reporting (non-authoritative)
try:
_, angle_hough = deskew_image(img_bgr.copy())
except Exception:
angle_hough = 0.0
success_enc, png_orig = cv2.imencode(".png", img_bgr)
orig_bytes = png_orig.tobytes() if success_enc else b""
try:
_, angle_wa = deskew_image_by_word_alignment(orig_bytes)
except Exception:
angle_wa = 0.0
angle_iterative = two_pass_debug.get("pass1_angle", 0.0)
angle_residual = two_pass_debug.get("pass2_angle", 0.0)
angle_textline = two_pass_debug.get("pass3_angle", 0.0)
duration = time.time() - t0
method_used = "three_pass" if abs(angle_textline) >= 0.01 else (
"two_pass" if abs(angle_residual) >= 0.01 else "iterative"
)
# Encode as PNG
success, deskewed_png_buf = cv2.imencode(".png", deskewed_bgr)
deskewed_png = deskewed_png_buf.tobytes() if success else b""
# Create binarized version
binarized_png = None
try:
binarized = create_ocr_image(deskewed_bgr)
success_bin, bin_buf = cv2.imencode(".png", binarized)
binarized_png = bin_buf.tobytes() if success_bin else None
except Exception as e:
logger.warning(f"Binarization failed: {e}")
confidence = max(0.5, 1.0 - abs(angle_applied) / 5.0)
deskew_result = {
"angle_hough": round(angle_hough, 3),
"angle_word_alignment": round(angle_wa, 3),
"angle_iterative": round(angle_iterative, 3),
"angle_residual": round(angle_residual, 3),
"angle_textline": round(angle_textline, 3),
"angle_applied": round(angle_applied, 3),
"method_used": method_used,
"confidence": round(confidence, 2),
"duration_seconds": round(duration, 2),
"two_pass_debug": two_pass_debug,
}
# Update cache
cached["deskewed_bgr"] = deskewed_bgr
cached["binarized_png"] = binarized_png
cached["deskew_result"] = deskew_result
# Persist to DB
db_update = {
"deskewed_png": deskewed_png,
"deskew_result": deskew_result,
"current_step": 3,
}
if binarized_png:
db_update["binarized_png"] = binarized_png
await update_session_db(session_id, **db_update)
logger.info(f"OCR Pipeline: deskew session {session_id}: "
f"hough={angle_hough:.2f} wa={angle_wa:.2f} "
f"iter={angle_iterative:.2f} residual={angle_residual:.2f} "
f"textline={angle_textline:.2f} "
f"-> {method_used} total={angle_applied:.2f}")
await _append_pipeline_log(session_id, "deskew", {
"angle_applied": round(angle_applied, 3),
"angle_iterative": round(angle_iterative, 3),
"angle_residual": round(angle_residual, 3),
"angle_textline": round(angle_textline, 3),
"confidence": round(confidence, 2),
"method": method_used,
}, duration_ms=int(duration * 1000))
return {
"session_id": session_id,
**deskew_result,
"deskewed_image_url": f"/api/v1/ocr-pipeline/sessions/{session_id}/image/deskewed",
"binarized_image_url": f"/api/v1/ocr-pipeline/sessions/{session_id}/image/binarized",
}
@router.post("/sessions/{session_id}/deskew/manual")
async def manual_deskew(session_id: str, req: ManualDeskewRequest):
"""Apply a manual rotation angle to the oriented image."""
if session_id not in _cache:
await _load_session_to_cache(session_id)
cached = _get_cached(session_id)
img_bgr = next((v for k in ("oriented_bgr", "original_bgr")
if (v := cached.get(k)) is not None), None)
if img_bgr is None:
raise HTTPException(status_code=400, detail="No image available for deskewing")
angle = max(-5.0, min(5.0, req.angle))
h, w = img_bgr.shape[:2]
center = (w // 2, h // 2)
M = cv2.getRotationMatrix2D(center, angle, 1.0)
rotated = cv2.warpAffine(img_bgr, M, (w, h),
flags=cv2.INTER_LINEAR,
borderMode=cv2.BORDER_REPLICATE)
success, png_buf = cv2.imencode(".png", rotated)
deskewed_png = png_buf.tobytes() if success else b""
# Binarize
binarized_png = None
try:
binarized = create_ocr_image(rotated)
success_bin, bin_buf = cv2.imencode(".png", binarized)
binarized_png = bin_buf.tobytes() if success_bin else None
except Exception:
pass
deskew_result = {
**(cached.get("deskew_result") or {}),
"angle_applied": round(angle, 3),
"method_used": "manual",
}
# Update cache
cached["deskewed_bgr"] = rotated
cached["binarized_png"] = binarized_png
cached["deskew_result"] = deskew_result
# Persist to DB
db_update = {
"deskewed_png": deskewed_png,
"deskew_result": deskew_result,
}
if binarized_png:
db_update["binarized_png"] = binarized_png
await update_session_db(session_id, **db_update)
logger.info(f"OCR Pipeline: manual deskew session {session_id}: {angle:.2f}")
return {
"session_id": session_id,
"angle_applied": round(angle, 3),
"method_used": "manual",
"deskewed_image_url": f"/api/v1/ocr-pipeline/sessions/{session_id}/image/deskewed",
}
@router.post("/sessions/{session_id}/ground-truth/deskew")
async def save_deskew_ground_truth(session_id: str, req: DeskewGroundTruthRequest):
"""Save ground truth feedback for the deskew step."""
session = await get_session_db(session_id)
if not session:
raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
ground_truth = session.get("ground_truth") or {}
gt = {
"is_correct": req.is_correct,
"corrected_angle": req.corrected_angle,
"notes": req.notes,
"saved_at": datetime.utcnow().isoformat(),
"deskew_result": session.get("deskew_result"),
}
ground_truth["deskew"] = gt
await update_session_db(session_id, ground_truth=ground_truth)
# Update cache
if session_id in _cache:
_cache[session_id]["ground_truth"] = ground_truth
logger.info(f"OCR Pipeline: ground truth deskew session {session_id}: "
f"correct={req.is_correct}, corrected_angle={req.corrected_angle}")
return {"session_id": session_id, "ground_truth": gt}

View File

@@ -0,0 +1,346 @@
"""
OCR Pipeline Dewarp Endpoints
Auto dewarp (with VLM/CV ensemble), manual dewarp, combined
rotation+shear adjustment, and ground truth.
Extracted from ocr_pipeline_geometry.py for file-size compliance.
"""
import json
import logging
import os
import re
import time
from datetime import datetime
from typing import Any, Dict
import cv2
from fastapi import APIRouter, HTTPException, Query
from cv_vocab_pipeline import (
_apply_shear,
create_ocr_image,
dewarp_image,
dewarp_image_manual,
)
from ocr_pipeline_session_store import (
get_session_db,
update_session_db,
)
from ocr_pipeline_common import (
_cache,
_load_session_to_cache,
_get_cached,
_append_pipeline_log,
ManualDewarpRequest,
CombinedAdjustRequest,
DewarpGroundTruthRequest,
)
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/api/v1/ocr-pipeline", tags=["ocr-pipeline"])
async def _detect_shear_with_vlm(image_bytes: bytes) -> Dict[str, Any]:
"""Ask qwen2.5vl:32b to estimate the vertical shear angle of a scanned page.
The VLM is shown the image and asked: are the column/table borders tilted?
If yes, by how many degrees? Returns a dict with shear_degrees and confidence.
Confidence is 0.0 if Ollama is unavailable or parsing fails.
"""
import httpx
import base64
ollama_base = os.getenv("OLLAMA_BASE_URL", "http://host.docker.internal:11434")
model = os.getenv("OLLAMA_HTR_MODEL", "qwen2.5vl:32b")
prompt = (
"This is a scanned vocabulary worksheet. Look at the vertical borders of the table columns. "
"Are they perfectly vertical, or do they tilt slightly? "
"If they tilt, estimate the tilt angle in degrees (positive = top tilts right, negative = top tilts left). "
"Reply with ONLY a JSON object like: {\"shear_degrees\": 1.2, \"confidence\": 0.8} "
"Use confidence 0.0-1.0 based on how clearly you can see the tilt. "
"If the columns look straight, return {\"shear_degrees\": 0.0, \"confidence\": 0.9}"
)
img_b64 = base64.b64encode(image_bytes).decode("utf-8")
payload = {
"model": model,
"prompt": prompt,
"images": [img_b64],
"stream": False,
}
try:
async with httpx.AsyncClient(timeout=60.0) as client:
resp = await client.post(f"{ollama_base}/api/generate", json=payload)
resp.raise_for_status()
text = resp.json().get("response", "")
# Parse JSON from response (may have surrounding text)
match = re.search(r'\{[^}]+\}', text)
if match:
data = json.loads(match.group(0))
shear = float(data.get("shear_degrees", 0.0))
conf = float(data.get("confidence", 0.0))
# Clamp to reasonable range
shear = max(-3.0, min(3.0, shear))
conf = max(0.0, min(1.0, conf))
return {"method": "vlm_qwen2.5vl", "shear_degrees": round(shear, 3), "confidence": round(conf, 2)}
except Exception as e:
logger.warning(f"VLM dewarp failed: {e}")
return {"method": "vlm_qwen2.5vl", "shear_degrees": 0.0, "confidence": 0.0}
@router.post("/sessions/{session_id}/dewarp")
async def auto_dewarp(
session_id: str,
method: str = Query("ensemble", description="Detection method: ensemble | vlm | cv"),
):
"""Detect and correct vertical shear on the deskewed image.
Methods:
- **ensemble** (default): 3-method CV ensemble (vertical edges + projection + Hough)
- **cv**: CV ensemble only (same as ensemble)
- **vlm**: Ask qwen2.5vl:32b to estimate the shear angle visually
"""
if method not in ("ensemble", "cv", "vlm"):
raise HTTPException(status_code=400, detail="method must be one of: ensemble, cv, vlm")
if session_id not in _cache:
await _load_session_to_cache(session_id)
cached = _get_cached(session_id)
deskewed_bgr = cached.get("deskewed_bgr")
if deskewed_bgr is None:
raise HTTPException(status_code=400, detail="Deskew must be completed before dewarp")
t0 = time.time()
if method == "vlm":
# Encode deskewed image to PNG for VLM
success, png_buf = cv2.imencode(".png", deskewed_bgr)
img_bytes = png_buf.tobytes() if success else b""
vlm_det = await _detect_shear_with_vlm(img_bytes)
shear_deg = vlm_det["shear_degrees"]
if abs(shear_deg) >= 0.05 and vlm_det["confidence"] >= 0.3:
dewarped_bgr = _apply_shear(deskewed_bgr, -shear_deg)
else:
dewarped_bgr = deskewed_bgr
dewarp_info = {
"method": vlm_det["method"],
"shear_degrees": shear_deg,
"confidence": vlm_det["confidence"],
"detections": [vlm_det],
}
else:
dewarped_bgr, dewarp_info = dewarp_image(deskewed_bgr)
duration = time.time() - t0
# Encode as PNG
success, png_buf = cv2.imencode(".png", dewarped_bgr)
dewarped_png = png_buf.tobytes() if success else b""
dewarp_result = {
"method_used": dewarp_info["method"],
"shear_degrees": dewarp_info["shear_degrees"],
"confidence": dewarp_info["confidence"],
"duration_seconds": round(duration, 2),
"detections": dewarp_info.get("detections", []),
}
# Update cache
cached["dewarped_bgr"] = dewarped_bgr
cached["dewarp_result"] = dewarp_result
# Persist to DB
await update_session_db(
session_id,
dewarped_png=dewarped_png,
dewarp_result=dewarp_result,
auto_shear_degrees=dewarp_info.get("shear_degrees", 0.0),
current_step=4,
)
logger.info(f"OCR Pipeline: dewarp session {session_id}: "
f"method={dewarp_info['method']} shear={dewarp_info['shear_degrees']:.3f} "
f"conf={dewarp_info['confidence']:.2f} ({duration:.2f}s)")
await _append_pipeline_log(session_id, "dewarp", {
"shear_degrees": dewarp_info["shear_degrees"],
"confidence": dewarp_info["confidence"],
"method": dewarp_info["method"],
"ensemble_methods": [d.get("method", "") for d in dewarp_info.get("detections", [])],
}, duration_ms=int(duration * 1000))
return {
"session_id": session_id,
**dewarp_result,
"dewarped_image_url": f"/api/v1/ocr-pipeline/sessions/{session_id}/image/dewarped",
}
@router.post("/sessions/{session_id}/dewarp/manual")
async def manual_dewarp(session_id: str, req: ManualDewarpRequest):
"""Apply shear correction with a manual angle."""
if session_id not in _cache:
await _load_session_to_cache(session_id)
cached = _get_cached(session_id)
deskewed_bgr = cached.get("deskewed_bgr")
if deskewed_bgr is None:
raise HTTPException(status_code=400, detail="Deskew must be completed before dewarp")
shear_deg = max(-2.0, min(2.0, req.shear_degrees))
if abs(shear_deg) < 0.001:
dewarped_bgr = deskewed_bgr
else:
dewarped_bgr = dewarp_image_manual(deskewed_bgr, shear_deg)
success, png_buf = cv2.imencode(".png", dewarped_bgr)
dewarped_png = png_buf.tobytes() if success else b""
dewarp_result = {
**(cached.get("dewarp_result") or {}),
"method_used": "manual",
"shear_degrees": round(shear_deg, 3),
}
# Update cache
cached["dewarped_bgr"] = dewarped_bgr
cached["dewarp_result"] = dewarp_result
# Persist to DB
await update_session_db(
session_id,
dewarped_png=dewarped_png,
dewarp_result=dewarp_result,
)
logger.info(f"OCR Pipeline: manual dewarp session {session_id}: shear={shear_deg:.3f}")
return {
"session_id": session_id,
"shear_degrees": round(shear_deg, 3),
"method_used": "manual",
"dewarped_image_url": f"/api/v1/ocr-pipeline/sessions/{session_id}/image/dewarped",
}
@router.post("/sessions/{session_id}/adjust-combined")
async def adjust_combined(session_id: str, req: CombinedAdjustRequest):
"""Apply rotation + shear combined to the original image.
Used by the fine-tuning sliders to preview arbitrary rotation/shear
combinations without re-running the full deskew/dewarp pipeline.
"""
if session_id not in _cache:
await _load_session_to_cache(session_id)
cached = _get_cached(session_id)
img_bgr = cached.get("original_bgr")
if img_bgr is None:
raise HTTPException(status_code=400, detail="Original image not available")
rotation = max(-15.0, min(15.0, req.rotation_degrees))
shear_deg = max(-5.0, min(5.0, req.shear_degrees))
h, w = img_bgr.shape[:2]
result_bgr = img_bgr
# Step 1: Apply rotation
if abs(rotation) >= 0.001:
center = (w // 2, h // 2)
M = cv2.getRotationMatrix2D(center, rotation, 1.0)
result_bgr = cv2.warpAffine(result_bgr, M, (w, h),
flags=cv2.INTER_LINEAR,
borderMode=cv2.BORDER_REPLICATE)
# Step 2: Apply shear
if abs(shear_deg) >= 0.001:
result_bgr = dewarp_image_manual(result_bgr, shear_deg)
# Encode
success, png_buf = cv2.imencode(".png", result_bgr)
dewarped_png = png_buf.tobytes() if success else b""
# Binarize
binarized_png = None
try:
binarized = create_ocr_image(result_bgr)
success_bin, bin_buf = cv2.imencode(".png", binarized)
binarized_png = bin_buf.tobytes() if success_bin else None
except Exception:
pass
# Build combined result dicts
deskew_result = {
**(cached.get("deskew_result") or {}),
"angle_applied": round(rotation, 3),
"method_used": "manual_combined",
}
dewarp_result = {
**(cached.get("dewarp_result") or {}),
"method_used": "manual_combined",
"shear_degrees": round(shear_deg, 3),
}
# Update cache
cached["deskewed_bgr"] = result_bgr
cached["dewarped_bgr"] = result_bgr
cached["deskew_result"] = deskew_result
cached["dewarp_result"] = dewarp_result
# Persist to DB
db_update = {
"dewarped_png": dewarped_png,
"deskew_result": deskew_result,
"dewarp_result": dewarp_result,
}
if binarized_png:
db_update["binarized_png"] = binarized_png
db_update["deskewed_png"] = dewarped_png
await update_session_db(session_id, **db_update)
logger.info(f"OCR Pipeline: combined adjust session {session_id}: "
f"rotation={rotation:.3f} shear={shear_deg:.3f}")
return {
"session_id": session_id,
"rotation_degrees": round(rotation, 3),
"shear_degrees": round(shear_deg, 3),
"method_used": "manual_combined",
"dewarped_image_url": f"/api/v1/ocr-pipeline/sessions/{session_id}/image/dewarped",
}
@router.post("/sessions/{session_id}/ground-truth/dewarp")
async def save_dewarp_ground_truth(session_id: str, req: DewarpGroundTruthRequest):
"""Save ground truth feedback for the dewarp step."""
session = await get_session_db(session_id)
if not session:
raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
ground_truth = session.get("ground_truth") or {}
gt = {
"is_correct": req.is_correct,
"corrected_shear": req.corrected_shear,
"notes": req.notes,
"saved_at": datetime.utcnow().isoformat(),
"dewarp_result": session.get("dewarp_result"),
}
ground_truth["dewarp"] = gt
await update_session_db(session_id, ground_truth=ground_truth)
if session_id in _cache:
_cache[session_id]["ground_truth"] = ground_truth
logger.info(f"OCR Pipeline: ground truth dewarp session {session_id}: "
f"correct={req.is_correct}, corrected_shear={req.corrected_shear}")
return {"session_id": session_id, "ground_truth": gt}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,299 @@
"""
OCR Pipeline Structure Detection and Exclude Regions
Detect document structure (boxes, zones, color regions, graphics)
and manage user-drawn exclude regions.
Extracted from ocr_pipeline_geometry.py for file-size compliance.
"""
import logging
import time
from typing import Any, Dict, List
import cv2
import numpy as np
from fastapi import APIRouter, HTTPException
from pydantic import BaseModel
from cv_box_detect import detect_boxes
from cv_color_detect import _COLOR_RANGES, _COLOR_HEX
from cv_graphic_detect import detect_graphic_elements
from ocr_pipeline_session_store import (
get_session_db,
update_session_db,
)
from ocr_pipeline_common import (
_cache,
_load_session_to_cache,
_get_cached,
_filter_border_ghost_words,
)
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/api/v1/ocr-pipeline", tags=["ocr-pipeline"])
# ---------------------------------------------------------------------------
# Structure Detection Endpoint
# ---------------------------------------------------------------------------
@router.post("/sessions/{session_id}/detect-structure")
async def detect_structure(session_id: str):
"""Detect document structure: boxes, zones, and color regions.
Runs box detection (line + shading) and color analysis on the cropped
image. Returns structured JSON with all detected elements for the
structure visualization step.
"""
if session_id not in _cache:
await _load_session_to_cache(session_id)
cached = _get_cached(session_id)
img_bgr = (
cached.get("cropped_bgr")
if cached.get("cropped_bgr") is not None
else cached.get("dewarped_bgr")
)
if img_bgr is None:
raise HTTPException(status_code=400, detail="Crop or dewarp must be completed first")
t0 = time.time()
h, w = img_bgr.shape[:2]
# --- Content bounds from word result (if available) or full image ---
word_result = cached.get("word_result")
words: List[Dict] = []
if word_result and word_result.get("cells"):
for cell in word_result["cells"]:
for wb in (cell.get("word_boxes") or []):
words.append(wb)
# Fallback: use raw OCR words if cell word_boxes are empty
if not words and word_result:
for key in ("raw_paddle_words_split", "raw_tesseract_words", "raw_paddle_words"):
raw = word_result.get(key, [])
if raw:
words = raw
logger.info("detect-structure: using %d words from %s (no cell word_boxes)", len(words), key)
break
# If no words yet, use image dimensions with small margin
if words:
content_x = max(0, min(int(wb["left"]) for wb in words))
content_y = max(0, min(int(wb["top"]) for wb in words))
content_r = min(w, max(int(wb["left"] + wb["width"]) for wb in words))
content_b = min(h, max(int(wb["top"] + wb["height"]) for wb in words))
content_w_px = content_r - content_x
content_h_px = content_b - content_y
else:
margin = int(min(w, h) * 0.03)
content_x, content_y = margin, margin
content_w_px = w - 2 * margin
content_h_px = h - 2 * margin
# --- Box detection ---
boxes = detect_boxes(
img_bgr,
content_x=content_x,
content_w=content_w_px,
content_y=content_y,
content_h=content_h_px,
)
# --- Zone splitting ---
from cv_box_detect import split_page_into_zones as _split_zones
zones = _split_zones(content_x, content_y, content_w_px, content_h_px, boxes)
# --- Color region sampling ---
# Sample background shading in each detected box
hsv = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2HSV)
box_colors = []
for box in boxes:
# Sample the center region of each box
cy1 = box.y + box.height // 4
cy2 = box.y + 3 * box.height // 4
cx1 = box.x + box.width // 4
cx2 = box.x + 3 * box.width // 4
cy1 = max(0, min(cy1, h - 1))
cy2 = max(0, min(cy2, h - 1))
cx1 = max(0, min(cx1, w - 1))
cx2 = max(0, min(cx2, w - 1))
if cy2 > cy1 and cx2 > cx1:
roi_hsv = hsv[cy1:cy2, cx1:cx2]
med_h = float(np.median(roi_hsv[:, :, 0]))
med_s = float(np.median(roi_hsv[:, :, 1]))
med_v = float(np.median(roi_hsv[:, :, 2]))
if med_s > 15:
from cv_color_detect import _hue_to_color_name
bg_name = _hue_to_color_name(med_h)
bg_hex = _COLOR_HEX.get(bg_name, "#6b7280")
else:
bg_name = "gray" if med_v < 220 else "white"
bg_hex = "#6b7280" if bg_name == "gray" else "#ffffff"
else:
bg_name = "unknown"
bg_hex = "#6b7280"
box_colors.append({"color_name": bg_name, "color_hex": bg_hex})
# --- Color text detection overview ---
# Quick scan for colored text regions across the page
color_summary: Dict[str, int] = {}
for color_name, ranges in _COLOR_RANGES.items():
mask = np.zeros((h, w), dtype=np.uint8)
for lower, upper in ranges:
mask = cv2.bitwise_or(mask, cv2.inRange(hsv, lower, upper))
pixel_count = int(np.sum(mask > 0))
if pixel_count > 50: # minimum threshold
color_summary[color_name] = pixel_count
# --- Graphic element detection ---
box_dicts = [
{"x": b.x, "y": b.y, "w": b.width, "h": b.height}
for b in boxes
]
graphics = detect_graphic_elements(
img_bgr, words,
detected_boxes=box_dicts,
)
# --- Filter border-ghost words from OCR result ---
ghost_count = 0
if boxes and word_result:
ghost_count = _filter_border_ghost_words(word_result, boxes)
if ghost_count:
logger.info("detect-structure: removed %d border-ghost words", ghost_count)
await update_session_db(session_id, word_result=word_result)
cached["word_result"] = word_result
duration = time.time() - t0
# Preserve user-drawn exclude regions from previous run
prev_sr = cached.get("structure_result") or {}
prev_exclude = prev_sr.get("exclude_regions", [])
result_dict = {
"image_width": w,
"image_height": h,
"content_bounds": {
"x": content_x, "y": content_y,
"w": content_w_px, "h": content_h_px,
},
"boxes": [
{
"x": b.x, "y": b.y, "w": b.width, "h": b.height,
"confidence": b.confidence,
"border_thickness": b.border_thickness,
"bg_color_name": box_colors[i]["color_name"],
"bg_color_hex": box_colors[i]["color_hex"],
}
for i, b in enumerate(boxes)
],
"zones": [
{
"index": z.index,
"zone_type": z.zone_type,
"y": z.y, "h": z.height,
"x": z.x, "w": z.width,
}
for z in zones
],
"graphics": [
{
"x": g.x, "y": g.y, "w": g.width, "h": g.height,
"area": g.area,
"shape": g.shape,
"color_name": g.color_name,
"color_hex": g.color_hex,
"confidence": round(g.confidence, 2),
}
for g in graphics
],
"exclude_regions": prev_exclude,
"color_pixel_counts": color_summary,
"has_words": len(words) > 0,
"word_count": len(words),
"border_ghosts_removed": ghost_count,
"duration_seconds": round(duration, 2),
}
# Persist to session
await update_session_db(session_id, structure_result=result_dict)
cached["structure_result"] = result_dict
logger.info("detect-structure session %s: %d boxes, %d zones, %d graphics, %.2fs",
session_id, len(boxes), len(zones), len(graphics), duration)
return {"session_id": session_id, **result_dict}
# ---------------------------------------------------------------------------
# Exclude Regions -- user-drawn rectangles to exclude from OCR results
# ---------------------------------------------------------------------------
class _ExcludeRegionIn(BaseModel):
x: int
y: int
w: int
h: int
label: str = ""
class _ExcludeRegionsBatchIn(BaseModel):
regions: list[_ExcludeRegionIn]
@router.put("/sessions/{session_id}/exclude-regions")
async def set_exclude_regions(session_id: str, body: _ExcludeRegionsBatchIn):
"""Replace all exclude regions for a session.
Regions are stored inside ``structure_result.exclude_regions``.
"""
session = await get_session_db(session_id)
if not session:
raise HTTPException(status_code=404, detail="Session not found")
sr = session.get("structure_result") or {}
sr["exclude_regions"] = [r.model_dump() for r in body.regions]
# Invalidate grid so it rebuilds with new exclude regions
await update_session_db(session_id, structure_result=sr, grid_editor_result=None)
# Update cache
if session_id in _cache:
_cache[session_id]["structure_result"] = sr
_cache[session_id].pop("grid_editor_result", None)
return {
"session_id": session_id,
"exclude_regions": sr["exclude_regions"],
"count": len(sr["exclude_regions"]),
}
@router.delete("/sessions/{session_id}/exclude-regions/{region_index}")
async def delete_exclude_region(session_id: str, region_index: int):
"""Remove a single exclude region by index."""
session = await get_session_db(session_id)
if not session:
raise HTTPException(status_code=404, detail="Session not found")
sr = session.get("structure_result") or {}
regions = sr.get("exclude_regions", [])
if region_index < 0 or region_index >= len(regions):
raise HTTPException(status_code=404, detail="Region index out of range")
removed = regions.pop(region_index)
sr["exclude_regions"] = regions
# Invalidate grid so it rebuilds with new exclude regions
await update_session_db(session_id, structure_result=sr, grid_editor_result=None)
if session_id in _cache:
_cache[session_id]["structure_result"] = sr
_cache[session_id].pop("grid_editor_result", None)
return {
"session_id": session_id,
"removed": removed,
"remaining": len(regions),
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,498 @@
"""
RBAC Policy Engine
Core engine for RBAC/ABAC permission checks,
role assignments, key shares, and default policies.
Extracted from rbac.py for file-size compliance.
"""
from typing import Optional, List, Dict, Set
from datetime import datetime, timezone
import uuid
from functools import wraps
from fastapi import HTTPException, Request
from rbac_types import (
Role,
Action,
ResourceType,
ZKVisibilityMode,
PolicySet,
RoleAssignment,
KeyShare,
)
from rbac_permissions import DEFAULT_PERMISSIONS
# =============================================
# POLICY ENGINE
# =============================================
class PolicyEngine:
"""
Engine fuer RBAC/ABAC Entscheidungen.
Prueft:
1. Basis-Rollenberechtigung (RBAC)
2. Policy-Einschraenkungen (ABAC)
3. Key Share Berechtigungen
"""
def __init__(self):
self.policy_sets: Dict[str, PolicySet] = {}
self.role_assignments: Dict[str, List[RoleAssignment]] = {} # user_id -> assignments
self.key_shares: Dict[str, List[KeyShare]] = {} # user_id -> shares
def register_policy_set(self, policy: PolicySet):
"""Registriere ein Policy Set."""
self.policy_sets[policy.id] = policy
def get_policy_for_context(
self,
bundesland: str,
jahr: int,
fach: Optional[str] = None,
verfahren: str = "abitur"
) -> Optional[PolicySet]:
"""Finde das passende Policy Set fuer einen Kontext."""
# Exakte Uebereinstimmung
for policy in self.policy_sets.values():
if (policy.bundesland == bundesland and
policy.jahr == jahr and
policy.verfahren == verfahren):
if policy.fach is None or policy.fach == fach:
return policy
# Fallback: Default Policy
for policy in self.policy_sets.values():
if policy.bundesland == "DEFAULT":
return policy
return None
def assign_role(
self,
user_id: str,
role: Role,
resource_type: ResourceType,
resource_id: str,
granted_by: str,
tenant_id: Optional[str] = None,
namespace_id: Optional[str] = None,
valid_to: Optional[datetime] = None
) -> RoleAssignment:
"""Weise einem User eine Rolle zu."""
assignment = RoleAssignment(
id=str(uuid.uuid4()),
user_id=user_id,
role=role,
resource_type=resource_type,
resource_id=resource_id,
tenant_id=tenant_id,
namespace_id=namespace_id,
granted_by=granted_by,
valid_to=valid_to
)
if user_id not in self.role_assignments:
self.role_assignments[user_id] = []
self.role_assignments[user_id].append(assignment)
return assignment
def revoke_role(self, assignment_id: str, revoked_by: str) -> bool:
"""Widerrufe eine Rollenzuweisung."""
for user_assignments in self.role_assignments.values():
for assignment in user_assignments:
if assignment.id == assignment_id:
assignment.revoked_at = datetime.now(timezone.utc)
return True
return False
def get_user_roles(
self,
user_id: str,
resource_type: Optional[ResourceType] = None,
resource_id: Optional[str] = None
) -> List[Role]:
"""Hole alle aktiven Rollen eines Users."""
assignments = self.role_assignments.get(user_id, [])
roles = []
for assignment in assignments:
if not assignment.is_active():
continue
if resource_type and assignment.resource_type != resource_type:
continue
if resource_id and assignment.resource_id != resource_id:
continue
roles.append(assignment.role)
return list(set(roles))
def create_key_share(
self,
user_id: str,
package_id: str,
permissions: Set[str],
granted_by: str,
scope: str = "full",
invite_token: Optional[str] = None
) -> KeyShare:
"""Erstelle einen Key Share."""
share = KeyShare(
id=str(uuid.uuid4()),
user_id=user_id,
package_id=package_id,
permissions=permissions,
scope=scope,
granted_by=granted_by,
invite_token=invite_token
)
if user_id not in self.key_shares:
self.key_shares[user_id] = []
self.key_shares[user_id].append(share)
return share
def accept_key_share(self, share_id: str, token: str) -> bool:
"""Akzeptiere einen Key Share via Invite Token."""
for user_shares in self.key_shares.values():
for share in user_shares:
if share.id == share_id and share.invite_token == token:
share.accepted_at = datetime.now(timezone.utc)
return True
return False
def revoke_key_share(self, share_id: str, revoked_by: str) -> bool:
"""Widerrufe einen Key Share."""
for user_shares in self.key_shares.values():
for share in user_shares:
if share.id == share_id:
share.revoked_at = datetime.now(timezone.utc)
share.revoked_by = revoked_by
return True
return False
def check_permission(
self,
user_id: str,
action: Action,
resource_type: ResourceType,
resource_id: str,
policy: Optional[PolicySet] = None,
package_id: Optional[str] = None
) -> bool:
"""
Pruefe ob ein User eine Aktion ausfuehren darf.
Prueft:
1. Basis-RBAC
2. Policy-Einschraenkungen
3. Key Share (falls package_id angegeben)
"""
# 1. Hole aktive Rollen
roles = self.get_user_roles(user_id, resource_type, resource_id)
if not roles:
return False
# 2. Pruefe Basis-RBAC
has_permission = False
for role in roles:
role_permissions = DEFAULT_PERMISSIONS.get(role, {})
resource_permissions = role_permissions.get(resource_type, set())
if action in resource_permissions:
has_permission = True
break
if not has_permission:
return False
# 3. Pruefe Policy-Einschraenkungen
if policy:
# ZK Visibility Mode
if Role.ZWEITKORREKTOR in roles:
if policy.zk_visibility_mode == ZKVisibilityMode.BLIND:
# Blind: ZK darf EK-Outputs nicht sehen
if resource_type in [ResourceType.EVALUATION, ResourceType.REPORT, ResourceType.GRADE_DECISION]:
if action == Action.READ:
# Pruefe ob es EK-Outputs sind (muesste ueber Metadaten geprueft werden)
pass # Implementierung abhaengig von Datenmodell
elif policy.zk_visibility_mode == ZKVisibilityMode.SEMI:
# Semi: ZK sieht Annotationen, aber keine Note
if resource_type == ResourceType.GRADE_DECISION and action == Action.READ:
return False
# 4. Pruefe Key Share (falls Package-basiert)
if package_id:
user_shares = self.key_shares.get(user_id, [])
has_key_share = any(
share.package_id == package_id and share.is_active()
for share in user_shares
)
if not has_key_share:
return False
return True
def get_allowed_actions(
self,
user_id: str,
resource_type: ResourceType,
resource_id: str,
policy: Optional[PolicySet] = None
) -> Set[Action]:
"""Hole alle erlaubten Aktionen fuer einen User auf einer Ressource."""
roles = self.get_user_roles(user_id, resource_type, resource_id)
allowed = set()
for role in roles:
role_permissions = DEFAULT_PERMISSIONS.get(role, {})
resource_permissions = role_permissions.get(resource_type, set())
allowed.update(resource_permissions)
# Policy-Einschraenkungen anwenden
if policy and Role.ZWEITKORREKTOR in roles:
if policy.zk_visibility_mode == ZKVisibilityMode.BLIND:
# Entferne READ fuer bestimmte Ressourcen
pass # Detailimplementierung
return allowed
# =============================================
# DEFAULT POLICY SETS (alle Bundeslaender)
# =============================================
def create_default_policy_sets() -> List[PolicySet]:
"""
Erstelle Default Policy Sets fuer alle Bundeslaender.
Diese koennen spaeter pro Land verfeinert werden.
"""
bundeslaender = [
"baden-wuerttemberg", "bayern", "berlin", "brandenburg",
"bremen", "hamburg", "hessen", "mecklenburg-vorpommern",
"niedersachsen", "nordrhein-westfalen", "rheinland-pfalz",
"saarland", "sachsen", "sachsen-anhalt", "schleswig-holstein",
"thueringen"
]
policies = []
# Default Policy (Fallback)
policies.append(PolicySet(
id="DEFAULT-2025",
bundesland="DEFAULT",
jahr=2025,
fach=None,
verfahren="abitur",
zk_visibility_mode=ZKVisibilityMode.FULL,
eh_visibility_mode=PolicySet.__dataclass_fields__["eh_visibility_mode"].default,
allow_teacher_uploaded_eh=True,
allow_land_uploaded_eh=True,
require_rights_confirmation_on_upload=True,
third_correction_threshold=4,
final_signoff_role="fachvorsitz"
))
# Niedersachsen (Beispiel mit spezifischen Anpassungen)
policies.append(PolicySet(
id="NI-2025-ABITUR",
bundesland="niedersachsen",
jahr=2025,
fach=None,
verfahren="abitur",
zk_visibility_mode=ZKVisibilityMode.FULL, # In NI sieht ZK alles
allow_teacher_uploaded_eh=True,
allow_land_uploaded_eh=True,
require_rights_confirmation_on_upload=True,
third_correction_threshold=4,
final_signoff_role="fachvorsitz",
export_template_id="niedersachsen-abitur"
))
# Bayern (Beispiel mit SEMI visibility)
policies.append(PolicySet(
id="BY-2025-ABITUR",
bundesland="bayern",
jahr=2025,
fach=None,
verfahren="abitur",
zk_visibility_mode=ZKVisibilityMode.SEMI, # ZK sieht Annotationen, nicht Note
allow_teacher_uploaded_eh=True,
allow_land_uploaded_eh=True,
require_rights_confirmation_on_upload=True,
third_correction_threshold=4,
final_signoff_role="fachvorsitz",
export_template_id="bayern-abitur"
))
# NRW (Beispiel)
policies.append(PolicySet(
id="NW-2025-ABITUR",
bundesland="nordrhein-westfalen",
jahr=2025,
fach=None,
verfahren="abitur",
zk_visibility_mode=ZKVisibilityMode.FULL,
allow_teacher_uploaded_eh=True,
allow_land_uploaded_eh=True,
require_rights_confirmation_on_upload=True,
third_correction_threshold=4,
final_signoff_role="fachvorsitz",
export_template_id="nrw-abitur"
))
# Generiere Basis-Policies fuer alle anderen Bundeslaender
for bl in bundeslaender:
if bl not in ["niedersachsen", "bayern", "nordrhein-westfalen"]:
policies.append(PolicySet(
id=f"{bl[:2].upper()}-2025-ABITUR",
bundesland=bl,
jahr=2025,
fach=None,
verfahren="abitur",
zk_visibility_mode=ZKVisibilityMode.FULL,
allow_teacher_uploaded_eh=True,
allow_land_uploaded_eh=True,
require_rights_confirmation_on_upload=True,
third_correction_threshold=4,
final_signoff_role="fachvorsitz"
))
return policies
# =============================================
# GLOBAL POLICY ENGINE INSTANCE
# =============================================
# Singleton Policy Engine
_policy_engine: Optional[PolicyEngine] = None
def get_policy_engine() -> PolicyEngine:
"""Hole die globale Policy Engine Instanz."""
global _policy_engine
if _policy_engine is None:
_policy_engine = PolicyEngine()
# Registriere Default Policies
for policy in create_default_policy_sets():
_policy_engine.register_policy_set(policy)
return _policy_engine
# =============================================
# API GUARDS (Decorators fuer FastAPI)
# =============================================
def require_permission(
action: Action,
resource_type: ResourceType,
resource_id_param: str = "resource_id"
):
"""
Decorator fuer FastAPI Endpoints.
Prueft ob der aktuelle User die angegebene Berechtigung hat.
Usage:
@app.get("/api/v1/packages/{package_id}")
@require_permission(Action.READ, ResourceType.EXAM_PACKAGE, "package_id")
async def get_package(package_id: str, request: Request):
...
"""
def decorator(func):
@wraps(func)
async def wrapper(*args, **kwargs):
request = kwargs.get('request')
if not request:
for arg in args:
if isinstance(arg, Request):
request = arg
break
if not request:
raise HTTPException(status_code=500, detail="Request not found")
# User aus Token holen
user = getattr(request.state, 'user', None)
if not user:
raise HTTPException(status_code=401, detail="Not authenticated")
user_id = user.get('user_id')
resource_id = kwargs.get(resource_id_param)
# Policy Engine pruefen
engine = get_policy_engine()
# Optional: Policy aus Kontext laden
policy = None
bundesland = user.get('bundesland')
if bundesland:
policy = engine.get_policy_for_context(bundesland, 2025)
if not engine.check_permission(
user_id=user_id,
action=action,
resource_type=resource_type,
resource_id=resource_id,
policy=policy
):
raise HTTPException(
status_code=403,
detail=f"Permission denied: {action.value} on {resource_type.value}"
)
return await func(*args, **kwargs)
return wrapper
return decorator
def require_role(role: Role):
"""
Decorator der prueft ob User eine bestimmte Rolle hat.
Usage:
@app.post("/api/v1/eh/publish")
@require_role(Role.LAND_ADMIN)
async def publish_eh(request: Request):
...
"""
def decorator(func):
@wraps(func)
async def wrapper(*args, **kwargs):
request = kwargs.get('request')
if not request:
for arg in args:
if isinstance(arg, Request):
request = arg
break
if not request:
raise HTTPException(status_code=500, detail="Request not found")
user = getattr(request.state, 'user', None)
if not user:
raise HTTPException(status_code=401, detail="Not authenticated")
user_id = user.get('user_id')
engine = get_policy_engine()
user_roles = engine.get_user_roles(user_id)
if role not in user_roles:
raise HTTPException(
status_code=403,
detail=f"Role required: {role.value}"
)
return await func(*args, **kwargs)
return wrapper
return decorator

View File

@@ -0,0 +1,221 @@
"""
RBAC Permission Matrix
Default role-to-resource permission mappings for
Klausur-Korrektur and Zeugnis workflows.
Extracted from rbac.py for file-size compliance.
"""
from typing import Dict, Set
from rbac_types import Role, Action, ResourceType
# =============================================
# RBAC PERMISSION MATRIX
# =============================================
# Standard-Berechtigungsmatrix (kann durch Policies ueberschrieben werden)
DEFAULT_PERMISSIONS: Dict[Role, Dict[ResourceType, Set[Action]]] = {
# Erstkorrektor
Role.ERSTKORREKTOR: {
ResourceType.EXAM_PACKAGE: {Action.READ, Action.UPDATE, Action.SHARE_KEY, Action.LOCK},
ResourceType.STUDENT_WORK: {Action.READ, Action.UPDATE},
ResourceType.EH_DOCUMENT: {Action.READ, Action.UPLOAD, Action.UPDATE},
ResourceType.RUBRIC: {Action.READ, Action.UPDATE},
ResourceType.ANNOTATION: {Action.CREATE, Action.READ, Action.UPDATE, Action.DELETE},
ResourceType.EVALUATION: {Action.CREATE, Action.READ, Action.UPDATE},
ResourceType.REPORT: {Action.CREATE, Action.READ, Action.UPDATE},
ResourceType.GRADE_DECISION: {Action.CREATE, Action.READ, Action.UPDATE},
ResourceType.EXPORT: {Action.CREATE, Action.READ, Action.DOWNLOAD},
ResourceType.AUDIT_LOG: {Action.READ},
},
# Zweitkorrektor (Standard: FULL visibility)
Role.ZWEITKORREKTOR: {
ResourceType.EXAM_PACKAGE: {Action.READ},
ResourceType.STUDENT_WORK: {Action.READ, Action.UPDATE},
ResourceType.EH_DOCUMENT: {Action.READ},
ResourceType.RUBRIC: {Action.READ},
ResourceType.ANNOTATION: {Action.CREATE, Action.READ, Action.UPDATE},
ResourceType.EVALUATION: {Action.CREATE, Action.READ, Action.UPDATE},
ResourceType.REPORT: {Action.CREATE, Action.READ, Action.UPDATE},
ResourceType.GRADE_DECISION: {Action.CREATE, Action.READ, Action.UPDATE},
ResourceType.EXPORT: {Action.READ, Action.DOWNLOAD},
ResourceType.AUDIT_LOG: {Action.READ},
},
# Drittkorrektor
Role.DRITTKORREKTOR: {
ResourceType.EXAM_PACKAGE: {Action.READ},
ResourceType.STUDENT_WORK: {Action.READ, Action.UPDATE},
ResourceType.EH_DOCUMENT: {Action.READ},
ResourceType.RUBRIC: {Action.READ},
ResourceType.ANNOTATION: {Action.CREATE, Action.READ, Action.UPDATE},
ResourceType.EVALUATION: {Action.CREATE, Action.READ, Action.UPDATE},
ResourceType.REPORT: {Action.CREATE, Action.READ, Action.UPDATE},
ResourceType.GRADE_DECISION: {Action.CREATE, Action.READ, Action.UPDATE},
ResourceType.AUDIT_LOG: {Action.READ},
},
# Fachvorsitz
Role.FACHVORSITZ: {
ResourceType.TENANT: {Action.READ},
ResourceType.NAMESPACE: {Action.READ, Action.UPDATE},
ResourceType.EXAM_PACKAGE: {Action.READ, Action.UPDATE, Action.LOCK, Action.UNLOCK, Action.SIGN_OFF},
ResourceType.STUDENT_WORK: {Action.READ, Action.UPDATE},
ResourceType.EH_DOCUMENT: {Action.READ, Action.UPLOAD, Action.UPDATE},
ResourceType.RUBRIC: {Action.READ, Action.UPDATE},
ResourceType.ANNOTATION: {Action.READ, Action.UPDATE},
ResourceType.EVALUATION: {Action.READ, Action.UPDATE},
ResourceType.REPORT: {Action.READ, Action.UPDATE},
ResourceType.GRADE_DECISION: {Action.READ, Action.UPDATE, Action.SIGN_OFF},
ResourceType.EXPORT: {Action.CREATE, Action.READ, Action.DOWNLOAD},
ResourceType.AUDIT_LOG: {Action.READ},
},
# Pruefungsvorsitz
Role.PRUEFUNGSVORSITZ: {
ResourceType.TENANT: {Action.READ},
ResourceType.NAMESPACE: {Action.READ, Action.CREATE},
ResourceType.EXAM_PACKAGE: {Action.READ, Action.SIGN_OFF},
ResourceType.STUDENT_WORK: {Action.READ},
ResourceType.EH_DOCUMENT: {Action.READ},
ResourceType.GRADE_DECISION: {Action.READ, Action.SIGN_OFF},
ResourceType.EXPORT: {Action.CREATE, Action.READ, Action.DOWNLOAD},
ResourceType.AUDIT_LOG: {Action.READ},
},
# Schul-Admin
Role.SCHUL_ADMIN: {
ResourceType.TENANT: {Action.READ, Action.UPDATE},
ResourceType.NAMESPACE: {Action.CREATE, Action.READ, Action.UPDATE, Action.DELETE},
ResourceType.EXAM_PACKAGE: {Action.CREATE, Action.READ, Action.DELETE, Action.ASSIGN_ROLE},
ResourceType.EH_DOCUMENT: {Action.READ, Action.UPLOAD, Action.DELETE},
ResourceType.AUDIT_LOG: {Action.READ},
},
# Land-Admin (Behoerde)
Role.LAND_ADMIN: {
ResourceType.TENANT: {Action.READ},
ResourceType.EH_DOCUMENT: {Action.READ, Action.UPLOAD, Action.UPDATE, Action.DELETE, Action.PUBLISH_OFFICIAL},
ResourceType.AUDIT_LOG: {Action.READ},
},
# Auditor
Role.AUDITOR: {
ResourceType.AUDIT_LOG: {Action.READ},
ResourceType.EXAM_PACKAGE: {Action.READ}, # Nur Metadaten
# Kein Zugriff auf Inhalte!
},
# Operator
Role.OPERATOR: {
ResourceType.TENANT: {Action.READ},
ResourceType.NAMESPACE: {Action.READ},
ResourceType.EXAM_PACKAGE: {Action.READ}, # Nur Metadaten
ResourceType.AUDIT_LOG: {Action.READ},
# Break-glass separat gehandhabt
},
# Teacher Assistant
Role.TEACHER_ASSISTANT: {
ResourceType.STUDENT_WORK: {Action.READ},
ResourceType.ANNOTATION: {Action.CREATE, Action.READ}, # Nur bestimmte Typen
ResourceType.EH_DOCUMENT: {Action.READ},
},
# Exam Author (nur Vorabi)
Role.EXAM_AUTHOR: {
ResourceType.EH_DOCUMENT: {Action.CREATE, Action.READ, Action.UPDATE, Action.DELETE},
ResourceType.RUBRIC: {Action.CREATE, Action.READ, Action.UPDATE, Action.DELETE},
},
# =============================================
# ZEUGNIS-WORKFLOW ROLLEN
# =============================================
# Klassenlehrer - Erstellt Zeugnisse, Kopfnoten, Bemerkungen
Role.KLASSENLEHRER: {
ResourceType.NAMESPACE: {Action.READ},
ResourceType.ZEUGNIS: {Action.CREATE, Action.READ, Action.UPDATE},
ResourceType.ZEUGNIS_ENTWURF: {Action.CREATE, Action.READ, Action.UPDATE, Action.DELETE},
ResourceType.ZEUGNIS_VORLAGE: {Action.READ},
ResourceType.SCHUELER_DATEN: {Action.READ, Action.UPDATE},
ResourceType.FACHNOTE: {Action.READ}, # Liest Fachnoten der Fachlehrer
ResourceType.KOPFNOTE: {Action.CREATE, Action.READ, Action.UPDATE},
ResourceType.FEHLZEITEN: {Action.READ, Action.UPDATE},
ResourceType.BEMERKUNG: {Action.CREATE, Action.READ, Action.UPDATE, Action.DELETE},
ResourceType.VERSETZUNG: {Action.READ},
ResourceType.EXPORT: {Action.CREATE, Action.READ, Action.DOWNLOAD},
ResourceType.AUDIT_LOG: {Action.READ},
},
# Fachlehrer - Traegt Fachnoten ein
Role.FACHLEHRER: {
ResourceType.NAMESPACE: {Action.READ},
ResourceType.SCHUELER_DATEN: {Action.READ}, # Nur eigene Schueler
ResourceType.FACHNOTE: {Action.CREATE, Action.READ, Action.UPDATE}, # Nur eigenes Fach
ResourceType.BEMERKUNG: {Action.CREATE, Action.READ}, # Fachbezogene Bemerkungen
ResourceType.AUDIT_LOG: {Action.READ},
},
# Zeugnisbeauftragter - Qualitaetskontrolle
Role.ZEUGNISBEAUFTRAGTER: {
ResourceType.NAMESPACE: {Action.READ, Action.UPDATE},
ResourceType.ZEUGNIS: {Action.READ, Action.UPDATE},
ResourceType.ZEUGNIS_ENTWURF: {Action.READ, Action.UPDATE},
ResourceType.ZEUGNIS_VORLAGE: {Action.READ, Action.UPDATE, Action.UPLOAD},
ResourceType.SCHUELER_DATEN: {Action.READ},
ResourceType.FACHNOTE: {Action.READ},
ResourceType.KOPFNOTE: {Action.READ, Action.UPDATE},
ResourceType.FEHLZEITEN: {Action.READ},
ResourceType.BEMERKUNG: {Action.READ, Action.UPDATE},
ResourceType.VERSETZUNG: {Action.READ},
ResourceType.EXPORT: {Action.CREATE, Action.READ, Action.DOWNLOAD},
ResourceType.AUDIT_LOG: {Action.READ},
},
# Sekretariat - Druck, Versand, Archivierung
Role.SEKRETARIAT: {
ResourceType.ZEUGNIS: {Action.READ, Action.DOWNLOAD},
ResourceType.ZEUGNIS_VORLAGE: {Action.READ},
ResourceType.SCHUELER_DATEN: {Action.READ}, # Fuer Adressdaten
ResourceType.EXPORT: {Action.CREATE, Action.READ, Action.DOWNLOAD},
ResourceType.AUDIT_LOG: {Action.READ},
},
# Schulleitung - Finale Zeugnis-Freigabe
Role.SCHULLEITUNG: {
ResourceType.TENANT: {Action.READ},
ResourceType.NAMESPACE: {Action.READ, Action.CREATE},
ResourceType.ZEUGNIS: {Action.READ, Action.SIGN_OFF, Action.LOCK},
ResourceType.ZEUGNIS_ENTWURF: {Action.READ, Action.UPDATE},
ResourceType.ZEUGNIS_VORLAGE: {Action.READ, Action.UPDATE},
ResourceType.SCHUELER_DATEN: {Action.READ},
ResourceType.FACHNOTE: {Action.READ},
ResourceType.KOPFNOTE: {Action.READ, Action.UPDATE},
ResourceType.FEHLZEITEN: {Action.READ},
ResourceType.BEMERKUNG: {Action.READ, Action.UPDATE},
ResourceType.KONFERENZ_BESCHLUSS: {Action.CREATE, Action.READ, Action.UPDATE, Action.SIGN_OFF},
ResourceType.VERSETZUNG: {Action.CREATE, Action.READ, Action.UPDATE, Action.SIGN_OFF},
ResourceType.EXPORT: {Action.CREATE, Action.READ, Action.DOWNLOAD},
ResourceType.AUDIT_LOG: {Action.READ},
},
# Stufenleitung - Stufenkoordination (z.B. Oberstufe)
Role.STUFENLEITUNG: {
ResourceType.NAMESPACE: {Action.READ, Action.UPDATE},
ResourceType.ZEUGNIS: {Action.READ, Action.UPDATE},
ResourceType.ZEUGNIS_ENTWURF: {Action.READ, Action.UPDATE},
ResourceType.SCHUELER_DATEN: {Action.READ},
ResourceType.FACHNOTE: {Action.READ},
ResourceType.KOPFNOTE: {Action.READ},
ResourceType.FEHLZEITEN: {Action.READ},
ResourceType.BEMERKUNG: {Action.READ, Action.UPDATE},
ResourceType.KONFERENZ_BESCHLUSS: {Action.READ},
ResourceType.VERSETZUNG: {Action.READ, Action.UPDATE},
ResourceType.EXPORT: {Action.READ, Action.DOWNLOAD},
ResourceType.AUDIT_LOG: {Action.READ},
},
}

View File

@@ -0,0 +1,438 @@
"""
RBAC/ABAC Type Definitions
Enums, data structures, and models for the policy system.
Extracted from rbac.py for file-size compliance.
"""
import json
from enum import Enum
from dataclasses import dataclass, field, asdict
from typing import Optional, List, Dict, Set, Any
from datetime import datetime, timezone
import uuid
# =============================================
# ENUMS: Roles, Actions, Resources
# =============================================
class Role(str, Enum):
"""Fachliche Rollen in Korrektur- und Zeugniskette."""
# === Klausur-Korrekturkette ===
ERSTKORREKTOR = "erstkorrektor" # EK
ZWEITKORREKTOR = "zweitkorrektor" # ZK
DRITTKORREKTOR = "drittkorrektor" # DK
# === Zeugnis-Workflow ===
KLASSENLEHRER = "klassenlehrer" # KL - Erstellt Zeugnis, Kopfnoten, Bemerkungen
FACHLEHRER = "fachlehrer" # FL - Traegt Fachnoten ein
ZEUGNISBEAUFTRAGTER = "zeugnisbeauftragter" # ZB - Qualitaetskontrolle
SEKRETARIAT = "sekretariat" # SEK - Druck, Versand, Archivierung
# === Leitung (Klausur + Zeugnis) ===
FACHVORSITZ = "fachvorsitz" # FVL - Fachpruefungsleitung
PRUEFUNGSVORSITZ = "pruefungsvorsitz" # PV - Schulleitung / Pruefungsvorsitz
SCHULLEITUNG = "schulleitung" # SL - Finale Zeugnis-Freigabe
STUFENLEITUNG = "stufenleitung" # STL - Stufenkoordination
# === Administration ===
SCHUL_ADMIN = "schul_admin" # SA
LAND_ADMIN = "land_admin" # LA - Behoerde
# === Spezial ===
AUDITOR = "auditor" # DSB/Auditor
OPERATOR = "operator" # OPS - Support
TEACHER_ASSISTANT = "teacher_assistant" # TA - Referendar
EXAM_AUTHOR = "exam_author" # EA - nur Vorabi
class Action(str, Enum):
"""Moegliche Operationen auf Ressourcen."""
CREATE = "create"
READ = "read"
UPDATE = "update"
DELETE = "delete"
ASSIGN_ROLE = "assign_role"
INVITE_USER = "invite_user"
REMOVE_USER = "remove_user"
UPLOAD = "upload"
DOWNLOAD = "download"
LOCK = "lock" # Finalisieren
UNLOCK = "unlock" # Nur mit Sonderrecht
SIGN_OFF = "sign_off" # Freigabe
SHARE_KEY = "share_key" # Key Share erzeugen
VIEW_PII = "view_pii" # Falls PII vorhanden
BREAK_GLASS = "break_glass" # Notfallzugriff
PUBLISH_OFFICIAL = "publish_official" # Amtliche EH verteilen
class ResourceType(str, Enum):
"""Ressourcentypen im System."""
TENANT = "tenant"
NAMESPACE = "namespace"
# === Klausur-Korrektur ===
EXAM_PACKAGE = "exam_package"
STUDENT_WORK = "student_work"
EH_DOCUMENT = "eh_document"
RUBRIC = "rubric" # Punkteraster
ANNOTATION = "annotation"
EVALUATION = "evaluation" # Kriterien/Punkte
REPORT = "report" # Gutachten
GRADE_DECISION = "grade_decision"
# === Zeugnisgenerator ===
ZEUGNIS = "zeugnis" # Zeugnisdokument
ZEUGNIS_VORLAGE = "zeugnis_vorlage" # Zeugnisvorlage/Template
ZEUGNIS_ENTWURF = "zeugnis_entwurf" # Zeugnisentwurf (vor Freigabe)
SCHUELER_DATEN = "schueler_daten" # Schueler-Stammdaten, Noten
FACHNOTE = "fachnote" # Einzelne Fachnote
KOPFNOTE = "kopfnote" # Arbeits-/Sozialverhalten
FEHLZEITEN = "fehlzeiten" # Fehlzeiten
BEMERKUNG = "bemerkung" # Zeugnisbemerkungen
KONFERENZ_BESCHLUSS = "konferenz_beschluss" # Konferenzergebnis
VERSETZUNG = "versetzung" # Versetzungsentscheidung
# === Allgemein ===
DOCUMENT = "document" # Generischer Dokumenttyp (EH, Vorlagen, etc.)
TEMPLATE = "template" # Generische Vorlagen
EXPORT = "export"
AUDIT_LOG = "audit_log"
KEY_MATERIAL = "key_material"
class ZKVisibilityMode(str, Enum):
"""Sichtbarkeitsmodus fuer Zweitkorrektoren."""
BLIND = "blind" # ZK sieht keine EK-Note/Gutachten
SEMI = "semi" # ZK sieht Annotationen, aber keine Note
FULL = "full" # ZK sieht alles
class EHVisibilityMode(str, Enum):
"""Sichtbarkeitsmodus fuer Erwartungshorizonte."""
BLIND = "blind" # ZK sieht EH nicht (selten)
SHARED = "shared" # ZK sieht EH (Standard)
class VerfahrenType(str, Enum):
"""Verfahrenstypen fuer Klausuren und Zeugnisse."""
# === Klausur/Pruefungsverfahren ===
ABITUR = "abitur"
VORABITUR = "vorabitur"
KLAUSUR = "klausur"
NACHPRUEFUNG = "nachpruefung"
# === Zeugnisverfahren ===
HALBJAHRESZEUGNIS = "halbjahreszeugnis"
JAHRESZEUGNIS = "jahreszeugnis"
ABSCHLUSSZEUGNIS = "abschlusszeugnis"
ABGANGSZEUGNIS = "abgangszeugnis"
@classmethod
def is_exam_type(cls, verfahren: str) -> bool:
"""Pruefe ob Verfahren ein Pruefungstyp ist."""
exam_types = {cls.ABITUR, cls.VORABITUR, cls.KLAUSUR, cls.NACHPRUEFUNG}
try:
return cls(verfahren) in exam_types
except ValueError:
return False
@classmethod
def is_certificate_type(cls, verfahren: str) -> bool:
"""Pruefe ob Verfahren ein Zeugnistyp ist."""
cert_types = {cls.HALBJAHRESZEUGNIS, cls.JAHRESZEUGNIS, cls.ABSCHLUSSZEUGNIS, cls.ABGANGSZEUGNIS}
try:
return cls(verfahren) in cert_types
except ValueError:
return False
# =============================================
# DATA STRUCTURES
# =============================================
@dataclass
class PolicySet:
"""
Policy-Konfiguration pro Bundesland/Jahr/Fach.
Ermoeglicht bundesland-spezifische Unterschiede ohne
harte Codierung im Quellcode.
Unterstuetzte Verfahrenstypen:
- Pruefungen: abitur, vorabitur, klausur, nachpruefung
- Zeugnisse: halbjahreszeugnis, jahreszeugnis, abschlusszeugnis, abgangszeugnis
"""
id: str
bundesland: str
jahr: int
fach: Optional[str] # None = gilt fuer alle Faecher
verfahren: str # See VerfahrenType enum
# Sichtbarkeitsregeln (Klausur)
zk_visibility_mode: ZKVisibilityMode = ZKVisibilityMode.FULL
eh_visibility_mode: EHVisibilityMode = EHVisibilityMode.SHARED
# EH-Quellen (Klausur)
allow_teacher_uploaded_eh: bool = True
allow_land_uploaded_eh: bool = True
require_rights_confirmation_on_upload: bool = True
require_dual_control_for_official_eh_update: bool = False
# Korrekturregeln (Klausur)
third_correction_threshold: int = 4 # Notenpunkte Abweichung
final_signoff_role: str = "fachvorsitz"
# Zeugnisregeln (Zeugnis)
require_klassenlehrer_approval: bool = True
require_schulleitung_signoff: bool = True
allow_sekretariat_edit_after_approval: bool = False
konferenz_protokoll_required: bool = True
bemerkungen_require_review: bool = True
fehlzeiten_auto_import: bool = True
kopfnoten_enabled: bool = False
versetzung_auto_calculate: bool = True
# Export & Anzeige
quote_verbatim_allowed: bool = False # Amtliche Texte in UI
export_template_id: str = "default"
# Zusaetzliche Flags
flags: Dict[str, Any] = field(default_factory=dict)
created_at: datetime = field(default_factory=lambda: datetime.now(timezone.utc))
def is_exam_policy(self) -> bool:
"""Pruefe ob diese Policy fuer Pruefungen ist."""
return VerfahrenType.is_exam_type(self.verfahren)
def is_certificate_policy(self) -> bool:
"""Pruefe ob diese Policy fuer Zeugnisse ist."""
return VerfahrenType.is_certificate_type(self.verfahren)
def to_dict(self):
d = asdict(self)
d['zk_visibility_mode'] = self.zk_visibility_mode.value
d['eh_visibility_mode'] = self.eh_visibility_mode.value
d['created_at'] = self.created_at.isoformat()
return d
@dataclass
class RoleAssignment:
"""
Zuweisung einer Rolle zu einem User fuer eine spezifische Ressource.
"""
id: str
user_id: str
role: Role
resource_type: ResourceType
resource_id: str
# Optionale Einschraenkungen
tenant_id: Optional[str] = None
namespace_id: Optional[str] = None
# Gueltigkeit
valid_from: datetime = field(default_factory=lambda: datetime.now(timezone.utc))
valid_to: Optional[datetime] = None
# Metadaten
granted_by: str = ""
granted_at: datetime = field(default_factory=lambda: datetime.now(timezone.utc))
revoked_at: Optional[datetime] = None
def is_active(self) -> bool:
now = datetime.now(timezone.utc)
if self.revoked_at:
return False
if self.valid_to and now > self.valid_to:
return False
return now >= self.valid_from
def to_dict(self):
return {
'id': self.id,
'user_id': self.user_id,
'role': self.role.value,
'resource_type': self.resource_type.value,
'resource_id': self.resource_id,
'tenant_id': self.tenant_id,
'namespace_id': self.namespace_id,
'valid_from': self.valid_from.isoformat(),
'valid_to': self.valid_to.isoformat() if self.valid_to else None,
'granted_by': self.granted_by,
'granted_at': self.granted_at.isoformat(),
'revoked_at': self.revoked_at.isoformat() if self.revoked_at else None,
'is_active': self.is_active()
}
@dataclass
class KeyShare:
"""
Berechtigung fuer einen User, auf verschluesselte Inhalte zuzugreifen.
Ein KeyShare ist KEIN Schluessel im Klartext, sondern eine
Berechtigung in Verbindung mit Role Assignment.
"""
id: str
user_id: str
package_id: str
# Berechtigungsumfang
permissions: Set[str] = field(default_factory=set)
# z.B. {"read_original", "read_eh", "read_ek_outputs", "write_annotations"}
# Optionale Einschraenkungen
scope: str = "full" # "full", "original_only", "eh_only", "outputs_only"
# Kette
granted_by: str = ""
granted_at: datetime = field(default_factory=lambda: datetime.now(timezone.utc))
# Akzeptanz (fuer Invite-Flow)
invite_token: Optional[str] = None
accepted_at: Optional[datetime] = None
# Widerruf
revoked_at: Optional[datetime] = None
revoked_by: Optional[str] = None
def is_active(self) -> bool:
return self.revoked_at is None and (
self.invite_token is None or self.accepted_at is not None
)
def to_dict(self):
return {
'id': self.id,
'user_id': self.user_id,
'package_id': self.package_id,
'permissions': list(self.permissions),
'scope': self.scope,
'granted_by': self.granted_by,
'granted_at': self.granted_at.isoformat(),
'invite_token': self.invite_token,
'accepted_at': self.accepted_at.isoformat() if self.accepted_at else None,
'revoked_at': self.revoked_at.isoformat() if self.revoked_at else None,
'is_active': self.is_active()
}
@dataclass
class Tenant:
"""
Hoechste Isolationseinheit - typischerweise eine Schule.
"""
id: str
name: str
bundesland: str
tenant_type: str = "school" # "school", "pruefungszentrum", "behoerde"
# Verschluesselung
encryption_enabled: bool = True
# Metadaten
created_at: datetime = field(default_factory=lambda: datetime.now(timezone.utc))
deleted_at: Optional[datetime] = None
def to_dict(self):
return {
'id': self.id,
'name': self.name,
'bundesland': self.bundesland,
'tenant_type': self.tenant_type,
'encryption_enabled': self.encryption_enabled,
'created_at': self.created_at.isoformat()
}
@dataclass
class Namespace:
"""
Arbeitsraum innerhalb eines Tenants.
z.B. "Abitur 2026 - Deutsch LK - Kurs 12a"
"""
id: str
tenant_id: str
name: str
# Kontext
jahr: int
fach: str
kurs: Optional[str] = None
pruefungsart: str = "abitur" # "abitur", "vorabitur"
# Policy
policy_set_id: Optional[str] = None
# Metadaten
created_by: str = ""
created_at: datetime = field(default_factory=lambda: datetime.now(timezone.utc))
deleted_at: Optional[datetime] = None
def to_dict(self):
return {
'id': self.id,
'tenant_id': self.tenant_id,
'name': self.name,
'jahr': self.jahr,
'fach': self.fach,
'kurs': self.kurs,
'pruefungsart': self.pruefungsart,
'policy_set_id': self.policy_set_id,
'created_by': self.created_by,
'created_at': self.created_at.isoformat()
}
@dataclass
class ExamPackage:
"""
Pruefungspaket - kompletter Satz Arbeiten mit allen Artefakten.
"""
id: str
namespace_id: str
tenant_id: str
name: str
beschreibung: Optional[str] = None
# Workflow-Status
status: str = "draft" # "draft", "in_progress", "locked", "signed_off"
# Beteiligte (Rollen werden separat zugewiesen)
owner_id: str = "" # Typischerweise EK
# Verschluesselung
encryption_key_id: Optional[str] = None
# Timestamps
created_at: datetime = field(default_factory=lambda: datetime.now(timezone.utc))
locked_at: Optional[datetime] = None
signed_off_at: Optional[datetime] = None
signed_off_by: Optional[str] = None
def to_dict(self):
return {
'id': self.id,
'namespace_id': self.namespace_id,
'tenant_id': self.tenant_id,
'name': self.name,
'beschreibung': self.beschreibung,
'status': self.status,
'owner_id': self.owner_id,
'created_at': self.created_at.isoformat(),
'locked_at': self.locked_at.isoformat() if self.locked_at else None,
'signed_off_at': self.signed_off_at.isoformat() if self.signed_off_at else None,
'signed_off_by': self.signed_off_by
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,343 @@
"""
BYOEH Invitation Flow Routes
Endpoints for inviting users, listing/accepting/declining/revoking
invitations to access Erwartungshorizonte.
Extracted from routes/eh.py for file-size compliance.
"""
import uuid
from datetime import datetime, timezone, timedelta
from fastapi import APIRouter, HTTPException, Request
from models.eh import EHKeyShare, EHShareInvitation
from models.requests import EHInviteRequest, EHAcceptInviteRequest
from services.auth_service import get_current_user
from services.eh_service import log_eh_audit
import storage
router = APIRouter()
# =============================================
# INVITATION FLOW
# =============================================
@router.post("/api/v1/eh/{eh_id}/invite")
async def invite_to_eh(
eh_id: str,
invite_request: EHInviteRequest,
request: Request
):
"""
Invite another user to access an Erwartungshorizont.
This creates a pending invitation that the recipient must accept.
"""
user = get_current_user(request)
tenant_id = user.get("tenant_id") or user.get("school_id") or user["user_id"]
# Check EH exists and belongs to user
if eh_id not in storage.eh_db:
raise HTTPException(status_code=404, detail="Erwartungshorizont not found")
eh = storage.eh_db[eh_id]
if eh.teacher_id != user["user_id"]:
raise HTTPException(status_code=403, detail="Only the owner can invite others")
# Validate role
valid_roles = ['second_examiner', 'third_examiner', 'supervisor', 'department_head', 'fachvorsitz']
if invite_request.role not in valid_roles:
raise HTTPException(status_code=400, detail=f"Invalid role. Must be one of: {valid_roles}")
# Check for existing pending invitation to same user
for inv in storage.eh_invitations_db.values():
if (inv.eh_id == eh_id and
inv.invitee_email == invite_request.invitee_email and
inv.status == 'pending'):
raise HTTPException(
status_code=409,
detail="Pending invitation already exists for this user"
)
# Create invitation
invitation_id = str(uuid.uuid4())
now = datetime.now(timezone.utc)
expires_at = now + timedelta(days=invite_request.expires_in_days)
invitation = EHShareInvitation(
id=invitation_id,
eh_id=eh_id,
inviter_id=user["user_id"],
invitee_id=invite_request.invitee_id or "",
invitee_email=invite_request.invitee_email,
role=invite_request.role,
klausur_id=invite_request.klausur_id,
message=invite_request.message,
status='pending',
expires_at=expires_at,
created_at=now,
accepted_at=None,
declined_at=None
)
storage.eh_invitations_db[invitation_id] = invitation
# Audit log
log_eh_audit(
tenant_id=tenant_id,
user_id=user["user_id"],
action="invite",
eh_id=eh_id,
details={
"invitation_id": invitation_id,
"invitee_email": invite_request.invitee_email,
"role": invite_request.role,
"expires_at": expires_at.isoformat()
}
)
return {
"status": "invited",
"invitation_id": invitation_id,
"eh_id": eh_id,
"invitee_email": invite_request.invitee_email,
"role": invite_request.role,
"expires_at": expires_at.isoformat(),
"eh_title": eh.title
}
@router.get("/api/v1/eh/invitations/pending")
async def list_pending_invitations(request: Request):
"""List all pending invitations for the current user."""
user = get_current_user(request)
user_email = user.get("email", "")
user_id = user["user_id"]
now = datetime.now(timezone.utc)
pending = []
for inv in storage.eh_invitations_db.values():
# Match by email or user_id
if (inv.invitee_email == user_email or inv.invitee_id == user_id):
if inv.status == 'pending' and inv.expires_at > now:
# Get EH info
eh_info = None
if inv.eh_id in storage.eh_db:
eh = storage.eh_db[inv.eh_id]
eh_info = {
"id": eh.id,
"title": eh.title,
"subject": eh.subject,
"niveau": eh.niveau,
"year": eh.year
}
pending.append({
"invitation": inv.to_dict(),
"eh": eh_info
})
return pending
@router.get("/api/v1/eh/invitations/sent")
async def list_sent_invitations(request: Request):
"""List all invitations sent by the current user."""
user = get_current_user(request)
user_id = user["user_id"]
sent = []
for inv in storage.eh_invitations_db.values():
if inv.inviter_id == user_id:
# Get EH info
eh_info = None
if inv.eh_id in storage.eh_db:
eh = storage.eh_db[inv.eh_id]
eh_info = {
"id": eh.id,
"title": eh.title,
"subject": eh.subject
}
sent.append({
"invitation": inv.to_dict(),
"eh": eh_info
})
return sent
@router.post("/api/v1/eh/invitations/{invitation_id}/accept")
async def accept_eh_invitation(
invitation_id: str,
accept_request: EHAcceptInviteRequest,
request: Request
):
"""Accept an invitation and receive access to the EH."""
user = get_current_user(request)
tenant_id = user.get("tenant_id") or user.get("school_id") or user["user_id"]
user_email = user.get("email", "")
user_id = user["user_id"]
now = datetime.now(timezone.utc)
# Find invitation
if invitation_id not in storage.eh_invitations_db:
raise HTTPException(status_code=404, detail="Invitation not found")
invitation = storage.eh_invitations_db[invitation_id]
# Verify recipient
if invitation.invitee_email != user_email and invitation.invitee_id != user_id:
raise HTTPException(status_code=403, detail="This invitation is not for you")
# Check status
if invitation.status != 'pending':
raise HTTPException(
status_code=400,
detail=f"Invitation is {invitation.status}, cannot accept"
)
# Check expiration
if invitation.expires_at < now:
invitation.status = 'expired'
raise HTTPException(status_code=400, detail="Invitation has expired")
# Create key share
share_id = str(uuid.uuid4())
key_share = EHKeyShare(
id=share_id,
eh_id=invitation.eh_id,
user_id=user_id,
encrypted_passphrase=accept_request.encrypted_passphrase,
passphrase_hint="",
granted_by=invitation.inviter_id,
granted_at=now,
role=invitation.role,
klausur_id=invitation.klausur_id,
active=True
)
# Store key share
if invitation.eh_id not in storage.eh_key_shares_db:
storage.eh_key_shares_db[invitation.eh_id] = []
storage.eh_key_shares_db[invitation.eh_id].append(key_share)
# Update invitation status
invitation.status = 'accepted'
invitation.accepted_at = now
invitation.invitee_id = user_id # Update with actual user ID
# Audit log
log_eh_audit(
tenant_id=tenant_id,
user_id=user_id,
action="accept_invite",
eh_id=invitation.eh_id,
details={
"invitation_id": invitation_id,
"share_id": share_id,
"role": invitation.role
}
)
return {
"status": "accepted",
"share_id": share_id,
"eh_id": invitation.eh_id,
"role": invitation.role,
"klausur_id": invitation.klausur_id
}
@router.post("/api/v1/eh/invitations/{invitation_id}/decline")
async def decline_eh_invitation(invitation_id: str, request: Request):
"""Decline an invitation."""
user = get_current_user(request)
tenant_id = user.get("tenant_id") or user.get("school_id") or user["user_id"]
user_email = user.get("email", "")
user_id = user["user_id"]
now = datetime.now(timezone.utc)
# Find invitation
if invitation_id not in storage.eh_invitations_db:
raise HTTPException(status_code=404, detail="Invitation not found")
invitation = storage.eh_invitations_db[invitation_id]
# Verify recipient
if invitation.invitee_email != user_email and invitation.invitee_id != user_id:
raise HTTPException(status_code=403, detail="This invitation is not for you")
# Check status
if invitation.status != 'pending':
raise HTTPException(
status_code=400,
detail=f"Invitation is {invitation.status}, cannot decline"
)
# Update status
invitation.status = 'declined'
invitation.declined_at = now
# Audit log
log_eh_audit(
tenant_id=tenant_id,
user_id=user_id,
action="decline_invite",
eh_id=invitation.eh_id,
details={"invitation_id": invitation_id}
)
return {
"status": "declined",
"invitation_id": invitation_id,
"eh_id": invitation.eh_id
}
@router.delete("/api/v1/eh/invitations/{invitation_id}")
async def revoke_eh_invitation(invitation_id: str, request: Request):
"""Revoke a pending invitation (by the inviter)."""
user = get_current_user(request)
tenant_id = user.get("tenant_id") or user.get("school_id") or user["user_id"]
user_id = user["user_id"]
# Find invitation
if invitation_id not in storage.eh_invitations_db:
raise HTTPException(status_code=404, detail="Invitation not found")
invitation = storage.eh_invitations_db[invitation_id]
# Verify inviter
if invitation.inviter_id != user_id:
raise HTTPException(status_code=403, detail="Only the inviter can revoke")
# Check status
if invitation.status != 'pending':
raise HTTPException(
status_code=400,
detail=f"Invitation is {invitation.status}, cannot revoke"
)
# Update status
invitation.status = 'revoked'
# Audit log
log_eh_audit(
tenant_id=tenant_id,
user_id=user_id,
action="revoke_invite",
eh_id=invitation.eh_id,
details={
"invitation_id": invitation_id,
"invitee_email": invitation.invitee_email
}
)
return {
"status": "revoked",
"invitation_id": invitation_id,
"eh_id": invitation.eh_id
}

View File

@@ -0,0 +1,347 @@
"""
BYOEH Key Sharing and Klausur Linking Routes
Endpoints for sharing EH access with other examiners
and linking EH to Klausuren.
Extracted from routes/eh.py for file-size compliance.
"""
import uuid
from datetime import datetime, timezone
from fastapi import APIRouter, HTTPException, Request
from models.eh import EHKeyShare, EHKlausurLink
from models.requests import EHShareRequest, EHLinkKlausurRequest
from services.auth_service import get_current_user
from services.eh_service import log_eh_audit
import storage
router = APIRouter()
# =============================================
# BYOEH KEY SHARING
# =============================================
@router.post("/api/v1/eh/{eh_id}/share")
async def share_erwartungshorizont(
eh_id: str,
share_request: EHShareRequest,
request: Request
):
"""
Share an Erwartungshorizont with another examiner.
The first examiner shares their EH by providing an encrypted passphrase
that the recipient can use.
"""
user = get_current_user(request)
tenant_id = user.get("tenant_id") or user.get("school_id") or user["user_id"]
# Check EH exists and belongs to user
if eh_id not in storage.eh_db:
raise HTTPException(status_code=404, detail="Erwartungshorizont not found")
eh = storage.eh_db[eh_id]
if eh.teacher_id != user["user_id"]:
raise HTTPException(status_code=403, detail="Only the owner can share this EH")
# Validate role
valid_roles = ['second_examiner', 'third_examiner', 'supervisor', 'department_head']
if share_request.role not in valid_roles:
raise HTTPException(status_code=400, detail=f"Invalid role. Must be one of: {valid_roles}")
# Create key share entry
share_id = str(uuid.uuid4())
key_share = EHKeyShare(
id=share_id,
eh_id=eh_id,
user_id=share_request.user_id,
encrypted_passphrase=share_request.encrypted_passphrase,
passphrase_hint=share_request.passphrase_hint or "",
granted_by=user["user_id"],
granted_at=datetime.now(timezone.utc),
role=share_request.role,
klausur_id=share_request.klausur_id,
active=True
)
# Store in memory
if eh_id not in storage.eh_key_shares_db:
storage.eh_key_shares_db[eh_id] = []
storage.eh_key_shares_db[eh_id].append(key_share)
# Audit log
log_eh_audit(
tenant_id=tenant_id,
user_id=user["user_id"],
action="share",
eh_id=eh_id,
details={
"shared_with": share_request.user_id,
"role": share_request.role,
"klausur_id": share_request.klausur_id
}
)
return {
"status": "shared",
"share_id": share_id,
"eh_id": eh_id,
"shared_with": share_request.user_id,
"role": share_request.role
}
@router.get("/api/v1/eh/{eh_id}/shares")
async def list_eh_shares(eh_id: str, request: Request):
"""List all users who have access to an EH."""
user = get_current_user(request)
# Check EH exists and belongs to user
if eh_id not in storage.eh_db:
raise HTTPException(status_code=404, detail="Erwartungshorizont not found")
eh = storage.eh_db[eh_id]
if eh.teacher_id != user["user_id"]:
raise HTTPException(status_code=403, detail="Only the owner can view shares")
shares = storage.eh_key_shares_db.get(eh_id, [])
return [share.to_dict() for share in shares if share.active]
@router.delete("/api/v1/eh/{eh_id}/shares/{share_id}")
async def revoke_eh_share(eh_id: str, share_id: str, request: Request):
"""Revoke a shared EH access."""
user = get_current_user(request)
tenant_id = user.get("tenant_id") or user.get("school_id") or user["user_id"]
# Check EH exists and belongs to user
if eh_id not in storage.eh_db:
raise HTTPException(status_code=404, detail="Erwartungshorizont not found")
eh = storage.eh_db[eh_id]
if eh.teacher_id != user["user_id"]:
raise HTTPException(status_code=403, detail="Only the owner can revoke shares")
# Find and deactivate share
shares = storage.eh_key_shares_db.get(eh_id, [])
for share in shares:
if share.id == share_id:
share.active = False
log_eh_audit(
tenant_id=tenant_id,
user_id=user["user_id"],
action="revoke_share",
eh_id=eh_id,
details={"revoked_user": share.user_id, "share_id": share_id}
)
return {"status": "revoked", "share_id": share_id}
raise HTTPException(status_code=404, detail="Share not found")
# =============================================
# KLAUSUR LINKING
# =============================================
@router.post("/api/v1/eh/{eh_id}/link-klausur")
async def link_eh_to_klausur(
eh_id: str,
link_request: EHLinkKlausurRequest,
request: Request
):
"""
Link an Erwartungshorizont to a Klausur.
This creates an association between the EH and a specific Klausur.
"""
user = get_current_user(request)
tenant_id = user.get("tenant_id") or user.get("school_id") or user["user_id"]
# Check EH exists and user has access
if eh_id not in storage.eh_db:
raise HTTPException(status_code=404, detail="Erwartungshorizont not found")
eh = storage.eh_db[eh_id]
user_has_access = (
eh.teacher_id == user["user_id"] or
any(
share.user_id == user["user_id"] and share.active
for share in storage.eh_key_shares_db.get(eh_id, [])
)
)
if not user_has_access:
raise HTTPException(status_code=403, detail="No access to this EH")
# Check Klausur exists
klausur_id = link_request.klausur_id
if klausur_id not in storage.klausuren_db:
raise HTTPException(status_code=404, detail="Klausur not found")
# Create link
link_id = str(uuid.uuid4())
link = EHKlausurLink(
id=link_id,
eh_id=eh_id,
klausur_id=klausur_id,
linked_by=user["user_id"],
linked_at=datetime.now(timezone.utc)
)
if klausur_id not in storage.eh_klausur_links_db:
storage.eh_klausur_links_db[klausur_id] = []
storage.eh_klausur_links_db[klausur_id].append(link)
# Audit log
log_eh_audit(
tenant_id=tenant_id,
user_id=user["user_id"],
action="link_klausur",
eh_id=eh_id,
details={"klausur_id": klausur_id}
)
return {
"status": "linked",
"link_id": link_id,
"eh_id": eh_id,
"klausur_id": klausur_id
}
@router.get("/api/v1/klausuren/{klausur_id}/linked-eh")
async def get_linked_eh(klausur_id: str, request: Request):
"""Get all EH linked to a specific Klausur."""
user = get_current_user(request)
user_id = user["user_id"]
# Check Klausur exists
if klausur_id not in storage.klausuren_db:
raise HTTPException(status_code=404, detail="Klausur not found")
# Get all links for this Klausur
links = storage.eh_klausur_links_db.get(klausur_id, [])
linked_ehs = []
for link in links:
if link.eh_id in storage.eh_db:
eh = storage.eh_db[link.eh_id]
# Check if user has access to this EH
is_owner = eh.teacher_id == user_id
is_shared = any(
share.user_id == user_id and share.active
for share in storage.eh_key_shares_db.get(link.eh_id, [])
)
if is_owner or is_shared:
# Find user's share info if shared
share_info = None
if is_shared:
for share in storage.eh_key_shares_db.get(link.eh_id, []):
if share.user_id == user_id and share.active:
share_info = share.to_dict()
break
linked_ehs.append({
"eh": eh.to_dict(),
"link": link.to_dict(),
"is_owner": is_owner,
"share": share_info
})
return linked_ehs
@router.delete("/api/v1/eh/{eh_id}/link-klausur/{klausur_id}")
async def unlink_eh_from_klausur(eh_id: str, klausur_id: str, request: Request):
"""Remove the link between an EH and a Klausur."""
user = get_current_user(request)
tenant_id = user.get("tenant_id") or user.get("school_id") or user["user_id"]
# Check EH exists and user has access
if eh_id not in storage.eh_db:
raise HTTPException(status_code=404, detail="Erwartungshorizont not found")
eh = storage.eh_db[eh_id]
if eh.teacher_id != user["user_id"]:
raise HTTPException(status_code=403, detail="Only the owner can unlink")
# Find and remove link
links = storage.eh_klausur_links_db.get(klausur_id, [])
for i, link in enumerate(links):
if link.eh_id == eh_id:
del links[i]
log_eh_audit(
tenant_id=tenant_id,
user_id=user["user_id"],
action="unlink_klausur",
eh_id=eh_id,
details={"klausur_id": klausur_id}
)
return {"status": "unlinked", "eh_id": eh_id, "klausur_id": klausur_id}
raise HTTPException(status_code=404, detail="Link not found")
@router.get("/api/v1/eh/{eh_id}/access-chain")
async def get_eh_access_chain(eh_id: str, request: Request):
"""
Get the complete access chain for an EH.
Shows the correction chain: EK -> ZK -> DK -> FVL
with their current access status.
"""
user = get_current_user(request)
# Check EH exists
if eh_id not in storage.eh_db:
raise HTTPException(status_code=404, detail="Erwartungshorizont not found")
eh = storage.eh_db[eh_id]
# Check access - owner or shared user
is_owner = eh.teacher_id == user["user_id"]
is_shared = any(
share.user_id == user["user_id"] and share.active
for share in storage.eh_key_shares_db.get(eh_id, [])
)
if not is_owner and not is_shared:
raise HTTPException(status_code=403, detail="No access to this EH")
# Build access chain
chain = {
"eh_id": eh_id,
"eh_title": eh.title,
"owner": {
"user_id": eh.teacher_id,
"role": "erstkorrektor"
},
"active_shares": [],
"pending_invitations": [],
"revoked_shares": []
}
# Active shares
for share in storage.eh_key_shares_db.get(eh_id, []):
share_dict = share.to_dict()
if share.active:
chain["active_shares"].append(share_dict)
else:
chain["revoked_shares"].append(share_dict)
# Pending invitations (only for owner)
if is_owner:
for inv in storage.eh_invitations_db.values():
if inv.eh_id == eh_id and inv.status == 'pending':
chain["pending_invitations"].append(inv.to_dict())
return chain

View File

@@ -0,0 +1,455 @@
"""
BYOEH Upload, List, and Core CRUD Routes
Endpoints for uploading, listing, getting, deleting,
indexing, and RAG-querying Erwartungshorizonte.
Extracted from routes/eh.py for file-size compliance.
"""
import os
import uuid
import json
from datetime import datetime, timezone
from typing import Optional
from fastapi import APIRouter, HTTPException, Request, UploadFile, File, Form, BackgroundTasks
from models.enums import EHStatus
from models.eh import (
Erwartungshorizont,
EHRightsConfirmation,
)
from models.requests import (
EHUploadMetadata,
EHRAGQuery,
EHIndexRequest,
)
from services.auth_service import get_current_user
from services.eh_service import log_eh_audit
from config import EH_UPLOAD_DIR, OPENAI_API_KEY, ENVIRONMENT, RIGHTS_CONFIRMATION_TEXT
import storage
# BYOEH imports
from qdrant_service import (
get_collection_info, delete_eh_vectors, search_eh, index_eh_chunks
)
from eh_pipeline import (
decrypt_text, verify_key_hash, process_eh_for_indexing,
generate_single_embedding, EncryptionError, EmbeddingError
)
router = APIRouter()
# =============================================
# EH UPLOAD & LIST
# =============================================
@router.post("/api/v1/eh/upload")
async def upload_erwartungshorizont(
file: UploadFile = File(...),
metadata_json: str = Form(...),
request: Request = None,
background_tasks: BackgroundTasks = None
):
"""
Upload an encrypted Erwartungshorizont.
The file MUST be client-side encrypted.
Server stores only the encrypted blob + key hash (never the passphrase).
"""
user = get_current_user(request)
tenant_id = user.get("tenant_id") or user.get("school_id") or user["user_id"]
try:
data = EHUploadMetadata(**json.loads(metadata_json))
except Exception as e:
raise HTTPException(status_code=400, detail=f"Invalid metadata: {str(e)}")
if not data.rights_confirmed:
raise HTTPException(status_code=400, detail="Rights confirmation required")
eh_id = str(uuid.uuid4())
# Create tenant-isolated directory
upload_dir = f"{EH_UPLOAD_DIR}/{tenant_id}/{eh_id}"
os.makedirs(upload_dir, exist_ok=True)
# Save encrypted file
encrypted_path = f"{upload_dir}/encrypted.bin"
content = await file.read()
with open(encrypted_path, "wb") as f:
f.write(content)
# Save salt separately
with open(f"{upload_dir}/salt.txt", "w") as f:
f.write(data.salt)
# Create EH record
eh = Erwartungshorizont(
id=eh_id,
tenant_id=tenant_id,
teacher_id=user["user_id"],
title=data.metadata.title,
subject=data.metadata.subject,
niveau=data.metadata.niveau,
year=data.metadata.year,
aufgaben_nummer=data.metadata.aufgaben_nummer,
encryption_key_hash=data.encryption_key_hash,
salt=data.salt,
encrypted_file_path=encrypted_path,
file_size_bytes=len(content),
original_filename=data.original_filename,
rights_confirmed=True,
rights_confirmed_at=datetime.now(timezone.utc),
status=EHStatus.PENDING_RIGHTS,
chunk_count=0,
indexed_at=None,
error_message=None,
training_allowed=False, # ALWAYS FALSE - critical for compliance
created_at=datetime.now(timezone.utc),
deleted_at=None
)
storage.eh_db[eh_id] = eh
# Store rights confirmation
rights_confirmation = EHRightsConfirmation(
id=str(uuid.uuid4()),
eh_id=eh_id,
teacher_id=user["user_id"],
confirmation_type="upload",
confirmation_text=RIGHTS_CONFIRMATION_TEXT,
ip_address=request.client.host if request.client else None,
user_agent=request.headers.get("user-agent"),
confirmed_at=datetime.now(timezone.utc)
)
storage.eh_rights_db[rights_confirmation.id] = rights_confirmation
# Audit log
log_eh_audit(
tenant_id=tenant_id,
user_id=user["user_id"],
action="upload",
eh_id=eh_id,
details={
"subject": data.metadata.subject,
"year": data.metadata.year,
"file_size": len(content)
},
ip_address=request.client.host if request.client else None,
user_agent=request.headers.get("user-agent")
)
return eh.to_dict()
@router.get("/api/v1/eh")
async def list_erwartungshorizonte(
request: Request,
subject: Optional[str] = None,
year: Optional[int] = None
):
"""List all Erwartungshorizonte for the current teacher."""
user = get_current_user(request)
tenant_id = user.get("tenant_id") or user.get("school_id") or user["user_id"]
results = []
for eh in storage.eh_db.values():
if eh.tenant_id == tenant_id and eh.deleted_at is None:
if subject and eh.subject != subject:
continue
if year and eh.year != year:
continue
results.append(eh.to_dict())
return results
# =============================================
# SPECIFIC EH ROUTES (must come before {eh_id} catch-all)
# =============================================
@router.get("/api/v1/eh/audit-log")
async def get_eh_audit_log(
request: Request,
eh_id: Optional[str] = None,
limit: int = 100
):
"""Get BYOEH audit log entries."""
user = get_current_user(request)
tenant_id = user.get("tenant_id") or user.get("school_id") or user["user_id"]
# Filter by tenant
entries = [e for e in storage.eh_audit_db if e.tenant_id == tenant_id]
# Filter by EH if specified
if eh_id:
entries = [e for e in entries if e.eh_id == eh_id]
# Sort and limit
entries = sorted(entries, key=lambda e: e.created_at, reverse=True)[:limit]
return [e.to_dict() for e in entries]
@router.get("/api/v1/eh/rights-text")
async def get_rights_confirmation_text():
"""Get the rights confirmation text for display in UI."""
return {
"text": RIGHTS_CONFIRMATION_TEXT,
"version": "v1.0"
}
@router.get("/api/v1/eh/qdrant-status")
async def get_qdrant_status(request: Request):
"""Get Qdrant collection status (admin only)."""
user = get_current_user(request)
if user.get("role") != "admin" and ENVIRONMENT != "development":
raise HTTPException(status_code=403, detail="Admin access required")
return await get_collection_info()
@router.get("/api/v1/eh/shared-with-me")
async def list_shared_eh(request: Request):
"""List all EH shared with the current user."""
user = get_current_user(request)
user_id = user["user_id"]
shared_ehs = []
for eh_id, shares in storage.eh_key_shares_db.items():
for share in shares:
if share.user_id == user_id and share.active:
if eh_id in storage.eh_db:
eh = storage.eh_db[eh_id]
shared_ehs.append({
"eh": eh.to_dict(),
"share": share.to_dict()
})
return shared_ehs
# =============================================
# GENERIC EH ROUTES
# =============================================
@router.get("/api/v1/eh/{eh_id}")
async def get_erwartungshorizont(eh_id: str, request: Request):
"""Get a specific Erwartungshorizont by ID."""
user = get_current_user(request)
if eh_id not in storage.eh_db:
raise HTTPException(status_code=404, detail="Erwartungshorizont not found")
eh = storage.eh_db[eh_id]
if eh.teacher_id != user["user_id"] and user.get("role") != "admin":
raise HTTPException(status_code=403, detail="Access denied")
if eh.deleted_at is not None:
raise HTTPException(status_code=404, detail="Erwartungshorizont was deleted")
return eh.to_dict()
@router.delete("/api/v1/eh/{eh_id}")
async def delete_erwartungshorizont(eh_id: str, request: Request):
"""Soft-delete an Erwartungshorizont and remove vectors from Qdrant."""
user = get_current_user(request)
if eh_id not in storage.eh_db:
raise HTTPException(status_code=404, detail="Erwartungshorizont not found")
eh = storage.eh_db[eh_id]
if eh.teacher_id != user["user_id"] and user.get("role") != "admin":
raise HTTPException(status_code=403, detail="Access denied")
# Soft delete
eh.deleted_at = datetime.now(timezone.utc)
# Delete vectors from Qdrant
try:
deleted_count = await delete_eh_vectors(eh_id)
print(f"Deleted {deleted_count} vectors for EH {eh_id}")
except Exception as e:
print(f"Warning: Failed to delete vectors: {e}")
# Audit log
log_eh_audit(
tenant_id=eh.tenant_id,
user_id=user["user_id"],
action="delete",
eh_id=eh_id,
ip_address=request.client.host if request.client else None,
user_agent=request.headers.get("user-agent")
)
return {"status": "deleted", "id": eh_id}
@router.post("/api/v1/eh/{eh_id}/index")
async def index_erwartungshorizont(
eh_id: str,
data: EHIndexRequest,
request: Request
):
"""
Index an Erwartungshorizont for RAG queries.
Requires the passphrase to decrypt, chunk, embed, and re-encrypt chunks.
The passphrase is only used transiently and never stored.
"""
user = get_current_user(request)
if eh_id not in storage.eh_db:
raise HTTPException(status_code=404, detail="Erwartungshorizont not found")
eh = storage.eh_db[eh_id]
if eh.teacher_id != user["user_id"] and user.get("role") != "admin":
raise HTTPException(status_code=403, detail="Access denied")
# Verify passphrase matches key hash
if not verify_key_hash(data.passphrase, eh.salt, eh.encryption_key_hash):
raise HTTPException(status_code=401, detail="Invalid passphrase")
eh.status = EHStatus.PROCESSING
try:
# Read encrypted file
with open(eh.encrypted_file_path, "rb") as f:
encrypted_content = f.read()
# Decrypt the file
decrypted_text = decrypt_text(
encrypted_content.decode('utf-8'),
data.passphrase,
eh.salt
)
# Process for indexing
chunk_count, chunks_data = await process_eh_for_indexing(
eh_id=eh_id,
tenant_id=eh.tenant_id,
subject=eh.subject,
text_content=decrypted_text,
passphrase=data.passphrase,
salt_hex=eh.salt
)
# Index in Qdrant
await index_eh_chunks(
eh_id=eh_id,
tenant_id=eh.tenant_id,
subject=eh.subject,
chunks=chunks_data
)
# Update EH record
eh.status = EHStatus.INDEXED
eh.chunk_count = chunk_count
eh.indexed_at = datetime.now(timezone.utc)
# Audit log
log_eh_audit(
tenant_id=eh.tenant_id,
user_id=user["user_id"],
action="indexed",
eh_id=eh_id,
details={"chunk_count": chunk_count}
)
return {
"status": "indexed",
"id": eh_id,
"chunk_count": chunk_count
}
except EncryptionError as e:
eh.status = EHStatus.ERROR
eh.error_message = str(e)
raise HTTPException(status_code=400, detail=f"Decryption failed: {str(e)}")
except EmbeddingError as e:
eh.status = EHStatus.ERROR
eh.error_message = str(e)
raise HTTPException(status_code=500, detail=f"Embedding generation failed: {str(e)}")
except Exception as e:
eh.status = EHStatus.ERROR
eh.error_message = str(e)
raise HTTPException(status_code=500, detail=f"Indexing failed: {str(e)}")
@router.post("/api/v1/eh/rag-query")
async def rag_query_eh(data: EHRAGQuery, request: Request):
"""
RAG query against teacher's Erwartungshorizonte.
1. Semantic search in Qdrant (tenant-isolated)
2. Decrypt relevant chunks on-the-fly
3. Return context for LLM usage
"""
user = get_current_user(request)
tenant_id = user.get("tenant_id") or user.get("school_id") or user["user_id"]
if not OPENAI_API_KEY:
raise HTTPException(status_code=500, detail="OpenAI API key not configured")
try:
# Generate embedding for query
query_embedding = await generate_single_embedding(data.query_text)
# Search in Qdrant (tenant-isolated)
results = await search_eh(
query_embedding=query_embedding,
tenant_id=tenant_id,
subject=data.subject,
limit=data.limit
)
# Decrypt matching chunks
decrypted_chunks = []
for r in results:
eh = storage.eh_db.get(r["eh_id"])
if eh and r.get("encrypted_content"):
try:
decrypted = decrypt_text(
r["encrypted_content"],
data.passphrase,
eh.salt
)
decrypted_chunks.append({
"text": decrypted,
"eh_id": r["eh_id"],
"eh_title": eh.title,
"chunk_index": r["chunk_index"],
"score": r["score"]
})
except EncryptionError:
# Skip chunks that can't be decrypted (wrong passphrase for different EH)
pass
# Audit log
log_eh_audit(
tenant_id=tenant_id,
user_id=user["user_id"],
action="rag_query",
details={
"query_length": len(data.query_text),
"results_count": len(results),
"decrypted_count": len(decrypted_chunks)
},
ip_address=request.client.host if request.client else None,
user_agent=request.headers.get("user-agent")
)
return {
"context": "\n\n---\n\n".join([c["text"] for c in decrypted_chunks]),
"sources": decrypted_chunks,
"query": data.query_text
}
except EmbeddingError as e:
raise HTTPException(status_code=500, detail=f"Query embedding failed: {str(e)}")
except Exception as e:
raise HTTPException(status_code=500, detail=f"RAG query failed: {str(e)}")