Files
breakpilot-lehrer/klausur-service/backend/zeugnis_models.py
Benjamin Boenisch 5a31f52310 Initial commit: breakpilot-lehrer - Lehrer KI Platform
Services: Admin-Lehrer, Backend-Lehrer, Studio v2, Website,
Klausur-Service, School-Service, Voice-Service, Geo-Service,
BreakPilot Drive, Agent-Core

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-11 23:47:26 +01:00

341 lines
10 KiB
Python

"""
Zeugnis Rights-Aware Crawler - Data Models
Pydantic models for API requests/responses and internal data structures.
Database schema is defined in metrics_db.py.
"""
from datetime import datetime
from enum import Enum
from typing import Optional, List, Dict, Any
from pydantic import BaseModel, Field
import uuid
# =============================================================================
# Enums
# =============================================================================
class LicenseType(str, Enum):
"""License classification for training permission."""
PUBLIC_DOMAIN = "public_domain" # Amtliche Werke (§5 UrhG)
CC_BY = "cc_by" # Creative Commons Attribution
CC_BY_SA = "cc_by_sa" # CC Attribution-ShareAlike
CC_BY_NC = "cc_by_nc" # CC NonCommercial - NO TRAINING
CC_BY_NC_SA = "cc_by_nc_sa" # CC NC-SA - NO TRAINING
GOV_STATUTE_FREE_USE = "gov_statute" # Government statutes (gemeinfrei)
ALL_RIGHTS_RESERVED = "all_rights" # Standard copyright - NO TRAINING
UNKNOWN_REQUIRES_REVIEW = "unknown" # Needs manual review
class CrawlStatus(str, Enum):
"""Status of a crawl job or seed URL."""
PENDING = "pending"
RUNNING = "running"
COMPLETED = "completed"
FAILED = "failed"
PAUSED = "paused"
class DocType(str, Enum):
"""Type of zeugnis document."""
VERORDNUNG = "verordnung" # Official regulation
HANDREICHUNG = "handreichung" # Implementation guide
FORMULAR = "formular" # Form template
ERLASS = "erlass" # Decree
SCHULORDNUNG = "schulordnung" # School regulations
SONSTIGES = "sonstiges" # Other
class EventType(str, Enum):
"""Audit event types."""
CRAWLED = "crawled"
INDEXED = "indexed"
DOWNLOADED = "downloaded"
VIEWED = "viewed"
EXPORTED = "exported"
TRAINED_ON = "trained_on"
DELETED = "deleted"
# =============================================================================
# Bundesland Definitions
# =============================================================================
BUNDESLAENDER = {
"bw": {"name": "Baden-Württemberg", "short": "BW"},
"by": {"name": "Bayern", "short": "BY"},
"be": {"name": "Berlin", "short": "BE"},
"bb": {"name": "Brandenburg", "short": "BB"},
"hb": {"name": "Bremen", "short": "HB"},
"hh": {"name": "Hamburg", "short": "HH"},
"he": {"name": "Hessen", "short": "HE"},
"mv": {"name": "Mecklenburg-Vorpommern", "short": "MV"},
"ni": {"name": "Niedersachsen", "short": "NI"},
"nw": {"name": "Nordrhein-Westfalen", "short": "NW"},
"rp": {"name": "Rheinland-Pfalz", "short": "RP"},
"sl": {"name": "Saarland", "short": "SL"},
"sn": {"name": "Sachsen", "short": "SN"},
"st": {"name": "Sachsen-Anhalt", "short": "ST"},
"sh": {"name": "Schleswig-Holstein", "short": "SH"},
"th": {"name": "Thüringen", "short": "TH"},
}
# Training permission based on Word document analysis
TRAINING_PERMISSIONS = {
"bw": True, # Amtliches Werk
"by": True, # Amtliches Werk
"be": False, # Keine Lizenz
"bb": False, # Keine Lizenz
"hb": False, # Eingeschränkt -> False for safety
"hh": False, # Keine Lizenz
"he": True, # Amtliches Werk
"mv": False, # Eingeschränkt -> False for safety
"ni": True, # Amtliches Werk
"nw": True, # Amtliches Werk
"rp": True, # Amtliches Werk
"sl": False, # Keine Lizenz
"sn": True, # Amtliches Werk
"st": False, # Eingeschränkt -> False for safety
"sh": True, # Amtliches Werk
"th": True, # Amtliches Werk
}
# =============================================================================
# API Models - Sources
# =============================================================================
class ZeugnisSourceBase(BaseModel):
"""Base model for zeugnis source."""
bundesland: str = Field(..., description="Bundesland code (e.g., 'ni', 'by')")
name: str = Field(..., description="Full name of the source")
base_url: Optional[str] = Field(None, description="Base URL for the source")
license_type: LicenseType = Field(..., description="License classification")
training_allowed: bool = Field(False, description="Whether AI training is permitted")
class ZeugnisSourceCreate(ZeugnisSourceBase):
"""Model for creating a new source."""
pass
class ZeugnisSource(ZeugnisSourceBase):
"""Full source model with all fields."""
id: str
verified_by: Optional[str] = None
verified_at: Optional[datetime] = None
created_at: datetime
updated_at: datetime
class Config:
from_attributes = True
class ZeugnisSourceVerify(BaseModel):
"""Model for verifying a source's license."""
verified_by: str = Field(..., description="User ID who verified")
license_type: LicenseType
training_allowed: bool
notes: Optional[str] = None
# =============================================================================
# API Models - Seed URLs
# =============================================================================
class SeedUrlBase(BaseModel):
"""Base model for seed URL."""
url: str = Field(..., description="URL to crawl")
doc_type: DocType = Field(DocType.VERORDNUNG, description="Type of document")
class SeedUrlCreate(SeedUrlBase):
"""Model for creating a new seed URL."""
source_id: str
class SeedUrl(SeedUrlBase):
"""Full seed URL model."""
id: str
source_id: str
status: CrawlStatus = CrawlStatus.PENDING
last_crawled: Optional[datetime] = None
error_message: Optional[str] = None
created_at: datetime
class Config:
from_attributes = True
# =============================================================================
# API Models - Documents
# =============================================================================
class ZeugnisDocumentBase(BaseModel):
"""Base model for zeugnis document."""
title: Optional[str] = None
url: str
content_type: Optional[str] = None
file_size: Optional[int] = None
class ZeugnisDocument(ZeugnisDocumentBase):
"""Full document model."""
id: str
seed_url_id: str
content_hash: Optional[str] = None
minio_path: Optional[str] = None
training_allowed: bool = False
indexed_in_qdrant: bool = False
bundesland: Optional[str] = None
source_name: Optional[str] = None
created_at: datetime
updated_at: datetime
class Config:
from_attributes = True
class ZeugnisDocumentVersion(BaseModel):
"""Document version for history tracking."""
id: str
document_id: str
version: int
content_hash: str
minio_path: Optional[str] = None
change_summary: Optional[str] = None
created_at: datetime
class Config:
from_attributes = True
# =============================================================================
# API Models - Crawler
# =============================================================================
class CrawlerStatus(BaseModel):
"""Current status of the crawler."""
is_running: bool = False
current_source: Optional[str] = None
current_bundesland: Optional[str] = None
queue_length: int = 0
documents_crawled_today: int = 0
documents_indexed_today: int = 0
last_activity: Optional[datetime] = None
errors_today: int = 0
class CrawlQueueItem(BaseModel):
"""Item in the crawl queue."""
id: str
source_id: str
bundesland: str
source_name: str
priority: int = 5
status: CrawlStatus = CrawlStatus.PENDING
started_at: Optional[datetime] = None
completed_at: Optional[datetime] = None
documents_found: int = 0
documents_indexed: int = 0
error_count: int = 0
created_at: datetime
class CrawlRequest(BaseModel):
"""Request to start a crawl."""
bundesland: Optional[str] = Field(None, description="Specific Bundesland to crawl")
source_id: Optional[str] = Field(None, description="Specific source ID to crawl")
priority: int = Field(5, ge=1, le=10, description="Priority (1=lowest, 10=highest)")
class CrawlResult(BaseModel):
"""Result of a crawl operation."""
source_id: str
bundesland: str
documents_found: int
documents_indexed: int
documents_skipped: int
errors: List[str]
duration_seconds: float
# =============================================================================
# API Models - Statistics
# =============================================================================
class ZeugnisStats(BaseModel):
"""Statistics for the zeugnis crawler."""
total_sources: int = 0
total_documents: int = 0
indexed_documents: int = 0
training_allowed_documents: int = 0
active_crawls: int = 0
per_bundesland: List[Dict[str, Any]] = []
class BundeslandStats(BaseModel):
"""Statistics per Bundesland."""
bundesland: str
name: str
training_allowed: bool
document_count: int
indexed_count: int
last_crawled: Optional[datetime] = None
# =============================================================================
# API Models - Audit
# =============================================================================
class UsageEvent(BaseModel):
"""Usage event for audit trail."""
id: str
document_id: str
event_type: EventType
user_id: Optional[str] = None
details: Optional[Dict[str, Any]] = None
created_at: datetime
class Config:
from_attributes = True
class AuditExport(BaseModel):
"""GDPR-compliant audit export."""
export_date: datetime
requested_by: str
events: List[UsageEvent]
document_count: int
date_range_start: datetime
date_range_end: datetime
# =============================================================================
# Helper Functions
# =============================================================================
def generate_id() -> str:
"""Generate a new UUID."""
return str(uuid.uuid4())
def get_training_allowed(bundesland: str) -> bool:
"""Get training permission for a Bundesland."""
return TRAINING_PERMISSIONS.get(bundesland.lower(), False)
def get_bundesland_name(code: str) -> str:
"""Get full Bundesland name from code."""
info = BUNDESLAENDER.get(code.lower(), {})
return info.get("name", code)
def get_license_for_bundesland(bundesland: str) -> LicenseType:
"""Get appropriate license type for a Bundesland."""
if TRAINING_PERMISSIONS.get(bundesland.lower(), False):
return LicenseType.GOV_STATUTE_FREE_USE
return LicenseType.UNKNOWN_REQUIRES_REVIEW