fix: Restore all files lost during destructive rebase
A previous `git pull --rebase origin main` dropped 177 local commits,
losing 3400+ files across admin-v2, backend, studio-v2, website,
klausur-service, and many other services. The partial restore attempt
(660295e2) only recovered some files.
This commit restores all missing files from pre-rebase ref 98933f5e
while preserving post-rebase additions (night-scheduler, night-mode UI,
NightModeWidget dashboard integration).
Restored features include:
- AI Module Sidebar (FAB), OCR Labeling, OCR Compare
- GPU Dashboard, RAG Pipeline, Magic Help
- Klausur-Korrektur (8 files), Abitur-Archiv (5+ files)
- Companion, Zeugnisse-Crawler, Screen Flow
- Full backend, studio-v2, website, klausur-service
- All compliance SDKs, agent-core, voice-service
- CI/CD configs, documentation, scripts
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
340
klausur-service/backend/zeugnis_models.py
Normal file
340
klausur-service/backend/zeugnis_models.py
Normal file
@@ -0,0 +1,340 @@
|
||||
"""
|
||||
Zeugnis Rights-Aware Crawler - Data Models
|
||||
|
||||
Pydantic models for API requests/responses and internal data structures.
|
||||
Database schema is defined in metrics_db.py.
|
||||
"""
|
||||
|
||||
from datetime import datetime
|
||||
from enum import Enum
|
||||
from typing import Optional, List, Dict, Any
|
||||
from pydantic import BaseModel, Field
|
||||
import uuid
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Enums
|
||||
# =============================================================================
|
||||
|
||||
class LicenseType(str, Enum):
|
||||
"""License classification for training permission."""
|
||||
PUBLIC_DOMAIN = "public_domain" # Amtliche Werke (§5 UrhG)
|
||||
CC_BY = "cc_by" # Creative Commons Attribution
|
||||
CC_BY_SA = "cc_by_sa" # CC Attribution-ShareAlike
|
||||
CC_BY_NC = "cc_by_nc" # CC NonCommercial - NO TRAINING
|
||||
CC_BY_NC_SA = "cc_by_nc_sa" # CC NC-SA - NO TRAINING
|
||||
GOV_STATUTE_FREE_USE = "gov_statute" # Government statutes (gemeinfrei)
|
||||
ALL_RIGHTS_RESERVED = "all_rights" # Standard copyright - NO TRAINING
|
||||
UNKNOWN_REQUIRES_REVIEW = "unknown" # Needs manual review
|
||||
|
||||
|
||||
class CrawlStatus(str, Enum):
|
||||
"""Status of a crawl job or seed URL."""
|
||||
PENDING = "pending"
|
||||
RUNNING = "running"
|
||||
COMPLETED = "completed"
|
||||
FAILED = "failed"
|
||||
PAUSED = "paused"
|
||||
|
||||
|
||||
class DocType(str, Enum):
|
||||
"""Type of zeugnis document."""
|
||||
VERORDNUNG = "verordnung" # Official regulation
|
||||
HANDREICHUNG = "handreichung" # Implementation guide
|
||||
FORMULAR = "formular" # Form template
|
||||
ERLASS = "erlass" # Decree
|
||||
SCHULORDNUNG = "schulordnung" # School regulations
|
||||
SONSTIGES = "sonstiges" # Other
|
||||
|
||||
|
||||
class EventType(str, Enum):
|
||||
"""Audit event types."""
|
||||
CRAWLED = "crawled"
|
||||
INDEXED = "indexed"
|
||||
DOWNLOADED = "downloaded"
|
||||
VIEWED = "viewed"
|
||||
EXPORTED = "exported"
|
||||
TRAINED_ON = "trained_on"
|
||||
DELETED = "deleted"
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Bundesland Definitions
|
||||
# =============================================================================
|
||||
|
||||
BUNDESLAENDER = {
|
||||
"bw": {"name": "Baden-Württemberg", "short": "BW"},
|
||||
"by": {"name": "Bayern", "short": "BY"},
|
||||
"be": {"name": "Berlin", "short": "BE"},
|
||||
"bb": {"name": "Brandenburg", "short": "BB"},
|
||||
"hb": {"name": "Bremen", "short": "HB"},
|
||||
"hh": {"name": "Hamburg", "short": "HH"},
|
||||
"he": {"name": "Hessen", "short": "HE"},
|
||||
"mv": {"name": "Mecklenburg-Vorpommern", "short": "MV"},
|
||||
"ni": {"name": "Niedersachsen", "short": "NI"},
|
||||
"nw": {"name": "Nordrhein-Westfalen", "short": "NW"},
|
||||
"rp": {"name": "Rheinland-Pfalz", "short": "RP"},
|
||||
"sl": {"name": "Saarland", "short": "SL"},
|
||||
"sn": {"name": "Sachsen", "short": "SN"},
|
||||
"st": {"name": "Sachsen-Anhalt", "short": "ST"},
|
||||
"sh": {"name": "Schleswig-Holstein", "short": "SH"},
|
||||
"th": {"name": "Thüringen", "short": "TH"},
|
||||
}
|
||||
|
||||
|
||||
# Training permission based on Word document analysis
|
||||
TRAINING_PERMISSIONS = {
|
||||
"bw": True, # Amtliches Werk
|
||||
"by": True, # Amtliches Werk
|
||||
"be": False, # Keine Lizenz
|
||||
"bb": False, # Keine Lizenz
|
||||
"hb": False, # Eingeschränkt -> False for safety
|
||||
"hh": False, # Keine Lizenz
|
||||
"he": True, # Amtliches Werk
|
||||
"mv": False, # Eingeschränkt -> False for safety
|
||||
"ni": True, # Amtliches Werk
|
||||
"nw": True, # Amtliches Werk
|
||||
"rp": True, # Amtliches Werk
|
||||
"sl": False, # Keine Lizenz
|
||||
"sn": True, # Amtliches Werk
|
||||
"st": False, # Eingeschränkt -> False for safety
|
||||
"sh": True, # Amtliches Werk
|
||||
"th": True, # Amtliches Werk
|
||||
}
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# API Models - Sources
|
||||
# =============================================================================
|
||||
|
||||
class ZeugnisSourceBase(BaseModel):
|
||||
"""Base model for zeugnis source."""
|
||||
bundesland: str = Field(..., description="Bundesland code (e.g., 'ni', 'by')")
|
||||
name: str = Field(..., description="Full name of the source")
|
||||
base_url: Optional[str] = Field(None, description="Base URL for the source")
|
||||
license_type: LicenseType = Field(..., description="License classification")
|
||||
training_allowed: bool = Field(False, description="Whether AI training is permitted")
|
||||
|
||||
|
||||
class ZeugnisSourceCreate(ZeugnisSourceBase):
|
||||
"""Model for creating a new source."""
|
||||
pass
|
||||
|
||||
|
||||
class ZeugnisSource(ZeugnisSourceBase):
|
||||
"""Full source model with all fields."""
|
||||
id: str
|
||||
verified_by: Optional[str] = None
|
||||
verified_at: Optional[datetime] = None
|
||||
created_at: datetime
|
||||
updated_at: datetime
|
||||
|
||||
class Config:
|
||||
from_attributes = True
|
||||
|
||||
|
||||
class ZeugnisSourceVerify(BaseModel):
|
||||
"""Model for verifying a source's license."""
|
||||
verified_by: str = Field(..., description="User ID who verified")
|
||||
license_type: LicenseType
|
||||
training_allowed: bool
|
||||
notes: Optional[str] = None
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# API Models - Seed URLs
|
||||
# =============================================================================
|
||||
|
||||
class SeedUrlBase(BaseModel):
|
||||
"""Base model for seed URL."""
|
||||
url: str = Field(..., description="URL to crawl")
|
||||
doc_type: DocType = Field(DocType.VERORDNUNG, description="Type of document")
|
||||
|
||||
|
||||
class SeedUrlCreate(SeedUrlBase):
|
||||
"""Model for creating a new seed URL."""
|
||||
source_id: str
|
||||
|
||||
|
||||
class SeedUrl(SeedUrlBase):
|
||||
"""Full seed URL model."""
|
||||
id: str
|
||||
source_id: str
|
||||
status: CrawlStatus = CrawlStatus.PENDING
|
||||
last_crawled: Optional[datetime] = None
|
||||
error_message: Optional[str] = None
|
||||
created_at: datetime
|
||||
|
||||
class Config:
|
||||
from_attributes = True
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# API Models - Documents
|
||||
# =============================================================================
|
||||
|
||||
class ZeugnisDocumentBase(BaseModel):
|
||||
"""Base model for zeugnis document."""
|
||||
title: Optional[str] = None
|
||||
url: str
|
||||
content_type: Optional[str] = None
|
||||
file_size: Optional[int] = None
|
||||
|
||||
|
||||
class ZeugnisDocument(ZeugnisDocumentBase):
|
||||
"""Full document model."""
|
||||
id: str
|
||||
seed_url_id: str
|
||||
content_hash: Optional[str] = None
|
||||
minio_path: Optional[str] = None
|
||||
training_allowed: bool = False
|
||||
indexed_in_qdrant: bool = False
|
||||
bundesland: Optional[str] = None
|
||||
source_name: Optional[str] = None
|
||||
created_at: datetime
|
||||
updated_at: datetime
|
||||
|
||||
class Config:
|
||||
from_attributes = True
|
||||
|
||||
|
||||
class ZeugnisDocumentVersion(BaseModel):
|
||||
"""Document version for history tracking."""
|
||||
id: str
|
||||
document_id: str
|
||||
version: int
|
||||
content_hash: str
|
||||
minio_path: Optional[str] = None
|
||||
change_summary: Optional[str] = None
|
||||
created_at: datetime
|
||||
|
||||
class Config:
|
||||
from_attributes = True
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# API Models - Crawler
|
||||
# =============================================================================
|
||||
|
||||
class CrawlerStatus(BaseModel):
|
||||
"""Current status of the crawler."""
|
||||
is_running: bool = False
|
||||
current_source: Optional[str] = None
|
||||
current_bundesland: Optional[str] = None
|
||||
queue_length: int = 0
|
||||
documents_crawled_today: int = 0
|
||||
documents_indexed_today: int = 0
|
||||
last_activity: Optional[datetime] = None
|
||||
errors_today: int = 0
|
||||
|
||||
|
||||
class CrawlQueueItem(BaseModel):
|
||||
"""Item in the crawl queue."""
|
||||
id: str
|
||||
source_id: str
|
||||
bundesland: str
|
||||
source_name: str
|
||||
priority: int = 5
|
||||
status: CrawlStatus = CrawlStatus.PENDING
|
||||
started_at: Optional[datetime] = None
|
||||
completed_at: Optional[datetime] = None
|
||||
documents_found: int = 0
|
||||
documents_indexed: int = 0
|
||||
error_count: int = 0
|
||||
created_at: datetime
|
||||
|
||||
|
||||
class CrawlRequest(BaseModel):
|
||||
"""Request to start a crawl."""
|
||||
bundesland: Optional[str] = Field(None, description="Specific Bundesland to crawl")
|
||||
source_id: Optional[str] = Field(None, description="Specific source ID to crawl")
|
||||
priority: int = Field(5, ge=1, le=10, description="Priority (1=lowest, 10=highest)")
|
||||
|
||||
|
||||
class CrawlResult(BaseModel):
|
||||
"""Result of a crawl operation."""
|
||||
source_id: str
|
||||
bundesland: str
|
||||
documents_found: int
|
||||
documents_indexed: int
|
||||
documents_skipped: int
|
||||
errors: List[str]
|
||||
duration_seconds: float
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# API Models - Statistics
|
||||
# =============================================================================
|
||||
|
||||
class ZeugnisStats(BaseModel):
|
||||
"""Statistics for the zeugnis crawler."""
|
||||
total_sources: int = 0
|
||||
total_documents: int = 0
|
||||
indexed_documents: int = 0
|
||||
training_allowed_documents: int = 0
|
||||
active_crawls: int = 0
|
||||
per_bundesland: List[Dict[str, Any]] = []
|
||||
|
||||
|
||||
class BundeslandStats(BaseModel):
|
||||
"""Statistics per Bundesland."""
|
||||
bundesland: str
|
||||
name: str
|
||||
training_allowed: bool
|
||||
document_count: int
|
||||
indexed_count: int
|
||||
last_crawled: Optional[datetime] = None
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# API Models - Audit
|
||||
# =============================================================================
|
||||
|
||||
class UsageEvent(BaseModel):
|
||||
"""Usage event for audit trail."""
|
||||
id: str
|
||||
document_id: str
|
||||
event_type: EventType
|
||||
user_id: Optional[str] = None
|
||||
details: Optional[Dict[str, Any]] = None
|
||||
created_at: datetime
|
||||
|
||||
class Config:
|
||||
from_attributes = True
|
||||
|
||||
|
||||
class AuditExport(BaseModel):
|
||||
"""GDPR-compliant audit export."""
|
||||
export_date: datetime
|
||||
requested_by: str
|
||||
events: List[UsageEvent]
|
||||
document_count: int
|
||||
date_range_start: datetime
|
||||
date_range_end: datetime
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Helper Functions
|
||||
# =============================================================================
|
||||
|
||||
def generate_id() -> str:
|
||||
"""Generate a new UUID."""
|
||||
return str(uuid.uuid4())
|
||||
|
||||
|
||||
def get_training_allowed(bundesland: str) -> bool:
|
||||
"""Get training permission for a Bundesland."""
|
||||
return TRAINING_PERMISSIONS.get(bundesland.lower(), False)
|
||||
|
||||
|
||||
def get_bundesland_name(code: str) -> str:
|
||||
"""Get full Bundesland name from code."""
|
||||
info = BUNDESLAENDER.get(code.lower(), {})
|
||||
return info.get("name", code)
|
||||
|
||||
|
||||
def get_license_for_bundesland(bundesland: str) -> LicenseType:
|
||||
"""Get appropriate license type for a Bundesland."""
|
||||
if TRAINING_PERMISSIONS.get(bundesland.lower(), False):
|
||||
return LicenseType.GOV_STATUTE_FREE_USE
|
||||
return LicenseType.UNKNOWN_REQUIRES_REVIEW
|
||||
Reference in New Issue
Block a user