Services: Admin-Lehrer, Backend-Lehrer, Studio v2, Website, Klausur-Service, School-Service, Voice-Service, Geo-Service, BreakPilot Drive, Agent-Core Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
341 lines
10 KiB
Python
341 lines
10 KiB
Python
"""
|
|
Zeugnis Rights-Aware Crawler - Data Models
|
|
|
|
Pydantic models for API requests/responses and internal data structures.
|
|
Database schema is defined in metrics_db.py.
|
|
"""
|
|
|
|
from datetime import datetime
|
|
from enum import Enum
|
|
from typing import Optional, List, Dict, Any
|
|
from pydantic import BaseModel, Field
|
|
import uuid
|
|
|
|
|
|
# =============================================================================
|
|
# Enums
|
|
# =============================================================================
|
|
|
|
class LicenseType(str, Enum):
|
|
"""License classification for training permission."""
|
|
PUBLIC_DOMAIN = "public_domain" # Amtliche Werke (§5 UrhG)
|
|
CC_BY = "cc_by" # Creative Commons Attribution
|
|
CC_BY_SA = "cc_by_sa" # CC Attribution-ShareAlike
|
|
CC_BY_NC = "cc_by_nc" # CC NonCommercial - NO TRAINING
|
|
CC_BY_NC_SA = "cc_by_nc_sa" # CC NC-SA - NO TRAINING
|
|
GOV_STATUTE_FREE_USE = "gov_statute" # Government statutes (gemeinfrei)
|
|
ALL_RIGHTS_RESERVED = "all_rights" # Standard copyright - NO TRAINING
|
|
UNKNOWN_REQUIRES_REVIEW = "unknown" # Needs manual review
|
|
|
|
|
|
class CrawlStatus(str, Enum):
|
|
"""Status of a crawl job or seed URL."""
|
|
PENDING = "pending"
|
|
RUNNING = "running"
|
|
COMPLETED = "completed"
|
|
FAILED = "failed"
|
|
PAUSED = "paused"
|
|
|
|
|
|
class DocType(str, Enum):
|
|
"""Type of zeugnis document."""
|
|
VERORDNUNG = "verordnung" # Official regulation
|
|
HANDREICHUNG = "handreichung" # Implementation guide
|
|
FORMULAR = "formular" # Form template
|
|
ERLASS = "erlass" # Decree
|
|
SCHULORDNUNG = "schulordnung" # School regulations
|
|
SONSTIGES = "sonstiges" # Other
|
|
|
|
|
|
class EventType(str, Enum):
|
|
"""Audit event types."""
|
|
CRAWLED = "crawled"
|
|
INDEXED = "indexed"
|
|
DOWNLOADED = "downloaded"
|
|
VIEWED = "viewed"
|
|
EXPORTED = "exported"
|
|
TRAINED_ON = "trained_on"
|
|
DELETED = "deleted"
|
|
|
|
|
|
# =============================================================================
|
|
# Bundesland Definitions
|
|
# =============================================================================
|
|
|
|
BUNDESLAENDER = {
|
|
"bw": {"name": "Baden-Württemberg", "short": "BW"},
|
|
"by": {"name": "Bayern", "short": "BY"},
|
|
"be": {"name": "Berlin", "short": "BE"},
|
|
"bb": {"name": "Brandenburg", "short": "BB"},
|
|
"hb": {"name": "Bremen", "short": "HB"},
|
|
"hh": {"name": "Hamburg", "short": "HH"},
|
|
"he": {"name": "Hessen", "short": "HE"},
|
|
"mv": {"name": "Mecklenburg-Vorpommern", "short": "MV"},
|
|
"ni": {"name": "Niedersachsen", "short": "NI"},
|
|
"nw": {"name": "Nordrhein-Westfalen", "short": "NW"},
|
|
"rp": {"name": "Rheinland-Pfalz", "short": "RP"},
|
|
"sl": {"name": "Saarland", "short": "SL"},
|
|
"sn": {"name": "Sachsen", "short": "SN"},
|
|
"st": {"name": "Sachsen-Anhalt", "short": "ST"},
|
|
"sh": {"name": "Schleswig-Holstein", "short": "SH"},
|
|
"th": {"name": "Thüringen", "short": "TH"},
|
|
}
|
|
|
|
|
|
# Training permission based on Word document analysis
|
|
TRAINING_PERMISSIONS = {
|
|
"bw": True, # Amtliches Werk
|
|
"by": True, # Amtliches Werk
|
|
"be": False, # Keine Lizenz
|
|
"bb": False, # Keine Lizenz
|
|
"hb": False, # Eingeschränkt -> False for safety
|
|
"hh": False, # Keine Lizenz
|
|
"he": True, # Amtliches Werk
|
|
"mv": False, # Eingeschränkt -> False for safety
|
|
"ni": True, # Amtliches Werk
|
|
"nw": True, # Amtliches Werk
|
|
"rp": True, # Amtliches Werk
|
|
"sl": False, # Keine Lizenz
|
|
"sn": True, # Amtliches Werk
|
|
"st": False, # Eingeschränkt -> False for safety
|
|
"sh": True, # Amtliches Werk
|
|
"th": True, # Amtliches Werk
|
|
}
|
|
|
|
|
|
# =============================================================================
|
|
# API Models - Sources
|
|
# =============================================================================
|
|
|
|
class ZeugnisSourceBase(BaseModel):
|
|
"""Base model for zeugnis source."""
|
|
bundesland: str = Field(..., description="Bundesland code (e.g., 'ni', 'by')")
|
|
name: str = Field(..., description="Full name of the source")
|
|
base_url: Optional[str] = Field(None, description="Base URL for the source")
|
|
license_type: LicenseType = Field(..., description="License classification")
|
|
training_allowed: bool = Field(False, description="Whether AI training is permitted")
|
|
|
|
|
|
class ZeugnisSourceCreate(ZeugnisSourceBase):
|
|
"""Model for creating a new source."""
|
|
pass
|
|
|
|
|
|
class ZeugnisSource(ZeugnisSourceBase):
|
|
"""Full source model with all fields."""
|
|
id: str
|
|
verified_by: Optional[str] = None
|
|
verified_at: Optional[datetime] = None
|
|
created_at: datetime
|
|
updated_at: datetime
|
|
|
|
class Config:
|
|
from_attributes = True
|
|
|
|
|
|
class ZeugnisSourceVerify(BaseModel):
|
|
"""Model for verifying a source's license."""
|
|
verified_by: str = Field(..., description="User ID who verified")
|
|
license_type: LicenseType
|
|
training_allowed: bool
|
|
notes: Optional[str] = None
|
|
|
|
|
|
# =============================================================================
|
|
# API Models - Seed URLs
|
|
# =============================================================================
|
|
|
|
class SeedUrlBase(BaseModel):
|
|
"""Base model for seed URL."""
|
|
url: str = Field(..., description="URL to crawl")
|
|
doc_type: DocType = Field(DocType.VERORDNUNG, description="Type of document")
|
|
|
|
|
|
class SeedUrlCreate(SeedUrlBase):
|
|
"""Model for creating a new seed URL."""
|
|
source_id: str
|
|
|
|
|
|
class SeedUrl(SeedUrlBase):
|
|
"""Full seed URL model."""
|
|
id: str
|
|
source_id: str
|
|
status: CrawlStatus = CrawlStatus.PENDING
|
|
last_crawled: Optional[datetime] = None
|
|
error_message: Optional[str] = None
|
|
created_at: datetime
|
|
|
|
class Config:
|
|
from_attributes = True
|
|
|
|
|
|
# =============================================================================
|
|
# API Models - Documents
|
|
# =============================================================================
|
|
|
|
class ZeugnisDocumentBase(BaseModel):
|
|
"""Base model for zeugnis document."""
|
|
title: Optional[str] = None
|
|
url: str
|
|
content_type: Optional[str] = None
|
|
file_size: Optional[int] = None
|
|
|
|
|
|
class ZeugnisDocument(ZeugnisDocumentBase):
|
|
"""Full document model."""
|
|
id: str
|
|
seed_url_id: str
|
|
content_hash: Optional[str] = None
|
|
minio_path: Optional[str] = None
|
|
training_allowed: bool = False
|
|
indexed_in_qdrant: bool = False
|
|
bundesland: Optional[str] = None
|
|
source_name: Optional[str] = None
|
|
created_at: datetime
|
|
updated_at: datetime
|
|
|
|
class Config:
|
|
from_attributes = True
|
|
|
|
|
|
class ZeugnisDocumentVersion(BaseModel):
|
|
"""Document version for history tracking."""
|
|
id: str
|
|
document_id: str
|
|
version: int
|
|
content_hash: str
|
|
minio_path: Optional[str] = None
|
|
change_summary: Optional[str] = None
|
|
created_at: datetime
|
|
|
|
class Config:
|
|
from_attributes = True
|
|
|
|
|
|
# =============================================================================
|
|
# API Models - Crawler
|
|
# =============================================================================
|
|
|
|
class CrawlerStatus(BaseModel):
|
|
"""Current status of the crawler."""
|
|
is_running: bool = False
|
|
current_source: Optional[str] = None
|
|
current_bundesland: Optional[str] = None
|
|
queue_length: int = 0
|
|
documents_crawled_today: int = 0
|
|
documents_indexed_today: int = 0
|
|
last_activity: Optional[datetime] = None
|
|
errors_today: int = 0
|
|
|
|
|
|
class CrawlQueueItem(BaseModel):
|
|
"""Item in the crawl queue."""
|
|
id: str
|
|
source_id: str
|
|
bundesland: str
|
|
source_name: str
|
|
priority: int = 5
|
|
status: CrawlStatus = CrawlStatus.PENDING
|
|
started_at: Optional[datetime] = None
|
|
completed_at: Optional[datetime] = None
|
|
documents_found: int = 0
|
|
documents_indexed: int = 0
|
|
error_count: int = 0
|
|
created_at: datetime
|
|
|
|
|
|
class CrawlRequest(BaseModel):
|
|
"""Request to start a crawl."""
|
|
bundesland: Optional[str] = Field(None, description="Specific Bundesland to crawl")
|
|
source_id: Optional[str] = Field(None, description="Specific source ID to crawl")
|
|
priority: int = Field(5, ge=1, le=10, description="Priority (1=lowest, 10=highest)")
|
|
|
|
|
|
class CrawlResult(BaseModel):
|
|
"""Result of a crawl operation."""
|
|
source_id: str
|
|
bundesland: str
|
|
documents_found: int
|
|
documents_indexed: int
|
|
documents_skipped: int
|
|
errors: List[str]
|
|
duration_seconds: float
|
|
|
|
|
|
# =============================================================================
|
|
# API Models - Statistics
|
|
# =============================================================================
|
|
|
|
class ZeugnisStats(BaseModel):
|
|
"""Statistics for the zeugnis crawler."""
|
|
total_sources: int = 0
|
|
total_documents: int = 0
|
|
indexed_documents: int = 0
|
|
training_allowed_documents: int = 0
|
|
active_crawls: int = 0
|
|
per_bundesland: List[Dict[str, Any]] = []
|
|
|
|
|
|
class BundeslandStats(BaseModel):
|
|
"""Statistics per Bundesland."""
|
|
bundesland: str
|
|
name: str
|
|
training_allowed: bool
|
|
document_count: int
|
|
indexed_count: int
|
|
last_crawled: Optional[datetime] = None
|
|
|
|
|
|
# =============================================================================
|
|
# API Models - Audit
|
|
# =============================================================================
|
|
|
|
class UsageEvent(BaseModel):
|
|
"""Usage event for audit trail."""
|
|
id: str
|
|
document_id: str
|
|
event_type: EventType
|
|
user_id: Optional[str] = None
|
|
details: Optional[Dict[str, Any]] = None
|
|
created_at: datetime
|
|
|
|
class Config:
|
|
from_attributes = True
|
|
|
|
|
|
class AuditExport(BaseModel):
|
|
"""GDPR-compliant audit export."""
|
|
export_date: datetime
|
|
requested_by: str
|
|
events: List[UsageEvent]
|
|
document_count: int
|
|
date_range_start: datetime
|
|
date_range_end: datetime
|
|
|
|
|
|
# =============================================================================
|
|
# Helper Functions
|
|
# =============================================================================
|
|
|
|
def generate_id() -> str:
|
|
"""Generate a new UUID."""
|
|
return str(uuid.uuid4())
|
|
|
|
|
|
def get_training_allowed(bundesland: str) -> bool:
|
|
"""Get training permission for a Bundesland."""
|
|
return TRAINING_PERMISSIONS.get(bundesland.lower(), False)
|
|
|
|
|
|
def get_bundesland_name(code: str) -> str:
|
|
"""Get full Bundesland name from code."""
|
|
info = BUNDESLAENDER.get(code.lower(), {})
|
|
return info.get("name", code)
|
|
|
|
|
|
def get_license_for_bundesland(bundesland: str) -> LicenseType:
|
|
"""Get appropriate license type for a Bundesland."""
|
|
if TRAINING_PERMISSIONS.get(bundesland.lower(), False):
|
|
return LicenseType.GOV_STATUTE_FREE_USE
|
|
return LicenseType.UNKNOWN_REQUIRES_REVIEW
|