""" Zeugnis Rights-Aware Crawler - Data Models Pydantic models for API requests/responses and internal data structures. Database schema is defined in metrics_db.py. """ from datetime import datetime from enum import Enum from typing import Optional, List, Dict, Any from pydantic import BaseModel, Field import uuid # ============================================================================= # Enums # ============================================================================= class LicenseType(str, Enum): """License classification for training permission.""" PUBLIC_DOMAIN = "public_domain" # Amtliche Werke (§5 UrhG) CC_BY = "cc_by" # Creative Commons Attribution CC_BY_SA = "cc_by_sa" # CC Attribution-ShareAlike CC_BY_NC = "cc_by_nc" # CC NonCommercial - NO TRAINING CC_BY_NC_SA = "cc_by_nc_sa" # CC NC-SA - NO TRAINING GOV_STATUTE_FREE_USE = "gov_statute" # Government statutes (gemeinfrei) ALL_RIGHTS_RESERVED = "all_rights" # Standard copyright - NO TRAINING UNKNOWN_REQUIRES_REVIEW = "unknown" # Needs manual review class CrawlStatus(str, Enum): """Status of a crawl job or seed URL.""" PENDING = "pending" RUNNING = "running" COMPLETED = "completed" FAILED = "failed" PAUSED = "paused" class DocType(str, Enum): """Type of zeugnis document.""" VERORDNUNG = "verordnung" # Official regulation HANDREICHUNG = "handreichung" # Implementation guide FORMULAR = "formular" # Form template ERLASS = "erlass" # Decree SCHULORDNUNG = "schulordnung" # School regulations SONSTIGES = "sonstiges" # Other class EventType(str, Enum): """Audit event types.""" CRAWLED = "crawled" INDEXED = "indexed" DOWNLOADED = "downloaded" VIEWED = "viewed" EXPORTED = "exported" TRAINED_ON = "trained_on" DELETED = "deleted" # ============================================================================= # Bundesland Definitions # ============================================================================= BUNDESLAENDER = { "bw": {"name": "Baden-Württemberg", "short": "BW"}, "by": {"name": "Bayern", "short": "BY"}, "be": {"name": "Berlin", "short": "BE"}, "bb": {"name": "Brandenburg", "short": "BB"}, "hb": {"name": "Bremen", "short": "HB"}, "hh": {"name": "Hamburg", "short": "HH"}, "he": {"name": "Hessen", "short": "HE"}, "mv": {"name": "Mecklenburg-Vorpommern", "short": "MV"}, "ni": {"name": "Niedersachsen", "short": "NI"}, "nw": {"name": "Nordrhein-Westfalen", "short": "NW"}, "rp": {"name": "Rheinland-Pfalz", "short": "RP"}, "sl": {"name": "Saarland", "short": "SL"}, "sn": {"name": "Sachsen", "short": "SN"}, "st": {"name": "Sachsen-Anhalt", "short": "ST"}, "sh": {"name": "Schleswig-Holstein", "short": "SH"}, "th": {"name": "Thüringen", "short": "TH"}, } # Training permission based on Word document analysis TRAINING_PERMISSIONS = { "bw": True, # Amtliches Werk "by": True, # Amtliches Werk "be": False, # Keine Lizenz "bb": False, # Keine Lizenz "hb": False, # Eingeschränkt -> False for safety "hh": False, # Keine Lizenz "he": True, # Amtliches Werk "mv": False, # Eingeschränkt -> False for safety "ni": True, # Amtliches Werk "nw": True, # Amtliches Werk "rp": True, # Amtliches Werk "sl": False, # Keine Lizenz "sn": True, # Amtliches Werk "st": False, # Eingeschränkt -> False for safety "sh": True, # Amtliches Werk "th": True, # Amtliches Werk } # ============================================================================= # API Models - Sources # ============================================================================= class ZeugnisSourceBase(BaseModel): """Base model for zeugnis source.""" bundesland: str = Field(..., description="Bundesland code (e.g., 'ni', 'by')") name: str = Field(..., description="Full name of the source") base_url: Optional[str] = Field(None, description="Base URL for the source") license_type: LicenseType = Field(..., description="License classification") training_allowed: bool = Field(False, description="Whether AI training is permitted") class ZeugnisSourceCreate(ZeugnisSourceBase): """Model for creating a new source.""" pass class ZeugnisSource(ZeugnisSourceBase): """Full source model with all fields.""" id: str verified_by: Optional[str] = None verified_at: Optional[datetime] = None created_at: datetime updated_at: datetime class Config: from_attributes = True class ZeugnisSourceVerify(BaseModel): """Model for verifying a source's license.""" verified_by: str = Field(..., description="User ID who verified") license_type: LicenseType training_allowed: bool notes: Optional[str] = None # ============================================================================= # API Models - Seed URLs # ============================================================================= class SeedUrlBase(BaseModel): """Base model for seed URL.""" url: str = Field(..., description="URL to crawl") doc_type: DocType = Field(DocType.VERORDNUNG, description="Type of document") class SeedUrlCreate(SeedUrlBase): """Model for creating a new seed URL.""" source_id: str class SeedUrl(SeedUrlBase): """Full seed URL model.""" id: str source_id: str status: CrawlStatus = CrawlStatus.PENDING last_crawled: Optional[datetime] = None error_message: Optional[str] = None created_at: datetime class Config: from_attributes = True # ============================================================================= # API Models - Documents # ============================================================================= class ZeugnisDocumentBase(BaseModel): """Base model for zeugnis document.""" title: Optional[str] = None url: str content_type: Optional[str] = None file_size: Optional[int] = None class ZeugnisDocument(ZeugnisDocumentBase): """Full document model.""" id: str seed_url_id: str content_hash: Optional[str] = None minio_path: Optional[str] = None training_allowed: bool = False indexed_in_qdrant: bool = False bundesland: Optional[str] = None source_name: Optional[str] = None created_at: datetime updated_at: datetime class Config: from_attributes = True class ZeugnisDocumentVersion(BaseModel): """Document version for history tracking.""" id: str document_id: str version: int content_hash: str minio_path: Optional[str] = None change_summary: Optional[str] = None created_at: datetime class Config: from_attributes = True # ============================================================================= # API Models - Crawler # ============================================================================= class CrawlerStatus(BaseModel): """Current status of the crawler.""" is_running: bool = False current_source: Optional[str] = None current_bundesland: Optional[str] = None queue_length: int = 0 documents_crawled_today: int = 0 documents_indexed_today: int = 0 last_activity: Optional[datetime] = None errors_today: int = 0 class CrawlQueueItem(BaseModel): """Item in the crawl queue.""" id: str source_id: str bundesland: str source_name: str priority: int = 5 status: CrawlStatus = CrawlStatus.PENDING started_at: Optional[datetime] = None completed_at: Optional[datetime] = None documents_found: int = 0 documents_indexed: int = 0 error_count: int = 0 created_at: datetime class CrawlRequest(BaseModel): """Request to start a crawl.""" bundesland: Optional[str] = Field(None, description="Specific Bundesland to crawl") source_id: Optional[str] = Field(None, description="Specific source ID to crawl") priority: int = Field(5, ge=1, le=10, description="Priority (1=lowest, 10=highest)") class CrawlResult(BaseModel): """Result of a crawl operation.""" source_id: str bundesland: str documents_found: int documents_indexed: int documents_skipped: int errors: List[str] duration_seconds: float # ============================================================================= # API Models - Statistics # ============================================================================= class ZeugnisStats(BaseModel): """Statistics for the zeugnis crawler.""" total_sources: int = 0 total_documents: int = 0 indexed_documents: int = 0 training_allowed_documents: int = 0 active_crawls: int = 0 per_bundesland: List[Dict[str, Any]] = [] class BundeslandStats(BaseModel): """Statistics per Bundesland.""" bundesland: str name: str training_allowed: bool document_count: int indexed_count: int last_crawled: Optional[datetime] = None # ============================================================================= # API Models - Audit # ============================================================================= class UsageEvent(BaseModel): """Usage event for audit trail.""" id: str document_id: str event_type: EventType user_id: Optional[str] = None details: Optional[Dict[str, Any]] = None created_at: datetime class Config: from_attributes = True class AuditExport(BaseModel): """GDPR-compliant audit export.""" export_date: datetime requested_by: str events: List[UsageEvent] document_count: int date_range_start: datetime date_range_end: datetime # ============================================================================= # Helper Functions # ============================================================================= def generate_id() -> str: """Generate a new UUID.""" return str(uuid.uuid4()) def get_training_allowed(bundesland: str) -> bool: """Get training permission for a Bundesland.""" return TRAINING_PERMISSIONS.get(bundesland.lower(), False) def get_bundesland_name(code: str) -> str: """Get full Bundesland name from code.""" info = BUNDESLAENDER.get(code.lower(), {}) return info.get("name", code) def get_license_for_bundesland(bundesland: str) -> LicenseType: """Get appropriate license type for a Bundesland.""" if TRAINING_PERMISSIONS.get(bundesland.lower(), False): return LicenseType.GOV_STATUTE_FREE_USE return LicenseType.UNKNOWN_REQUIRES_REVIEW