breakpilot-lehrer/klausur-service/backend/zeugnis_models.py

"""
Zeugnis Rights-Aware Crawler - Data Models

Pydantic models for API requests/responses and internal data structures.
Database schema is defined in metrics_db.py.
"""

from datetime import datetime
from enum import Enum
from typing import Optional, List, Dict, Any
from pydantic import BaseModel, Field
import uuid


# =============================================================================
# Enums
# =============================================================================

class LicenseType(str, Enum):
    """License classification for training permission."""
    PUBLIC_DOMAIN = "public_domain"           # Amtliche Werke (§5 UrhG)
    CC_BY = "cc_by"                           # Creative Commons Attribution
    CC_BY_SA = "cc_by_sa"                     # CC Attribution-ShareAlike
    CC_BY_NC = "cc_by_nc"                     # CC NonCommercial - NO TRAINING
    CC_BY_NC_SA = "cc_by_nc_sa"               # CC NC-SA - NO TRAINING
    GOV_STATUTE_FREE_USE = "gov_statute"      # Government statutes (gemeinfrei)
    ALL_RIGHTS_RESERVED = "all_rights"        # Standard copyright - NO TRAINING
    UNKNOWN_REQUIRES_REVIEW = "unknown"       # Needs manual review


class CrawlStatus(str, Enum):
    """Status of a crawl job or seed URL."""
    PENDING = "pending"
    RUNNING = "running"
    COMPLETED = "completed"
    FAILED = "failed"
    PAUSED = "paused"


class DocType(str, Enum):
    """Type of zeugnis document."""
    VERORDNUNG = "verordnung"           # Official regulation
    HANDREICHUNG = "handreichung"       # Implementation guide
    FORMULAR = "formular"               # Form template
    ERLASS = "erlass"                   # Decree
    SCHULORDNUNG = "schulordnung"       # School regulations
    SONSTIGES = "sonstiges"             # Other


class EventType(str, Enum):
    """Audit event types."""
    CRAWLED = "crawled"
    INDEXED = "indexed"
    DOWNLOADED = "downloaded"
    VIEWED = "viewed"
    EXPORTED = "exported"
    TRAINED_ON = "trained_on"
    DELETED = "deleted"


# =============================================================================
# Bundesland Definitions
# =============================================================================

BUNDESLAENDER = {
    "bw": {"name": "Baden-Württemberg", "short": "BW"},
    "by": {"name": "Bayern", "short": "BY"},
    "be": {"name": "Berlin", "short": "BE"},
    "bb": {"name": "Brandenburg", "short": "BB"},
    "hb": {"name": "Bremen", "short": "HB"},
    "hh": {"name": "Hamburg", "short": "HH"},
    "he": {"name": "Hessen", "short": "HE"},
    "mv": {"name": "Mecklenburg-Vorpommern", "short": "MV"},
    "ni": {"name": "Niedersachsen", "short": "NI"},
    "nw": {"name": "Nordrhein-Westfalen", "short": "NW"},
    "rp": {"name": "Rheinland-Pfalz", "short": "RP"},
    "sl": {"name": "Saarland", "short": "SL"},
    "sn": {"name": "Sachsen", "short": "SN"},
    "st": {"name": "Sachsen-Anhalt", "short": "ST"},
    "sh": {"name": "Schleswig-Holstein", "short": "SH"},
    "th": {"name": "Thüringen", "short": "TH"},
}


# Training permission based on Word document analysis
TRAINING_PERMISSIONS = {
    "bw": True,   # Amtliches Werk
    "by": True,   # Amtliches Werk
    "be": False,  # Keine Lizenz
    "bb": False,  # Keine Lizenz
    "hb": False,  # Eingeschränkt -> False for safety
    "hh": False,  # Keine Lizenz
    "he": True,   # Amtliches Werk
    "mv": False,  # Eingeschränkt -> False for safety
    "ni": True,   # Amtliches Werk
    "nw": True,   # Amtliches Werk
    "rp": True,   # Amtliches Werk
    "sl": False,  # Keine Lizenz
    "sn": True,   # Amtliches Werk
    "st": False,  # Eingeschränkt -> False for safety
    "sh": True,   # Amtliches Werk
    "th": True,   # Amtliches Werk
}


# =============================================================================
# API Models - Sources
# =============================================================================

class ZeugnisSourceBase(BaseModel):
    """Base model for zeugnis source."""
    bundesland: str = Field(..., description="Bundesland code (e.g., 'ni', 'by')")
    name: str = Field(..., description="Full name of the source")
    base_url: Optional[str] = Field(None, description="Base URL for the source")
    license_type: LicenseType = Field(..., description="License classification")
    training_allowed: bool = Field(False, description="Whether AI training is permitted")


class ZeugnisSourceCreate(ZeugnisSourceBase):
    """Model for creating a new source."""
    pass


class ZeugnisSource(ZeugnisSourceBase):
    """Full source model with all fields."""
    id: str
    verified_by: Optional[str] = None
    verified_at: Optional[datetime] = None
    created_at: datetime
    updated_at: datetime

    class Config:
        from_attributes = True


class ZeugnisSourceVerify(BaseModel):
    """Model for verifying a source's license."""
    verified_by: str = Field(..., description="User ID who verified")
    license_type: LicenseType
    training_allowed: bool
    notes: Optional[str] = None


# =============================================================================
# API Models - Seed URLs
# =============================================================================

class SeedUrlBase(BaseModel):
    """Base model for seed URL."""
    url: str = Field(..., description="URL to crawl")
    doc_type: DocType = Field(DocType.VERORDNUNG, description="Type of document")


class SeedUrlCreate(SeedUrlBase):
    """Model for creating a new seed URL."""
    source_id: str


class SeedUrl(SeedUrlBase):
    """Full seed URL model."""
    id: str
    source_id: str
    status: CrawlStatus = CrawlStatus.PENDING
    last_crawled: Optional[datetime] = None
    error_message: Optional[str] = None
    created_at: datetime

    class Config:
        from_attributes = True


# =============================================================================
# API Models - Documents
# =============================================================================

class ZeugnisDocumentBase(BaseModel):
    """Base model for zeugnis document."""
    title: Optional[str] = None
    url: str
    content_type: Optional[str] = None
    file_size: Optional[int] = None


class ZeugnisDocument(ZeugnisDocumentBase):
    """Full document model."""
    id: str
    seed_url_id: str
    content_hash: Optional[str] = None
    minio_path: Optional[str] = None
    training_allowed: bool = False
    indexed_in_qdrant: bool = False
    bundesland: Optional[str] = None
    source_name: Optional[str] = None
    created_at: datetime
    updated_at: datetime

    class Config:
        from_attributes = True


class ZeugnisDocumentVersion(BaseModel):
    """Document version for history tracking."""
    id: str
    document_id: str
    version: int
    content_hash: str
    minio_path: Optional[str] = None
    change_summary: Optional[str] = None
    created_at: datetime

    class Config:
        from_attributes = True


# =============================================================================
# API Models - Crawler
# =============================================================================

class CrawlerStatus(BaseModel):
    """Current status of the crawler."""
    is_running: bool = False
    current_source: Optional[str] = None
    current_bundesland: Optional[str] = None
    queue_length: int = 0
    documents_crawled_today: int = 0
    documents_indexed_today: int = 0
    last_activity: Optional[datetime] = None
    errors_today: int = 0


class CrawlQueueItem(BaseModel):
    """Item in the crawl queue."""
    id: str
    source_id: str
    bundesland: str
    source_name: str
    priority: int = 5
    status: CrawlStatus = CrawlStatus.PENDING
    started_at: Optional[datetime] = None
    completed_at: Optional[datetime] = None
    documents_found: int = 0
    documents_indexed: int = 0
    error_count: int = 0
    created_at: datetime


class CrawlRequest(BaseModel):
    """Request to start a crawl."""
    bundesland: Optional[str] = Field(None, description="Specific Bundesland to crawl")
    source_id: Optional[str] = Field(None, description="Specific source ID to crawl")
    priority: int = Field(5, ge=1, le=10, description="Priority (1=lowest, 10=highest)")


class CrawlResult(BaseModel):
    """Result of a crawl operation."""
    source_id: str
    bundesland: str
    documents_found: int
    documents_indexed: int
    documents_skipped: int
    errors: List[str]
    duration_seconds: float


# =============================================================================
# API Models - Statistics
# =============================================================================

class ZeugnisStats(BaseModel):
    """Statistics for the zeugnis crawler."""
    total_sources: int = 0
    total_documents: int = 0
    indexed_documents: int = 0
    training_allowed_documents: int = 0
    active_crawls: int = 0
    per_bundesland: List[Dict[str, Any]] = []


class BundeslandStats(BaseModel):
    """Statistics per Bundesland."""
    bundesland: str
    name: str
    training_allowed: bool
    document_count: int
    indexed_count: int
    last_crawled: Optional[datetime] = None


# =============================================================================
# API Models - Audit
# =============================================================================

class UsageEvent(BaseModel):
    """Usage event for audit trail."""
    id: str
    document_id: str
    event_type: EventType
    user_id: Optional[str] = None
    details: Optional[Dict[str, Any]] = None
    created_at: datetime

    class Config:
        from_attributes = True


class AuditExport(BaseModel):
    """GDPR-compliant audit export."""
    export_date: datetime
    requested_by: str
    events: List[UsageEvent]
    document_count: int
    date_range_start: datetime
    date_range_end: datetime


# =============================================================================
# Helper Functions
# =============================================================================

def generate_id() -> str:
    """Generate a new UUID."""
    return str(uuid.uuid4())


def get_training_allowed(bundesland: str) -> bool:
    """Get training permission for a Bundesland."""
    return TRAINING_PERMISSIONS.get(bundesland.lower(), False)


def get_bundesland_name(code: str) -> str:
    """Get full Bundesland name from code."""
    info = BUNDESLAENDER.get(code.lower(), {})
    return info.get("name", code)


def get_license_for_bundesland(bundesland: str) -> LicenseType:
    """Get appropriate license type for a Bundesland."""
    if TRAINING_PERMISSIONS.get(bundesland.lower(), False):
        return LicenseType.GOV_STATUTE_FREE_USE
    return LicenseType.UNKNOWN_REQUIRES_REVIEW