Restructure: Move 52 files into 7 domain packages

korrektur/ zeugnis/ admin/ compliance/ worksheet/ training/ metrics/ 52 shims, relative imports, RAG untouched. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-25 22:10:48 +02:00
parent 0504d22b8e
commit 165c493d1e
111 changed files with 11859 additions and 11609 deletions
--- a/klausur-service/backend/zeugnis/init.py
+++ b/klausur-service/backend/zeugnis/init.py
@@ -0,0 +1,6 @@
+"""
+zeugnis package — certificate crawler, models, storage.
+
+Backward-compatible re-exports: consumers can still use
+``from zeugnis_api import ...`` etc. via the shim files in backend/.
+"""
--- a/klausur-service/backend/zeugnis/api.py
+++ b/klausur-service/backend/zeugnis/api.py
@@ -0,0 +1,19 @@
+"""
+Zeugnis Rights-Aware Crawler — barrel re-export.
+
+All implementation split into:
+  zeugnis_api_sources — sources, seed URLs, initialization
+  zeugnis_api_docs    — documents, crawler, statistics, audit
+
+FastAPI router for managing zeugnis sources, documents, and crawler operations.
+"""
+
+from fastapi import APIRouter
+
+from .api_sources import router as _sources_router  # noqa: F401
+from .api_docs import router as _docs_router  # noqa: F401
+
+# Composite router (used by main.py)
+router = APIRouter()
+router.include_router(_sources_router)
+router.include_router(_docs_router)
--- a/klausur-service/backend/zeugnis/api_docs.py
+++ b/klausur-service/backend/zeugnis/api_docs.py
@@ -0,0 +1,321 @@
+"""
+Zeugnis API Docs — documents, crawler control, statistics, audit endpoints.
+
+Extracted from zeugnis_api.py for modularity.
+"""
+
+from datetime import datetime, timedelta
+from typing import Optional, List
+from fastapi import APIRouter, HTTPException, BackgroundTasks, Query
+
+from .models import (
+    CrawlRequest, EventType,
+    BUNDESLAENDER,
+    generate_id, get_training_allowed, get_license_for_bundesland,
+)
+from .crawler import (
+    start_crawler, stop_crawler, get_crawler_status,
+)
+from metrics_db import (
+    get_zeugnis_documents, get_zeugnis_stats,
+    log_zeugnis_event, get_pool,
+)
+
+
+router = APIRouter(prefix="/api/v1/admin/zeugnis", tags=["Zeugnis Crawler"])
+
+
+# =============================================================================
+# Documents Endpoints
+# =============================================================================
+
+@router.get("/documents", response_model=List[dict])
+async def list_documents(
+    bundesland: Optional[str] = None,
+    limit: int = Query(100, le=500),
+    offset: int = 0,
+):
+    """Get all zeugnis documents with optional filtering."""
+    documents = await get_zeugnis_documents(bundesland=bundesland, limit=limit, offset=offset)
+    return documents
+
+
+@router.get("/documents/{document_id}", response_model=dict)
+async def get_document(document_id: str):
+    """Get details for a specific document."""
+    pool = await get_pool()
+    if not pool:
+        raise HTTPException(status_code=503, detail="Database not available")
+
+    try:
+        async with pool.acquire() as conn:
+            doc = await conn.fetchrow(
+                """
+                SELECT d.*, s.bundesland, s.name as source_name
+                FROM zeugnis_documents d
+                JOIN zeugnis_seed_urls u ON d.seed_url_id = u.id
+                JOIN zeugnis_sources s ON u.source_id = s.id
+                WHERE d.id = $1
+                """,
+                document_id
+            )
+            if not doc:
+                raise HTTPException(status_code=404, detail="Document not found")
+
+            # Log view event
+            await log_zeugnis_event(document_id, EventType.VIEWED.value)
+
+            return dict(doc)
+    except HTTPException:
+        raise
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+
+
+@router.get("/documents/{document_id}/versions", response_model=List[dict])
+async def get_document_versions(document_id: str):
+    """Get version history for a document."""
+    pool = await get_pool()
+    if not pool:
+        raise HTTPException(status_code=503, detail="Database not available")
+
+    try:
+        async with pool.acquire() as conn:
+            rows = await conn.fetch(
+                """
+                SELECT * FROM zeugnis_document_versions
+                WHERE document_id = $1
+                ORDER BY version DESC
+                """,
+                document_id
+            )
+            return [dict(r) for r in rows]
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+
+
+# =============================================================================
+# Crawler Control Endpoints
+# =============================================================================
+
+@router.get("/crawler/status", response_model=dict)
+async def crawler_status():
+    """Get current crawler status."""
+    return get_crawler_status()
+
+
+@router.post("/crawler/start", response_model=dict)
+async def start_crawl(request: CrawlRequest, background_tasks: BackgroundTasks):
+    """Start the crawler."""
+    success = await start_crawler(
+        bundesland=request.bundesland,
+        source_id=request.source_id,
+    )
+    if not success:
+        raise HTTPException(status_code=409, detail="Crawler already running")
+    return {"success": True, "message": "Crawler started"}
+
+
+@router.post("/crawler/stop", response_model=dict)
+async def stop_crawl():
+    """Stop the crawler."""
+    success = await stop_crawler()
+    if not success:
+        raise HTTPException(status_code=409, detail="Crawler not running")
+    return {"success": True, "message": "Crawler stopped"}
+
+
+@router.get("/crawler/queue", response_model=List[dict])
+async def get_queue():
+    """Get the crawler queue."""
+    pool = await get_pool()
+    if not pool:
+        return []
+
+    try:
+        async with pool.acquire() as conn:
+            rows = await conn.fetch(
+                """
+                SELECT q.*, s.bundesland, s.name as source_name
+                FROM zeugnis_crawler_queue q
+                JOIN zeugnis_sources s ON q.source_id = s.id
+                ORDER BY q.priority DESC, q.created_at
+                """
+            )
+            return [dict(r) for r in rows]
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+
+
+@router.post("/crawler/queue", response_model=dict)
+async def add_to_queue(request: CrawlRequest):
+    """Add a source to the crawler queue."""
+    pool = await get_pool()
+    if not pool:
+        raise HTTPException(status_code=503, detail="Database not available")
+
+    queue_id = generate_id()
+    try:
+        async with pool.acquire() as conn:
+            # Get source ID if bundesland provided
+            source_id = request.source_id
+            if not source_id and request.bundesland:
+                source = await conn.fetchrow(
+                    "SELECT id FROM zeugnis_sources WHERE bundesland = $1",
+                    request.bundesland
+                )
+                if source:
+                    source_id = source["id"]
+
+            if not source_id:
+                raise HTTPException(status_code=400, detail="Source not found")
+
+            await conn.execute(
+                """
+                INSERT INTO zeugnis_crawler_queue (id, source_id, priority, status)
+                VALUES ($1, $2, $3, 'pending')
+                """,
+                queue_id, source_id, request.priority
+            )
+        return {"id": queue_id, "success": True}
+    except HTTPException:
+        raise
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+
+
+# =============================================================================
+# Statistics Endpoints
+# =============================================================================
+
+@router.get("/stats", response_model=dict)
+async def get_stats():
+    """Get zeugnis crawler statistics."""
+    stats = await get_zeugnis_stats()
+    return stats
+
+
+@router.get("/stats/bundesland", response_model=List[dict])
+async def get_bundesland_stats():
+    """Get statistics per Bundesland."""
+    pool = await get_pool()
+
+    # Build stats from BUNDESLAENDER with DB data if available
+    stats = []
+    for code, info in BUNDESLAENDER.items():
+        stat = {
+            "bundesland": code,
+            "name": info["name"],
+            "training_allowed": get_training_allowed(code),
+            "document_count": 0,
+            "indexed_count": 0,
+            "last_crawled": None,
+        }
+
+        if pool:
+            try:
+                async with pool.acquire() as conn:
+                    row = await conn.fetchrow(
+                        """
+                        SELECT
+                            COUNT(d.id) as doc_count,
+                            COUNT(CASE WHEN d.indexed_in_qdrant THEN 1 END) as indexed_count,
+                            MAX(u.last_crawled) as last_crawled
+                        FROM zeugnis_sources s
+                        LEFT JOIN zeugnis_seed_urls u ON s.id = u.source_id
+                        LEFT JOIN zeugnis_documents d ON u.id = d.seed_url_id
+                        WHERE s.bundesland = $1
+                        GROUP BY s.id
+                        """,
+                        code
+                    )
+                    if row:
+                        stat["document_count"] = row["doc_count"] or 0
+                        stat["indexed_count"] = row["indexed_count"] or 0
+                        stat["last_crawled"] = row["last_crawled"].isoformat() if row["last_crawled"] else None
+            except Exception:
+                pass
+
+        stats.append(stat)
+
+    return stats
+
+
+# =============================================================================
+# Audit Endpoints
+# =============================================================================
+
+@router.get("/audit/events", response_model=List[dict])
+async def get_audit_events(
+    document_id: Optional[str] = None,
+    event_type: Optional[str] = None,
+    limit: int = Query(100, le=1000),
+    days: int = Query(30, le=365),
+):
+    """Get audit events with optional filtering."""
+    pool = await get_pool()
+    if not pool:
+        return []
+
+    try:
+        since = datetime.now() - timedelta(days=days)
+        async with pool.acquire() as conn:
+            query = """
+                SELECT * FROM zeugnis_usage_events
+                WHERE created_at >= $1
+            """
+            params = [since]
+
+            if document_id:
+                query += " AND document_id = $2"
+                params.append(document_id)
+            if event_type:
+                query += f" AND event_type = ${len(params) + 1}"
+                params.append(event_type)
+
+            query += f" ORDER BY created_at DESC LIMIT ${len(params) + 1}"
+            params.append(limit)
+
+            rows = await conn.fetch(query, *params)
+            return [dict(r) for r in rows]
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+
+
+@router.get("/audit/export", response_model=dict)
+async def export_audit(
+    days: int = Query(30, le=365),
+    requested_by: str = Query(..., description="User requesting the export"),
+):
+    """Export audit data for GDPR compliance."""
+    pool = await get_pool()
+    if not pool:
+        raise HTTPException(status_code=503, detail="Database not available")
+
+    try:
+        since = datetime.now() - timedelta(days=days)
+        async with pool.acquire() as conn:
+            rows = await conn.fetch(
+                """
+                SELECT * FROM zeugnis_usage_events
+                WHERE created_at >= $1
+                ORDER BY created_at DESC
+                """,
+                since
+            )
+
+            doc_count = await conn.fetchval(
+                "SELECT COUNT(DISTINCT document_id) FROM zeugnis_usage_events WHERE created_at >= $1",
+                since
+            )
+
+            return {
+                "export_date": datetime.now().isoformat(),
+                "requested_by": requested_by,
+                "events": [dict(r) for r in rows],
+                "document_count": doc_count or 0,
+                "date_range_start": since.isoformat(),
+                "date_range_end": datetime.now().isoformat(),
+            }
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
--- a/klausur-service/backend/zeugnis/api_sources.py
+++ b/klausur-service/backend/zeugnis/api_sources.py
@@ -0,0 +1,232 @@
+"""
+Zeugnis API Sources — source and seed URL management endpoints.
+
+Extracted from zeugnis_api.py for modularity.
+"""
+
+from typing import Optional, List
+from fastapi import APIRouter, HTTPException
+from pydantic import BaseModel
+
+from .models import (
+    ZeugnisSourceCreate, ZeugnisSourceVerify,
+    SeedUrlCreate,
+    LicenseType, DocType,
+    BUNDESLAENDER,
+    generate_id, get_training_allowed, get_bundesland_name, get_license_for_bundesland,
+)
+from metrics_db import (
+    get_zeugnis_sources, upsert_zeugnis_source, get_pool,
+)
+
+
+router = APIRouter(prefix="/api/v1/admin/zeugnis", tags=["Zeugnis Crawler"])
+
+
+# =============================================================================
+# Sources Endpoints
+# =============================================================================
+
+@router.get("/sources", response_model=List[dict])
+async def list_sources():
+    """Get all zeugnis sources (Bundeslaender)."""
+    sources = await get_zeugnis_sources()
+    if not sources:
+        # Return default sources if none exist
+        return [
+            {
+                "id": None,
+                "bundesland": code,
+                "name": info["name"],
+                "base_url": None,
+                "license_type": str(get_license_for_bundesland(code).value),
+                "training_allowed": get_training_allowed(code),
+                "verified_by": None,
+                "verified_at": None,
+                "created_at": None,
+                "updated_at": None,
+            }
+            for code, info in BUNDESLAENDER.items()
+        ]
+    return sources
+
+
+@router.post("/sources", response_model=dict)
+async def create_source(source: ZeugnisSourceCreate):
+    """Create or update a zeugnis source."""
+    source_id = generate_id()
+    success = await upsert_zeugnis_source(
+        id=source_id,
+        bundesland=source.bundesland,
+        name=source.name,
+        license_type=source.license_type.value,
+        training_allowed=source.training_allowed,
+        base_url=source.base_url,
+    )
+    if not success:
+        raise HTTPException(status_code=500, detail="Failed to create source")
+    return {"id": source_id, "success": True}
+
+
+@router.put("/sources/{source_id}/verify", response_model=dict)
+async def verify_source(source_id: str, verification: ZeugnisSourceVerify):
+    """Verify a source's license status."""
+    pool = await get_pool()
+    if not pool:
+        raise HTTPException(status_code=503, detail="Database not available")
+
+    try:
+        async with pool.acquire() as conn:
+            await conn.execute(
+                """
+                UPDATE zeugnis_sources
+                SET license_type = $2,
+                    training_allowed = $3,
+                    verified_by = $4,
+                    verified_at = NOW(),
+                    updated_at = NOW()
+                WHERE id = $1
+                """,
+                source_id, verification.license_type.value,
+                verification.training_allowed, verification.verified_by
+            )
+        return {"success": True, "source_id": source_id}
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+
+
+@router.get("/sources/{bundesland}", response_model=dict)
+async def get_source_by_bundesland(bundesland: str):
+    """Get source details for a specific Bundesland."""
+    pool = await get_pool()
+    if not pool:
+        # Return default info
+        if bundesland not in BUNDESLAENDER:
+            raise HTTPException(status_code=404, detail=f"Bundesland not found: {bundesland}")
+        return {
+            "bundesland": bundesland,
+            "name": get_bundesland_name(bundesland),
+            "training_allowed": get_training_allowed(bundesland),
+            "license_type": get_license_for_bundesland(bundesland).value,
+            "document_count": 0,
+        }
+
+    try:
+        async with pool.acquire() as conn:
+            source = await conn.fetchrow(
+                "SELECT * FROM zeugnis_sources WHERE bundesland = $1",
+                bundesland
+            )
+            if source:
+                doc_count = await conn.fetchval(
+                    """
+                    SELECT COUNT(*) FROM zeugnis_documents d
+                    JOIN zeugnis_seed_urls u ON d.seed_url_id = u.id
+                    WHERE u.source_id = $1
+                    """,
+                    source["id"]
+                )
+                return {**dict(source), "document_count": doc_count or 0}
+
+            # Return default
+            return {
+                "bundesland": bundesland,
+                "name": get_bundesland_name(bundesland),
+                "training_allowed": get_training_allowed(bundesland),
+                "license_type": get_license_for_bundesland(bundesland).value,
+                "document_count": 0,
+            }
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+
+
+# =============================================================================
+# Seed URLs Endpoints
+# =============================================================================
+
+@router.get("/sources/{source_id}/urls", response_model=List[dict])
+async def list_seed_urls(source_id: str):
+    """Get all seed URLs for a source."""
+    pool = await get_pool()
+    if not pool:
+        return []
+
+    try:
+        async with pool.acquire() as conn:
+            rows = await conn.fetch(
+                "SELECT * FROM zeugnis_seed_urls WHERE source_id = $1 ORDER BY created_at",
+                source_id
+            )
+            return [dict(r) for r in rows]
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+
+
+@router.post("/sources/{source_id}/urls", response_model=dict)
+async def add_seed_url(source_id: str, seed_url: SeedUrlCreate):
+    """Add a new seed URL to a source."""
+    pool = await get_pool()
+    if not pool:
+        raise HTTPException(status_code=503, detail="Database not available")
+
+    url_id = generate_id()
+    try:
+        async with pool.acquire() as conn:
+            await conn.execute(
+                """
+                INSERT INTO zeugnis_seed_urls (id, source_id, url, doc_type, status)
+                VALUES ($1, $2, $3, $4, 'pending')
+                """,
+                url_id, source_id, seed_url.url, seed_url.doc_type.value
+            )
+        return {"id": url_id, "success": True}
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+
+
+@router.delete("/urls/{url_id}", response_model=dict)
+async def delete_seed_url(url_id: str):
+    """Delete a seed URL."""
+    pool = await get_pool()
+    if not pool:
+        raise HTTPException(status_code=503, detail="Database not available")
+
+    try:
+        async with pool.acquire() as conn:
+            await conn.execute(
+                "DELETE FROM zeugnis_seed_urls WHERE id = $1",
+                url_id
+            )
+        return {"success": True}
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+
+
+# =============================================================================
+# Initialization Endpoint
+# =============================================================================
+
+@router.post("/init", response_model=dict)
+async def initialize_sources():
+    """Initialize default sources from BUNDESLAENDER."""
+    pool = await get_pool()
+    if not pool:
+        raise HTTPException(status_code=503, detail="Database not available")
+
+    created = 0
+    try:
+        for code, info in BUNDESLAENDER.items():
+            source_id = generate_id()
+            success = await upsert_zeugnis_source(
+                id=source_id,
+                bundesland=code,
+                name=info["name"],
+                license_type=get_license_for_bundesland(code).value,
+                training_allowed=get_training_allowed(code),
+            )
+            if success:
+                created += 1
+
+        return {"success": True, "sources_created": created}
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
--- a/klausur-service/backend/zeugnis/control.py
+++ b/klausur-service/backend/zeugnis/control.py
@@ -0,0 +1,105 @@
+"""
+Zeugnis Crawler - Start/stop/status control functions.
+"""
+
+import asyncio
+from typing import Optional, Dict, Any
+
+from .worker import ZeugnisCrawler, get_crawler_state
+
+
+_crawler_instance: Optional[ZeugnisCrawler] = None
+_crawler_task: Optional[asyncio.Task] = None
+
+
+async def start_crawler(bundesland: Optional[str] = None, source_id: Optional[str] = None) -> bool:
+    """Start the crawler."""
+    global _crawler_instance, _crawler_task
+
+    state = get_crawler_state()
+
+    if state.is_running:
+        return False
+
+    state.is_running = True
+    state.documents_crawled_today = 0
+    state.documents_indexed_today = 0
+    state.errors_today = 0
+
+    _crawler_instance = ZeugnisCrawler()
+    await _crawler_instance.init()
+
+    async def run_crawler():
+        try:
+            from metrics_db import get_pool
+            pool = await get_pool()
+
+            if pool:
+                async with pool.acquire() as conn:
+                    # Get sources to crawl
+                    if source_id:
+                        sources = await conn.fetch(
+                            "SELECT id, bundesland FROM zeugnis_sources WHERE id = $1",
+                            source_id
+                        )
+                    elif bundesland:
+                        sources = await conn.fetch(
+                            "SELECT id, bundesland FROM zeugnis_sources WHERE bundesland = $1",
+                            bundesland
+                        )
+                    else:
+                        sources = await conn.fetch(
+                            "SELECT id, bundesland FROM zeugnis_sources ORDER BY bundesland"
+                        )
+
+                    for source in sources:
+                        if not state.is_running:
+                            break
+                        await _crawler_instance.crawl_source(source["id"])
+
+        except Exception as e:
+            print(f"Crawler error: {e}")
+
+        finally:
+            state.is_running = False
+            if _crawler_instance:
+                await _crawler_instance.close()
+
+    _crawler_task = asyncio.create_task(run_crawler())
+    return True
+
+
+async def stop_crawler() -> bool:
+    """Stop the crawler."""
+    global _crawler_task
+
+    state = get_crawler_state()
+
+    if not state.is_running:
+        return False
+
+    state.is_running = False
+
+    if _crawler_task:
+        _crawler_task.cancel()
+        try:
+            await _crawler_task
+        except asyncio.CancelledError:
+            pass
+
+    return True
+
+
+def get_crawler_status() -> Dict[str, Any]:
+    """Get current crawler status."""
+    state = get_crawler_state()
+    return {
+        "is_running": state.is_running,
+        "current_source": state.current_source_id,
+        "current_bundesland": state.current_bundesland,
+        "queue_length": len(state.queue),
+        "documents_crawled_today": state.documents_crawled_today,
+        "documents_indexed_today": state.documents_indexed_today,
+        "errors_today": state.errors_today,
+        "last_activity": state.last_activity.isoformat() if state.last_activity else None,
+    }
--- a/klausur-service/backend/zeugnis/crawler.py
+++ b/klausur-service/backend/zeugnis/crawler.py
@@ -0,0 +1,26 @@
+"""
+Zeugnis Rights-Aware Crawler
+
+Barrel re-export: all public symbols for backward compatibility.
+"""
+
+from .text import (  # noqa: F401
+    extract_text_from_pdf,
+    extract_text_from_html,
+    chunk_text,
+    compute_hash,
+)
+from .storage import (  # noqa: F401
+    generate_embeddings,
+    upload_to_minio,
+    index_in_qdrant,
+)
+from .worker import (  # noqa: F401
+    CrawlerState,
+    ZeugnisCrawler,
+)
+from .control import (  # noqa: F401
+    start_crawler,
+    stop_crawler,
+    get_crawler_status,
+)
--- a/klausur-service/backend/zeugnis/models.py
+++ b/klausur-service/backend/zeugnis/models.py
@@ -0,0 +1,340 @@
+"""
+Zeugnis Rights-Aware Crawler - Data Models
+
+Pydantic models for API requests/responses and internal data structures.
+Database schema is defined in metrics_db.py.
+"""
+
+from datetime import datetime
+from enum import Enum
+from typing import Optional, List, Dict, Any
+from pydantic import BaseModel, Field
+import uuid
+
+
+# =============================================================================
+# Enums
+# =============================================================================
+
+class LicenseType(str, Enum):
+    """License classification for training permission."""
+    PUBLIC_DOMAIN = "public_domain"           # Amtliche Werke (§5 UrhG)
+    CC_BY = "cc_by"                           # Creative Commons Attribution
+    CC_BY_SA = "cc_by_sa"                     # CC Attribution-ShareAlike
+    CC_BY_NC = "cc_by_nc"                     # CC NonCommercial - NO TRAINING
+    CC_BY_NC_SA = "cc_by_nc_sa"               # CC NC-SA - NO TRAINING
+    GOV_STATUTE_FREE_USE = "gov_statute"      # Government statutes (gemeinfrei)
+    ALL_RIGHTS_RESERVED = "all_rights"        # Standard copyright - NO TRAINING
+    UNKNOWN_REQUIRES_REVIEW = "unknown"       # Needs manual review
+
+
+class CrawlStatus(str, Enum):
+    """Status of a crawl job or seed URL."""
+    PENDING = "pending"
+    RUNNING = "running"
+    COMPLETED = "completed"
+    FAILED = "failed"
+    PAUSED = "paused"
+
+
+class DocType(str, Enum):
+    """Type of zeugnis document."""
+    VERORDNUNG = "verordnung"           # Official regulation
+    HANDREICHUNG = "handreichung"       # Implementation guide
+    FORMULAR = "formular"               # Form template
+    ERLASS = "erlass"                   # Decree
+    SCHULORDNUNG = "schulordnung"       # School regulations
+    SONSTIGES = "sonstiges"             # Other
+
+
+class EventType(str, Enum):
+    """Audit event types."""
+    CRAWLED = "crawled"
+    INDEXED = "indexed"
+    DOWNLOADED = "downloaded"
+    VIEWED = "viewed"
+    EXPORTED = "exported"
+    TRAINED_ON = "trained_on"
+    DELETED = "deleted"
+
+
+# =============================================================================
+# Bundesland Definitions
+# =============================================================================
+
+BUNDESLAENDER = {
+    "bw": {"name": "Baden-Württemberg", "short": "BW"},
+    "by": {"name": "Bayern", "short": "BY"},
+    "be": {"name": "Berlin", "short": "BE"},
+    "bb": {"name": "Brandenburg", "short": "BB"},
+    "hb": {"name": "Bremen", "short": "HB"},
+    "hh": {"name": "Hamburg", "short": "HH"},
+    "he": {"name": "Hessen", "short": "HE"},
+    "mv": {"name": "Mecklenburg-Vorpommern", "short": "MV"},
+    "ni": {"name": "Niedersachsen", "short": "NI"},
+    "nw": {"name": "Nordrhein-Westfalen", "short": "NW"},
+    "rp": {"name": "Rheinland-Pfalz", "short": "RP"},
+    "sl": {"name": "Saarland", "short": "SL"},
+    "sn": {"name": "Sachsen", "short": "SN"},
+    "st": {"name": "Sachsen-Anhalt", "short": "ST"},
+    "sh": {"name": "Schleswig-Holstein", "short": "SH"},
+    "th": {"name": "Thüringen", "short": "TH"},
+}
+
+
+# Training permission based on Word document analysis
+TRAINING_PERMISSIONS = {
+    "bw": True,   # Amtliches Werk
+    "by": True,   # Amtliches Werk
+    "be": False,  # Keine Lizenz
+    "bb": False,  # Keine Lizenz
+    "hb": False,  # Eingeschränkt -> False for safety
+    "hh": False,  # Keine Lizenz
+    "he": True,   # Amtliches Werk
+    "mv": False,  # Eingeschränkt -> False for safety
+    "ni": True,   # Amtliches Werk
+    "nw": True,   # Amtliches Werk
+    "rp": True,   # Amtliches Werk
+    "sl": False,  # Keine Lizenz
+    "sn": True,   # Amtliches Werk
+    "st": False,  # Eingeschränkt -> False for safety
+    "sh": True,   # Amtliches Werk
+    "th": True,   # Amtliches Werk
+}
+
+
+# =============================================================================
+# API Models - Sources
+# =============================================================================
+
+class ZeugnisSourceBase(BaseModel):
+    """Base model for zeugnis source."""
+    bundesland: str = Field(..., description="Bundesland code (e.g., 'ni', 'by')")
+    name: str = Field(..., description="Full name of the source")
+    base_url: Optional[str] = Field(None, description="Base URL for the source")
+    license_type: LicenseType = Field(..., description="License classification")
+    training_allowed: bool = Field(False, description="Whether AI training is permitted")
+
+
+class ZeugnisSourceCreate(ZeugnisSourceBase):
+    """Model for creating a new source."""
+    pass
+
+
+class ZeugnisSource(ZeugnisSourceBase):
+    """Full source model with all fields."""
+    id: str
+    verified_by: Optional[str] = None
+    verified_at: Optional[datetime] = None
+    created_at: datetime
+    updated_at: datetime
+
+    class Config:
+        from_attributes = True
+
+
+class ZeugnisSourceVerify(BaseModel):
+    """Model for verifying a source's license."""
+    verified_by: str = Field(..., description="User ID who verified")
+    license_type: LicenseType
+    training_allowed: bool
+    notes: Optional[str] = None
+
+
+# =============================================================================
+# API Models - Seed URLs
+# =============================================================================
+
+class SeedUrlBase(BaseModel):
+    """Base model for seed URL."""
+    url: str = Field(..., description="URL to crawl")
+    doc_type: DocType = Field(DocType.VERORDNUNG, description="Type of document")
+
+
+class SeedUrlCreate(SeedUrlBase):
+    """Model for creating a new seed URL."""
+    source_id: str
+
+
+class SeedUrl(SeedUrlBase):
+    """Full seed URL model."""
+    id: str
+    source_id: str
+    status: CrawlStatus = CrawlStatus.PENDING
+    last_crawled: Optional[datetime] = None
+    error_message: Optional[str] = None
+    created_at: datetime
+
+    class Config:
+        from_attributes = True
+
+
+# =============================================================================
+# API Models - Documents
+# =============================================================================
+
+class ZeugnisDocumentBase(BaseModel):
+    """Base model for zeugnis document."""
+    title: Optional[str] = None
+    url: str
+    content_type: Optional[str] = None
+    file_size: Optional[int] = None
+
+
+class ZeugnisDocument(ZeugnisDocumentBase):
+    """Full document model."""
+    id: str
+    seed_url_id: str
+    content_hash: Optional[str] = None
+    minio_path: Optional[str] = None
+    training_allowed: bool = False
+    indexed_in_qdrant: bool = False
+    bundesland: Optional[str] = None
+    source_name: Optional[str] = None
+    created_at: datetime
+    updated_at: datetime
+
+    class Config:
+        from_attributes = True
+
+
+class ZeugnisDocumentVersion(BaseModel):
+    """Document version for history tracking."""
+    id: str
+    document_id: str
+    version: int
+    content_hash: str
+    minio_path: Optional[str] = None
+    change_summary: Optional[str] = None
+    created_at: datetime
+
+    class Config:
+        from_attributes = True
+
+
+# =============================================================================
+# API Models - Crawler
+# =============================================================================
+
+class CrawlerStatus(BaseModel):
+    """Current status of the crawler."""
+    is_running: bool = False
+    current_source: Optional[str] = None
+    current_bundesland: Optional[str] = None
+    queue_length: int = 0
+    documents_crawled_today: int = 0
+    documents_indexed_today: int = 0
+    last_activity: Optional[datetime] = None
+    errors_today: int = 0
+
+
+class CrawlQueueItem(BaseModel):
+    """Item in the crawl queue."""
+    id: str
+    source_id: str
+    bundesland: str
+    source_name: str
+    priority: int = 5
+    status: CrawlStatus = CrawlStatus.PENDING
+    started_at: Optional[datetime] = None
+    completed_at: Optional[datetime] = None
+    documents_found: int = 0
+    documents_indexed: int = 0
+    error_count: int = 0
+    created_at: datetime
+
+
+class CrawlRequest(BaseModel):
+    """Request to start a crawl."""
+    bundesland: Optional[str] = Field(None, description="Specific Bundesland to crawl")
+    source_id: Optional[str] = Field(None, description="Specific source ID to crawl")
+    priority: int = Field(5, ge=1, le=10, description="Priority (1=lowest, 10=highest)")
+
+
+class CrawlResult(BaseModel):
+    """Result of a crawl operation."""
+    source_id: str
+    bundesland: str
+    documents_found: int
+    documents_indexed: int
+    documents_skipped: int
+    errors: List[str]
+    duration_seconds: float
+
+
+# =============================================================================
+# API Models - Statistics
+# =============================================================================
+
+class ZeugnisStats(BaseModel):
+    """Statistics for the zeugnis crawler."""
+    total_sources: int = 0
+    total_documents: int = 0
+    indexed_documents: int = 0
+    training_allowed_documents: int = 0
+    active_crawls: int = 0
+    per_bundesland: List[Dict[str, Any]] = []
+
+
+class BundeslandStats(BaseModel):
+    """Statistics per Bundesland."""
+    bundesland: str
+    name: str
+    training_allowed: bool
+    document_count: int
+    indexed_count: int
+    last_crawled: Optional[datetime] = None
+
+
+# =============================================================================
+# API Models - Audit
+# =============================================================================
+
+class UsageEvent(BaseModel):
+    """Usage event for audit trail."""
+    id: str
+    document_id: str
+    event_type: EventType
+    user_id: Optional[str] = None
+    details: Optional[Dict[str, Any]] = None
+    created_at: datetime
+
+    class Config:
+        from_attributes = True
+
+
+class AuditExport(BaseModel):
+    """GDPR-compliant audit export."""
+    export_date: datetime
+    requested_by: str
+    events: List[UsageEvent]
+    document_count: int
+    date_range_start: datetime
+    date_range_end: datetime
+
+
+# =============================================================================
+# Helper Functions
+# =============================================================================
+
+def generate_id() -> str:
+    """Generate a new UUID."""
+    return str(uuid.uuid4())
+
+
+def get_training_allowed(bundesland: str) -> bool:
+    """Get training permission for a Bundesland."""
+    return TRAINING_PERMISSIONS.get(bundesland.lower(), False)
+
+
+def get_bundesland_name(code: str) -> str:
+    """Get full Bundesland name from code."""
+    info = BUNDESLAENDER.get(code.lower(), {})
+    return info.get("name", code)
+
+
+def get_license_for_bundesland(bundesland: str) -> LicenseType:
+    """Get appropriate license type for a Bundesland."""
+    if TRAINING_PERMISSIONS.get(bundesland.lower(), False):
+        return LicenseType.GOV_STATUTE_FREE_USE
+    return LicenseType.UNKNOWN_REQUIRES_REVIEW
--- a/klausur-service/backend/zeugnis/seed_data.py
+++ b/klausur-service/backend/zeugnis/seed_data.py
@@ -0,0 +1,415 @@
+"""
+Zeugnis Seed Data - Initial URLs from Word Document
+
+Contains seed URLs for all 16 German federal states (Bundesländer)
+based on the "Bundesland URL Zeugnisse.docx" document.
+
+Training permissions:
+- Ja: Amtliches Werk (§5 UrhG) - training allowed
+- Nein: Keine Lizenz angegeben - training NOT allowed
+- Eingeschränkt: Treated as NOT allowed for safety
+"""
+
+from typing import Dict, List, Any
+
+# Seed data structure: bundesland -> list of seed URLs
+SEED_DATA: Dict[str, Dict[str, Any]] = {
+    "bw": {
+        "name": "Baden-Württemberg",
+        "license": "gov_statute",
+        "training_allowed": True,
+        "base_url": "https://www.landesrecht-bw.de",
+        "urls": [
+            {
+                "url": "https://www.landesrecht-bw.de/jportal/portal/t/cru/page/bsbawueprod.psml?pid=Dokumentanzeige&showdoccase=1&js_peid=Trefferliste&documentnumber=1&numberofresults=1&fromdoctodoc=yes&doc.id=jlr-SchulGBWpP5&doc.part=X&doc.price=0.0&doc.hl=1",
+                "doc_type": "verordnung",
+                "title": "Schulgesetz BW - Zeugnisse"
+            },
+            {
+                "url": "https://www.landesrecht-bw.de/jportal/portal/t/cs9/page/bsbawueprod.psml?pid=Dokumentanzeige&showdoccase=1&js_peid=Trefferliste&documentnumber=1&numberofresults=1&fromdoctodoc=yes&doc.id=jlr-NotenBildVBW2016rahmen&doc.part=X&doc.price=0.0",
+                "doc_type": "verordnung",
+                "title": "Notenbildungsverordnung"
+            }
+        ]
+    },
+    "by": {
+        "name": "Bayern",
+        "license": "gov_statute",
+        "training_allowed": True,
+        "base_url": "https://www.gesetze-bayern.de",
+        "urls": [
+            {
+                "url": "https://www.gesetze-bayern.de/Content/Document/BaySchO2016",
+                "doc_type": "schulordnung",
+                "title": "Bayerische Schulordnung"
+            },
+            {
+                "url": "https://www.gesetze-bayern.de/Content/Document/BayGSO",
+                "doc_type": "schulordnung",
+                "title": "Grundschulordnung Bayern"
+            },
+            {
+                "url": "https://www.gesetze-bayern.de/Content/Document/BayVSO",
+                "doc_type": "schulordnung",
+                "title": "Volksschulordnung Bayern"
+            }
+        ]
+    },
+    "be": {
+        "name": "Berlin",
+        "license": "unknown",
+        "training_allowed": False,
+        "base_url": "https://gesetze.berlin.de",
+        "urls": [
+            {
+                "url": "https://gesetze.berlin.de/bsbe/document/jlr-SchulGBEpP58",
+                "doc_type": "verordnung",
+                "title": "Berliner Schulgesetz - Zeugnisse"
+            },
+            {
+                "url": "https://gesetze.berlin.de/bsbe/document/jlr-SekIVBE2010rahmen",
+                "doc_type": "verordnung",
+                "title": "Sekundarstufe I-Verordnung"
+            }
+        ]
+    },
+    "bb": {
+        "name": "Brandenburg",
+        "license": "unknown",
+        "training_allowed": False,
+        "base_url": "https://bravors.brandenburg.de",
+        "urls": [
+            {
+                "url": "https://bravors.brandenburg.de/verordnungen/vvzeugnis",
+                "doc_type": "verordnung",
+                "title": "Verwaltungsvorschriften Zeugnisse"
+            },
+            {
+                "url": "https://bravors.brandenburg.de/verordnungen/gostv",
+                "doc_type": "verordnung",
+                "title": "GOST-Verordnung Brandenburg"
+            }
+        ]
+    },
+    "hb": {
+        "name": "Bremen",
+        "license": "unknown",
+        "training_allowed": False,  # Eingeschränkt -> False for safety
+        "base_url": "https://www.transparenz.bremen.de",
+        "urls": [
+            {
+                "url": "https://www.transparenz.bremen.de/metainformationen/bremisches-schulgesetz-bremschg-vom-28-juni-2005-121009",
+                "doc_type": "verordnung",
+                "title": "Bremisches Schulgesetz"
+            },
+            {
+                "url": "https://www.transparenz.bremen.de/metainformationen/verordnung-ueber-die-sekundarstufe-i-der-oberschule-vom-20-juni-2017-130380",
+                "doc_type": "verordnung",
+                "title": "Sekundarstufe I Verordnung Bremen"
+            }
+        ]
+    },
+    "hh": {
+        "name": "Hamburg",
+        "license": "unknown",
+        "training_allowed": False,
+        "base_url": "https://www.landesrecht-hamburg.de",
+        "urls": [
+            {
+                "url": "https://www.landesrecht-hamburg.de/bsha/document/jlr-SchulGHA2009pP44",
+                "doc_type": "verordnung",
+                "title": "Hamburgisches Schulgesetz - Zeugnisse"
+            },
+            {
+                "url": "https://www.landesrecht-hamburg.de/bsha/document/jlr-AusglLeistVHA2011rahmen",
+                "doc_type": "verordnung",
+                "title": "Ausbildungs- und Prüfungsordnung"
+            }
+        ]
+    },
+    "he": {
+        "name": "Hessen",
+        "license": "gov_statute",
+        "training_allowed": True,
+        "base_url": "https://www.rv.hessenrecht.hessen.de",
+        "urls": [
+            {
+                "url": "https://www.rv.hessenrecht.hessen.de/bshe/document/jlr-SchulGHE2017pP73",
+                "doc_type": "verordnung",
+                "title": "Hessisches Schulgesetz - Zeugnisse"
+            },
+            {
+                "url": "https://www.rv.hessenrecht.hessen.de/bshe/document/jlr-VOBGM11HE2011rahmen",
+                "doc_type": "verordnung",
+                "title": "Verordnung zur Gestaltung des Schulverhältnisses"
+            }
+        ]
+    },
+    "mv": {
+        "name": "Mecklenburg-Vorpommern",
+        "license": "unknown",
+        "training_allowed": False,  # Eingeschränkt -> False for safety
+        "base_url": "https://www.landesrecht-mv.de",
+        "urls": [
+            {
+                "url": "https://www.landesrecht-mv.de/bsmv/document/jlr-SchulGMV2010pP63",
+                "doc_type": "verordnung",
+                "title": "Schulgesetz MV - Zeugnisse"
+            },
+            {
+                "url": "https://www.landesrecht-mv.de/bsmv/document/jlr-ZeugnVMVrahmen",
+                "doc_type": "verordnung",
+                "title": "Zeugnisverordnung MV"
+            }
+        ]
+    },
+    "ni": {
+        "name": "Niedersachsen",
+        "license": "gov_statute",
+        "training_allowed": True,
+        "base_url": "https://www.nds-voris.de",
+        "urls": [
+            {
+                "url": "https://www.nds-voris.de/jportal/portal/t/1gxi/page/bsvorisprod.psml?pid=Dokumentanzeige&showdoccase=1&js_peid=Trefferliste&documentnumber=1&numberofresults=1&fromdoctodoc=yes&doc.id=jlr-SchulGNDpP59",
+                "doc_type": "verordnung",
+                "title": "Niedersächsisches Schulgesetz - Zeugnisse"
+            },
+            {
+                "url": "https://www.nds-voris.de/jportal/portal/t/1gxi/page/bsvorisprod.psml?pid=Dokumentanzeige&showdoccase=1&js_peid=Trefferliste&documentnumber=1&numberofresults=1&fromdoctodoc=yes&doc.id=jlr-ErgZeugnErlNDrahmen",
+                "doc_type": "erlass",
+                "title": "Ergänzende Bestimmungen für Zeugnisse"
+            },
+            {
+                "url": "https://www.mk.niedersachsen.de/startseite/schule/unsere_schulen/allgemein_bildende_schulen/zeugnisse_versetzungen/zeugnisse-und-versetzungen-6351.html",
+                "doc_type": "handreichung",
+                "title": "Handreichung Zeugnisse NI"
+            }
+        ]
+    },
+    "nw": {
+        "name": "Nordrhein-Westfalen",
+        "license": "gov_statute",
+        "training_allowed": True,
+        "base_url": "https://recht.nrw.de",
+        "urls": [
+            {
+                "url": "https://recht.nrw.de/lmi/owa/br_text_anzeigen?v_id=10000000000000000521",
+                "doc_type": "verordnung",
+                "title": "Schulgesetz NRW"
+            },
+            {
+                "url": "https://recht.nrw.de/lmi/owa/br_text_anzeigen?v_id=10000000000000000525",
+                "doc_type": "verordnung",
+                "title": "Ausbildungs- und Prüfungsordnung Sek I"
+            },
+            {
+                "url": "https://www.schulministerium.nrw/zeugnisse",
+                "doc_type": "handreichung",
+                "title": "Handreichung Zeugnisse NRW"
+            }
+        ]
+    },
+    "rp": {
+        "name": "Rheinland-Pfalz",
+        "license": "gov_statute",
+        "training_allowed": True,
+        "base_url": "https://landesrecht.rlp.de",
+        "urls": [
+            {
+                "url": "https://landesrecht.rlp.de/bsrp/document/jlr-SchulGRPpP61",
+                "doc_type": "verordnung",
+                "title": "Schulgesetz RP - Zeugnisse"
+            },
+            {
+                "url": "https://landesrecht.rlp.de/bsrp/document/jlr-ZeugnVRPrahmen",
+                "doc_type": "verordnung",
+                "title": "Zeugnisverordnung RP"
+            }
+        ]
+    },
+    "sl": {
+        "name": "Saarland",
+        "license": "unknown",
+        "training_allowed": False,
+        "base_url": "https://recht.saarland.de",
+        "urls": [
+            {
+                "url": "https://recht.saarland.de/bssl/document/jlr-SchulOGSLrahmen",
+                "doc_type": "schulordnung",
+                "title": "Schulordnungsgesetz Saarland"
+            },
+            {
+                "url": "https://recht.saarland.de/bssl/document/jlr-ZeugnVSL2014rahmen",
+                "doc_type": "verordnung",
+                "title": "Zeugnisverordnung Saarland"
+            }
+        ]
+    },
+    "sn": {
+        "name": "Sachsen",
+        "license": "gov_statute",
+        "training_allowed": True,
+        "base_url": "https://www.revosax.sachsen.de",
+        "urls": [
+            {
+                "url": "https://www.revosax.sachsen.de/vorschrift/4192-Schulgesetz-fuer-den-Freistaat-Sachsen",
+                "doc_type": "verordnung",
+                "title": "Schulgesetz Sachsen"
+            },
+            {
+                "url": "https://www.revosax.sachsen.de/vorschrift/13500-Schulordnung-Gymnasien-Abiturpruefung",
+                "doc_type": "schulordnung",
+                "title": "Schulordnung Gymnasien Sachsen"
+            }
+        ]
+    },
+    "st": {
+        "name": "Sachsen-Anhalt",
+        "license": "unknown",
+        "training_allowed": False,  # Eingeschränkt -> False for safety
+        "base_url": "https://www.landesrecht.sachsen-anhalt.de",
+        "urls": [
+            {
+                "url": "https://www.landesrecht.sachsen-anhalt.de/bsst/document/jlr-SchulGSTpP27",
+                "doc_type": "verordnung",
+                "title": "Schulgesetz Sachsen-Anhalt"
+            },
+            {
+                "url": "https://www.landesrecht.sachsen-anhalt.de/bsst/document/jlr-VersetzVST2017rahmen",
+                "doc_type": "verordnung",
+                "title": "Versetzungsverordnung ST"
+            }
+        ]
+    },
+    "sh": {
+        "name": "Schleswig-Holstein",
+        "license": "gov_statute",
+        "training_allowed": True,
+        "base_url": "https://www.gesetze-rechtsprechung.sh.juris.de",
+        "urls": [
+            {
+                "url": "https://www.gesetze-rechtsprechung.sh.juris.de/jportal/portal/t/10wx/page/bsshoprod.psml?pid=Dokumentanzeige&showdoccase=1&js_peid=Trefferliste&documentnumber=1&numberofresults=1&fromdoctodoc=yes&doc.id=jlr-SchulGSHpP22",
+                "doc_type": "verordnung",
+                "title": "Schulgesetz SH - Zeugnisse"
+            },
+            {
+                "url": "https://www.gesetze-rechtsprechung.sh.juris.de/jportal/portal/t/10wx/page/bsshoprod.psml?pid=Dokumentanzeige&showdoccase=1&js_peid=Trefferliste&documentnumber=1&numberofresults=1&fromdoctodoc=yes&doc.id=jlr-ZeugnVSHrahmen",
+                "doc_type": "verordnung",
+                "title": "Zeugnisverordnung SH"
+            }
+        ]
+    },
+    "th": {
+        "name": "Thüringen",
+        "license": "gov_statute",
+        "training_allowed": True,
+        "base_url": "https://landesrecht.thueringen.de",
+        "urls": [
+            {
+                "url": "https://landesrecht.thueringen.de/bsth/document/jlr-SchulGTHpP58",
+                "doc_type": "verordnung",
+                "title": "Thüringer Schulgesetz - Zeugnisse"
+            },
+            {
+                "url": "https://landesrecht.thueringen.de/bsth/document/jlr-SchulOTH2018rahmen",
+                "doc_type": "schulordnung",
+                "title": "Thüringer Schulordnung"
+            }
+        ]
+    }
+}
+
+
+async def populate_seed_data():
+    """Populate database with seed data."""
+    from metrics_db import get_pool, upsert_zeugnis_source
+    from zeugnis_models import generate_id
+
+    pool = await get_pool()
+    if not pool:
+        print("Database not available")
+        return False
+
+    try:
+        async with pool.acquire() as conn:
+            for bundesland, data in SEED_DATA.items():
+                # Create or update source
+                source_id = generate_id()
+                await upsert_zeugnis_source(
+                    id=source_id,
+                    bundesland=bundesland,
+                    name=data["name"],
+                    license_type=data["license"],
+                    training_allowed=data["training_allowed"],
+                    base_url=data.get("base_url"),
+                )
+
+                # Get the actual source ID (might be existing)
+                existing = await conn.fetchrow(
+                    "SELECT id FROM zeugnis_sources WHERE bundesland = $1",
+                    bundesland
+                )
+                if existing:
+                    source_id = existing["id"]
+
+                # Add seed URLs
+                for url_data in data.get("urls", []):
+                    url_id = generate_id()
+                    await conn.execute(
+                        """
+                        INSERT INTO zeugnis_seed_urls (id, source_id, url, doc_type, status)
+                        VALUES ($1, $2, $3, $4, 'pending')
+                        ON CONFLICT DO NOTHING
+                        """,
+                        url_id, source_id, url_data["url"], url_data["doc_type"]
+                    )
+
+                print(f"Populated {bundesland}: {len(data.get('urls', []))} URLs")
+
+        print("Seed data population complete!")
+        return True
+
+    except Exception as e:
+        print(f"Failed to populate seed data: {e}")
+        return False
+
+
+def get_training_summary() -> Dict[str, List[str]]:
+    """Get summary of training permissions."""
+    allowed = []
+    not_allowed = []
+
+    for bundesland, data in SEED_DATA.items():
+        name = data["name"]
+        if data["training_allowed"]:
+            allowed.append(f"{name} ({bundesland})")
+        else:
+            not_allowed.append(f"{name} ({bundesland})")
+
+    return {
+        "training_allowed": sorted(allowed),
+        "training_not_allowed": sorted(not_allowed),
+        "total_allowed": len(allowed),
+        "total_not_allowed": len(not_allowed),
+    }
+
+
+if __name__ == "__main__":
+    import asyncio
+
+    print("=" * 60)
+    print("Zeugnis Seed Data Summary")
+    print("=" * 60)
+
+    summary = get_training_summary()
+    print(f"\nTraining ALLOWED ({summary['total_allowed']} Bundesländer):")
+    for bl in summary["training_allowed"]:
+        print(f"  ✓ {bl}")
+
+    print(f"\nTraining NOT ALLOWED ({summary['total_not_allowed']} Bundesländer):")
+    for bl in summary["training_not_allowed"]:
+        print(f"  ✗ {bl}")
+
+    print("\n" + "=" * 60)
+    print("To populate database, run:")
+    print("  python -c 'import asyncio; from zeugnis_seed_data import populate_seed_data; asyncio.run(populate_seed_data())'")
--- a/klausur-service/backend/zeugnis/storage.py
+++ b/klausur-service/backend/zeugnis/storage.py
@@ -0,0 +1,180 @@
+"""
+Zeugnis Crawler - Embedding generation, MinIO upload, and Qdrant indexing.
+"""
+
+import io
+import os
+import uuid
+from datetime import datetime
+from typing import Optional, List, Dict, Any
+
+
+# =============================================================================
+# Configuration
+# =============================================================================
+
+QDRANT_URL = os.getenv("QDRANT_URL", "http://localhost:6333")
+MINIO_ENDPOINT = os.getenv("MINIO_ENDPOINT", "localhost:9000")
+MINIO_ACCESS_KEY = os.getenv("MINIO_ACCESS_KEY", "test-access-key")
+MINIO_SECRET_KEY = os.getenv("MINIO_SECRET_KEY", "test-secret-key")
+MINIO_BUCKET = os.getenv("MINIO_BUCKET", "breakpilot-rag")
+EMBEDDING_BACKEND = os.getenv("EMBEDDING_BACKEND", "local")
+
+ZEUGNIS_COLLECTION = "bp_zeugnis"
+
+
+# =============================================================================
+# Embedding Generation
+# =============================================================================
+
+_embedding_model = None
+
+
+def get_embedding_model():
+    """Get or initialize embedding model."""
+    global _embedding_model
+    if _embedding_model is None and EMBEDDING_BACKEND == "local":
+        try:
+            from sentence_transformers import SentenceTransformer
+            _embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
+            print("Loaded local embedding model: all-MiniLM-L6-v2")
+        except ImportError:
+            print("Warning: sentence-transformers not installed")
+    return _embedding_model
+
+
+async def generate_embeddings(texts: List[str]) -> List[List[float]]:
+    """Generate embeddings for a list of texts."""
+    if not texts:
+        return []
+
+    if EMBEDDING_BACKEND == "local":
+        model = get_embedding_model()
+        if model:
+            embeddings = model.encode(texts, show_progress_bar=False)
+            return [emb.tolist() for emb in embeddings]
+        return []
+
+    elif EMBEDDING_BACKEND == "openai":
+        import openai
+        api_key = os.getenv("OPENAI_API_KEY")
+        if not api_key:
+            print("Warning: OPENAI_API_KEY not set")
+            return []
+
+        client = openai.AsyncOpenAI(api_key=api_key)
+        response = await client.embeddings.create(
+            input=texts,
+            model="text-embedding-3-small"
+        )
+        return [item.embedding for item in response.data]
+
+    return []
+
+
+# =============================================================================
+# MinIO Storage
+# =============================================================================
+
+async def upload_to_minio(
+    content: bytes,
+    bundesland: str,
+    filename: str,
+    content_type: str = "application/pdf",
+    year: Optional[int] = None,
+) -> Optional[str]:
+    """Upload document to MinIO."""
+    try:
+        from minio import Minio
+
+        client = Minio(
+            MINIO_ENDPOINT,
+            access_key=MINIO_ACCESS_KEY,
+            secret_key=MINIO_SECRET_KEY,
+            secure=os.getenv("MINIO_SECURE", "false").lower() == "true"
+        )
+
+        # Ensure bucket exists
+        if not client.bucket_exists(MINIO_BUCKET):
+            client.make_bucket(MINIO_BUCKET)
+
+        # Build path
+        year_str = str(year) if year else str(datetime.now().year)
+        object_name = f"landes-daten/{bundesland}/zeugnis/{year_str}/{filename}"
+
+        # Upload
+        client.put_object(
+            MINIO_BUCKET,
+            object_name,
+            io.BytesIO(content),
+            len(content),
+            content_type=content_type,
+        )
+
+        return object_name
+    except Exception as e:
+        print(f"MinIO upload failed: {e}")
+        return None
+
+
+# =============================================================================
+# Qdrant Indexing
+# =============================================================================
+
+async def index_in_qdrant(
+    doc_id: str,
+    chunks: List[str],
+    embeddings: List[List[float]],
+    metadata: Dict[str, Any],
+) -> int:
+    """Index document chunks in Qdrant."""
+    try:
+        from qdrant_client import QdrantClient
+        from qdrant_client.models import VectorParams, Distance, PointStruct
+
+        client = QdrantClient(url=QDRANT_URL)
+
+        # Ensure collection exists
+        collections = client.get_collections().collections
+        if not any(c.name == ZEUGNIS_COLLECTION for c in collections):
+            vector_size = len(embeddings[0]) if embeddings else 384
+            client.create_collection(
+                collection_name=ZEUGNIS_COLLECTION,
+                vectors_config=VectorParams(
+                    size=vector_size,
+                    distance=Distance.COSINE,
+                ),
+            )
+            print(f"Created Qdrant collection: {ZEUGNIS_COLLECTION}")
+
+        # Create points
+        points = []
+        for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
+            point_id = str(uuid.uuid4())
+            points.append(PointStruct(
+                id=point_id,
+                vector=embedding,
+                payload={
+                    "document_id": doc_id,
+                    "chunk_index": i,
+                    "chunk_text": chunk[:500],  # Store first 500 chars for preview
+                    "bundesland": metadata.get("bundesland"),
+                    "doc_type": metadata.get("doc_type"),
+                    "title": metadata.get("title"),
+                    "source_url": metadata.get("url"),
+                    "training_allowed": metadata.get("training_allowed", False),
+                    "indexed_at": datetime.now().isoformat(),
+                }
+            ))
+
+        # Upsert
+        if points:
+            client.upsert(
+                collection_name=ZEUGNIS_COLLECTION,
+                points=points,
+            )
+
+        return len(points)
+    except Exception as e:
+        print(f"Qdrant indexing failed: {e}")
+        return 0
--- a/klausur-service/backend/zeugnis/text.py
+++ b/klausur-service/backend/zeugnis/text.py
@@ -0,0 +1,110 @@
+"""
+Zeugnis Crawler - Text extraction, chunking, and hashing utilities.
+"""
+
+import hashlib
+from typing import List
+
+CHUNK_SIZE = 1000
+CHUNK_OVERLAP = 200
+
+
+def extract_text_from_pdf(content: bytes) -> str:
+    """Extract text from PDF bytes."""
+    try:
+        from PyPDF2 import PdfReader
+        import io
+
+        reader = PdfReader(io.BytesIO(content))
+        text_parts = []
+        for page in reader.pages:
+            text = page.extract_text()
+            if text:
+                text_parts.append(text)
+        return "\n\n".join(text_parts)
+    except Exception as e:
+        print(f"PDF extraction failed: {e}")
+        return ""
+
+
+def extract_text_from_html(content: bytes, encoding: str = "utf-8") -> str:
+    """Extract text from HTML bytes."""
+    try:
+        from bs4 import BeautifulSoup
+
+        html = content.decode(encoding, errors="replace")
+        soup = BeautifulSoup(html, "html.parser")
+
+        # Remove script and style elements
+        for element in soup(["script", "style", "nav", "header", "footer"]):
+            element.decompose()
+
+        # Get text
+        text = soup.get_text(separator="\n", strip=True)
+
+        # Clean up whitespace
+        lines = [line.strip() for line in text.splitlines() if line.strip()]
+        return "\n".join(lines)
+    except Exception as e:
+        print(f"HTML extraction failed: {e}")
+        return ""
+
+
+def chunk_text(text: str, chunk_size: int = CHUNK_SIZE, overlap: int = CHUNK_OVERLAP) -> List[str]:
+    """Split text into overlapping chunks."""
+    if not text:
+        return []
+
+    chunks = []
+    separators = ["\n\n", "\n", ". ", " "]
+
+    def split_recursive(text: str, sep_index: int = 0) -> List[str]:
+        if len(text) <= chunk_size:
+            return [text] if text.strip() else []
+
+        if sep_index >= len(separators):
+            # Force split at chunk_size
+            result = []
+            for i in range(0, len(text), chunk_size - overlap):
+                chunk = text[i:i + chunk_size]
+                if chunk.strip():
+                    result.append(chunk)
+            return result
+
+        sep = separators[sep_index]
+        parts = text.split(sep)
+        result = []
+        current = ""
+
+        for part in parts:
+            if len(current) + len(sep) + len(part) <= chunk_size:
+                current = current + sep + part if current else part
+            else:
+                if current.strip():
+                    result.extend(split_recursive(current, sep_index + 1) if len(current) > chunk_size else [current])
+                current = part
+
+        if current.strip():
+            result.extend(split_recursive(current, sep_index + 1) if len(current) > chunk_size else [current])
+
+        return result
+
+    chunks = split_recursive(text)
+
+    # Add overlap
+    if overlap > 0 and len(chunks) > 1:
+        overlapped = []
+        for i, chunk in enumerate(chunks):
+            if i > 0:
+                # Add end of previous chunk
+                prev_end = chunks[i - 1][-overlap:]
+                chunk = prev_end + chunk
+            overlapped.append(chunk)
+        chunks = overlapped
+
+    return chunks
+
+
+def compute_hash(content: bytes) -> str:
+    """Compute SHA-256 hash of content."""
+    return hashlib.sha256(content).hexdigest()
--- a/klausur-service/backend/zeugnis/worker.py
+++ b/klausur-service/backend/zeugnis/worker.py
@@ -0,0 +1,313 @@
+"""
+Zeugnis Crawler - ZeugnisCrawler worker class and CrawlerState.
+
+Crawls official government documents about school certificates from
+all 16 German federal states. Only indexes documents where AI training
+is legally permitted.
+"""
+
+import asyncio
+from datetime import datetime
+from typing import Optional, List, Dict, Any, Tuple
+from dataclasses import dataclass, field
+
+import httpx
+
+from .models import generate_id
+from .text import (
+    extract_text_from_pdf,
+    extract_text_from_html,
+    chunk_text,
+    compute_hash,
+)
+from .storage import (
+    upload_to_minio,
+    generate_embeddings,
+    index_in_qdrant,
+)
+
+
+# =============================================================================
+# Configuration
+# =============================================================================
+
+MAX_RETRIES = 3
+RETRY_DELAY = 5  # seconds
+REQUEST_TIMEOUT = 30  # seconds
+USER_AGENT = "BreakPilot-Zeugnis-Crawler/1.0 (Educational Research)"
+
+
+# =============================================================================
+# Crawler State
+# =============================================================================
+
+@dataclass
+class CrawlerState:
+    """Global crawler state."""
+    is_running: bool = False
+    current_source_id: Optional[str] = None
+    current_bundesland: Optional[str] = None
+    queue: List[Dict] = field(default_factory=list)
+    documents_crawled_today: int = 0
+    documents_indexed_today: int = 0
+    errors_today: int = 0
+    last_activity: Optional[datetime] = None
+
+
+_crawler_state = CrawlerState()
+
+
+def get_crawler_state() -> CrawlerState:
+    """Get the global crawler state."""
+    return _crawler_state
+
+
+# =============================================================================
+# Crawler Worker
+# =============================================================================
+
+class ZeugnisCrawler:
+    """Rights-aware crawler for zeugnis documents."""
+
+    def __init__(self):
+        self.http_client: Optional[httpx.AsyncClient] = None
+        self.db_pool = None
+
+    async def init(self):
+        """Initialize crawler resources."""
+        self.http_client = httpx.AsyncClient(
+            timeout=REQUEST_TIMEOUT,
+            follow_redirects=True,
+            headers={"User-Agent": USER_AGENT},
+        )
+
+        # Initialize database connection
+        try:
+            from metrics_db import get_pool
+            self.db_pool = await get_pool()
+        except Exception as e:
+            print(f"Failed to get database pool: {e}")
+
+    async def close(self):
+        """Close crawler resources."""
+        if self.http_client:
+            await self.http_client.aclose()
+
+    async def fetch_url(self, url: str) -> Tuple[Optional[bytes], Optional[str]]:
+        """Fetch URL with retry logic."""
+        for attempt in range(MAX_RETRIES):
+            try:
+                response = await self.http_client.get(url)
+                response.raise_for_status()
+                content_type = response.headers.get("content-type", "")
+                return response.content, content_type
+            except httpx.HTTPStatusError as e:
+                print(f"HTTP error {e.response.status_code} for {url}")
+                if e.response.status_code == 404:
+                    return None, None
+            except Exception as e:
+                print(f"Attempt {attempt + 1}/{MAX_RETRIES} failed for {url}: {e}")
+                if attempt < MAX_RETRIES - 1:
+                    await asyncio.sleep(RETRY_DELAY * (attempt + 1))
+        return None, None
+
+    async def crawl_seed_url(
+        self,
+        seed_url_id: str,
+        url: str,
+        bundesland: str,
+        doc_type: str,
+        training_allowed: bool,
+    ) -> Dict[str, Any]:
+        """Crawl a single seed URL."""
+        global _crawler_state
+
+        result = {
+            "seed_url_id": seed_url_id,
+            "url": url,
+            "success": False,
+            "document_id": None,
+            "indexed": False,
+            "error": None,
+        }
+
+        try:
+            # Fetch content
+            content, content_type = await self.fetch_url(url)
+            if not content:
+                result["error"] = "Failed to fetch URL"
+                return result
+
+            # Determine file type
+            is_pdf = "pdf" in content_type.lower() or url.lower().endswith(".pdf")
+
+            # Extract text
+            if is_pdf:
+                text = extract_text_from_pdf(content)
+                filename = url.split("/")[-1] or f"document_{seed_url_id}.pdf"
+            else:
+                text = extract_text_from_html(content)
+                filename = f"document_{seed_url_id}.html"
+
+            if not text:
+                result["error"] = "No text extracted"
+                return result
+
+            # Compute hash for versioning
+            content_hash = compute_hash(content)
+
+            # Upload to MinIO
+            minio_path = await upload_to_minio(
+                content,
+                bundesland,
+                filename,
+                content_type=content_type or "application/octet-stream",
+            )
+
+            # Generate document ID
+            doc_id = generate_id()
+
+            # Store document in database
+            if self.db_pool:
+                async with self.db_pool.acquire() as conn:
+                    await conn.execute(
+                        """
+                        INSERT INTO zeugnis_documents
+                        (id, seed_url_id, title, url, content_hash, minio_path,
+                         training_allowed, file_size, content_type)
+                        VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9)
+                        ON CONFLICT DO NOTHING
+                        """,
+                        doc_id, seed_url_id, filename, url, content_hash,
+                        minio_path, training_allowed, len(content), content_type
+                    )
+
+            result["document_id"] = doc_id
+            result["success"] = True
+            _crawler_state.documents_crawled_today += 1
+
+            # Only index if training is allowed
+            if training_allowed:
+                chunks = chunk_text(text)
+                if chunks:
+                    embeddings = await generate_embeddings(chunks)
+                    if embeddings:
+                        indexed_count = await index_in_qdrant(
+                            doc_id,
+                            chunks,
+                            embeddings,
+                            {
+                                "bundesland": bundesland,
+                                "doc_type": doc_type,
+                                "title": filename,
+                                "url": url,
+                                "training_allowed": True,
+                            }
+                        )
+                        if indexed_count > 0:
+                            result["indexed"] = True
+                            _crawler_state.documents_indexed_today += 1
+
+                            # Update database
+                            if self.db_pool:
+                                async with self.db_pool.acquire() as conn:
+                                    await conn.execute(
+                                        "UPDATE zeugnis_documents SET indexed_in_qdrant = true WHERE id = $1",
+                                        doc_id
+                                    )
+            else:
+                result["indexed"] = False
+                result["error"] = "Training not allowed for this source"
+
+            _crawler_state.last_activity = datetime.now()
+
+        except Exception as e:
+            result["error"] = str(e)
+            _crawler_state.errors_today += 1
+
+        return result
+
+    async def crawl_source(self, source_id: str) -> Dict[str, Any]:
+        """Crawl all seed URLs for a source."""
+        global _crawler_state
+
+        result = {
+            "source_id": source_id,
+            "documents_found": 0,
+            "documents_indexed": 0,
+            "errors": [],
+            "started_at": datetime.now(),
+            "completed_at": None,
+        }
+
+        if not self.db_pool:
+            result["errors"].append("Database not available")
+            return result
+
+        try:
+            async with self.db_pool.acquire() as conn:
+                # Get source info
+                source = await conn.fetchrow(
+                    "SELECT * FROM zeugnis_sources WHERE id = $1",
+                    source_id
+                )
+                if not source:
+                    result["errors"].append(f"Source not found: {source_id}")
+                    return result
+
+                bundesland = source["bundesland"]
+                training_allowed = source["training_allowed"]
+
+                _crawler_state.current_source_id = source_id
+                _crawler_state.current_bundesland = bundesland
+
+                # Get seed URLs
+                seed_urls = await conn.fetch(
+                    "SELECT * FROM zeugnis_seed_urls WHERE source_id = $1 AND status != 'completed'",
+                    source_id
+                )
+
+                for seed_url in seed_urls:
+                    # Update status to running
+                    await conn.execute(
+                        "UPDATE zeugnis_seed_urls SET status = 'running' WHERE id = $1",
+                        seed_url["id"]
+                    )
+
+                    # Crawl
+                    crawl_result = await self.crawl_seed_url(
+                        seed_url["id"],
+                        seed_url["url"],
+                        bundesland,
+                        seed_url["doc_type"],
+                        training_allowed,
+                    )
+
+                    # Update status
+                    if crawl_result["success"]:
+                        result["documents_found"] += 1
+                        if crawl_result["indexed"]:
+                            result["documents_indexed"] += 1
+                        await conn.execute(
+                            "UPDATE zeugnis_seed_urls SET status = 'completed', last_crawled = NOW() WHERE id = $1",
+                            seed_url["id"]
+                        )
+                    else:
+                        result["errors"].append(f"{seed_url['url']}: {crawl_result['error']}")
+                        await conn.execute(
+                            "UPDATE zeugnis_seed_urls SET status = 'failed', error_message = $2 WHERE id = $1",
+                            seed_url["id"], crawl_result["error"]
+                        )
+
+                    # Small delay between requests
+                    await asyncio.sleep(1)
+
+        except Exception as e:
+            result["errors"].append(str(e))
+
+        finally:
+            result["completed_at"] = datetime.now()
+            _crawler_state.current_source_id = None
+            _crawler_state.current_bundesland = None
+
+        return result