Restructure: Move 52 files into 7 domain packages
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 28s
CI / test-go-edu-search (push) Successful in 28s
CI / test-python-klausur (push) Failing after 2m22s
CI / test-python-agent-core (push) Successful in 21s
CI / test-nodejs-website (push) Successful in 23s

korrektur/ zeugnis/ admin/ compliance/ worksheet/ training/ metrics/
52 shims, relative imports, RAG untouched.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-04-25 22:10:48 +02:00
parent 0504d22b8e
commit 165c493d1e
111 changed files with 11859 additions and 11609 deletions

View File

@@ -0,0 +1,6 @@
"""
zeugnis package — certificate crawler, models, storage.
Backward-compatible re-exports: consumers can still use
``from zeugnis_api import ...`` etc. via the shim files in backend/.
"""

View File

@@ -0,0 +1,19 @@
"""
Zeugnis Rights-Aware Crawler — barrel re-export.
All implementation split into:
zeugnis_api_sources — sources, seed URLs, initialization
zeugnis_api_docs — documents, crawler, statistics, audit
FastAPI router for managing zeugnis sources, documents, and crawler operations.
"""
from fastapi import APIRouter
from .api_sources import router as _sources_router # noqa: F401
from .api_docs import router as _docs_router # noqa: F401
# Composite router (used by main.py)
router = APIRouter()
router.include_router(_sources_router)
router.include_router(_docs_router)

View File

@@ -0,0 +1,321 @@
"""
Zeugnis API Docs — documents, crawler control, statistics, audit endpoints.
Extracted from zeugnis_api.py for modularity.
"""
from datetime import datetime, timedelta
from typing import Optional, List
from fastapi import APIRouter, HTTPException, BackgroundTasks, Query
from .models import (
CrawlRequest, EventType,
BUNDESLAENDER,
generate_id, get_training_allowed, get_license_for_bundesland,
)
from .crawler import (
start_crawler, stop_crawler, get_crawler_status,
)
from metrics_db import (
get_zeugnis_documents, get_zeugnis_stats,
log_zeugnis_event, get_pool,
)
router = APIRouter(prefix="/api/v1/admin/zeugnis", tags=["Zeugnis Crawler"])
# =============================================================================
# Documents Endpoints
# =============================================================================
@router.get("/documents", response_model=List[dict])
async def list_documents(
bundesland: Optional[str] = None,
limit: int = Query(100, le=500),
offset: int = 0,
):
"""Get all zeugnis documents with optional filtering."""
documents = await get_zeugnis_documents(bundesland=bundesland, limit=limit, offset=offset)
return documents
@router.get("/documents/{document_id}", response_model=dict)
async def get_document(document_id: str):
"""Get details for a specific document."""
pool = await get_pool()
if not pool:
raise HTTPException(status_code=503, detail="Database not available")
try:
async with pool.acquire() as conn:
doc = await conn.fetchrow(
"""
SELECT d.*, s.bundesland, s.name as source_name
FROM zeugnis_documents d
JOIN zeugnis_seed_urls u ON d.seed_url_id = u.id
JOIN zeugnis_sources s ON u.source_id = s.id
WHERE d.id = $1
""",
document_id
)
if not doc:
raise HTTPException(status_code=404, detail="Document not found")
# Log view event
await log_zeugnis_event(document_id, EventType.VIEWED.value)
return dict(doc)
except HTTPException:
raise
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@router.get("/documents/{document_id}/versions", response_model=List[dict])
async def get_document_versions(document_id: str):
"""Get version history for a document."""
pool = await get_pool()
if not pool:
raise HTTPException(status_code=503, detail="Database not available")
try:
async with pool.acquire() as conn:
rows = await conn.fetch(
"""
SELECT * FROM zeugnis_document_versions
WHERE document_id = $1
ORDER BY version DESC
""",
document_id
)
return [dict(r) for r in rows]
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
# =============================================================================
# Crawler Control Endpoints
# =============================================================================
@router.get("/crawler/status", response_model=dict)
async def crawler_status():
"""Get current crawler status."""
return get_crawler_status()
@router.post("/crawler/start", response_model=dict)
async def start_crawl(request: CrawlRequest, background_tasks: BackgroundTasks):
"""Start the crawler."""
success = await start_crawler(
bundesland=request.bundesland,
source_id=request.source_id,
)
if not success:
raise HTTPException(status_code=409, detail="Crawler already running")
return {"success": True, "message": "Crawler started"}
@router.post("/crawler/stop", response_model=dict)
async def stop_crawl():
"""Stop the crawler."""
success = await stop_crawler()
if not success:
raise HTTPException(status_code=409, detail="Crawler not running")
return {"success": True, "message": "Crawler stopped"}
@router.get("/crawler/queue", response_model=List[dict])
async def get_queue():
"""Get the crawler queue."""
pool = await get_pool()
if not pool:
return []
try:
async with pool.acquire() as conn:
rows = await conn.fetch(
"""
SELECT q.*, s.bundesland, s.name as source_name
FROM zeugnis_crawler_queue q
JOIN zeugnis_sources s ON q.source_id = s.id
ORDER BY q.priority DESC, q.created_at
"""
)
return [dict(r) for r in rows]
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@router.post("/crawler/queue", response_model=dict)
async def add_to_queue(request: CrawlRequest):
"""Add a source to the crawler queue."""
pool = await get_pool()
if not pool:
raise HTTPException(status_code=503, detail="Database not available")
queue_id = generate_id()
try:
async with pool.acquire() as conn:
# Get source ID if bundesland provided
source_id = request.source_id
if not source_id and request.bundesland:
source = await conn.fetchrow(
"SELECT id FROM zeugnis_sources WHERE bundesland = $1",
request.bundesland
)
if source:
source_id = source["id"]
if not source_id:
raise HTTPException(status_code=400, detail="Source not found")
await conn.execute(
"""
INSERT INTO zeugnis_crawler_queue (id, source_id, priority, status)
VALUES ($1, $2, $3, 'pending')
""",
queue_id, source_id, request.priority
)
return {"id": queue_id, "success": True}
except HTTPException:
raise
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
# =============================================================================
# Statistics Endpoints
# =============================================================================
@router.get("/stats", response_model=dict)
async def get_stats():
"""Get zeugnis crawler statistics."""
stats = await get_zeugnis_stats()
return stats
@router.get("/stats/bundesland", response_model=List[dict])
async def get_bundesland_stats():
"""Get statistics per Bundesland."""
pool = await get_pool()
# Build stats from BUNDESLAENDER with DB data if available
stats = []
for code, info in BUNDESLAENDER.items():
stat = {
"bundesland": code,
"name": info["name"],
"training_allowed": get_training_allowed(code),
"document_count": 0,
"indexed_count": 0,
"last_crawled": None,
}
if pool:
try:
async with pool.acquire() as conn:
row = await conn.fetchrow(
"""
SELECT
COUNT(d.id) as doc_count,
COUNT(CASE WHEN d.indexed_in_qdrant THEN 1 END) as indexed_count,
MAX(u.last_crawled) as last_crawled
FROM zeugnis_sources s
LEFT JOIN zeugnis_seed_urls u ON s.id = u.source_id
LEFT JOIN zeugnis_documents d ON u.id = d.seed_url_id
WHERE s.bundesland = $1
GROUP BY s.id
""",
code
)
if row:
stat["document_count"] = row["doc_count"] or 0
stat["indexed_count"] = row["indexed_count"] or 0
stat["last_crawled"] = row["last_crawled"].isoformat() if row["last_crawled"] else None
except Exception:
pass
stats.append(stat)
return stats
# =============================================================================
# Audit Endpoints
# =============================================================================
@router.get("/audit/events", response_model=List[dict])
async def get_audit_events(
document_id: Optional[str] = None,
event_type: Optional[str] = None,
limit: int = Query(100, le=1000),
days: int = Query(30, le=365),
):
"""Get audit events with optional filtering."""
pool = await get_pool()
if not pool:
return []
try:
since = datetime.now() - timedelta(days=days)
async with pool.acquire() as conn:
query = """
SELECT * FROM zeugnis_usage_events
WHERE created_at >= $1
"""
params = [since]
if document_id:
query += " AND document_id = $2"
params.append(document_id)
if event_type:
query += f" AND event_type = ${len(params) + 1}"
params.append(event_type)
query += f" ORDER BY created_at DESC LIMIT ${len(params) + 1}"
params.append(limit)
rows = await conn.fetch(query, *params)
return [dict(r) for r in rows]
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@router.get("/audit/export", response_model=dict)
async def export_audit(
days: int = Query(30, le=365),
requested_by: str = Query(..., description="User requesting the export"),
):
"""Export audit data for GDPR compliance."""
pool = await get_pool()
if not pool:
raise HTTPException(status_code=503, detail="Database not available")
try:
since = datetime.now() - timedelta(days=days)
async with pool.acquire() as conn:
rows = await conn.fetch(
"""
SELECT * FROM zeugnis_usage_events
WHERE created_at >= $1
ORDER BY created_at DESC
""",
since
)
doc_count = await conn.fetchval(
"SELECT COUNT(DISTINCT document_id) FROM zeugnis_usage_events WHERE created_at >= $1",
since
)
return {
"export_date": datetime.now().isoformat(),
"requested_by": requested_by,
"events": [dict(r) for r in rows],
"document_count": doc_count or 0,
"date_range_start": since.isoformat(),
"date_range_end": datetime.now().isoformat(),
}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))

View File

@@ -0,0 +1,232 @@
"""
Zeugnis API Sources — source and seed URL management endpoints.
Extracted from zeugnis_api.py for modularity.
"""
from typing import Optional, List
from fastapi import APIRouter, HTTPException
from pydantic import BaseModel
from .models import (
ZeugnisSourceCreate, ZeugnisSourceVerify,
SeedUrlCreate,
LicenseType, DocType,
BUNDESLAENDER,
generate_id, get_training_allowed, get_bundesland_name, get_license_for_bundesland,
)
from metrics_db import (
get_zeugnis_sources, upsert_zeugnis_source, get_pool,
)
router = APIRouter(prefix="/api/v1/admin/zeugnis", tags=["Zeugnis Crawler"])
# =============================================================================
# Sources Endpoints
# =============================================================================
@router.get("/sources", response_model=List[dict])
async def list_sources():
"""Get all zeugnis sources (Bundeslaender)."""
sources = await get_zeugnis_sources()
if not sources:
# Return default sources if none exist
return [
{
"id": None,
"bundesland": code,
"name": info["name"],
"base_url": None,
"license_type": str(get_license_for_bundesland(code).value),
"training_allowed": get_training_allowed(code),
"verified_by": None,
"verified_at": None,
"created_at": None,
"updated_at": None,
}
for code, info in BUNDESLAENDER.items()
]
return sources
@router.post("/sources", response_model=dict)
async def create_source(source: ZeugnisSourceCreate):
"""Create or update a zeugnis source."""
source_id = generate_id()
success = await upsert_zeugnis_source(
id=source_id,
bundesland=source.bundesland,
name=source.name,
license_type=source.license_type.value,
training_allowed=source.training_allowed,
base_url=source.base_url,
)
if not success:
raise HTTPException(status_code=500, detail="Failed to create source")
return {"id": source_id, "success": True}
@router.put("/sources/{source_id}/verify", response_model=dict)
async def verify_source(source_id: str, verification: ZeugnisSourceVerify):
"""Verify a source's license status."""
pool = await get_pool()
if not pool:
raise HTTPException(status_code=503, detail="Database not available")
try:
async with pool.acquire() as conn:
await conn.execute(
"""
UPDATE zeugnis_sources
SET license_type = $2,
training_allowed = $3,
verified_by = $4,
verified_at = NOW(),
updated_at = NOW()
WHERE id = $1
""",
source_id, verification.license_type.value,
verification.training_allowed, verification.verified_by
)
return {"success": True, "source_id": source_id}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@router.get("/sources/{bundesland}", response_model=dict)
async def get_source_by_bundesland(bundesland: str):
"""Get source details for a specific Bundesland."""
pool = await get_pool()
if not pool:
# Return default info
if bundesland not in BUNDESLAENDER:
raise HTTPException(status_code=404, detail=f"Bundesland not found: {bundesland}")
return {
"bundesland": bundesland,
"name": get_bundesland_name(bundesland),
"training_allowed": get_training_allowed(bundesland),
"license_type": get_license_for_bundesland(bundesland).value,
"document_count": 0,
}
try:
async with pool.acquire() as conn:
source = await conn.fetchrow(
"SELECT * FROM zeugnis_sources WHERE bundesland = $1",
bundesland
)
if source:
doc_count = await conn.fetchval(
"""
SELECT COUNT(*) FROM zeugnis_documents d
JOIN zeugnis_seed_urls u ON d.seed_url_id = u.id
WHERE u.source_id = $1
""",
source["id"]
)
return {**dict(source), "document_count": doc_count or 0}
# Return default
return {
"bundesland": bundesland,
"name": get_bundesland_name(bundesland),
"training_allowed": get_training_allowed(bundesland),
"license_type": get_license_for_bundesland(bundesland).value,
"document_count": 0,
}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
# =============================================================================
# Seed URLs Endpoints
# =============================================================================
@router.get("/sources/{source_id}/urls", response_model=List[dict])
async def list_seed_urls(source_id: str):
"""Get all seed URLs for a source."""
pool = await get_pool()
if not pool:
return []
try:
async with pool.acquire() as conn:
rows = await conn.fetch(
"SELECT * FROM zeugnis_seed_urls WHERE source_id = $1 ORDER BY created_at",
source_id
)
return [dict(r) for r in rows]
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@router.post("/sources/{source_id}/urls", response_model=dict)
async def add_seed_url(source_id: str, seed_url: SeedUrlCreate):
"""Add a new seed URL to a source."""
pool = await get_pool()
if not pool:
raise HTTPException(status_code=503, detail="Database not available")
url_id = generate_id()
try:
async with pool.acquire() as conn:
await conn.execute(
"""
INSERT INTO zeugnis_seed_urls (id, source_id, url, doc_type, status)
VALUES ($1, $2, $3, $4, 'pending')
""",
url_id, source_id, seed_url.url, seed_url.doc_type.value
)
return {"id": url_id, "success": True}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@router.delete("/urls/{url_id}", response_model=dict)
async def delete_seed_url(url_id: str):
"""Delete a seed URL."""
pool = await get_pool()
if not pool:
raise HTTPException(status_code=503, detail="Database not available")
try:
async with pool.acquire() as conn:
await conn.execute(
"DELETE FROM zeugnis_seed_urls WHERE id = $1",
url_id
)
return {"success": True}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
# =============================================================================
# Initialization Endpoint
# =============================================================================
@router.post("/init", response_model=dict)
async def initialize_sources():
"""Initialize default sources from BUNDESLAENDER."""
pool = await get_pool()
if not pool:
raise HTTPException(status_code=503, detail="Database not available")
created = 0
try:
for code, info in BUNDESLAENDER.items():
source_id = generate_id()
success = await upsert_zeugnis_source(
id=source_id,
bundesland=code,
name=info["name"],
license_type=get_license_for_bundesland(code).value,
training_allowed=get_training_allowed(code),
)
if success:
created += 1
return {"success": True, "sources_created": created}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))

View File

@@ -0,0 +1,105 @@
"""
Zeugnis Crawler - Start/stop/status control functions.
"""
import asyncio
from typing import Optional, Dict, Any
from .worker import ZeugnisCrawler, get_crawler_state
_crawler_instance: Optional[ZeugnisCrawler] = None
_crawler_task: Optional[asyncio.Task] = None
async def start_crawler(bundesland: Optional[str] = None, source_id: Optional[str] = None) -> bool:
"""Start the crawler."""
global _crawler_instance, _crawler_task
state = get_crawler_state()
if state.is_running:
return False
state.is_running = True
state.documents_crawled_today = 0
state.documents_indexed_today = 0
state.errors_today = 0
_crawler_instance = ZeugnisCrawler()
await _crawler_instance.init()
async def run_crawler():
try:
from metrics_db import get_pool
pool = await get_pool()
if pool:
async with pool.acquire() as conn:
# Get sources to crawl
if source_id:
sources = await conn.fetch(
"SELECT id, bundesland FROM zeugnis_sources WHERE id = $1",
source_id
)
elif bundesland:
sources = await conn.fetch(
"SELECT id, bundesland FROM zeugnis_sources WHERE bundesland = $1",
bundesland
)
else:
sources = await conn.fetch(
"SELECT id, bundesland FROM zeugnis_sources ORDER BY bundesland"
)
for source in sources:
if not state.is_running:
break
await _crawler_instance.crawl_source(source["id"])
except Exception as e:
print(f"Crawler error: {e}")
finally:
state.is_running = False
if _crawler_instance:
await _crawler_instance.close()
_crawler_task = asyncio.create_task(run_crawler())
return True
async def stop_crawler() -> bool:
"""Stop the crawler."""
global _crawler_task
state = get_crawler_state()
if not state.is_running:
return False
state.is_running = False
if _crawler_task:
_crawler_task.cancel()
try:
await _crawler_task
except asyncio.CancelledError:
pass
return True
def get_crawler_status() -> Dict[str, Any]:
"""Get current crawler status."""
state = get_crawler_state()
return {
"is_running": state.is_running,
"current_source": state.current_source_id,
"current_bundesland": state.current_bundesland,
"queue_length": len(state.queue),
"documents_crawled_today": state.documents_crawled_today,
"documents_indexed_today": state.documents_indexed_today,
"errors_today": state.errors_today,
"last_activity": state.last_activity.isoformat() if state.last_activity else None,
}

View File

@@ -0,0 +1,26 @@
"""
Zeugnis Rights-Aware Crawler
Barrel re-export: all public symbols for backward compatibility.
"""
from .text import ( # noqa: F401
extract_text_from_pdf,
extract_text_from_html,
chunk_text,
compute_hash,
)
from .storage import ( # noqa: F401
generate_embeddings,
upload_to_minio,
index_in_qdrant,
)
from .worker import ( # noqa: F401
CrawlerState,
ZeugnisCrawler,
)
from .control import ( # noqa: F401
start_crawler,
stop_crawler,
get_crawler_status,
)

View File

@@ -0,0 +1,340 @@
"""
Zeugnis Rights-Aware Crawler - Data Models
Pydantic models for API requests/responses and internal data structures.
Database schema is defined in metrics_db.py.
"""
from datetime import datetime
from enum import Enum
from typing import Optional, List, Dict, Any
from pydantic import BaseModel, Field
import uuid
# =============================================================================
# Enums
# =============================================================================
class LicenseType(str, Enum):
"""License classification for training permission."""
PUBLIC_DOMAIN = "public_domain" # Amtliche Werke (§5 UrhG)
CC_BY = "cc_by" # Creative Commons Attribution
CC_BY_SA = "cc_by_sa" # CC Attribution-ShareAlike
CC_BY_NC = "cc_by_nc" # CC NonCommercial - NO TRAINING
CC_BY_NC_SA = "cc_by_nc_sa" # CC NC-SA - NO TRAINING
GOV_STATUTE_FREE_USE = "gov_statute" # Government statutes (gemeinfrei)
ALL_RIGHTS_RESERVED = "all_rights" # Standard copyright - NO TRAINING
UNKNOWN_REQUIRES_REVIEW = "unknown" # Needs manual review
class CrawlStatus(str, Enum):
"""Status of a crawl job or seed URL."""
PENDING = "pending"
RUNNING = "running"
COMPLETED = "completed"
FAILED = "failed"
PAUSED = "paused"
class DocType(str, Enum):
"""Type of zeugnis document."""
VERORDNUNG = "verordnung" # Official regulation
HANDREICHUNG = "handreichung" # Implementation guide
FORMULAR = "formular" # Form template
ERLASS = "erlass" # Decree
SCHULORDNUNG = "schulordnung" # School regulations
SONSTIGES = "sonstiges" # Other
class EventType(str, Enum):
"""Audit event types."""
CRAWLED = "crawled"
INDEXED = "indexed"
DOWNLOADED = "downloaded"
VIEWED = "viewed"
EXPORTED = "exported"
TRAINED_ON = "trained_on"
DELETED = "deleted"
# =============================================================================
# Bundesland Definitions
# =============================================================================
BUNDESLAENDER = {
"bw": {"name": "Baden-Württemberg", "short": "BW"},
"by": {"name": "Bayern", "short": "BY"},
"be": {"name": "Berlin", "short": "BE"},
"bb": {"name": "Brandenburg", "short": "BB"},
"hb": {"name": "Bremen", "short": "HB"},
"hh": {"name": "Hamburg", "short": "HH"},
"he": {"name": "Hessen", "short": "HE"},
"mv": {"name": "Mecklenburg-Vorpommern", "short": "MV"},
"ni": {"name": "Niedersachsen", "short": "NI"},
"nw": {"name": "Nordrhein-Westfalen", "short": "NW"},
"rp": {"name": "Rheinland-Pfalz", "short": "RP"},
"sl": {"name": "Saarland", "short": "SL"},
"sn": {"name": "Sachsen", "short": "SN"},
"st": {"name": "Sachsen-Anhalt", "short": "ST"},
"sh": {"name": "Schleswig-Holstein", "short": "SH"},
"th": {"name": "Thüringen", "short": "TH"},
}
# Training permission based on Word document analysis
TRAINING_PERMISSIONS = {
"bw": True, # Amtliches Werk
"by": True, # Amtliches Werk
"be": False, # Keine Lizenz
"bb": False, # Keine Lizenz
"hb": False, # Eingeschränkt -> False for safety
"hh": False, # Keine Lizenz
"he": True, # Amtliches Werk
"mv": False, # Eingeschränkt -> False for safety
"ni": True, # Amtliches Werk
"nw": True, # Amtliches Werk
"rp": True, # Amtliches Werk
"sl": False, # Keine Lizenz
"sn": True, # Amtliches Werk
"st": False, # Eingeschränkt -> False for safety
"sh": True, # Amtliches Werk
"th": True, # Amtliches Werk
}
# =============================================================================
# API Models - Sources
# =============================================================================
class ZeugnisSourceBase(BaseModel):
"""Base model for zeugnis source."""
bundesland: str = Field(..., description="Bundesland code (e.g., 'ni', 'by')")
name: str = Field(..., description="Full name of the source")
base_url: Optional[str] = Field(None, description="Base URL for the source")
license_type: LicenseType = Field(..., description="License classification")
training_allowed: bool = Field(False, description="Whether AI training is permitted")
class ZeugnisSourceCreate(ZeugnisSourceBase):
"""Model for creating a new source."""
pass
class ZeugnisSource(ZeugnisSourceBase):
"""Full source model with all fields."""
id: str
verified_by: Optional[str] = None
verified_at: Optional[datetime] = None
created_at: datetime
updated_at: datetime
class Config:
from_attributes = True
class ZeugnisSourceVerify(BaseModel):
"""Model for verifying a source's license."""
verified_by: str = Field(..., description="User ID who verified")
license_type: LicenseType
training_allowed: bool
notes: Optional[str] = None
# =============================================================================
# API Models - Seed URLs
# =============================================================================
class SeedUrlBase(BaseModel):
"""Base model for seed URL."""
url: str = Field(..., description="URL to crawl")
doc_type: DocType = Field(DocType.VERORDNUNG, description="Type of document")
class SeedUrlCreate(SeedUrlBase):
"""Model for creating a new seed URL."""
source_id: str
class SeedUrl(SeedUrlBase):
"""Full seed URL model."""
id: str
source_id: str
status: CrawlStatus = CrawlStatus.PENDING
last_crawled: Optional[datetime] = None
error_message: Optional[str] = None
created_at: datetime
class Config:
from_attributes = True
# =============================================================================
# API Models - Documents
# =============================================================================
class ZeugnisDocumentBase(BaseModel):
"""Base model for zeugnis document."""
title: Optional[str] = None
url: str
content_type: Optional[str] = None
file_size: Optional[int] = None
class ZeugnisDocument(ZeugnisDocumentBase):
"""Full document model."""
id: str
seed_url_id: str
content_hash: Optional[str] = None
minio_path: Optional[str] = None
training_allowed: bool = False
indexed_in_qdrant: bool = False
bundesland: Optional[str] = None
source_name: Optional[str] = None
created_at: datetime
updated_at: datetime
class Config:
from_attributes = True
class ZeugnisDocumentVersion(BaseModel):
"""Document version for history tracking."""
id: str
document_id: str
version: int
content_hash: str
minio_path: Optional[str] = None
change_summary: Optional[str] = None
created_at: datetime
class Config:
from_attributes = True
# =============================================================================
# API Models - Crawler
# =============================================================================
class CrawlerStatus(BaseModel):
"""Current status of the crawler."""
is_running: bool = False
current_source: Optional[str] = None
current_bundesland: Optional[str] = None
queue_length: int = 0
documents_crawled_today: int = 0
documents_indexed_today: int = 0
last_activity: Optional[datetime] = None
errors_today: int = 0
class CrawlQueueItem(BaseModel):
"""Item in the crawl queue."""
id: str
source_id: str
bundesland: str
source_name: str
priority: int = 5
status: CrawlStatus = CrawlStatus.PENDING
started_at: Optional[datetime] = None
completed_at: Optional[datetime] = None
documents_found: int = 0
documents_indexed: int = 0
error_count: int = 0
created_at: datetime
class CrawlRequest(BaseModel):
"""Request to start a crawl."""
bundesland: Optional[str] = Field(None, description="Specific Bundesland to crawl")
source_id: Optional[str] = Field(None, description="Specific source ID to crawl")
priority: int = Field(5, ge=1, le=10, description="Priority (1=lowest, 10=highest)")
class CrawlResult(BaseModel):
"""Result of a crawl operation."""
source_id: str
bundesland: str
documents_found: int
documents_indexed: int
documents_skipped: int
errors: List[str]
duration_seconds: float
# =============================================================================
# API Models - Statistics
# =============================================================================
class ZeugnisStats(BaseModel):
"""Statistics for the zeugnis crawler."""
total_sources: int = 0
total_documents: int = 0
indexed_documents: int = 0
training_allowed_documents: int = 0
active_crawls: int = 0
per_bundesland: List[Dict[str, Any]] = []
class BundeslandStats(BaseModel):
"""Statistics per Bundesland."""
bundesland: str
name: str
training_allowed: bool
document_count: int
indexed_count: int
last_crawled: Optional[datetime] = None
# =============================================================================
# API Models - Audit
# =============================================================================
class UsageEvent(BaseModel):
"""Usage event for audit trail."""
id: str
document_id: str
event_type: EventType
user_id: Optional[str] = None
details: Optional[Dict[str, Any]] = None
created_at: datetime
class Config:
from_attributes = True
class AuditExport(BaseModel):
"""GDPR-compliant audit export."""
export_date: datetime
requested_by: str
events: List[UsageEvent]
document_count: int
date_range_start: datetime
date_range_end: datetime
# =============================================================================
# Helper Functions
# =============================================================================
def generate_id() -> str:
"""Generate a new UUID."""
return str(uuid.uuid4())
def get_training_allowed(bundesland: str) -> bool:
"""Get training permission for a Bundesland."""
return TRAINING_PERMISSIONS.get(bundesland.lower(), False)
def get_bundesland_name(code: str) -> str:
"""Get full Bundesland name from code."""
info = BUNDESLAENDER.get(code.lower(), {})
return info.get("name", code)
def get_license_for_bundesland(bundesland: str) -> LicenseType:
"""Get appropriate license type for a Bundesland."""
if TRAINING_PERMISSIONS.get(bundesland.lower(), False):
return LicenseType.GOV_STATUTE_FREE_USE
return LicenseType.UNKNOWN_REQUIRES_REVIEW

View File

@@ -0,0 +1,415 @@
"""
Zeugnis Seed Data - Initial URLs from Word Document
Contains seed URLs for all 16 German federal states (Bundesländer)
based on the "Bundesland URL Zeugnisse.docx" document.
Training permissions:
- Ja: Amtliches Werk (§5 UrhG) - training allowed
- Nein: Keine Lizenz angegeben - training NOT allowed
- Eingeschränkt: Treated as NOT allowed for safety
"""
from typing import Dict, List, Any
# Seed data structure: bundesland -> list of seed URLs
SEED_DATA: Dict[str, Dict[str, Any]] = {
"bw": {
"name": "Baden-Württemberg",
"license": "gov_statute",
"training_allowed": True,
"base_url": "https://www.landesrecht-bw.de",
"urls": [
{
"url": "https://www.landesrecht-bw.de/jportal/portal/t/cru/page/bsbawueprod.psml?pid=Dokumentanzeige&showdoccase=1&js_peid=Trefferliste&documentnumber=1&numberofresults=1&fromdoctodoc=yes&doc.id=jlr-SchulGBWpP5&doc.part=X&doc.price=0.0&doc.hl=1",
"doc_type": "verordnung",
"title": "Schulgesetz BW - Zeugnisse"
},
{
"url": "https://www.landesrecht-bw.de/jportal/portal/t/cs9/page/bsbawueprod.psml?pid=Dokumentanzeige&showdoccase=1&js_peid=Trefferliste&documentnumber=1&numberofresults=1&fromdoctodoc=yes&doc.id=jlr-NotenBildVBW2016rahmen&doc.part=X&doc.price=0.0",
"doc_type": "verordnung",
"title": "Notenbildungsverordnung"
}
]
},
"by": {
"name": "Bayern",
"license": "gov_statute",
"training_allowed": True,
"base_url": "https://www.gesetze-bayern.de",
"urls": [
{
"url": "https://www.gesetze-bayern.de/Content/Document/BaySchO2016",
"doc_type": "schulordnung",
"title": "Bayerische Schulordnung"
},
{
"url": "https://www.gesetze-bayern.de/Content/Document/BayGSO",
"doc_type": "schulordnung",
"title": "Grundschulordnung Bayern"
},
{
"url": "https://www.gesetze-bayern.de/Content/Document/BayVSO",
"doc_type": "schulordnung",
"title": "Volksschulordnung Bayern"
}
]
},
"be": {
"name": "Berlin",
"license": "unknown",
"training_allowed": False,
"base_url": "https://gesetze.berlin.de",
"urls": [
{
"url": "https://gesetze.berlin.de/bsbe/document/jlr-SchulGBEpP58",
"doc_type": "verordnung",
"title": "Berliner Schulgesetz - Zeugnisse"
},
{
"url": "https://gesetze.berlin.de/bsbe/document/jlr-SekIVBE2010rahmen",
"doc_type": "verordnung",
"title": "Sekundarstufe I-Verordnung"
}
]
},
"bb": {
"name": "Brandenburg",
"license": "unknown",
"training_allowed": False,
"base_url": "https://bravors.brandenburg.de",
"urls": [
{
"url": "https://bravors.brandenburg.de/verordnungen/vvzeugnis",
"doc_type": "verordnung",
"title": "Verwaltungsvorschriften Zeugnisse"
},
{
"url": "https://bravors.brandenburg.de/verordnungen/gostv",
"doc_type": "verordnung",
"title": "GOST-Verordnung Brandenburg"
}
]
},
"hb": {
"name": "Bremen",
"license": "unknown",
"training_allowed": False, # Eingeschränkt -> False for safety
"base_url": "https://www.transparenz.bremen.de",
"urls": [
{
"url": "https://www.transparenz.bremen.de/metainformationen/bremisches-schulgesetz-bremschg-vom-28-juni-2005-121009",
"doc_type": "verordnung",
"title": "Bremisches Schulgesetz"
},
{
"url": "https://www.transparenz.bremen.de/metainformationen/verordnung-ueber-die-sekundarstufe-i-der-oberschule-vom-20-juni-2017-130380",
"doc_type": "verordnung",
"title": "Sekundarstufe I Verordnung Bremen"
}
]
},
"hh": {
"name": "Hamburg",
"license": "unknown",
"training_allowed": False,
"base_url": "https://www.landesrecht-hamburg.de",
"urls": [
{
"url": "https://www.landesrecht-hamburg.de/bsha/document/jlr-SchulGHA2009pP44",
"doc_type": "verordnung",
"title": "Hamburgisches Schulgesetz - Zeugnisse"
},
{
"url": "https://www.landesrecht-hamburg.de/bsha/document/jlr-AusglLeistVHA2011rahmen",
"doc_type": "verordnung",
"title": "Ausbildungs- und Prüfungsordnung"
}
]
},
"he": {
"name": "Hessen",
"license": "gov_statute",
"training_allowed": True,
"base_url": "https://www.rv.hessenrecht.hessen.de",
"urls": [
{
"url": "https://www.rv.hessenrecht.hessen.de/bshe/document/jlr-SchulGHE2017pP73",
"doc_type": "verordnung",
"title": "Hessisches Schulgesetz - Zeugnisse"
},
{
"url": "https://www.rv.hessenrecht.hessen.de/bshe/document/jlr-VOBGM11HE2011rahmen",
"doc_type": "verordnung",
"title": "Verordnung zur Gestaltung des Schulverhältnisses"
}
]
},
"mv": {
"name": "Mecklenburg-Vorpommern",
"license": "unknown",
"training_allowed": False, # Eingeschränkt -> False for safety
"base_url": "https://www.landesrecht-mv.de",
"urls": [
{
"url": "https://www.landesrecht-mv.de/bsmv/document/jlr-SchulGMV2010pP63",
"doc_type": "verordnung",
"title": "Schulgesetz MV - Zeugnisse"
},
{
"url": "https://www.landesrecht-mv.de/bsmv/document/jlr-ZeugnVMVrahmen",
"doc_type": "verordnung",
"title": "Zeugnisverordnung MV"
}
]
},
"ni": {
"name": "Niedersachsen",
"license": "gov_statute",
"training_allowed": True,
"base_url": "https://www.nds-voris.de",
"urls": [
{
"url": "https://www.nds-voris.de/jportal/portal/t/1gxi/page/bsvorisprod.psml?pid=Dokumentanzeige&showdoccase=1&js_peid=Trefferliste&documentnumber=1&numberofresults=1&fromdoctodoc=yes&doc.id=jlr-SchulGNDpP59",
"doc_type": "verordnung",
"title": "Niedersächsisches Schulgesetz - Zeugnisse"
},
{
"url": "https://www.nds-voris.de/jportal/portal/t/1gxi/page/bsvorisprod.psml?pid=Dokumentanzeige&showdoccase=1&js_peid=Trefferliste&documentnumber=1&numberofresults=1&fromdoctodoc=yes&doc.id=jlr-ErgZeugnErlNDrahmen",
"doc_type": "erlass",
"title": "Ergänzende Bestimmungen für Zeugnisse"
},
{
"url": "https://www.mk.niedersachsen.de/startseite/schule/unsere_schulen/allgemein_bildende_schulen/zeugnisse_versetzungen/zeugnisse-und-versetzungen-6351.html",
"doc_type": "handreichung",
"title": "Handreichung Zeugnisse NI"
}
]
},
"nw": {
"name": "Nordrhein-Westfalen",
"license": "gov_statute",
"training_allowed": True,
"base_url": "https://recht.nrw.de",
"urls": [
{
"url": "https://recht.nrw.de/lmi/owa/br_text_anzeigen?v_id=10000000000000000521",
"doc_type": "verordnung",
"title": "Schulgesetz NRW"
},
{
"url": "https://recht.nrw.de/lmi/owa/br_text_anzeigen?v_id=10000000000000000525",
"doc_type": "verordnung",
"title": "Ausbildungs- und Prüfungsordnung Sek I"
},
{
"url": "https://www.schulministerium.nrw/zeugnisse",
"doc_type": "handreichung",
"title": "Handreichung Zeugnisse NRW"
}
]
},
"rp": {
"name": "Rheinland-Pfalz",
"license": "gov_statute",
"training_allowed": True,
"base_url": "https://landesrecht.rlp.de",
"urls": [
{
"url": "https://landesrecht.rlp.de/bsrp/document/jlr-SchulGRPpP61",
"doc_type": "verordnung",
"title": "Schulgesetz RP - Zeugnisse"
},
{
"url": "https://landesrecht.rlp.de/bsrp/document/jlr-ZeugnVRPrahmen",
"doc_type": "verordnung",
"title": "Zeugnisverordnung RP"
}
]
},
"sl": {
"name": "Saarland",
"license": "unknown",
"training_allowed": False,
"base_url": "https://recht.saarland.de",
"urls": [
{
"url": "https://recht.saarland.de/bssl/document/jlr-SchulOGSLrahmen",
"doc_type": "schulordnung",
"title": "Schulordnungsgesetz Saarland"
},
{
"url": "https://recht.saarland.de/bssl/document/jlr-ZeugnVSL2014rahmen",
"doc_type": "verordnung",
"title": "Zeugnisverordnung Saarland"
}
]
},
"sn": {
"name": "Sachsen",
"license": "gov_statute",
"training_allowed": True,
"base_url": "https://www.revosax.sachsen.de",
"urls": [
{
"url": "https://www.revosax.sachsen.de/vorschrift/4192-Schulgesetz-fuer-den-Freistaat-Sachsen",
"doc_type": "verordnung",
"title": "Schulgesetz Sachsen"
},
{
"url": "https://www.revosax.sachsen.de/vorschrift/13500-Schulordnung-Gymnasien-Abiturpruefung",
"doc_type": "schulordnung",
"title": "Schulordnung Gymnasien Sachsen"
}
]
},
"st": {
"name": "Sachsen-Anhalt",
"license": "unknown",
"training_allowed": False, # Eingeschränkt -> False for safety
"base_url": "https://www.landesrecht.sachsen-anhalt.de",
"urls": [
{
"url": "https://www.landesrecht.sachsen-anhalt.de/bsst/document/jlr-SchulGSTpP27",
"doc_type": "verordnung",
"title": "Schulgesetz Sachsen-Anhalt"
},
{
"url": "https://www.landesrecht.sachsen-anhalt.de/bsst/document/jlr-VersetzVST2017rahmen",
"doc_type": "verordnung",
"title": "Versetzungsverordnung ST"
}
]
},
"sh": {
"name": "Schleswig-Holstein",
"license": "gov_statute",
"training_allowed": True,
"base_url": "https://www.gesetze-rechtsprechung.sh.juris.de",
"urls": [
{
"url": "https://www.gesetze-rechtsprechung.sh.juris.de/jportal/portal/t/10wx/page/bsshoprod.psml?pid=Dokumentanzeige&showdoccase=1&js_peid=Trefferliste&documentnumber=1&numberofresults=1&fromdoctodoc=yes&doc.id=jlr-SchulGSHpP22",
"doc_type": "verordnung",
"title": "Schulgesetz SH - Zeugnisse"
},
{
"url": "https://www.gesetze-rechtsprechung.sh.juris.de/jportal/portal/t/10wx/page/bsshoprod.psml?pid=Dokumentanzeige&showdoccase=1&js_peid=Trefferliste&documentnumber=1&numberofresults=1&fromdoctodoc=yes&doc.id=jlr-ZeugnVSHrahmen",
"doc_type": "verordnung",
"title": "Zeugnisverordnung SH"
}
]
},
"th": {
"name": "Thüringen",
"license": "gov_statute",
"training_allowed": True,
"base_url": "https://landesrecht.thueringen.de",
"urls": [
{
"url": "https://landesrecht.thueringen.de/bsth/document/jlr-SchulGTHpP58",
"doc_type": "verordnung",
"title": "Thüringer Schulgesetz - Zeugnisse"
},
{
"url": "https://landesrecht.thueringen.de/bsth/document/jlr-SchulOTH2018rahmen",
"doc_type": "schulordnung",
"title": "Thüringer Schulordnung"
}
]
}
}
async def populate_seed_data():
"""Populate database with seed data."""
from metrics_db import get_pool, upsert_zeugnis_source
from zeugnis_models import generate_id
pool = await get_pool()
if not pool:
print("Database not available")
return False
try:
async with pool.acquire() as conn:
for bundesland, data in SEED_DATA.items():
# Create or update source
source_id = generate_id()
await upsert_zeugnis_source(
id=source_id,
bundesland=bundesland,
name=data["name"],
license_type=data["license"],
training_allowed=data["training_allowed"],
base_url=data.get("base_url"),
)
# Get the actual source ID (might be existing)
existing = await conn.fetchrow(
"SELECT id FROM zeugnis_sources WHERE bundesland = $1",
bundesland
)
if existing:
source_id = existing["id"]
# Add seed URLs
for url_data in data.get("urls", []):
url_id = generate_id()
await conn.execute(
"""
INSERT INTO zeugnis_seed_urls (id, source_id, url, doc_type, status)
VALUES ($1, $2, $3, $4, 'pending')
ON CONFLICT DO NOTHING
""",
url_id, source_id, url_data["url"], url_data["doc_type"]
)
print(f"Populated {bundesland}: {len(data.get('urls', []))} URLs")
print("Seed data population complete!")
return True
except Exception as e:
print(f"Failed to populate seed data: {e}")
return False
def get_training_summary() -> Dict[str, List[str]]:
"""Get summary of training permissions."""
allowed = []
not_allowed = []
for bundesland, data in SEED_DATA.items():
name = data["name"]
if data["training_allowed"]:
allowed.append(f"{name} ({bundesland})")
else:
not_allowed.append(f"{name} ({bundesland})")
return {
"training_allowed": sorted(allowed),
"training_not_allowed": sorted(not_allowed),
"total_allowed": len(allowed),
"total_not_allowed": len(not_allowed),
}
if __name__ == "__main__":
import asyncio
print("=" * 60)
print("Zeugnis Seed Data Summary")
print("=" * 60)
summary = get_training_summary()
print(f"\nTraining ALLOWED ({summary['total_allowed']} Bundesländer):")
for bl in summary["training_allowed"]:
print(f"{bl}")
print(f"\nTraining NOT ALLOWED ({summary['total_not_allowed']} Bundesländer):")
for bl in summary["training_not_allowed"]:
print(f"{bl}")
print("\n" + "=" * 60)
print("To populate database, run:")
print(" python -c 'import asyncio; from zeugnis_seed_data import populate_seed_data; asyncio.run(populate_seed_data())'")

View File

@@ -0,0 +1,180 @@
"""
Zeugnis Crawler - Embedding generation, MinIO upload, and Qdrant indexing.
"""
import io
import os
import uuid
from datetime import datetime
from typing import Optional, List, Dict, Any
# =============================================================================
# Configuration
# =============================================================================
QDRANT_URL = os.getenv("QDRANT_URL", "http://localhost:6333")
MINIO_ENDPOINT = os.getenv("MINIO_ENDPOINT", "localhost:9000")
MINIO_ACCESS_KEY = os.getenv("MINIO_ACCESS_KEY", "test-access-key")
MINIO_SECRET_KEY = os.getenv("MINIO_SECRET_KEY", "test-secret-key")
MINIO_BUCKET = os.getenv("MINIO_BUCKET", "breakpilot-rag")
EMBEDDING_BACKEND = os.getenv("EMBEDDING_BACKEND", "local")
ZEUGNIS_COLLECTION = "bp_zeugnis"
# =============================================================================
# Embedding Generation
# =============================================================================
_embedding_model = None
def get_embedding_model():
"""Get or initialize embedding model."""
global _embedding_model
if _embedding_model is None and EMBEDDING_BACKEND == "local":
try:
from sentence_transformers import SentenceTransformer
_embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
print("Loaded local embedding model: all-MiniLM-L6-v2")
except ImportError:
print("Warning: sentence-transformers not installed")
return _embedding_model
async def generate_embeddings(texts: List[str]) -> List[List[float]]:
"""Generate embeddings for a list of texts."""
if not texts:
return []
if EMBEDDING_BACKEND == "local":
model = get_embedding_model()
if model:
embeddings = model.encode(texts, show_progress_bar=False)
return [emb.tolist() for emb in embeddings]
return []
elif EMBEDDING_BACKEND == "openai":
import openai
api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
print("Warning: OPENAI_API_KEY not set")
return []
client = openai.AsyncOpenAI(api_key=api_key)
response = await client.embeddings.create(
input=texts,
model="text-embedding-3-small"
)
return [item.embedding for item in response.data]
return []
# =============================================================================
# MinIO Storage
# =============================================================================
async def upload_to_minio(
content: bytes,
bundesland: str,
filename: str,
content_type: str = "application/pdf",
year: Optional[int] = None,
) -> Optional[str]:
"""Upload document to MinIO."""
try:
from minio import Minio
client = Minio(
MINIO_ENDPOINT,
access_key=MINIO_ACCESS_KEY,
secret_key=MINIO_SECRET_KEY,
secure=os.getenv("MINIO_SECURE", "false").lower() == "true"
)
# Ensure bucket exists
if not client.bucket_exists(MINIO_BUCKET):
client.make_bucket(MINIO_BUCKET)
# Build path
year_str = str(year) if year else str(datetime.now().year)
object_name = f"landes-daten/{bundesland}/zeugnis/{year_str}/{filename}"
# Upload
client.put_object(
MINIO_BUCKET,
object_name,
io.BytesIO(content),
len(content),
content_type=content_type,
)
return object_name
except Exception as e:
print(f"MinIO upload failed: {e}")
return None
# =============================================================================
# Qdrant Indexing
# =============================================================================
async def index_in_qdrant(
doc_id: str,
chunks: List[str],
embeddings: List[List[float]],
metadata: Dict[str, Any],
) -> int:
"""Index document chunks in Qdrant."""
try:
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Distance, PointStruct
client = QdrantClient(url=QDRANT_URL)
# Ensure collection exists
collections = client.get_collections().collections
if not any(c.name == ZEUGNIS_COLLECTION for c in collections):
vector_size = len(embeddings[0]) if embeddings else 384
client.create_collection(
collection_name=ZEUGNIS_COLLECTION,
vectors_config=VectorParams(
size=vector_size,
distance=Distance.COSINE,
),
)
print(f"Created Qdrant collection: {ZEUGNIS_COLLECTION}")
# Create points
points = []
for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
point_id = str(uuid.uuid4())
points.append(PointStruct(
id=point_id,
vector=embedding,
payload={
"document_id": doc_id,
"chunk_index": i,
"chunk_text": chunk[:500], # Store first 500 chars for preview
"bundesland": metadata.get("bundesland"),
"doc_type": metadata.get("doc_type"),
"title": metadata.get("title"),
"source_url": metadata.get("url"),
"training_allowed": metadata.get("training_allowed", False),
"indexed_at": datetime.now().isoformat(),
}
))
# Upsert
if points:
client.upsert(
collection_name=ZEUGNIS_COLLECTION,
points=points,
)
return len(points)
except Exception as e:
print(f"Qdrant indexing failed: {e}")
return 0

View File

@@ -0,0 +1,110 @@
"""
Zeugnis Crawler - Text extraction, chunking, and hashing utilities.
"""
import hashlib
from typing import List
CHUNK_SIZE = 1000
CHUNK_OVERLAP = 200
def extract_text_from_pdf(content: bytes) -> str:
"""Extract text from PDF bytes."""
try:
from PyPDF2 import PdfReader
import io
reader = PdfReader(io.BytesIO(content))
text_parts = []
for page in reader.pages:
text = page.extract_text()
if text:
text_parts.append(text)
return "\n\n".join(text_parts)
except Exception as e:
print(f"PDF extraction failed: {e}")
return ""
def extract_text_from_html(content: bytes, encoding: str = "utf-8") -> str:
"""Extract text from HTML bytes."""
try:
from bs4 import BeautifulSoup
html = content.decode(encoding, errors="replace")
soup = BeautifulSoup(html, "html.parser")
# Remove script and style elements
for element in soup(["script", "style", "nav", "header", "footer"]):
element.decompose()
# Get text
text = soup.get_text(separator="\n", strip=True)
# Clean up whitespace
lines = [line.strip() for line in text.splitlines() if line.strip()]
return "\n".join(lines)
except Exception as e:
print(f"HTML extraction failed: {e}")
return ""
def chunk_text(text: str, chunk_size: int = CHUNK_SIZE, overlap: int = CHUNK_OVERLAP) -> List[str]:
"""Split text into overlapping chunks."""
if not text:
return []
chunks = []
separators = ["\n\n", "\n", ". ", " "]
def split_recursive(text: str, sep_index: int = 0) -> List[str]:
if len(text) <= chunk_size:
return [text] if text.strip() else []
if sep_index >= len(separators):
# Force split at chunk_size
result = []
for i in range(0, len(text), chunk_size - overlap):
chunk = text[i:i + chunk_size]
if chunk.strip():
result.append(chunk)
return result
sep = separators[sep_index]
parts = text.split(sep)
result = []
current = ""
for part in parts:
if len(current) + len(sep) + len(part) <= chunk_size:
current = current + sep + part if current else part
else:
if current.strip():
result.extend(split_recursive(current, sep_index + 1) if len(current) > chunk_size else [current])
current = part
if current.strip():
result.extend(split_recursive(current, sep_index + 1) if len(current) > chunk_size else [current])
return result
chunks = split_recursive(text)
# Add overlap
if overlap > 0 and len(chunks) > 1:
overlapped = []
for i, chunk in enumerate(chunks):
if i > 0:
# Add end of previous chunk
prev_end = chunks[i - 1][-overlap:]
chunk = prev_end + chunk
overlapped.append(chunk)
chunks = overlapped
return chunks
def compute_hash(content: bytes) -> str:
"""Compute SHA-256 hash of content."""
return hashlib.sha256(content).hexdigest()

View File

@@ -0,0 +1,313 @@
"""
Zeugnis Crawler - ZeugnisCrawler worker class and CrawlerState.
Crawls official government documents about school certificates from
all 16 German federal states. Only indexes documents where AI training
is legally permitted.
"""
import asyncio
from datetime import datetime
from typing import Optional, List, Dict, Any, Tuple
from dataclasses import dataclass, field
import httpx
from .models import generate_id
from .text import (
extract_text_from_pdf,
extract_text_from_html,
chunk_text,
compute_hash,
)
from .storage import (
upload_to_minio,
generate_embeddings,
index_in_qdrant,
)
# =============================================================================
# Configuration
# =============================================================================
MAX_RETRIES = 3
RETRY_DELAY = 5 # seconds
REQUEST_TIMEOUT = 30 # seconds
USER_AGENT = "BreakPilot-Zeugnis-Crawler/1.0 (Educational Research)"
# =============================================================================
# Crawler State
# =============================================================================
@dataclass
class CrawlerState:
"""Global crawler state."""
is_running: bool = False
current_source_id: Optional[str] = None
current_bundesland: Optional[str] = None
queue: List[Dict] = field(default_factory=list)
documents_crawled_today: int = 0
documents_indexed_today: int = 0
errors_today: int = 0
last_activity: Optional[datetime] = None
_crawler_state = CrawlerState()
def get_crawler_state() -> CrawlerState:
"""Get the global crawler state."""
return _crawler_state
# =============================================================================
# Crawler Worker
# =============================================================================
class ZeugnisCrawler:
"""Rights-aware crawler for zeugnis documents."""
def __init__(self):
self.http_client: Optional[httpx.AsyncClient] = None
self.db_pool = None
async def init(self):
"""Initialize crawler resources."""
self.http_client = httpx.AsyncClient(
timeout=REQUEST_TIMEOUT,
follow_redirects=True,
headers={"User-Agent": USER_AGENT},
)
# Initialize database connection
try:
from metrics_db import get_pool
self.db_pool = await get_pool()
except Exception as e:
print(f"Failed to get database pool: {e}")
async def close(self):
"""Close crawler resources."""
if self.http_client:
await self.http_client.aclose()
async def fetch_url(self, url: str) -> Tuple[Optional[bytes], Optional[str]]:
"""Fetch URL with retry logic."""
for attempt in range(MAX_RETRIES):
try:
response = await self.http_client.get(url)
response.raise_for_status()
content_type = response.headers.get("content-type", "")
return response.content, content_type
except httpx.HTTPStatusError as e:
print(f"HTTP error {e.response.status_code} for {url}")
if e.response.status_code == 404:
return None, None
except Exception as e:
print(f"Attempt {attempt + 1}/{MAX_RETRIES} failed for {url}: {e}")
if attempt < MAX_RETRIES - 1:
await asyncio.sleep(RETRY_DELAY * (attempt + 1))
return None, None
async def crawl_seed_url(
self,
seed_url_id: str,
url: str,
bundesland: str,
doc_type: str,
training_allowed: bool,
) -> Dict[str, Any]:
"""Crawl a single seed URL."""
global _crawler_state
result = {
"seed_url_id": seed_url_id,
"url": url,
"success": False,
"document_id": None,
"indexed": False,
"error": None,
}
try:
# Fetch content
content, content_type = await self.fetch_url(url)
if not content:
result["error"] = "Failed to fetch URL"
return result
# Determine file type
is_pdf = "pdf" in content_type.lower() or url.lower().endswith(".pdf")
# Extract text
if is_pdf:
text = extract_text_from_pdf(content)
filename = url.split("/")[-1] or f"document_{seed_url_id}.pdf"
else:
text = extract_text_from_html(content)
filename = f"document_{seed_url_id}.html"
if not text:
result["error"] = "No text extracted"
return result
# Compute hash for versioning
content_hash = compute_hash(content)
# Upload to MinIO
minio_path = await upload_to_minio(
content,
bundesland,
filename,
content_type=content_type or "application/octet-stream",
)
# Generate document ID
doc_id = generate_id()
# Store document in database
if self.db_pool:
async with self.db_pool.acquire() as conn:
await conn.execute(
"""
INSERT INTO zeugnis_documents
(id, seed_url_id, title, url, content_hash, minio_path,
training_allowed, file_size, content_type)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9)
ON CONFLICT DO NOTHING
""",
doc_id, seed_url_id, filename, url, content_hash,
minio_path, training_allowed, len(content), content_type
)
result["document_id"] = doc_id
result["success"] = True
_crawler_state.documents_crawled_today += 1
# Only index if training is allowed
if training_allowed:
chunks = chunk_text(text)
if chunks:
embeddings = await generate_embeddings(chunks)
if embeddings:
indexed_count = await index_in_qdrant(
doc_id,
chunks,
embeddings,
{
"bundesland": bundesland,
"doc_type": doc_type,
"title": filename,
"url": url,
"training_allowed": True,
}
)
if indexed_count > 0:
result["indexed"] = True
_crawler_state.documents_indexed_today += 1
# Update database
if self.db_pool:
async with self.db_pool.acquire() as conn:
await conn.execute(
"UPDATE zeugnis_documents SET indexed_in_qdrant = true WHERE id = $1",
doc_id
)
else:
result["indexed"] = False
result["error"] = "Training not allowed for this source"
_crawler_state.last_activity = datetime.now()
except Exception as e:
result["error"] = str(e)
_crawler_state.errors_today += 1
return result
async def crawl_source(self, source_id: str) -> Dict[str, Any]:
"""Crawl all seed URLs for a source."""
global _crawler_state
result = {
"source_id": source_id,
"documents_found": 0,
"documents_indexed": 0,
"errors": [],
"started_at": datetime.now(),
"completed_at": None,
}
if not self.db_pool:
result["errors"].append("Database not available")
return result
try:
async with self.db_pool.acquire() as conn:
# Get source info
source = await conn.fetchrow(
"SELECT * FROM zeugnis_sources WHERE id = $1",
source_id
)
if not source:
result["errors"].append(f"Source not found: {source_id}")
return result
bundesland = source["bundesland"]
training_allowed = source["training_allowed"]
_crawler_state.current_source_id = source_id
_crawler_state.current_bundesland = bundesland
# Get seed URLs
seed_urls = await conn.fetch(
"SELECT * FROM zeugnis_seed_urls WHERE source_id = $1 AND status != 'completed'",
source_id
)
for seed_url in seed_urls:
# Update status to running
await conn.execute(
"UPDATE zeugnis_seed_urls SET status = 'running' WHERE id = $1",
seed_url["id"]
)
# Crawl
crawl_result = await self.crawl_seed_url(
seed_url["id"],
seed_url["url"],
bundesland,
seed_url["doc_type"],
training_allowed,
)
# Update status
if crawl_result["success"]:
result["documents_found"] += 1
if crawl_result["indexed"]:
result["documents_indexed"] += 1
await conn.execute(
"UPDATE zeugnis_seed_urls SET status = 'completed', last_crawled = NOW() WHERE id = $1",
seed_url["id"]
)
else:
result["errors"].append(f"{seed_url['url']}: {crawl_result['error']}")
await conn.execute(
"UPDATE zeugnis_seed_urls SET status = 'failed', error_message = $2 WHERE id = $1",
seed_url["id"], crawl_result["error"]
)
# Small delay between requests
await asyncio.sleep(1)
except Exception as e:
result["errors"].append(str(e))
finally:
result["completed_at"] = datetime.now()
_crawler_state.current_source_id = None
_crawler_state.current_bundesland = None
return result