fix: Restore all files lost during destructive rebase

A previous `git pull --rebase origin main` dropped 177 local commits,
losing 3400+ files across admin-v2, backend, studio-v2, website,
klausur-service, and many other services. The partial restore attempt
(660295e2) only recovered some files.

This commit restores all missing files from pre-rebase ref 98933f5e
while preserving post-rebase additions (night-scheduler, night-mode UI,
NightModeWidget dashboard integration).

Restored features include:
- AI Module Sidebar (FAB), OCR Labeling, OCR Compare
- GPU Dashboard, RAG Pipeline, Magic Help
- Klausur-Korrektur (8 files), Abitur-Archiv (5+ files)
- Companion, Zeugnisse-Crawler, Screen Flow
- Full backend, studio-v2, website, klausur-service
- All compliance SDKs, agent-core, voice-service
- CI/CD configs, documentation, scripts

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-02-09 09:51:32 +01:00
parent f7487ee240
commit bfdaf63ba9
2009 changed files with 749983 additions and 1731 deletions

View File

@@ -0,0 +1,638 @@
"""
Qdrant Vector Database Service for BYOEH
Manages vector storage and semantic search for Erwartungshorizonte.
"""
import os
from typing import List, Dict, Optional
from qdrant_client import QdrantClient
from qdrant_client.http import models
from qdrant_client.models import VectorParams, Distance, PointStruct, Filter, FieldCondition, MatchValue
QDRANT_URL = os.getenv("QDRANT_URL", "http://localhost:6333")
COLLECTION_NAME = "bp_eh"
VECTOR_SIZE = 1536 # OpenAI text-embedding-3-small
_client: Optional[QdrantClient] = None
def get_qdrant_client() -> QdrantClient:
"""Get or create Qdrant client singleton."""
global _client
if _client is None:
_client = QdrantClient(url=QDRANT_URL)
return _client
async def init_qdrant_collection() -> bool:
"""Initialize Qdrant collection for BYOEH if not exists."""
try:
client = get_qdrant_client()
# Check if collection exists
collections = client.get_collections().collections
collection_names = [c.name for c in collections]
if COLLECTION_NAME not in collection_names:
client.create_collection(
collection_name=COLLECTION_NAME,
vectors_config=VectorParams(
size=VECTOR_SIZE,
distance=Distance.COSINE
)
)
print(f"Created Qdrant collection: {COLLECTION_NAME}")
else:
print(f"Qdrant collection {COLLECTION_NAME} already exists")
return True
except Exception as e:
print(f"Failed to initialize Qdrant: {e}")
return False
async def index_eh_chunks(
eh_id: str,
tenant_id: str,
subject: str,
chunks: List[Dict]
) -> int:
"""
Index EH chunks in Qdrant.
Args:
eh_id: Erwartungshorizont ID
tenant_id: Tenant/School ID for isolation
subject: Subject (deutsch, englisch, etc.)
chunks: List of {text, embedding, encrypted_content}
Returns:
Number of indexed chunks
"""
client = get_qdrant_client()
points = []
for i, chunk in enumerate(chunks):
point_id = f"{eh_id}_{i}"
points.append(
PointStruct(
id=point_id,
vector=chunk["embedding"],
payload={
"tenant_id": tenant_id,
"eh_id": eh_id,
"chunk_index": i,
"subject": subject,
"encrypted_content": chunk.get("encrypted_content", ""),
"training_allowed": False # ALWAYS FALSE - critical for compliance
}
)
)
if points:
client.upsert(collection_name=COLLECTION_NAME, points=points)
return len(points)
async def search_eh(
query_embedding: List[float],
tenant_id: str,
subject: Optional[str] = None,
limit: int = 5
) -> List[Dict]:
"""
Semantic search in tenant's Erwartungshorizonte.
Args:
query_embedding: Query vector (1536 dimensions)
tenant_id: Tenant ID for isolation
subject: Optional subject filter
limit: Max results
Returns:
List of matching chunks with scores
"""
client = get_qdrant_client()
# Build filter conditions
must_conditions = [
FieldCondition(key="tenant_id", match=MatchValue(value=tenant_id))
]
if subject:
must_conditions.append(
FieldCondition(key="subject", match=MatchValue(value=subject))
)
query_filter = Filter(must=must_conditions)
results = client.search(
collection_name=COLLECTION_NAME,
query_vector=query_embedding,
query_filter=query_filter,
limit=limit
)
return [
{
"id": str(r.id),
"score": r.score,
"eh_id": r.payload.get("eh_id"),
"chunk_index": r.payload.get("chunk_index"),
"encrypted_content": r.payload.get("encrypted_content"),
"subject": r.payload.get("subject")
}
for r in results
]
async def delete_eh_vectors(eh_id: str) -> int:
"""
Delete all vectors for a specific Erwartungshorizont.
Args:
eh_id: Erwartungshorizont ID
Returns:
Number of deleted points
"""
client = get_qdrant_client()
# Get all points for this EH first
scroll_result = client.scroll(
collection_name=COLLECTION_NAME,
scroll_filter=Filter(
must=[FieldCondition(key="eh_id", match=MatchValue(value=eh_id))]
),
limit=1000
)
point_ids = [str(p.id) for p in scroll_result[0]]
if point_ids:
client.delete(
collection_name=COLLECTION_NAME,
points_selector=models.PointIdsList(points=point_ids)
)
return len(point_ids)
async def get_collection_info() -> Dict:
"""Get collection statistics."""
try:
client = get_qdrant_client()
info = client.get_collection(COLLECTION_NAME)
return {
"name": COLLECTION_NAME,
"vectors_count": info.vectors_count,
"points_count": info.points_count,
"status": info.status.value
}
except Exception as e:
return {"error": str(e)}
# =============================================================================
# QdrantService Class (for NiBiS Ingestion Pipeline)
# =============================================================================
class QdrantService:
"""
Class-based Qdrant service for flexible collection management.
Used by nibis_ingestion.py for bulk indexing.
"""
def __init__(self, url: str = None):
self.url = url or QDRANT_URL
self._client = None
@property
def client(self) -> QdrantClient:
if self._client is None:
self._client = QdrantClient(url=self.url)
return self._client
async def ensure_collection(self, collection_name: str, vector_size: int = VECTOR_SIZE) -> bool:
"""
Ensure collection exists, create if needed.
Args:
collection_name: Name of the collection
vector_size: Dimension of vectors
Returns:
True if collection exists/created
"""
try:
collections = self.client.get_collections().collections
collection_names = [c.name for c in collections]
if collection_name not in collection_names:
self.client.create_collection(
collection_name=collection_name,
vectors_config=VectorParams(
size=vector_size,
distance=Distance.COSINE
)
)
print(f"Created collection: {collection_name}")
return True
except Exception as e:
print(f"Error ensuring collection: {e}")
return False
async def upsert_points(self, collection_name: str, points: List[Dict]) -> int:
"""
Upsert points into collection.
Args:
collection_name: Target collection
points: List of {id, vector, payload}
Returns:
Number of upserted points
"""
import uuid
if not points:
return 0
qdrant_points = []
for p in points:
# Convert string ID to UUID for Qdrant compatibility
point_id = p["id"]
if isinstance(point_id, str):
# Use uuid5 with DNS namespace for deterministic UUID from string
point_id = str(uuid.uuid5(uuid.NAMESPACE_DNS, point_id))
qdrant_points.append(
PointStruct(
id=point_id,
vector=p["vector"],
payload={**p.get("payload", {}), "original_id": p["id"]} # Keep original ID in payload
)
)
self.client.upsert(collection_name=collection_name, points=qdrant_points)
return len(qdrant_points)
async def search(
self,
collection_name: str,
query_vector: List[float],
filter_conditions: Optional[Dict] = None,
limit: int = 10
) -> List[Dict]:
"""
Semantic search in collection.
Args:
collection_name: Collection to search
query_vector: Query embedding
filter_conditions: Optional filters (key: value pairs)
limit: Max results
Returns:
List of matching points with scores
"""
query_filter = None
if filter_conditions:
must_conditions = [
FieldCondition(key=k, match=MatchValue(value=v))
for k, v in filter_conditions.items()
]
query_filter = Filter(must=must_conditions)
results = self.client.search(
collection_name=collection_name,
query_vector=query_vector,
query_filter=query_filter,
limit=limit
)
return [
{
"id": str(r.id),
"score": r.score,
"payload": r.payload
}
for r in results
]
async def get_stats(self, collection_name: str) -> Dict:
"""Get collection statistics."""
try:
info = self.client.get_collection(collection_name)
return {
"name": collection_name,
"vectors_count": info.vectors_count,
"points_count": info.points_count,
"status": info.status.value
}
except Exception as e:
return {"error": str(e), "name": collection_name}
# =============================================================================
# NiBiS RAG Search (for Klausurkorrektur Module)
# =============================================================================
async def search_nibis_eh(
query_embedding: List[float],
year: Optional[int] = None,
subject: Optional[str] = None,
niveau: Optional[str] = None,
limit: int = 5
) -> List[Dict]:
"""
Search in NiBiS Erwartungshorizonte (public, pre-indexed data).
Unlike search_eh(), this searches in the public NiBiS collection
and returns plaintext (not encrypted).
Args:
query_embedding: Query vector
year: Optional year filter (2016, 2017, 2024, 2025)
subject: Optional subject filter
niveau: Optional niveau filter (eA, gA)
limit: Max results
Returns:
List of matching chunks with metadata
"""
client = get_qdrant_client()
collection = "bp_nibis_eh"
# Build filter
must_conditions = []
if year:
must_conditions.append(
FieldCondition(key="year", match=MatchValue(value=year))
)
if subject:
must_conditions.append(
FieldCondition(key="subject", match=MatchValue(value=subject))
)
if niveau:
must_conditions.append(
FieldCondition(key="niveau", match=MatchValue(value=niveau))
)
query_filter = Filter(must=must_conditions) if must_conditions else None
try:
results = client.search(
collection_name=collection,
query_vector=query_embedding,
query_filter=query_filter,
limit=limit
)
return [
{
"id": str(r.id),
"score": r.score,
"text": r.payload.get("text", ""),
"year": r.payload.get("year"),
"subject": r.payload.get("subject"),
"niveau": r.payload.get("niveau"),
"task_number": r.payload.get("task_number"),
"doc_type": r.payload.get("doc_type"),
"variant": r.payload.get("variant"),
}
for r in results
]
except Exception as e:
print(f"NiBiS search error: {e}")
return []
# =============================================================================
# Legal Templates RAG Search (for Document Generator)
# =============================================================================
LEGAL_TEMPLATES_COLLECTION = "bp_legal_templates"
LEGAL_TEMPLATES_VECTOR_SIZE = 1024 # BGE-M3
async def init_legal_templates_collection() -> bool:
"""Initialize Qdrant collection for legal templates if not exists."""
try:
client = get_qdrant_client()
collections = client.get_collections().collections
collection_names = [c.name for c in collections]
if LEGAL_TEMPLATES_COLLECTION not in collection_names:
client.create_collection(
collection_name=LEGAL_TEMPLATES_COLLECTION,
vectors_config=VectorParams(
size=LEGAL_TEMPLATES_VECTOR_SIZE,
distance=Distance.COSINE
)
)
print(f"Created Qdrant collection: {LEGAL_TEMPLATES_COLLECTION}")
else:
print(f"Qdrant collection {LEGAL_TEMPLATES_COLLECTION} already exists")
return True
except Exception as e:
print(f"Failed to initialize legal templates collection: {e}")
return False
async def search_legal_templates(
query_embedding: List[float],
template_type: Optional[str] = None,
license_types: Optional[List[str]] = None,
language: Optional[str] = None,
jurisdiction: Optional[str] = None,
attribution_required: Optional[bool] = None,
limit: int = 10
) -> List[Dict]:
"""
Search in legal templates collection for document generation.
Args:
query_embedding: Query vector (1024 dimensions, BGE-M3)
template_type: Filter by template type (privacy_policy, terms_of_service, etc.)
license_types: Filter by license types (cc0, mit, cc_by_4, etc.)
language: Filter by language (de, en)
jurisdiction: Filter by jurisdiction (DE, EU, US, etc.)
attribution_required: Filter by attribution requirement
limit: Max results
Returns:
List of matching template chunks with full metadata
"""
client = get_qdrant_client()
# Build filter conditions
must_conditions = []
if template_type:
must_conditions.append(
FieldCondition(key="template_type", match=MatchValue(value=template_type))
)
if language:
must_conditions.append(
FieldCondition(key="language", match=MatchValue(value=language))
)
if jurisdiction:
must_conditions.append(
FieldCondition(key="jurisdiction", match=MatchValue(value=jurisdiction))
)
if attribution_required is not None:
must_conditions.append(
FieldCondition(key="attribution_required", match=MatchValue(value=attribution_required))
)
# License type filter (OR condition)
should_conditions = []
if license_types:
for license_type in license_types:
should_conditions.append(
FieldCondition(key="license_id", match=MatchValue(value=license_type))
)
# Construct filter
query_filter = None
if must_conditions or should_conditions:
filter_args = {}
if must_conditions:
filter_args["must"] = must_conditions
if should_conditions:
filter_args["should"] = should_conditions
query_filter = Filter(**filter_args)
try:
results = client.search(
collection_name=LEGAL_TEMPLATES_COLLECTION,
query_vector=query_embedding,
query_filter=query_filter,
limit=limit
)
return [
{
"id": str(r.id),
"score": r.score,
"text": r.payload.get("text", ""),
"document_title": r.payload.get("document_title"),
"template_type": r.payload.get("template_type"),
"clause_category": r.payload.get("clause_category"),
"language": r.payload.get("language"),
"jurisdiction": r.payload.get("jurisdiction"),
"license_id": r.payload.get("license_id"),
"license_name": r.payload.get("license_name"),
"license_url": r.payload.get("license_url"),
"attribution_required": r.payload.get("attribution_required"),
"attribution_text": r.payload.get("attribution_text"),
"source_name": r.payload.get("source_name"),
"source_url": r.payload.get("source_url"),
"source_repo": r.payload.get("source_repo"),
"placeholders": r.payload.get("placeholders", []),
"is_complete_document": r.payload.get("is_complete_document"),
"is_modular": r.payload.get("is_modular"),
"requires_customization": r.payload.get("requires_customization"),
"output_allowed": r.payload.get("output_allowed"),
"modification_allowed": r.payload.get("modification_allowed"),
"distortion_prohibited": r.payload.get("distortion_prohibited"),
}
for r in results
]
except Exception as e:
print(f"Legal templates search error: {e}")
return []
async def get_legal_templates_stats() -> Dict:
"""Get statistics for the legal templates collection."""
try:
client = get_qdrant_client()
info = client.get_collection(LEGAL_TEMPLATES_COLLECTION)
# Count by template type
template_types = ["privacy_policy", "terms_of_service", "cookie_banner",
"impressum", "widerruf", "dpa", "sla", "agb"]
type_counts = {}
for ttype in template_types:
result = client.count(
collection_name=LEGAL_TEMPLATES_COLLECTION,
count_filter=Filter(
must=[FieldCondition(key="template_type", match=MatchValue(value=ttype))]
)
)
if result.count > 0:
type_counts[ttype] = result.count
# Count by language
lang_counts = {}
for lang in ["de", "en"]:
result = client.count(
collection_name=LEGAL_TEMPLATES_COLLECTION,
count_filter=Filter(
must=[FieldCondition(key="language", match=MatchValue(value=lang))]
)
)
lang_counts[lang] = result.count
# Count by license
license_counts = {}
for license_id in ["cc0", "mit", "cc_by_4", "public_domain", "unlicense"]:
result = client.count(
collection_name=LEGAL_TEMPLATES_COLLECTION,
count_filter=Filter(
must=[FieldCondition(key="license_id", match=MatchValue(value=license_id))]
)
)
if result.count > 0:
license_counts[license_id] = result.count
return {
"collection": LEGAL_TEMPLATES_COLLECTION,
"vectors_count": info.vectors_count,
"points_count": info.points_count,
"status": info.status.value,
"template_types": type_counts,
"languages": lang_counts,
"licenses": license_counts,
}
except Exception as e:
return {"error": str(e), "collection": LEGAL_TEMPLATES_COLLECTION}
async def delete_legal_templates_by_source(source_name: str) -> int:
"""
Delete all legal template chunks from a specific source.
Args:
source_name: Name of the source to delete
Returns:
Number of deleted points
"""
client = get_qdrant_client()
# Count first
count_result = client.count(
collection_name=LEGAL_TEMPLATES_COLLECTION,
count_filter=Filter(
must=[FieldCondition(key="source_name", match=MatchValue(value=source_name))]
)
)
# Delete by filter
client.delete(
collection_name=LEGAL_TEMPLATES_COLLECTION,
points_selector=Filter(
must=[FieldCondition(key="source_name", match=MatchValue(value=source_name))]
)
)
return count_result.count