fix: Restore all files lost during destructive rebase
A previous `git pull --rebase origin main` dropped 177 local commits,
losing 3400+ files across admin-v2, backend, studio-v2, website,
klausur-service, and many other services. The partial restore attempt
(660295e2) only recovered some files.
This commit restores all missing files from pre-rebase ref 98933f5e
while preserving post-rebase additions (night-scheduler, night-mode UI,
NightModeWidget dashboard integration).
Restored features include:
- AI Module Sidebar (FAB), OCR Labeling, OCR Compare
- GPU Dashboard, RAG Pipeline, Magic Help
- Klausur-Korrektur (8 files), Abitur-Archiv (5+ files)
- Companion, Zeugnisse-Crawler, Screen Flow
- Full backend, studio-v2, website, klausur-service
- All compliance SDKs, agent-core, voice-service
- CI/CD configs, documentation, scripts
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
39
breakpilot-compliance-sdk/services/rag-service/Dockerfile
Normal file
39
breakpilot-compliance-sdk/services/rag-service/Dockerfile
Normal file
@@ -0,0 +1,39 @@
|
||||
# Build stage
|
||||
FROM python:3.11-slim AS builder
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
# Install build dependencies
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
build-essential \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Install Python dependencies
|
||||
COPY requirements.txt .
|
||||
RUN pip install --no-cache-dir --user -r requirements.txt
|
||||
|
||||
# Runtime stage
|
||||
FROM python:3.11-slim
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
# Copy installed packages from builder
|
||||
COPY --from=builder /root/.local /root/.local
|
||||
ENV PATH=/root/.local/bin:$PATH
|
||||
|
||||
# Copy application code
|
||||
COPY . .
|
||||
|
||||
# Create non-root user
|
||||
RUN useradd -m -u 1000 appuser && chown -R appuser:appuser /app
|
||||
USER appuser
|
||||
|
||||
# Expose port
|
||||
EXPOSE 8082
|
||||
|
||||
# Health check
|
||||
HEALTHCHECK --interval=30s --timeout=3s --start-period=10s --retries=3 \
|
||||
CMD python -c "import httpx; httpx.get('http://localhost:8082/health')" || exit 1
|
||||
|
||||
# Run
|
||||
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8082"]
|
||||
34
breakpilot-compliance-sdk/services/rag-service/config.py
Normal file
34
breakpilot-compliance-sdk/services/rag-service/config.py
Normal file
@@ -0,0 +1,34 @@
|
||||
"""
|
||||
Configuration for RAG Service
|
||||
"""
|
||||
|
||||
from pydantic_settings import BaseSettings
|
||||
from typing import Optional
|
||||
|
||||
|
||||
class Settings(BaseSettings):
|
||||
"""Application settings loaded from environment variables."""
|
||||
|
||||
# Service
|
||||
environment: str = "development"
|
||||
port: int = 8082
|
||||
|
||||
# Qdrant
|
||||
qdrant_url: str = "http://localhost:6333"
|
||||
qdrant_collection: str = "legal_documents"
|
||||
|
||||
# Ollama
|
||||
ollama_url: str = "http://localhost:11434"
|
||||
embedding_model: str = "bge-m3"
|
||||
llm_model: str = "qwen2.5:32b"
|
||||
|
||||
# Document Processing
|
||||
chunk_size: int = 512
|
||||
chunk_overlap: int = 50
|
||||
|
||||
# Legal Corpus
|
||||
corpus_path: str = "./legal-corpus"
|
||||
|
||||
class Config:
|
||||
env_file = ".env"
|
||||
env_file_encoding = "utf-8"
|
||||
245
breakpilot-compliance-sdk/services/rag-service/main.py
Normal file
245
breakpilot-compliance-sdk/services/rag-service/main.py
Normal file
@@ -0,0 +1,245 @@
|
||||
"""
|
||||
BreakPilot Compliance SDK - RAG Service
|
||||
|
||||
Retrieval-Augmented Generation service for legal document search and Q&A.
|
||||
"""
|
||||
|
||||
import os
|
||||
from contextlib import asynccontextmanager
|
||||
from fastapi import FastAPI, HTTPException, UploadFile, File
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from pydantic import BaseModel
|
||||
from typing import List, Optional
|
||||
import structlog
|
||||
|
||||
from rag.search import SearchService
|
||||
from rag.assistant import AssistantService
|
||||
from rag.documents import DocumentService
|
||||
from config import Settings
|
||||
|
||||
# Configure logging
|
||||
structlog.configure(
|
||||
processors=[
|
||||
structlog.processors.TimeStamper(fmt="iso"),
|
||||
structlog.processors.JSONRenderer()
|
||||
]
|
||||
)
|
||||
logger = structlog.get_logger()
|
||||
|
||||
# Load settings
|
||||
settings = Settings()
|
||||
|
||||
# Services
|
||||
search_service: SearchService = None
|
||||
assistant_service: AssistantService = None
|
||||
document_service: DocumentService = None
|
||||
|
||||
|
||||
@asynccontextmanager
|
||||
async def lifespan(app: FastAPI):
|
||||
"""Application lifespan handler."""
|
||||
global search_service, assistant_service, document_service
|
||||
|
||||
logger.info("Starting RAG Service", version="0.0.1")
|
||||
|
||||
# Initialize services
|
||||
search_service = SearchService(settings)
|
||||
assistant_service = AssistantService(settings)
|
||||
document_service = DocumentService(settings)
|
||||
|
||||
# Initialize vector store with legal corpus
|
||||
await search_service.initialize()
|
||||
|
||||
logger.info("RAG Service ready",
|
||||
regulations=len(search_service.regulations),
|
||||
total_chunks=search_service.total_chunks)
|
||||
|
||||
yield
|
||||
|
||||
logger.info("Shutting down RAG Service")
|
||||
|
||||
|
||||
app = FastAPI(
|
||||
title="BreakPilot RAG Service",
|
||||
description="Legal document search and Q&A service",
|
||||
version="0.0.1",
|
||||
lifespan=lifespan
|
||||
)
|
||||
|
||||
app.add_middleware(
|
||||
CORSMiddleware,
|
||||
allow_origins=["*"],
|
||||
allow_credentials=True,
|
||||
allow_methods=["*"],
|
||||
allow_headers=["*"],
|
||||
)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Models
|
||||
# =============================================================================
|
||||
|
||||
class SearchRequest(BaseModel):
|
||||
query: str
|
||||
regulation_codes: Optional[List[str]] = None
|
||||
limit: int = 10
|
||||
min_score: float = 0.7
|
||||
|
||||
|
||||
class SearchResult(BaseModel):
|
||||
content: str
|
||||
regulation_code: str
|
||||
article: Optional[str] = None
|
||||
paragraph: Optional[str] = None
|
||||
score: float
|
||||
metadata: dict = {}
|
||||
|
||||
|
||||
class SearchResponse(BaseModel):
|
||||
query: str
|
||||
results: List[SearchResult]
|
||||
total: int
|
||||
|
||||
|
||||
class AskRequest(BaseModel):
|
||||
question: str
|
||||
context: Optional[str] = None
|
||||
regulation_codes: Optional[List[str]] = None
|
||||
include_citations: bool = True
|
||||
|
||||
|
||||
class Citation(BaseModel):
|
||||
regulation_code: str
|
||||
article: str
|
||||
text: str
|
||||
relevance: float
|
||||
|
||||
|
||||
class AskResponse(BaseModel):
|
||||
question: str
|
||||
answer: str
|
||||
citations: List[Citation]
|
||||
confidence: float
|
||||
|
||||
|
||||
class RegulationInfo(BaseModel):
|
||||
code: str
|
||||
name: str
|
||||
chunks: int
|
||||
last_updated: str
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Endpoints
|
||||
# =============================================================================
|
||||
|
||||
@app.get("/health")
|
||||
async def health():
|
||||
"""Health check endpoint."""
|
||||
return {
|
||||
"status": "healthy",
|
||||
"service": "rag-service",
|
||||
"version": "0.0.1",
|
||||
"regulations": len(search_service.regulations) if search_service else 0
|
||||
}
|
||||
|
||||
|
||||
@app.post("/api/v1/search", response_model=SearchResponse)
|
||||
async def search(request: SearchRequest):
|
||||
"""Perform semantic search across legal documents."""
|
||||
try:
|
||||
results = await search_service.search(
|
||||
query=request.query,
|
||||
regulation_codes=request.regulation_codes,
|
||||
limit=request.limit,
|
||||
min_score=request.min_score
|
||||
)
|
||||
|
||||
return SearchResponse(
|
||||
query=request.query,
|
||||
results=[SearchResult(**r) for r in results],
|
||||
total=len(results)
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error("Search failed", error=str(e))
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
@app.post("/api/v1/ask", response_model=AskResponse)
|
||||
async def ask(request: AskRequest):
|
||||
"""Ask a question about legal requirements."""
|
||||
try:
|
||||
response = await assistant_service.ask(
|
||||
question=request.question,
|
||||
context=request.context,
|
||||
regulation_codes=request.regulation_codes,
|
||||
include_citations=request.include_citations
|
||||
)
|
||||
|
||||
return AskResponse(
|
||||
question=request.question,
|
||||
answer=response["answer"],
|
||||
citations=[Citation(**c) for c in response.get("citations", [])],
|
||||
confidence=response.get("confidence", 0.9)
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error("Ask failed", error=str(e))
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
@app.get("/api/v1/regulations", response_model=List[RegulationInfo])
|
||||
async def get_regulations():
|
||||
"""Get list of available regulations."""
|
||||
return search_service.get_regulations()
|
||||
|
||||
|
||||
@app.get("/api/v1/regulations/{code}")
|
||||
async def get_regulation(code: str):
|
||||
"""Get details of a specific regulation."""
|
||||
regulation = search_service.get_regulation(code)
|
||||
if not regulation:
|
||||
raise HTTPException(status_code=404, detail="Regulation not found")
|
||||
return regulation
|
||||
|
||||
|
||||
@app.post("/api/v1/documents")
|
||||
async def upload_document(
|
||||
file: UploadFile = File(...),
|
||||
regulation_code: Optional[str] = None
|
||||
):
|
||||
"""Upload a custom document for indexing."""
|
||||
try:
|
||||
result = await document_service.process_upload(
|
||||
file=file,
|
||||
regulation_code=regulation_code
|
||||
)
|
||||
return {
|
||||
"id": result["id"],
|
||||
"filename": file.filename,
|
||||
"chunks": result["chunks"],
|
||||
"status": "INDEXED"
|
||||
}
|
||||
except Exception as e:
|
||||
logger.error("Document upload failed", error=str(e))
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
@app.delete("/api/v1/documents/{document_id}")
|
||||
async def delete_document(document_id: str):
|
||||
"""Delete a custom document."""
|
||||
try:
|
||||
await document_service.delete(document_id)
|
||||
return {"status": "deleted", "id": document_id}
|
||||
except Exception as e:
|
||||
logger.error("Document deletion failed", error=str(e))
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import uvicorn
|
||||
uvicorn.run(
|
||||
"main:app",
|
||||
host="0.0.0.0",
|
||||
port=int(os.getenv("PORT", "8082")),
|
||||
reload=os.getenv("ENVIRONMENT") != "production"
|
||||
)
|
||||
@@ -0,0 +1,7 @@
|
||||
"""RAG module for BreakPilot Compliance SDK."""
|
||||
|
||||
from .search import SearchService
|
||||
from .assistant import AssistantService
|
||||
from .documents import DocumentService
|
||||
|
||||
__all__ = ["SearchService", "AssistantService", "DocumentService"]
|
||||
139
breakpilot-compliance-sdk/services/rag-service/rag/assistant.py
Normal file
139
breakpilot-compliance-sdk/services/rag-service/rag/assistant.py
Normal file
@@ -0,0 +1,139 @@
|
||||
"""
|
||||
Assistant Service for RAG
|
||||
|
||||
Handles Q&A using LLM with retrieved context.
|
||||
"""
|
||||
|
||||
import httpx
|
||||
from typing import List, Optional, Dict, Any
|
||||
import structlog
|
||||
|
||||
from .search import SearchService
|
||||
|
||||
logger = structlog.get_logger()
|
||||
|
||||
|
||||
SYSTEM_PROMPT = """Du bist ein Experte für Datenschutz- und Compliance-Recht.
|
||||
Beantworte Fragen basierend auf den bereitgestellten Rechtstexten.
|
||||
Zitiere immer die relevanten Artikel und Paragraphen.
|
||||
Antworte auf Deutsch.
|
||||
Wenn du dir nicht sicher bist, sage das klar.
|
||||
"""
|
||||
|
||||
|
||||
class AssistantService:
|
||||
"""Service for legal Q&A using RAG."""
|
||||
|
||||
def __init__(self, settings):
|
||||
self.settings = settings
|
||||
self.search_service = SearchService(settings)
|
||||
|
||||
async def ask(
|
||||
self,
|
||||
question: str,
|
||||
context: Optional[str] = None,
|
||||
regulation_codes: Optional[List[str]] = None,
|
||||
include_citations: bool = True
|
||||
) -> Dict[str, Any]:
|
||||
"""Answer a legal question using RAG."""
|
||||
|
||||
# Search for relevant context
|
||||
search_results = await self.search_service.search(
|
||||
query=question,
|
||||
regulation_codes=regulation_codes,
|
||||
limit=5,
|
||||
min_score=0.6
|
||||
)
|
||||
|
||||
# Build context from search results
|
||||
retrieved_context = "\n\n".join([
|
||||
f"[{r['regulation_code']} Art. {r['article']}]: {r['content']}"
|
||||
for r in search_results
|
||||
])
|
||||
|
||||
# Add user-provided context if any
|
||||
if context:
|
||||
retrieved_context = f"{context}\n\n{retrieved_context}"
|
||||
|
||||
# Build prompt
|
||||
prompt = f"""Kontext aus Rechtstexten:
|
||||
{retrieved_context}
|
||||
|
||||
Frage: {question}
|
||||
|
||||
Beantworte die Frage basierend auf dem Kontext. Zitiere relevante Artikel."""
|
||||
|
||||
# Generate answer
|
||||
answer = await self._generate_response(prompt)
|
||||
|
||||
# Extract citations
|
||||
citations = []
|
||||
if include_citations:
|
||||
for result in search_results:
|
||||
citations.append({
|
||||
"regulation_code": result["regulation_code"],
|
||||
"article": result.get("article", ""),
|
||||
"text": result["content"][:200] + "...",
|
||||
"relevance": result["score"]
|
||||
})
|
||||
|
||||
return {
|
||||
"answer": answer,
|
||||
"citations": citations,
|
||||
"confidence": self._calculate_confidence(search_results)
|
||||
}
|
||||
|
||||
async def _generate_response(self, prompt: str) -> str:
|
||||
"""Generate response using Ollama."""
|
||||
try:
|
||||
async with httpx.AsyncClient() as client:
|
||||
response = await client.post(
|
||||
f"{self.settings.ollama_url}/api/generate",
|
||||
json={
|
||||
"model": self.settings.llm_model,
|
||||
"prompt": prompt,
|
||||
"system": SYSTEM_PROMPT,
|
||||
"stream": False,
|
||||
"options": {
|
||||
"temperature": 0.3,
|
||||
"top_p": 0.9
|
||||
}
|
||||
},
|
||||
timeout=120.0
|
||||
)
|
||||
response.raise_for_status()
|
||||
return response.json()["response"]
|
||||
except httpx.TimeoutException:
|
||||
logger.error("LLM request timed out")
|
||||
return "Die Anfrage hat zu lange gedauert. Bitte versuchen Sie es erneut."
|
||||
except Exception as e:
|
||||
logger.error("LLM generation failed", error=str(e))
|
||||
# Return fallback response
|
||||
return self._generate_fallback_response(prompt)
|
||||
|
||||
def _generate_fallback_response(self, prompt: str) -> str:
|
||||
"""Generate a fallback response without LLM."""
|
||||
return """Basierend auf den verfügbaren Rechtstexten:
|
||||
|
||||
Die relevanten Regelungen finden sich in den zitierten Artikeln.
|
||||
Für eine detaillierte rechtliche Bewertung empfehle ich die Konsultation
|
||||
der vollständigen Gesetzestexte oder eines Rechtsbeistands.
|
||||
|
||||
Hinweis: Dies ist eine automatisch generierte Antwort.
|
||||
Der LLM-Dienst war nicht verfügbar."""
|
||||
|
||||
def _calculate_confidence(self, search_results: List[Dict]) -> float:
|
||||
"""Calculate confidence score based on search results."""
|
||||
if not search_results:
|
||||
return 0.3
|
||||
|
||||
# Average relevance score
|
||||
avg_score = sum(r["score"] for r in search_results) / len(search_results)
|
||||
|
||||
# Adjust based on number of results
|
||||
if len(search_results) >= 3:
|
||||
confidence = avg_score * 1.1
|
||||
else:
|
||||
confidence = avg_score * 0.9
|
||||
|
||||
return min(confidence, 1.0)
|
||||
153
breakpilot-compliance-sdk/services/rag-service/rag/documents.py
Normal file
153
breakpilot-compliance-sdk/services/rag-service/rag/documents.py
Normal file
@@ -0,0 +1,153 @@
|
||||
"""
|
||||
Document Service for RAG
|
||||
|
||||
Handles document upload, processing, and indexing.
|
||||
"""
|
||||
|
||||
import uuid
|
||||
from typing import Optional, Dict, Any
|
||||
from fastapi import UploadFile
|
||||
import structlog
|
||||
|
||||
logger = structlog.get_logger()
|
||||
|
||||
|
||||
class DocumentService:
|
||||
"""Service for document processing and indexing."""
|
||||
|
||||
def __init__(self, settings):
|
||||
self.settings = settings
|
||||
self.documents: Dict[str, Dict] = {}
|
||||
|
||||
async def process_upload(
|
||||
self,
|
||||
file: UploadFile,
|
||||
regulation_code: Optional[str] = None
|
||||
) -> Dict[str, Any]:
|
||||
"""Process and index an uploaded document."""
|
||||
doc_id = str(uuid.uuid4())
|
||||
|
||||
# Read file content
|
||||
content = await file.read()
|
||||
|
||||
# Determine file type and extract text
|
||||
filename = file.filename or "unknown"
|
||||
if filename.endswith(".pdf"):
|
||||
text = await self._extract_pdf(content)
|
||||
elif filename.endswith(".docx"):
|
||||
text = await self._extract_docx(content)
|
||||
elif filename.endswith(".md"):
|
||||
text = await self._extract_markdown(content)
|
||||
else:
|
||||
text = content.decode("utf-8", errors="ignore")
|
||||
|
||||
# Chunk the text
|
||||
chunks = self._chunk_text(text)
|
||||
|
||||
# Store document metadata
|
||||
self.documents[doc_id] = {
|
||||
"id": doc_id,
|
||||
"filename": filename,
|
||||
"regulation_code": regulation_code or "CUSTOM",
|
||||
"chunks": len(chunks),
|
||||
"text_length": len(text)
|
||||
}
|
||||
|
||||
# TODO: Index chunks in Qdrant
|
||||
logger.info("Document processed",
|
||||
doc_id=doc_id,
|
||||
filename=filename,
|
||||
chunks=len(chunks))
|
||||
|
||||
return {
|
||||
"id": doc_id,
|
||||
"filename": filename,
|
||||
"chunks": len(chunks)
|
||||
}
|
||||
|
||||
async def _extract_pdf(self, content: bytes) -> str:
|
||||
"""Extract text from PDF."""
|
||||
try:
|
||||
from pypdf import PdfReader
|
||||
from io import BytesIO
|
||||
|
||||
reader = PdfReader(BytesIO(content))
|
||||
text = ""
|
||||
for page in reader.pages:
|
||||
text += page.extract_text() + "\n"
|
||||
return text
|
||||
except Exception as e:
|
||||
logger.error("PDF extraction failed", error=str(e))
|
||||
return ""
|
||||
|
||||
async def _extract_docx(self, content: bytes) -> str:
|
||||
"""Extract text from DOCX."""
|
||||
try:
|
||||
from docx import Document
|
||||
from io import BytesIO
|
||||
|
||||
doc = Document(BytesIO(content))
|
||||
text = ""
|
||||
for para in doc.paragraphs:
|
||||
text += para.text + "\n"
|
||||
return text
|
||||
except Exception as e:
|
||||
logger.error("DOCX extraction failed", error=str(e))
|
||||
return ""
|
||||
|
||||
async def _extract_markdown(self, content: bytes) -> str:
|
||||
"""Extract text from Markdown."""
|
||||
try:
|
||||
import markdown
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
html = markdown.markdown(content.decode("utf-8"))
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
return soup.get_text()
|
||||
except Exception as e:
|
||||
logger.error("Markdown extraction failed", error=str(e))
|
||||
return content.decode("utf-8", errors="ignore")
|
||||
|
||||
def _chunk_text(self, text: str) -> list:
|
||||
"""Split text into chunks."""
|
||||
chunk_size = self.settings.chunk_size
|
||||
chunk_overlap = self.settings.chunk_overlap
|
||||
|
||||
chunks = []
|
||||
start = 0
|
||||
|
||||
while start < len(text):
|
||||
end = start + chunk_size
|
||||
|
||||
# Try to break at sentence boundary
|
||||
if end < len(text):
|
||||
# Look for sentence end within overlap window
|
||||
search_start = max(end - chunk_overlap, start)
|
||||
search_text = text[search_start:end + chunk_overlap]
|
||||
|
||||
for sep in [". ", ".\n", "! ", "? "]:
|
||||
last_sep = search_text.rfind(sep)
|
||||
if last_sep > 0:
|
||||
end = search_start + last_sep + len(sep)
|
||||
break
|
||||
|
||||
chunk = text[start:end].strip()
|
||||
if chunk:
|
||||
chunks.append(chunk)
|
||||
|
||||
start = end - chunk_overlap
|
||||
|
||||
return chunks
|
||||
|
||||
async def delete(self, document_id: str) -> bool:
|
||||
"""Delete a document and its chunks."""
|
||||
if document_id in self.documents:
|
||||
del self.documents[document_id]
|
||||
# TODO: Delete from Qdrant
|
||||
logger.info("Document deleted", doc_id=document_id)
|
||||
return True
|
||||
return False
|
||||
|
||||
def get_document(self, document_id: str) -> Optional[Dict]:
|
||||
"""Get document metadata."""
|
||||
return self.documents.get(document_id)
|
||||
235
breakpilot-compliance-sdk/services/rag-service/rag/search.py
Normal file
235
breakpilot-compliance-sdk/services/rag-service/rag/search.py
Normal file
@@ -0,0 +1,235 @@
|
||||
"""
|
||||
Search Service for RAG
|
||||
|
||||
Handles semantic search across legal documents using Qdrant and embeddings.
|
||||
"""
|
||||
|
||||
import httpx
|
||||
from typing import List, Optional, Dict, Any
|
||||
from qdrant_client import QdrantClient
|
||||
from qdrant_client.models import (
|
||||
Distance, VectorParams, PointStruct,
|
||||
Filter, FieldCondition, MatchValue
|
||||
)
|
||||
import structlog
|
||||
|
||||
logger = structlog.get_logger()
|
||||
|
||||
|
||||
class SearchService:
|
||||
"""Service for semantic search across legal documents."""
|
||||
|
||||
def __init__(self, settings):
|
||||
self.settings = settings
|
||||
self.qdrant = QdrantClient(url=settings.qdrant_url)
|
||||
self.collection = settings.qdrant_collection
|
||||
self.regulations: Dict[str, Dict] = {}
|
||||
self.total_chunks = 0
|
||||
|
||||
async def initialize(self):
|
||||
"""Initialize the search service and load legal corpus."""
|
||||
# Ensure collection exists
|
||||
try:
|
||||
self.qdrant.get_collection(self.collection)
|
||||
logger.info("Using existing collection", collection=self.collection)
|
||||
except Exception:
|
||||
# Create collection
|
||||
self.qdrant.create_collection(
|
||||
collection_name=self.collection,
|
||||
vectors_config=VectorParams(
|
||||
size=1024, # bge-m3 dimension
|
||||
distance=Distance.COSINE
|
||||
)
|
||||
)
|
||||
logger.info("Created collection", collection=self.collection)
|
||||
|
||||
# Load built-in regulations metadata
|
||||
self._load_regulations_metadata()
|
||||
|
||||
# Index legal corpus if empty
|
||||
info = self.qdrant.get_collection(self.collection)
|
||||
if info.points_count == 0:
|
||||
await self._index_legal_corpus()
|
||||
|
||||
self.total_chunks = info.points_count
|
||||
|
||||
def _load_regulations_metadata(self):
|
||||
"""Load metadata for available regulations."""
|
||||
self.regulations = {
|
||||
"DSGVO": {
|
||||
"code": "DSGVO",
|
||||
"name": "Datenschutz-Grundverordnung",
|
||||
"full_name": "Verordnung (EU) 2016/679",
|
||||
"effective": "2018-05-25",
|
||||
"chunks": 99,
|
||||
"articles": list(range(1, 100))
|
||||
},
|
||||
"AI_ACT": {
|
||||
"code": "AI_ACT",
|
||||
"name": "EU AI Act",
|
||||
"full_name": "Verordnung über Künstliche Intelligenz",
|
||||
"effective": "2025-02-02",
|
||||
"chunks": 85,
|
||||
"articles": list(range(1, 114))
|
||||
},
|
||||
"NIS2": {
|
||||
"code": "NIS2",
|
||||
"name": "NIS 2 Directive",
|
||||
"full_name": "Richtlinie (EU) 2022/2555",
|
||||
"effective": "2024-10-17",
|
||||
"chunks": 46,
|
||||
"articles": list(range(1, 47))
|
||||
},
|
||||
"TDDDG": {
|
||||
"code": "TDDDG",
|
||||
"name": "TDDDG",
|
||||
"full_name": "Telekommunikation-Digitale-Dienste-Datenschutz-Gesetz",
|
||||
"effective": "2021-12-01",
|
||||
"chunks": 30,
|
||||
"articles": list(range(1, 31))
|
||||
},
|
||||
"BDSG": {
|
||||
"code": "BDSG",
|
||||
"name": "BDSG",
|
||||
"full_name": "Bundesdatenschutzgesetz",
|
||||
"effective": "2018-05-25",
|
||||
"chunks": 86,
|
||||
"articles": list(range(1, 87))
|
||||
}
|
||||
}
|
||||
|
||||
async def _index_legal_corpus(self):
|
||||
"""Index the legal corpus into Qdrant."""
|
||||
logger.info("Indexing legal corpus...")
|
||||
|
||||
# Sample chunks for demonstration
|
||||
# In production, this would load actual legal documents
|
||||
sample_chunks = [
|
||||
{
|
||||
"content": "Art. 9 Abs. 1 DSGVO: Die Verarbeitung personenbezogener Daten, aus denen die rassische und ethnische Herkunft, politische Meinungen, religiöse oder weltanschauliche Überzeugungen oder die Gewerkschaftszugehörigkeit hervorgehen, sowie die Verarbeitung von genetischen Daten, biometrischen Daten zur eindeutigen Identifizierung einer natürlichen Person, Gesundheitsdaten oder Daten zum Sexualleben oder der sexuellen Orientierung einer natürlichen Person ist untersagt.",
|
||||
"regulation_code": "DSGVO",
|
||||
"article": "9",
|
||||
"paragraph": "1"
|
||||
},
|
||||
{
|
||||
"content": "Art. 6 Abs. 1 DSGVO: Die Verarbeitung ist nur rechtmäßig, wenn mindestens eine der nachstehenden Bedingungen erfüllt ist: a) Die betroffene Person hat ihre Einwilligung zu der Verarbeitung der sie betreffenden personenbezogenen Daten für einen oder mehrere bestimmte Zwecke gegeben.",
|
||||
"regulation_code": "DSGVO",
|
||||
"article": "6",
|
||||
"paragraph": "1"
|
||||
},
|
||||
{
|
||||
"content": "Art. 32 DSGVO: Unter Berücksichtigung des Stands der Technik, der Implementierungskosten und der Art, des Umfangs, der Umstände und der Zwecke der Verarbeitung sowie der unterschiedlichen Eintrittswahrscheinlichkeit und Schwere des Risikos für die Rechte und Freiheiten natürlicher Personen treffen der Verantwortliche und der Auftragsverarbeiter geeignete technische und organisatorische Maßnahmen.",
|
||||
"regulation_code": "DSGVO",
|
||||
"article": "32",
|
||||
"paragraph": "1"
|
||||
},
|
||||
{
|
||||
"content": "Art. 6 AI Act: Hochrisiko-KI-Systeme. Als Hochrisiko-KI-Systeme gelten KI-Systeme, die als Sicherheitskomponente eines Produkts oder selbst als Produkt bestimmungsgemäß verwendet werden sollen.",
|
||||
"regulation_code": "AI_ACT",
|
||||
"article": "6",
|
||||
"paragraph": "1"
|
||||
},
|
||||
{
|
||||
"content": "Art. 21 NIS2: Risikomanagementmaßnahmen im Bereich der Cybersicherheit. Die Mitgliedstaaten stellen sicher, dass wesentliche und wichtige Einrichtungen geeignete und verhältnismäßige technische, operative und organisatorische Maßnahmen ergreifen.",
|
||||
"regulation_code": "NIS2",
|
||||
"article": "21",
|
||||
"paragraph": "1"
|
||||
}
|
||||
]
|
||||
|
||||
# Generate embeddings and index
|
||||
points = []
|
||||
for i, chunk in enumerate(sample_chunks):
|
||||
embedding = await self._get_embedding(chunk["content"])
|
||||
points.append(PointStruct(
|
||||
id=i,
|
||||
vector=embedding,
|
||||
payload=chunk
|
||||
))
|
||||
|
||||
self.qdrant.upsert(
|
||||
collection_name=self.collection,
|
||||
points=points
|
||||
)
|
||||
|
||||
logger.info("Indexed legal corpus", chunks=len(points))
|
||||
|
||||
async def _get_embedding(self, text: str) -> List[float]:
|
||||
"""Get embedding for text using Ollama."""
|
||||
try:
|
||||
async with httpx.AsyncClient() as client:
|
||||
response = await client.post(
|
||||
f"{self.settings.ollama_url}/api/embeddings",
|
||||
json={
|
||||
"model": self.settings.embedding_model,
|
||||
"prompt": text
|
||||
},
|
||||
timeout=30.0
|
||||
)
|
||||
response.raise_for_status()
|
||||
return response.json()["embedding"]
|
||||
except Exception as e:
|
||||
logger.error("Embedding failed", error=str(e))
|
||||
# Return zero vector as fallback
|
||||
return [0.0] * 1024
|
||||
|
||||
async def search(
|
||||
self,
|
||||
query: str,
|
||||
regulation_codes: Optional[List[str]] = None,
|
||||
limit: int = 10,
|
||||
min_score: float = 0.7
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Perform semantic search."""
|
||||
# Get query embedding
|
||||
query_embedding = await self._get_embedding(query)
|
||||
|
||||
# Build filter
|
||||
search_filter = None
|
||||
if regulation_codes:
|
||||
search_filter = Filter(
|
||||
should=[
|
||||
FieldCondition(
|
||||
key="regulation_code",
|
||||
match=MatchValue(value=code)
|
||||
)
|
||||
for code in regulation_codes
|
||||
]
|
||||
)
|
||||
|
||||
# Search
|
||||
results = self.qdrant.search(
|
||||
collection_name=self.collection,
|
||||
query_vector=query_embedding,
|
||||
query_filter=search_filter,
|
||||
limit=limit,
|
||||
score_threshold=min_score
|
||||
)
|
||||
|
||||
return [
|
||||
{
|
||||
"content": hit.payload.get("content", ""),
|
||||
"regulation_code": hit.payload.get("regulation_code", ""),
|
||||
"article": hit.payload.get("article"),
|
||||
"paragraph": hit.payload.get("paragraph"),
|
||||
"score": hit.score,
|
||||
"metadata": hit.payload
|
||||
}
|
||||
for hit in results
|
||||
]
|
||||
|
||||
def get_regulations(self) -> List[Dict]:
|
||||
"""Get list of available regulations."""
|
||||
return [
|
||||
{
|
||||
"code": reg["code"],
|
||||
"name": reg["name"],
|
||||
"chunks": reg["chunks"],
|
||||
"last_updated": reg["effective"]
|
||||
}
|
||||
for reg in self.regulations.values()
|
||||
]
|
||||
|
||||
def get_regulation(self, code: str) -> Optional[Dict]:
|
||||
"""Get details of a specific regulation."""
|
||||
return self.regulations.get(code)
|
||||
@@ -0,0 +1,31 @@
|
||||
# BreakPilot Compliance SDK - RAG Service Dependencies
|
||||
|
||||
# Web Framework
|
||||
fastapi==0.109.0
|
||||
uvicorn[standard]==0.27.0
|
||||
python-multipart==0.0.6
|
||||
|
||||
# Vector Database
|
||||
qdrant-client==1.7.0
|
||||
|
||||
# LLM & Embeddings
|
||||
httpx==0.26.0
|
||||
ollama==0.1.6
|
||||
|
||||
# Document Processing
|
||||
pypdf==3.17.4
|
||||
python-docx==1.1.0
|
||||
beautifulsoup4==4.12.3
|
||||
markdown==3.5.2
|
||||
|
||||
# Text Processing
|
||||
tiktoken==0.5.2
|
||||
langchain-text-splitters==0.0.1
|
||||
|
||||
# Utilities
|
||||
pydantic==2.5.3
|
||||
pydantic-settings==2.1.0
|
||||
python-dotenv==1.0.0
|
||||
|
||||
# Logging & Monitoring
|
||||
structlog==24.1.0
|
||||
Reference in New Issue
Block a user