Initial commit: breakpilot-compliance - Compliance SDK Platform

Services: Admin-Compliance, Backend-Compliance,
AI-Compliance-SDK, Consent-SDK, Developer-Portal,
PCA-Platform, DSMS

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Boenisch
2026-02-11 23:47:28 +01:00
commit 4435e7ea0a
734 changed files with 251369 additions and 0 deletions

View File

@@ -0,0 +1,7 @@
"""RAG module for BreakPilot Compliance SDK."""
from .search import SearchService
from .assistant import AssistantService
from .documents import DocumentService
__all__ = ["SearchService", "AssistantService", "DocumentService"]

View File

@@ -0,0 +1,139 @@
"""
Assistant Service for RAG
Handles Q&A using LLM with retrieved context.
"""
import httpx
from typing import List, Optional, Dict, Any
import structlog
from .search import SearchService
logger = structlog.get_logger()
SYSTEM_PROMPT = """Du bist ein Experte für Datenschutz- und Compliance-Recht.
Beantworte Fragen basierend auf den bereitgestellten Rechtstexten.
Zitiere immer die relevanten Artikel und Paragraphen.
Antworte auf Deutsch.
Wenn du dir nicht sicher bist, sage das klar.
"""
class AssistantService:
"""Service for legal Q&A using RAG."""
def __init__(self, settings):
self.settings = settings
self.search_service = SearchService(settings)
async def ask(
self,
question: str,
context: Optional[str] = None,
regulation_codes: Optional[List[str]] = None,
include_citations: bool = True
) -> Dict[str, Any]:
"""Answer a legal question using RAG."""
# Search for relevant context
search_results = await self.search_service.search(
query=question,
regulation_codes=regulation_codes,
limit=5,
min_score=0.6
)
# Build context from search results
retrieved_context = "\n\n".join([
f"[{r['regulation_code']} Art. {r['article']}]: {r['content']}"
for r in search_results
])
# Add user-provided context if any
if context:
retrieved_context = f"{context}\n\n{retrieved_context}"
# Build prompt
prompt = f"""Kontext aus Rechtstexten:
{retrieved_context}
Frage: {question}
Beantworte die Frage basierend auf dem Kontext. Zitiere relevante Artikel."""
# Generate answer
answer = await self._generate_response(prompt)
# Extract citations
citations = []
if include_citations:
for result in search_results:
citations.append({
"regulation_code": result["regulation_code"],
"article": result.get("article", ""),
"text": result["content"][:200] + "...",
"relevance": result["score"]
})
return {
"answer": answer,
"citations": citations,
"confidence": self._calculate_confidence(search_results)
}
async def _generate_response(self, prompt: str) -> str:
"""Generate response using Ollama."""
try:
async with httpx.AsyncClient() as client:
response = await client.post(
f"{self.settings.ollama_url}/api/generate",
json={
"model": self.settings.llm_model,
"prompt": prompt,
"system": SYSTEM_PROMPT,
"stream": False,
"options": {
"temperature": 0.3,
"top_p": 0.9
}
},
timeout=120.0
)
response.raise_for_status()
return response.json()["response"]
except httpx.TimeoutException:
logger.error("LLM request timed out")
return "Die Anfrage hat zu lange gedauert. Bitte versuchen Sie es erneut."
except Exception as e:
logger.error("LLM generation failed", error=str(e))
# Return fallback response
return self._generate_fallback_response(prompt)
def _generate_fallback_response(self, prompt: str) -> str:
"""Generate a fallback response without LLM."""
return """Basierend auf den verfügbaren Rechtstexten:
Die relevanten Regelungen finden sich in den zitierten Artikeln.
Für eine detaillierte rechtliche Bewertung empfehle ich die Konsultation
der vollständigen Gesetzestexte oder eines Rechtsbeistands.
Hinweis: Dies ist eine automatisch generierte Antwort.
Der LLM-Dienst war nicht verfügbar."""
def _calculate_confidence(self, search_results: List[Dict]) -> float:
"""Calculate confidence score based on search results."""
if not search_results:
return 0.3
# Average relevance score
avg_score = sum(r["score"] for r in search_results) / len(search_results)
# Adjust based on number of results
if len(search_results) >= 3:
confidence = avg_score * 1.1
else:
confidence = avg_score * 0.9
return min(confidence, 1.0)

View File

@@ -0,0 +1,153 @@
"""
Document Service for RAG
Handles document upload, processing, and indexing.
"""
import uuid
from typing import Optional, Dict, Any
from fastapi import UploadFile
import structlog
logger = structlog.get_logger()
class DocumentService:
"""Service for document processing and indexing."""
def __init__(self, settings):
self.settings = settings
self.documents: Dict[str, Dict] = {}
async def process_upload(
self,
file: UploadFile,
regulation_code: Optional[str] = None
) -> Dict[str, Any]:
"""Process and index an uploaded document."""
doc_id = str(uuid.uuid4())
# Read file content
content = await file.read()
# Determine file type and extract text
filename = file.filename or "unknown"
if filename.endswith(".pdf"):
text = await self._extract_pdf(content)
elif filename.endswith(".docx"):
text = await self._extract_docx(content)
elif filename.endswith(".md"):
text = await self._extract_markdown(content)
else:
text = content.decode("utf-8", errors="ignore")
# Chunk the text
chunks = self._chunk_text(text)
# Store document metadata
self.documents[doc_id] = {
"id": doc_id,
"filename": filename,
"regulation_code": regulation_code or "CUSTOM",
"chunks": len(chunks),
"text_length": len(text)
}
# TODO: Index chunks in Qdrant
logger.info("Document processed",
doc_id=doc_id,
filename=filename,
chunks=len(chunks))
return {
"id": doc_id,
"filename": filename,
"chunks": len(chunks)
}
async def _extract_pdf(self, content: bytes) -> str:
"""Extract text from PDF."""
try:
from pypdf import PdfReader
from io import BytesIO
reader = PdfReader(BytesIO(content))
text = ""
for page in reader.pages:
text += page.extract_text() + "\n"
return text
except Exception as e:
logger.error("PDF extraction failed", error=str(e))
return ""
async def _extract_docx(self, content: bytes) -> str:
"""Extract text from DOCX."""
try:
from docx import Document
from io import BytesIO
doc = Document(BytesIO(content))
text = ""
for para in doc.paragraphs:
text += para.text + "\n"
return text
except Exception as e:
logger.error("DOCX extraction failed", error=str(e))
return ""
async def _extract_markdown(self, content: bytes) -> str:
"""Extract text from Markdown."""
try:
import markdown
from bs4 import BeautifulSoup
html = markdown.markdown(content.decode("utf-8"))
soup = BeautifulSoup(html, "html.parser")
return soup.get_text()
except Exception as e:
logger.error("Markdown extraction failed", error=str(e))
return content.decode("utf-8", errors="ignore")
def _chunk_text(self, text: str) -> list:
"""Split text into chunks."""
chunk_size = self.settings.chunk_size
chunk_overlap = self.settings.chunk_overlap
chunks = []
start = 0
while start < len(text):
end = start + chunk_size
# Try to break at sentence boundary
if end < len(text):
# Look for sentence end within overlap window
search_start = max(end - chunk_overlap, start)
search_text = text[search_start:end + chunk_overlap]
for sep in [". ", ".\n", "! ", "? "]:
last_sep = search_text.rfind(sep)
if last_sep > 0:
end = search_start + last_sep + len(sep)
break
chunk = text[start:end].strip()
if chunk:
chunks.append(chunk)
start = end - chunk_overlap
return chunks
async def delete(self, document_id: str) -> bool:
"""Delete a document and its chunks."""
if document_id in self.documents:
del self.documents[document_id]
# TODO: Delete from Qdrant
logger.info("Document deleted", doc_id=document_id)
return True
return False
def get_document(self, document_id: str) -> Optional[Dict]:
"""Get document metadata."""
return self.documents.get(document_id)

View File

@@ -0,0 +1,235 @@
"""
Search Service for RAG
Handles semantic search across legal documents using Qdrant and embeddings.
"""
import httpx
from typing import List, Optional, Dict, Any
from qdrant_client import QdrantClient
from qdrant_client.models import (
Distance, VectorParams, PointStruct,
Filter, FieldCondition, MatchValue
)
import structlog
logger = structlog.get_logger()
class SearchService:
"""Service for semantic search across legal documents."""
def __init__(self, settings):
self.settings = settings
self.qdrant = QdrantClient(url=settings.qdrant_url)
self.collection = settings.qdrant_collection
self.regulations: Dict[str, Dict] = {}
self.total_chunks = 0
async def initialize(self):
"""Initialize the search service and load legal corpus."""
# Ensure collection exists
try:
self.qdrant.get_collection(self.collection)
logger.info("Using existing collection", collection=self.collection)
except Exception:
# Create collection
self.qdrant.create_collection(
collection_name=self.collection,
vectors_config=VectorParams(
size=1024, # bge-m3 dimension
distance=Distance.COSINE
)
)
logger.info("Created collection", collection=self.collection)
# Load built-in regulations metadata
self._load_regulations_metadata()
# Index legal corpus if empty
info = self.qdrant.get_collection(self.collection)
if info.points_count == 0:
await self._index_legal_corpus()
self.total_chunks = info.points_count
def _load_regulations_metadata(self):
"""Load metadata for available regulations."""
self.regulations = {
"DSGVO": {
"code": "DSGVO",
"name": "Datenschutz-Grundverordnung",
"full_name": "Verordnung (EU) 2016/679",
"effective": "2018-05-25",
"chunks": 99,
"articles": list(range(1, 100))
},
"AI_ACT": {
"code": "AI_ACT",
"name": "EU AI Act",
"full_name": "Verordnung über Künstliche Intelligenz",
"effective": "2025-02-02",
"chunks": 85,
"articles": list(range(1, 114))
},
"NIS2": {
"code": "NIS2",
"name": "NIS 2 Directive",
"full_name": "Richtlinie (EU) 2022/2555",
"effective": "2024-10-17",
"chunks": 46,
"articles": list(range(1, 47))
},
"TDDDG": {
"code": "TDDDG",
"name": "TDDDG",
"full_name": "Telekommunikation-Digitale-Dienste-Datenschutz-Gesetz",
"effective": "2021-12-01",
"chunks": 30,
"articles": list(range(1, 31))
},
"BDSG": {
"code": "BDSG",
"name": "BDSG",
"full_name": "Bundesdatenschutzgesetz",
"effective": "2018-05-25",
"chunks": 86,
"articles": list(range(1, 87))
}
}
async def _index_legal_corpus(self):
"""Index the legal corpus into Qdrant."""
logger.info("Indexing legal corpus...")
# Sample chunks for demonstration
# In production, this would load actual legal documents
sample_chunks = [
{
"content": "Art. 9 Abs. 1 DSGVO: Die Verarbeitung personenbezogener Daten, aus denen die rassische und ethnische Herkunft, politische Meinungen, religiöse oder weltanschauliche Überzeugungen oder die Gewerkschaftszugehörigkeit hervorgehen, sowie die Verarbeitung von genetischen Daten, biometrischen Daten zur eindeutigen Identifizierung einer natürlichen Person, Gesundheitsdaten oder Daten zum Sexualleben oder der sexuellen Orientierung einer natürlichen Person ist untersagt.",
"regulation_code": "DSGVO",
"article": "9",
"paragraph": "1"
},
{
"content": "Art. 6 Abs. 1 DSGVO: Die Verarbeitung ist nur rechtmäßig, wenn mindestens eine der nachstehenden Bedingungen erfüllt ist: a) Die betroffene Person hat ihre Einwilligung zu der Verarbeitung der sie betreffenden personenbezogenen Daten für einen oder mehrere bestimmte Zwecke gegeben.",
"regulation_code": "DSGVO",
"article": "6",
"paragraph": "1"
},
{
"content": "Art. 32 DSGVO: Unter Berücksichtigung des Stands der Technik, der Implementierungskosten und der Art, des Umfangs, der Umstände und der Zwecke der Verarbeitung sowie der unterschiedlichen Eintrittswahrscheinlichkeit und Schwere des Risikos für die Rechte und Freiheiten natürlicher Personen treffen der Verantwortliche und der Auftragsverarbeiter geeignete technische und organisatorische Maßnahmen.",
"regulation_code": "DSGVO",
"article": "32",
"paragraph": "1"
},
{
"content": "Art. 6 AI Act: Hochrisiko-KI-Systeme. Als Hochrisiko-KI-Systeme gelten KI-Systeme, die als Sicherheitskomponente eines Produkts oder selbst als Produkt bestimmungsgemäß verwendet werden sollen.",
"regulation_code": "AI_ACT",
"article": "6",
"paragraph": "1"
},
{
"content": "Art. 21 NIS2: Risikomanagementmaßnahmen im Bereich der Cybersicherheit. Die Mitgliedstaaten stellen sicher, dass wesentliche und wichtige Einrichtungen geeignete und verhältnismäßige technische, operative und organisatorische Maßnahmen ergreifen.",
"regulation_code": "NIS2",
"article": "21",
"paragraph": "1"
}
]
# Generate embeddings and index
points = []
for i, chunk in enumerate(sample_chunks):
embedding = await self._get_embedding(chunk["content"])
points.append(PointStruct(
id=i,
vector=embedding,
payload=chunk
))
self.qdrant.upsert(
collection_name=self.collection,
points=points
)
logger.info("Indexed legal corpus", chunks=len(points))
async def _get_embedding(self, text: str) -> List[float]:
"""Get embedding for text using Ollama."""
try:
async with httpx.AsyncClient() as client:
response = await client.post(
f"{self.settings.ollama_url}/api/embeddings",
json={
"model": self.settings.embedding_model,
"prompt": text
},
timeout=30.0
)
response.raise_for_status()
return response.json()["embedding"]
except Exception as e:
logger.error("Embedding failed", error=str(e))
# Return zero vector as fallback
return [0.0] * 1024
async def search(
self,
query: str,
regulation_codes: Optional[List[str]] = None,
limit: int = 10,
min_score: float = 0.7
) -> List[Dict[str, Any]]:
"""Perform semantic search."""
# Get query embedding
query_embedding = await self._get_embedding(query)
# Build filter
search_filter = None
if regulation_codes:
search_filter = Filter(
should=[
FieldCondition(
key="regulation_code",
match=MatchValue(value=code)
)
for code in regulation_codes
]
)
# Search
results = self.qdrant.search(
collection_name=self.collection,
query_vector=query_embedding,
query_filter=search_filter,
limit=limit,
score_threshold=min_score
)
return [
{
"content": hit.payload.get("content", ""),
"regulation_code": hit.payload.get("regulation_code", ""),
"article": hit.payload.get("article"),
"paragraph": hit.payload.get("paragraph"),
"score": hit.score,
"metadata": hit.payload
}
for hit in results
]
def get_regulations(self) -> List[Dict]:
"""Get list of available regulations."""
return [
{
"code": reg["code"],
"name": reg["name"],
"chunks": reg["chunks"],
"last_updated": reg["effective"]
}
for reg in self.regulations.values()
]
def get_regulation(self, code: str) -> Optional[Dict]:
"""Get details of a specific regulation."""
return self.regulations.get(code)