Initial commit: breakpilot-compliance - Compliance SDK Platform

Services: Admin-Compliance, Backend-Compliance,
AI-Compliance-SDK, Consent-SDK, Developer-Portal,
PCA-Platform, DSMS

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Boenisch
2026-02-11 23:47:28 +01:00
commit 4435e7ea0a
734 changed files with 251369 additions and 0 deletions

View File

@@ -0,0 +1,153 @@
"""
Document Service for RAG
Handles document upload, processing, and indexing.
"""
import uuid
from typing import Optional, Dict, Any
from fastapi import UploadFile
import structlog
logger = structlog.get_logger()
class DocumentService:
"""Service for document processing and indexing."""
def __init__(self, settings):
self.settings = settings
self.documents: Dict[str, Dict] = {}
async def process_upload(
self,
file: UploadFile,
regulation_code: Optional[str] = None
) -> Dict[str, Any]:
"""Process and index an uploaded document."""
doc_id = str(uuid.uuid4())
# Read file content
content = await file.read()
# Determine file type and extract text
filename = file.filename or "unknown"
if filename.endswith(".pdf"):
text = await self._extract_pdf(content)
elif filename.endswith(".docx"):
text = await self._extract_docx(content)
elif filename.endswith(".md"):
text = await self._extract_markdown(content)
else:
text = content.decode("utf-8", errors="ignore")
# Chunk the text
chunks = self._chunk_text(text)
# Store document metadata
self.documents[doc_id] = {
"id": doc_id,
"filename": filename,
"regulation_code": regulation_code or "CUSTOM",
"chunks": len(chunks),
"text_length": len(text)
}
# TODO: Index chunks in Qdrant
logger.info("Document processed",
doc_id=doc_id,
filename=filename,
chunks=len(chunks))
return {
"id": doc_id,
"filename": filename,
"chunks": len(chunks)
}
async def _extract_pdf(self, content: bytes) -> str:
"""Extract text from PDF."""
try:
from pypdf import PdfReader
from io import BytesIO
reader = PdfReader(BytesIO(content))
text = ""
for page in reader.pages:
text += page.extract_text() + "\n"
return text
except Exception as e:
logger.error("PDF extraction failed", error=str(e))
return ""
async def _extract_docx(self, content: bytes) -> str:
"""Extract text from DOCX."""
try:
from docx import Document
from io import BytesIO
doc = Document(BytesIO(content))
text = ""
for para in doc.paragraphs:
text += para.text + "\n"
return text
except Exception as e:
logger.error("DOCX extraction failed", error=str(e))
return ""
async def _extract_markdown(self, content: bytes) -> str:
"""Extract text from Markdown."""
try:
import markdown
from bs4 import BeautifulSoup
html = markdown.markdown(content.decode("utf-8"))
soup = BeautifulSoup(html, "html.parser")
return soup.get_text()
except Exception as e:
logger.error("Markdown extraction failed", error=str(e))
return content.decode("utf-8", errors="ignore")
def _chunk_text(self, text: str) -> list:
"""Split text into chunks."""
chunk_size = self.settings.chunk_size
chunk_overlap = self.settings.chunk_overlap
chunks = []
start = 0
while start < len(text):
end = start + chunk_size
# Try to break at sentence boundary
if end < len(text):
# Look for sentence end within overlap window
search_start = max(end - chunk_overlap, start)
search_text = text[search_start:end + chunk_overlap]
for sep in [". ", ".\n", "! ", "? "]:
last_sep = search_text.rfind(sep)
if last_sep > 0:
end = search_start + last_sep + len(sep)
break
chunk = text[start:end].strip()
if chunk:
chunks.append(chunk)
start = end - chunk_overlap
return chunks
async def delete(self, document_id: str) -> bool:
"""Delete a document and its chunks."""
if document_id in self.documents:
del self.documents[document_id]
# TODO: Delete from Qdrant
logger.info("Document deleted", doc_id=document_id)
return True
return False
def get_document(self, document_id: str) -> Optional[Dict]:
"""Get document metadata."""
return self.documents.get(document_id)