Files
breakpilot-compliance/breakpilot-compliance-sdk/services/rag-service/rag/documents.py
Benjamin Boenisch 4435e7ea0a Initial commit: breakpilot-compliance - Compliance SDK Platform
Services: Admin-Compliance, Backend-Compliance,
AI-Compliance-SDK, Consent-SDK, Developer-Portal,
PCA-Platform, DSMS

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-11 23:47:28 +01:00

154 lines
4.6 KiB
Python

"""
Document Service for RAG
Handles document upload, processing, and indexing.
"""
import uuid
from typing import Optional, Dict, Any
from fastapi import UploadFile
import structlog
logger = structlog.get_logger()
class DocumentService:
"""Service for document processing and indexing."""
def __init__(self, settings):
self.settings = settings
self.documents: Dict[str, Dict] = {}
async def process_upload(
self,
file: UploadFile,
regulation_code: Optional[str] = None
) -> Dict[str, Any]:
"""Process and index an uploaded document."""
doc_id = str(uuid.uuid4())
# Read file content
content = await file.read()
# Determine file type and extract text
filename = file.filename or "unknown"
if filename.endswith(".pdf"):
text = await self._extract_pdf(content)
elif filename.endswith(".docx"):
text = await self._extract_docx(content)
elif filename.endswith(".md"):
text = await self._extract_markdown(content)
else:
text = content.decode("utf-8", errors="ignore")
# Chunk the text
chunks = self._chunk_text(text)
# Store document metadata
self.documents[doc_id] = {
"id": doc_id,
"filename": filename,
"regulation_code": regulation_code or "CUSTOM",
"chunks": len(chunks),
"text_length": len(text)
}
# TODO: Index chunks in Qdrant
logger.info("Document processed",
doc_id=doc_id,
filename=filename,
chunks=len(chunks))
return {
"id": doc_id,
"filename": filename,
"chunks": len(chunks)
}
async def _extract_pdf(self, content: bytes) -> str:
"""Extract text from PDF."""
try:
from pypdf import PdfReader
from io import BytesIO
reader = PdfReader(BytesIO(content))
text = ""
for page in reader.pages:
text += page.extract_text() + "\n"
return text
except Exception as e:
logger.error("PDF extraction failed", error=str(e))
return ""
async def _extract_docx(self, content: bytes) -> str:
"""Extract text from DOCX."""
try:
from docx import Document
from io import BytesIO
doc = Document(BytesIO(content))
text = ""
for para in doc.paragraphs:
text += para.text + "\n"
return text
except Exception as e:
logger.error("DOCX extraction failed", error=str(e))
return ""
async def _extract_markdown(self, content: bytes) -> str:
"""Extract text from Markdown."""
try:
import markdown
from bs4 import BeautifulSoup
html = markdown.markdown(content.decode("utf-8"))
soup = BeautifulSoup(html, "html.parser")
return soup.get_text()
except Exception as e:
logger.error("Markdown extraction failed", error=str(e))
return content.decode("utf-8", errors="ignore")
def _chunk_text(self, text: str) -> list:
"""Split text into chunks."""
chunk_size = self.settings.chunk_size
chunk_overlap = self.settings.chunk_overlap
chunks = []
start = 0
while start < len(text):
end = start + chunk_size
# Try to break at sentence boundary
if end < len(text):
# Look for sentence end within overlap window
search_start = max(end - chunk_overlap, start)
search_text = text[search_start:end + chunk_overlap]
for sep in [". ", ".\n", "! ", "? "]:
last_sep = search_text.rfind(sep)
if last_sep > 0:
end = search_start + last_sep + len(sep)
break
chunk = text[start:end].strip()
if chunk:
chunks.append(chunk)
start = end - chunk_overlap
return chunks
async def delete(self, document_id: str) -> bool:
"""Delete a document and its chunks."""
if document_id in self.documents:
del self.documents[document_id]
# TODO: Delete from Qdrant
logger.info("Document deleted", doc_id=document_id)
return True
return False
def get_document(self, document_id: str) -> Optional[Dict]:
"""Get document metadata."""
return self.documents.get(document_id)