Initial commit: breakpilot-compliance - Compliance SDK Platform

Services: Admin-Compliance, Backend-Compliance, AI-Compliance-SDK, Consent-SDK, Developer-Portal, PCA-Platform, DSMS Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-11 23:47:28 +01:00
commit 4435e7ea0a
734 changed files with 251369 additions and 0 deletions
@@ -0,0 +1,153 @@
+"""
+Document Service for RAG
+
+Handles document upload, processing, and indexing.
+"""
+
+import uuid
+from typing import Optional, Dict, Any
+from fastapi import UploadFile
+import structlog
+
+logger = structlog.get_logger()
+
+
+class DocumentService:
+    """Service for document processing and indexing."""
+
+    def __init__(self, settings):
+        self.settings = settings
+        self.documents: Dict[str, Dict] = {}
+
+    async def process_upload(
+        self,
+        file: UploadFile,
+        regulation_code: Optional[str] = None
+    ) -> Dict[str, Any]:
+        """Process and index an uploaded document."""
+        doc_id = str(uuid.uuid4())
+
+        # Read file content
+        content = await file.read()
+
+        # Determine file type and extract text
+        filename = file.filename or "unknown"
+        if filename.endswith(".pdf"):
+            text = await self._extract_pdf(content)
+        elif filename.endswith(".docx"):
+            text = await self._extract_docx(content)
+        elif filename.endswith(".md"):
+            text = await self._extract_markdown(content)
+        else:
+            text = content.decode("utf-8", errors="ignore")
+
+        # Chunk the text
+        chunks = self._chunk_text(text)
+
+        # Store document metadata
+        self.documents[doc_id] = {
+            "id": doc_id,
+            "filename": filename,
+            "regulation_code": regulation_code or "CUSTOM",
+            "chunks": len(chunks),
+            "text_length": len(text)
+        }
+
+        # TODO: Index chunks in Qdrant
+        logger.info("Document processed",
+                   doc_id=doc_id,
+                   filename=filename,
+                   chunks=len(chunks))
+
+        return {
+            "id": doc_id,
+            "filename": filename,
+            "chunks": len(chunks)
+        }
+
+    async def _extract_pdf(self, content: bytes) -> str:
+        """Extract text from PDF."""
+        try:
+            from pypdf import PdfReader
+            from io import BytesIO
+
+            reader = PdfReader(BytesIO(content))
+            text = ""
+            for page in reader.pages:
+                text += page.extract_text() + "\n"
+            return text
+        except Exception as e:
+            logger.error("PDF extraction failed", error=str(e))
+            return ""
+
+    async def _extract_docx(self, content: bytes) -> str:
+        """Extract text from DOCX."""
+        try:
+            from docx import Document
+            from io import BytesIO
+
+            doc = Document(BytesIO(content))
+            text = ""
+            for para in doc.paragraphs:
+                text += para.text + "\n"
+            return text
+        except Exception as e:
+            logger.error("DOCX extraction failed", error=str(e))
+            return ""
+
+    async def _extract_markdown(self, content: bytes) -> str:
+        """Extract text from Markdown."""
+        try:
+            import markdown
+            from bs4 import BeautifulSoup
+
+            html = markdown.markdown(content.decode("utf-8"))
+            soup = BeautifulSoup(html, "html.parser")
+            return soup.get_text()
+        except Exception as e:
+            logger.error("Markdown extraction failed", error=str(e))
+            return content.decode("utf-8", errors="ignore")
+
+    def _chunk_text(self, text: str) -> list:
+        """Split text into chunks."""
+        chunk_size = self.settings.chunk_size
+        chunk_overlap = self.settings.chunk_overlap
+
+        chunks = []
+        start = 0
+
+        while start < len(text):
+            end = start + chunk_size
+
+            # Try to break at sentence boundary
+            if end < len(text):
+                # Look for sentence end within overlap window
+                search_start = max(end - chunk_overlap, start)
+                search_text = text[search_start:end + chunk_overlap]
+
+                for sep in [". ", ".\n", "! ", "? "]:
+                    last_sep = search_text.rfind(sep)
+                    if last_sep > 0:
+                        end = search_start + last_sep + len(sep)
+                        break
+
+            chunk = text[start:end].strip()
+            if chunk:
+                chunks.append(chunk)
+
+            start = end - chunk_overlap
+
+        return chunks
+
+    async def delete(self, document_id: str) -> bool:
+        """Delete a document and its chunks."""
+        if document_id in self.documents:
+            del self.documents[document_id]
+            # TODO: Delete from Qdrant
+            logger.info("Document deleted", doc_id=document_id)
+            return True
+        return False
+
+    def get_document(self, document_id: str) -> Optional[Dict]:
+        """Get document metadata."""
+        return self.documents.get(document_id)