breakpilot-pwa/breakpilot-compliance-sdk/services/rag-service/rag/documents.py

"""
Document Service for RAG

Handles document upload, processing, and indexing.
"""

import uuid
from typing import Optional, Dict, Any
from fastapi import UploadFile
import structlog

logger = structlog.get_logger()


class DocumentService:
    """Service for document processing and indexing."""

    def __init__(self, settings):
        self.settings = settings
        self.documents: Dict[str, Dict] = {}

    async def process_upload(
        self,
        file: UploadFile,
        regulation_code: Optional[str] = None
    ) -> Dict[str, Any]:
        """Process and index an uploaded document."""
        doc_id = str(uuid.uuid4())

        # Read file content
        content = await file.read()

        # Determine file type and extract text
        filename = file.filename or "unknown"
        if filename.endswith(".pdf"):
            text = await self._extract_pdf(content)
        elif filename.endswith(".docx"):
            text = await self._extract_docx(content)
        elif filename.endswith(".md"):
            text = await self._extract_markdown(content)
        else:
            text = content.decode("utf-8", errors="ignore")

        # Chunk the text
        chunks = self._chunk_text(text)

        # Store document metadata
        self.documents[doc_id] = {
            "id": doc_id,
            "filename": filename,
            "regulation_code": regulation_code or "CUSTOM",
            "chunks": len(chunks),
            "text_length": len(text)
        }

        # TODO: Index chunks in Qdrant
        logger.info("Document processed",
                   doc_id=doc_id,
                   filename=filename,
                   chunks=len(chunks))

        return {
            "id": doc_id,
            "filename": filename,
            "chunks": len(chunks)
        }

    async def _extract_pdf(self, content: bytes) -> str:
        """Extract text from PDF."""
        try:
            from pypdf import PdfReader
            from io import BytesIO

            reader = PdfReader(BytesIO(content))
            text = ""
            for page in reader.pages:
                text += page.extract_text() + "\n"
            return text
        except Exception as e:
            logger.error("PDF extraction failed", error=str(e))
            return ""

    async def _extract_docx(self, content: bytes) -> str:
        """Extract text from DOCX."""
        try:
            from docx import Document
            from io import BytesIO

            doc = Document(BytesIO(content))
            text = ""
            for para in doc.paragraphs:
                text += para.text + "\n"
            return text
        except Exception as e:
            logger.error("DOCX extraction failed", error=str(e))
            return ""

    async def _extract_markdown(self, content: bytes) -> str:
        """Extract text from Markdown."""
        try:
            import markdown
            from bs4 import BeautifulSoup

            html = markdown.markdown(content.decode("utf-8"))
            soup = BeautifulSoup(html, "html.parser")
            return soup.get_text()
        except Exception as e:
            logger.error("Markdown extraction failed", error=str(e))
            return content.decode("utf-8", errors="ignore")

    def _chunk_text(self, text: str) -> list:
        """Split text into chunks."""
        chunk_size = self.settings.chunk_size
        chunk_overlap = self.settings.chunk_overlap

        chunks = []
        start = 0

        while start < len(text):
            end = start + chunk_size

            # Try to break at sentence boundary
            if end < len(text):
                # Look for sentence end within overlap window
                search_start = max(end - chunk_overlap, start)
                search_text = text[search_start:end + chunk_overlap]

                for sep in [". ", ".\n", "! ", "? "]:
                    last_sep = search_text.rfind(sep)
                    if last_sep > 0:
                        end = search_start + last_sep + len(sep)
                        break

            chunk = text[start:end].strip()
            if chunk:
                chunks.append(chunk)

            start = end - chunk_overlap

        return chunks

    async def delete(self, document_id: str) -> bool:
        """Delete a document and its chunks."""
        if document_id in self.documents:
            del self.documents[document_id]
            # TODO: Delete from Qdrant
            logger.info("Document deleted", doc_id=document_id)
            return True
        return False

    def get_document(self, document_id: str) -> Optional[Dict]:
        """Get document metadata."""
        return self.documents.get(document_id)