""" Document Service for RAG Handles document upload, processing, and indexing. """ import uuid from typing import Optional, Dict, Any from fastapi import UploadFile import structlog logger = structlog.get_logger() class DocumentService: """Service for document processing and indexing.""" def __init__(self, settings): self.settings = settings self.documents: Dict[str, Dict] = {} async def process_upload( self, file: UploadFile, regulation_code: Optional[str] = None ) -> Dict[str, Any]: """Process and index an uploaded document.""" doc_id = str(uuid.uuid4()) # Read file content content = await file.read() # Determine file type and extract text filename = file.filename or "unknown" if filename.endswith(".pdf"): text = await self._extract_pdf(content) elif filename.endswith(".docx"): text = await self._extract_docx(content) elif filename.endswith(".md"): text = await self._extract_markdown(content) else: text = content.decode("utf-8", errors="ignore") # Chunk the text chunks = self._chunk_text(text) # Store document metadata self.documents[doc_id] = { "id": doc_id, "filename": filename, "regulation_code": regulation_code or "CUSTOM", "chunks": len(chunks), "text_length": len(text) } # TODO: Index chunks in Qdrant logger.info("Document processed", doc_id=doc_id, filename=filename, chunks=len(chunks)) return { "id": doc_id, "filename": filename, "chunks": len(chunks) } async def _extract_pdf(self, content: bytes) -> str: """Extract text from PDF.""" try: from pypdf import PdfReader from io import BytesIO reader = PdfReader(BytesIO(content)) text = "" for page in reader.pages: text += page.extract_text() + "\n" return text except Exception as e: logger.error("PDF extraction failed", error=str(e)) return "" async def _extract_docx(self, content: bytes) -> str: """Extract text from DOCX.""" try: from docx import Document from io import BytesIO doc = Document(BytesIO(content)) text = "" for para in doc.paragraphs: text += para.text + "\n" return text except Exception as e: logger.error("DOCX extraction failed", error=str(e)) return "" async def _extract_markdown(self, content: bytes) -> str: """Extract text from Markdown.""" try: import markdown from bs4 import BeautifulSoup html = markdown.markdown(content.decode("utf-8")) soup = BeautifulSoup(html, "html.parser") return soup.get_text() except Exception as e: logger.error("Markdown extraction failed", error=str(e)) return content.decode("utf-8", errors="ignore") def _chunk_text(self, text: str) -> list: """Split text into chunks.""" chunk_size = self.settings.chunk_size chunk_overlap = self.settings.chunk_overlap chunks = [] start = 0 while start < len(text): end = start + chunk_size # Try to break at sentence boundary if end < len(text): # Look for sentence end within overlap window search_start = max(end - chunk_overlap, start) search_text = text[search_start:end + chunk_overlap] for sep in [". ", ".\n", "! ", "? "]: last_sep = search_text.rfind(sep) if last_sep > 0: end = search_start + last_sep + len(sep) break chunk = text[start:end].strip() if chunk: chunks.append(chunk) start = end - chunk_overlap return chunks async def delete(self, document_id: str) -> bool: """Delete a document and its chunks.""" if document_id in self.documents: del self.documents[document_id] # TODO: Delete from Qdrant logger.info("Document deleted", doc_id=document_id) return True return False def get_document(self, document_id: str) -> Optional[Dict]: """Get document metadata.""" return self.documents.get(document_id)