feat: BreakPilot PWA - Full codebase (clean push without large binaries)
Some checks failed
Tests / Go Tests (push) Has been cancelled
Tests / Python Tests (push) Has been cancelled
Tests / Integration Tests (push) Has been cancelled
Tests / Go Lint (push) Has been cancelled
Tests / Python Lint (push) Has been cancelled
Tests / Security Scan (push) Has been cancelled
Tests / All Checks Passed (push) Has been cancelled
Security Scanning / Secret Scanning (push) Has been cancelled
Security Scanning / Dependency Vulnerability Scan (push) Has been cancelled
Security Scanning / Go Security Scan (push) Has been cancelled
Security Scanning / Python Security Scan (push) Has been cancelled
Security Scanning / Node.js Security Scan (push) Has been cancelled
Security Scanning / Docker Image Security (push) Has been cancelled
Security Scanning / Security Summary (push) Has been cancelled
CI/CD Pipeline / Go Tests (push) Has been cancelled
CI/CD Pipeline / Python Tests (push) Has been cancelled
CI/CD Pipeline / Website Tests (push) Has been cancelled
CI/CD Pipeline / Linting (push) Has been cancelled
CI/CD Pipeline / Security Scan (push) Has been cancelled
CI/CD Pipeline / Docker Build & Push (push) Has been cancelled
CI/CD Pipeline / Integration Tests (push) Has been cancelled
CI/CD Pipeline / Deploy to Staging (push) Has been cancelled
CI/CD Pipeline / Deploy to Production (push) Has been cancelled
CI/CD Pipeline / CI Summary (push) Has been cancelled
ci/woodpecker/manual/build-ci-image Pipeline was successful
ci/woodpecker/manual/main Pipeline failed
Some checks failed
Tests / Go Tests (push) Has been cancelled
Tests / Python Tests (push) Has been cancelled
Tests / Integration Tests (push) Has been cancelled
Tests / Go Lint (push) Has been cancelled
Tests / Python Lint (push) Has been cancelled
Tests / Security Scan (push) Has been cancelled
Tests / All Checks Passed (push) Has been cancelled
Security Scanning / Secret Scanning (push) Has been cancelled
Security Scanning / Dependency Vulnerability Scan (push) Has been cancelled
Security Scanning / Go Security Scan (push) Has been cancelled
Security Scanning / Python Security Scan (push) Has been cancelled
Security Scanning / Node.js Security Scan (push) Has been cancelled
Security Scanning / Docker Image Security (push) Has been cancelled
Security Scanning / Security Summary (push) Has been cancelled
CI/CD Pipeline / Go Tests (push) Has been cancelled
CI/CD Pipeline / Python Tests (push) Has been cancelled
CI/CD Pipeline / Website Tests (push) Has been cancelled
CI/CD Pipeline / Linting (push) Has been cancelled
CI/CD Pipeline / Security Scan (push) Has been cancelled
CI/CD Pipeline / Docker Build & Push (push) Has been cancelled
CI/CD Pipeline / Integration Tests (push) Has been cancelled
CI/CD Pipeline / Deploy to Staging (push) Has been cancelled
CI/CD Pipeline / Deploy to Production (push) Has been cancelled
CI/CD Pipeline / CI Summary (push) Has been cancelled
ci/woodpecker/manual/build-ci-image Pipeline was successful
ci/woodpecker/manual/main Pipeline failed
All services: admin-v2, studio-v2, website, ai-compliance-sdk, consent-service, klausur-service, voice-service, and infrastructure. Large PDFs and compiled binaries excluded via .gitignore.
This commit is contained in:
153
breakpilot-compliance-sdk/services/rag-service/rag/documents.py
Normal file
153
breakpilot-compliance-sdk/services/rag-service/rag/documents.py
Normal file
@@ -0,0 +1,153 @@
|
||||
"""
|
||||
Document Service for RAG
|
||||
|
||||
Handles document upload, processing, and indexing.
|
||||
"""
|
||||
|
||||
import uuid
|
||||
from typing import Optional, Dict, Any
|
||||
from fastapi import UploadFile
|
||||
import structlog
|
||||
|
||||
logger = structlog.get_logger()
|
||||
|
||||
|
||||
class DocumentService:
|
||||
"""Service for document processing and indexing."""
|
||||
|
||||
def __init__(self, settings):
|
||||
self.settings = settings
|
||||
self.documents: Dict[str, Dict] = {}
|
||||
|
||||
async def process_upload(
|
||||
self,
|
||||
file: UploadFile,
|
||||
regulation_code: Optional[str] = None
|
||||
) -> Dict[str, Any]:
|
||||
"""Process and index an uploaded document."""
|
||||
doc_id = str(uuid.uuid4())
|
||||
|
||||
# Read file content
|
||||
content = await file.read()
|
||||
|
||||
# Determine file type and extract text
|
||||
filename = file.filename or "unknown"
|
||||
if filename.endswith(".pdf"):
|
||||
text = await self._extract_pdf(content)
|
||||
elif filename.endswith(".docx"):
|
||||
text = await self._extract_docx(content)
|
||||
elif filename.endswith(".md"):
|
||||
text = await self._extract_markdown(content)
|
||||
else:
|
||||
text = content.decode("utf-8", errors="ignore")
|
||||
|
||||
# Chunk the text
|
||||
chunks = self._chunk_text(text)
|
||||
|
||||
# Store document metadata
|
||||
self.documents[doc_id] = {
|
||||
"id": doc_id,
|
||||
"filename": filename,
|
||||
"regulation_code": regulation_code or "CUSTOM",
|
||||
"chunks": len(chunks),
|
||||
"text_length": len(text)
|
||||
}
|
||||
|
||||
# TODO: Index chunks in Qdrant
|
||||
logger.info("Document processed",
|
||||
doc_id=doc_id,
|
||||
filename=filename,
|
||||
chunks=len(chunks))
|
||||
|
||||
return {
|
||||
"id": doc_id,
|
||||
"filename": filename,
|
||||
"chunks": len(chunks)
|
||||
}
|
||||
|
||||
async def _extract_pdf(self, content: bytes) -> str:
|
||||
"""Extract text from PDF."""
|
||||
try:
|
||||
from pypdf import PdfReader
|
||||
from io import BytesIO
|
||||
|
||||
reader = PdfReader(BytesIO(content))
|
||||
text = ""
|
||||
for page in reader.pages:
|
||||
text += page.extract_text() + "\n"
|
||||
return text
|
||||
except Exception as e:
|
||||
logger.error("PDF extraction failed", error=str(e))
|
||||
return ""
|
||||
|
||||
async def _extract_docx(self, content: bytes) -> str:
|
||||
"""Extract text from DOCX."""
|
||||
try:
|
||||
from docx import Document
|
||||
from io import BytesIO
|
||||
|
||||
doc = Document(BytesIO(content))
|
||||
text = ""
|
||||
for para in doc.paragraphs:
|
||||
text += para.text + "\n"
|
||||
return text
|
||||
except Exception as e:
|
||||
logger.error("DOCX extraction failed", error=str(e))
|
||||
return ""
|
||||
|
||||
async def _extract_markdown(self, content: bytes) -> str:
|
||||
"""Extract text from Markdown."""
|
||||
try:
|
||||
import markdown
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
html = markdown.markdown(content.decode("utf-8"))
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
return soup.get_text()
|
||||
except Exception as e:
|
||||
logger.error("Markdown extraction failed", error=str(e))
|
||||
return content.decode("utf-8", errors="ignore")
|
||||
|
||||
def _chunk_text(self, text: str) -> list:
|
||||
"""Split text into chunks."""
|
||||
chunk_size = self.settings.chunk_size
|
||||
chunk_overlap = self.settings.chunk_overlap
|
||||
|
||||
chunks = []
|
||||
start = 0
|
||||
|
||||
while start < len(text):
|
||||
end = start + chunk_size
|
||||
|
||||
# Try to break at sentence boundary
|
||||
if end < len(text):
|
||||
# Look for sentence end within overlap window
|
||||
search_start = max(end - chunk_overlap, start)
|
||||
search_text = text[search_start:end + chunk_overlap]
|
||||
|
||||
for sep in [". ", ".\n", "! ", "? "]:
|
||||
last_sep = search_text.rfind(sep)
|
||||
if last_sep > 0:
|
||||
end = search_start + last_sep + len(sep)
|
||||
break
|
||||
|
||||
chunk = text[start:end].strip()
|
||||
if chunk:
|
||||
chunks.append(chunk)
|
||||
|
||||
start = end - chunk_overlap
|
||||
|
||||
return chunks
|
||||
|
||||
async def delete(self, document_id: str) -> bool:
|
||||
"""Delete a document and its chunks."""
|
||||
if document_id in self.documents:
|
||||
del self.documents[document_id]
|
||||
# TODO: Delete from Qdrant
|
||||
logger.info("Document deleted", doc_id=document_id)
|
||||
return True
|
||||
return False
|
||||
|
||||
def get_document(self, document_id: str) -> Optional[Dict]:
|
||||
"""Get document metadata."""
|
||||
return self.documents.get(document_id)
|
||||
Reference in New Issue
Block a user