This repository has been archived on 2026-02-15. You can view files and clone it. You cannot open issues or pull requests or push a commit.
Files
BreakPilot Dev 19855efacc
Some checks failed
Tests / Go Tests (push) Has been cancelled
Tests / Python Tests (push) Has been cancelled
Tests / Integration Tests (push) Has been cancelled
Tests / Go Lint (push) Has been cancelled
Tests / Python Lint (push) Has been cancelled
Tests / Security Scan (push) Has been cancelled
Tests / All Checks Passed (push) Has been cancelled
Security Scanning / Secret Scanning (push) Has been cancelled
Security Scanning / Dependency Vulnerability Scan (push) Has been cancelled
Security Scanning / Go Security Scan (push) Has been cancelled
Security Scanning / Python Security Scan (push) Has been cancelled
Security Scanning / Node.js Security Scan (push) Has been cancelled
Security Scanning / Docker Image Security (push) Has been cancelled
Security Scanning / Security Summary (push) Has been cancelled
CI/CD Pipeline / Go Tests (push) Has been cancelled
CI/CD Pipeline / Python Tests (push) Has been cancelled
CI/CD Pipeline / Website Tests (push) Has been cancelled
CI/CD Pipeline / Linting (push) Has been cancelled
CI/CD Pipeline / Security Scan (push) Has been cancelled
CI/CD Pipeline / Docker Build & Push (push) Has been cancelled
CI/CD Pipeline / Integration Tests (push) Has been cancelled
CI/CD Pipeline / Deploy to Staging (push) Has been cancelled
CI/CD Pipeline / Deploy to Production (push) Has been cancelled
CI/CD Pipeline / CI Summary (push) Has been cancelled
ci/woodpecker/manual/build-ci-image Pipeline was successful
ci/woodpecker/manual/main Pipeline failed
feat: BreakPilot PWA - Full codebase (clean push without large binaries)
All services: admin-v2, studio-v2, website, ai-compliance-sdk,
consent-service, klausur-service, voice-service, and infrastructure.
Large PDFs and compiled binaries excluded via .gitignore.
2026-02-11 13:25:58 +01:00

154 lines
4.6 KiB
Python

"""
Document Service for RAG
Handles document upload, processing, and indexing.
"""
import uuid
from typing import Optional, Dict, Any
from fastapi import UploadFile
import structlog
logger = structlog.get_logger()
class DocumentService:
"""Service for document processing and indexing."""
def __init__(self, settings):
self.settings = settings
self.documents: Dict[str, Dict] = {}
async def process_upload(
self,
file: UploadFile,
regulation_code: Optional[str] = None
) -> Dict[str, Any]:
"""Process and index an uploaded document."""
doc_id = str(uuid.uuid4())
# Read file content
content = await file.read()
# Determine file type and extract text
filename = file.filename or "unknown"
if filename.endswith(".pdf"):
text = await self._extract_pdf(content)
elif filename.endswith(".docx"):
text = await self._extract_docx(content)
elif filename.endswith(".md"):
text = await self._extract_markdown(content)
else:
text = content.decode("utf-8", errors="ignore")
# Chunk the text
chunks = self._chunk_text(text)
# Store document metadata
self.documents[doc_id] = {
"id": doc_id,
"filename": filename,
"regulation_code": regulation_code or "CUSTOM",
"chunks": len(chunks),
"text_length": len(text)
}
# TODO: Index chunks in Qdrant
logger.info("Document processed",
doc_id=doc_id,
filename=filename,
chunks=len(chunks))
return {
"id": doc_id,
"filename": filename,
"chunks": len(chunks)
}
async def _extract_pdf(self, content: bytes) -> str:
"""Extract text from PDF."""
try:
from pypdf import PdfReader
from io import BytesIO
reader = PdfReader(BytesIO(content))
text = ""
for page in reader.pages:
text += page.extract_text() + "\n"
return text
except Exception as e:
logger.error("PDF extraction failed", error=str(e))
return ""
async def _extract_docx(self, content: bytes) -> str:
"""Extract text from DOCX."""
try:
from docx import Document
from io import BytesIO
doc = Document(BytesIO(content))
text = ""
for para in doc.paragraphs:
text += para.text + "\n"
return text
except Exception as e:
logger.error("DOCX extraction failed", error=str(e))
return ""
async def _extract_markdown(self, content: bytes) -> str:
"""Extract text from Markdown."""
try:
import markdown
from bs4 import BeautifulSoup
html = markdown.markdown(content.decode("utf-8"))
soup = BeautifulSoup(html, "html.parser")
return soup.get_text()
except Exception as e:
logger.error("Markdown extraction failed", error=str(e))
return content.decode("utf-8", errors="ignore")
def _chunk_text(self, text: str) -> list:
"""Split text into chunks."""
chunk_size = self.settings.chunk_size
chunk_overlap = self.settings.chunk_overlap
chunks = []
start = 0
while start < len(text):
end = start + chunk_size
# Try to break at sentence boundary
if end < len(text):
# Look for sentence end within overlap window
search_start = max(end - chunk_overlap, start)
search_text = text[search_start:end + chunk_overlap]
for sep in [". ", ".\n", "! ", "? "]:
last_sep = search_text.rfind(sep)
if last_sep > 0:
end = search_start + last_sep + len(sep)
break
chunk = text[start:end].strip()
if chunk:
chunks.append(chunk)
start = end - chunk_overlap
return chunks
async def delete(self, document_id: str) -> bool:
"""Delete a document and its chunks."""
if document_id in self.documents:
del self.documents[document_id]
# TODO: Delete from Qdrant
logger.info("Document deleted", doc_id=document_id)
return True
return False
def get_document(self, document_id: str) -> Optional[Dict]:
"""Get document metadata."""
return self.documents.get(document_id)