This repository has been archived on 2026-02-15. You can view files and clone it. You cannot open issues or pull requests or push a commit.
Files
breakpilot-pwa/breakpilot-compliance-sdk/services/rag-service/rag/documents.py
Benjamin Admin bfdaf63ba9 fix: Restore all files lost during destructive rebase
A previous `git pull --rebase origin main` dropped 177 local commits,
losing 3400+ files across admin-v2, backend, studio-v2, website,
klausur-service, and many other services. The partial restore attempt
(660295e2) only recovered some files.

This commit restores all missing files from pre-rebase ref 98933f5e
while preserving post-rebase additions (night-scheduler, night-mode UI,
NightModeWidget dashboard integration).

Restored features include:
- AI Module Sidebar (FAB), OCR Labeling, OCR Compare
- GPU Dashboard, RAG Pipeline, Magic Help
- Klausur-Korrektur (8 files), Abitur-Archiv (5+ files)
- Companion, Zeugnisse-Crawler, Screen Flow
- Full backend, studio-v2, website, klausur-service
- All compliance SDKs, agent-core, voice-service
- CI/CD configs, documentation, scripts

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-09 09:51:32 +01:00

154 lines
4.6 KiB
Python

"""
Document Service for RAG
Handles document upload, processing, and indexing.
"""
import uuid
from typing import Optional, Dict, Any
from fastapi import UploadFile
import structlog
logger = structlog.get_logger()
class DocumentService:
"""Service for document processing and indexing."""
def __init__(self, settings):
self.settings = settings
self.documents: Dict[str, Dict] = {}
async def process_upload(
self,
file: UploadFile,
regulation_code: Optional[str] = None
) -> Dict[str, Any]:
"""Process and index an uploaded document."""
doc_id = str(uuid.uuid4())
# Read file content
content = await file.read()
# Determine file type and extract text
filename = file.filename or "unknown"
if filename.endswith(".pdf"):
text = await self._extract_pdf(content)
elif filename.endswith(".docx"):
text = await self._extract_docx(content)
elif filename.endswith(".md"):
text = await self._extract_markdown(content)
else:
text = content.decode("utf-8", errors="ignore")
# Chunk the text
chunks = self._chunk_text(text)
# Store document metadata
self.documents[doc_id] = {
"id": doc_id,
"filename": filename,
"regulation_code": regulation_code or "CUSTOM",
"chunks": len(chunks),
"text_length": len(text)
}
# TODO: Index chunks in Qdrant
logger.info("Document processed",
doc_id=doc_id,
filename=filename,
chunks=len(chunks))
return {
"id": doc_id,
"filename": filename,
"chunks": len(chunks)
}
async def _extract_pdf(self, content: bytes) -> str:
"""Extract text from PDF."""
try:
from pypdf import PdfReader
from io import BytesIO
reader = PdfReader(BytesIO(content))
text = ""
for page in reader.pages:
text += page.extract_text() + "\n"
return text
except Exception as e:
logger.error("PDF extraction failed", error=str(e))
return ""
async def _extract_docx(self, content: bytes) -> str:
"""Extract text from DOCX."""
try:
from docx import Document
from io import BytesIO
doc = Document(BytesIO(content))
text = ""
for para in doc.paragraphs:
text += para.text + "\n"
return text
except Exception as e:
logger.error("DOCX extraction failed", error=str(e))
return ""
async def _extract_markdown(self, content: bytes) -> str:
"""Extract text from Markdown."""
try:
import markdown
from bs4 import BeautifulSoup
html = markdown.markdown(content.decode("utf-8"))
soup = BeautifulSoup(html, "html.parser")
return soup.get_text()
except Exception as e:
logger.error("Markdown extraction failed", error=str(e))
return content.decode("utf-8", errors="ignore")
def _chunk_text(self, text: str) -> list:
"""Split text into chunks."""
chunk_size = self.settings.chunk_size
chunk_overlap = self.settings.chunk_overlap
chunks = []
start = 0
while start < len(text):
end = start + chunk_size
# Try to break at sentence boundary
if end < len(text):
# Look for sentence end within overlap window
search_start = max(end - chunk_overlap, start)
search_text = text[search_start:end + chunk_overlap]
for sep in [". ", ".\n", "! ", "? "]:
last_sep = search_text.rfind(sep)
if last_sep > 0:
end = search_start + last_sep + len(sep)
break
chunk = text[start:end].strip()
if chunk:
chunks.append(chunk)
start = end - chunk_overlap
return chunks
async def delete(self, document_id: str) -> bool:
"""Delete a document and its chunks."""
if document_id in self.documents:
del self.documents[document_id]
# TODO: Delete from Qdrant
logger.info("Document deleted", doc_id=document_id)
return True
return False
def get_document(self, document_id: str) -> Optional[Dict]:
"""Get document metadata."""
return self.documents.get(document_id)