Files
breakpilot-lehrer/klausur-service/backend/dsfa_rag_embedding.py
Benjamin Admin 34da9f4cda [split-required] Split 700-870 LOC files across all services
backend-lehrer (11 files):
- llm_gateway/routes/schools.py (867 → 5), recording_api.py (848 → 6)
- messenger_api.py (840 → 5), print_generator.py (824 → 5)
- unit_analytics_api.py (751 → 5), classroom/routes/context.py (726 → 4)
- llm_gateway/routes/edu_search_seeds.py (710 → 4)

klausur-service (12 files):
- ocr_labeling_api.py (845 → 4), metrics_db.py (833 → 4)
- legal_corpus_api.py (790 → 4), page_crop.py (758 → 3)
- mail/ai_service.py (747 → 4), github_crawler.py (767 → 3)
- trocr_service.py (730 → 4), full_compliance_pipeline.py (723 → 4)
- dsfa_rag_api.py (715 → 4), ocr_pipeline_auto.py (705 → 4)

website (6 pages):
- audit-checklist (867 → 8), content (806 → 6)
- screen-flow (790 → 4), scraper (789 → 5)
- zeugnisse (776 → 5), modules (745 → 4)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-25 08:01:18 +02:00

117 lines
4.0 KiB
Python

"""
DSFA RAG Embedding Service Integration.
Handles embedding generation, text extraction, and fallback logic.
"""
import os
import hashlib
import logging
import struct
import re
from typing import List
import httpx
logger = logging.getLogger(__name__)
# Embedding service configuration
EMBEDDING_SERVICE_URL = os.getenv("EMBEDDING_SERVICE_URL", "http://172.18.0.13:8087")
async def get_embedding(text: str) -> List[float]:
"""
Get embedding for text using the embedding-service.
Uses BGE-M3 model which produces 1024-dimensional vectors.
"""
async with httpx.AsyncClient(timeout=60.0) as client:
try:
response = await client.post(
f"{EMBEDDING_SERVICE_URL}/embed-single",
json={"text": text}
)
response.raise_for_status()
data = response.json()
return data.get("embedding", [])
except httpx.HTTPError as e:
logger.error(f"Embedding service error: {e}")
# Fallback to hash-based pseudo-embedding for development
return _generate_fallback_embedding(text)
async def get_embeddings_batch(texts: List[str]) -> List[List[float]]:
"""
Get embeddings for multiple texts in batch.
"""
async with httpx.AsyncClient(timeout=120.0) as client:
try:
response = await client.post(
f"{EMBEDDING_SERVICE_URL}/embed",
json={"texts": texts}
)
response.raise_for_status()
data = response.json()
return data.get("embeddings", [])
except httpx.HTTPError as e:
logger.error(f"Embedding service batch error: {e}")
# Fallback
return [_generate_fallback_embedding(t) for t in texts]
async def extract_text_from_url(url: str) -> str:
"""
Extract text from a document URL (PDF, HTML, etc.).
"""
async with httpx.AsyncClient(timeout=120.0) as client:
try:
# First try to use the embedding-service's extract-pdf endpoint
response = await client.post(
f"{EMBEDDING_SERVICE_URL}/extract-pdf",
json={"url": url}
)
response.raise_for_status()
data = response.json()
return data.get("text", "")
except httpx.HTTPError as e:
logger.error(f"PDF extraction error for {url}: {e}")
# Fallback: try to fetch HTML content directly
try:
response = await client.get(url, follow_redirects=True)
response.raise_for_status()
content_type = response.headers.get("content-type", "")
if "html" in content_type:
# Simple HTML text extraction
html = response.text
# Remove scripts and styles
html = re.sub(r'<script[^>]*>.*?</script>', '', html, flags=re.DOTALL | re.IGNORECASE)
html = re.sub(r'<style[^>]*>.*?</style>', '', html, flags=re.DOTALL | re.IGNORECASE)
# Remove tags
text = re.sub(r'<[^>]+>', ' ', html)
# Clean whitespace
text = re.sub(r'\s+', ' ', text).strip()
return text
else:
return ""
except Exception as fetch_err:
logger.error(f"Fallback fetch error for {url}: {fetch_err}")
return ""
def _generate_fallback_embedding(text: str) -> List[float]:
"""
Generate deterministic pseudo-embedding from text hash.
Used as fallback when embedding service is unavailable.
"""
hash_bytes = hashlib.sha256(text.encode()).digest()
embedding = []
for i in range(0, min(len(hash_bytes), 128), 4):
val = struct.unpack('f', hash_bytes[i:i+4])[0]
embedding.append(val % 1.0)
# Pad to 1024 dimensions
while len(embedding) < 1024:
embedding.extend(embedding[:min(len(embedding), 1024 - len(embedding))])
return embedding[:1024]