[split-required] Split 700-870 LOC files across all services
backend-lehrer (11 files): - llm_gateway/routes/schools.py (867 → 5), recording_api.py (848 → 6) - messenger_api.py (840 → 5), print_generator.py (824 → 5) - unit_analytics_api.py (751 → 5), classroom/routes/context.py (726 → 4) - llm_gateway/routes/edu_search_seeds.py (710 → 4) klausur-service (12 files): - ocr_labeling_api.py (845 → 4), metrics_db.py (833 → 4) - legal_corpus_api.py (790 → 4), page_crop.py (758 → 3) - mail/ai_service.py (747 → 4), github_crawler.py (767 → 3) - trocr_service.py (730 → 4), full_compliance_pipeline.py (723 → 4) - dsfa_rag_api.py (715 → 4), ocr_pipeline_auto.py (705 → 4) website (6 pages): - audit-checklist (867 → 8), content (806 → 6) - screen-flow (790 → 4), scraper (789 → 5) - zeugnisse (776 → 5), modules (745 → 4) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
116
klausur-service/backend/dsfa_rag_embedding.py
Normal file
116
klausur-service/backend/dsfa_rag_embedding.py
Normal file
@@ -0,0 +1,116 @@
|
||||
"""
|
||||
DSFA RAG Embedding Service Integration.
|
||||
|
||||
Handles embedding generation, text extraction, and fallback logic.
|
||||
"""
|
||||
|
||||
import os
|
||||
import hashlib
|
||||
import logging
|
||||
import struct
|
||||
import re
|
||||
from typing import List
|
||||
|
||||
import httpx
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Embedding service configuration
|
||||
EMBEDDING_SERVICE_URL = os.getenv("EMBEDDING_SERVICE_URL", "http://172.18.0.13:8087")
|
||||
|
||||
|
||||
async def get_embedding(text: str) -> List[float]:
|
||||
"""
|
||||
Get embedding for text using the embedding-service.
|
||||
|
||||
Uses BGE-M3 model which produces 1024-dimensional vectors.
|
||||
"""
|
||||
async with httpx.AsyncClient(timeout=60.0) as client:
|
||||
try:
|
||||
response = await client.post(
|
||||
f"{EMBEDDING_SERVICE_URL}/embed-single",
|
||||
json={"text": text}
|
||||
)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
return data.get("embedding", [])
|
||||
except httpx.HTTPError as e:
|
||||
logger.error(f"Embedding service error: {e}")
|
||||
# Fallback to hash-based pseudo-embedding for development
|
||||
return _generate_fallback_embedding(text)
|
||||
|
||||
|
||||
async def get_embeddings_batch(texts: List[str]) -> List[List[float]]:
|
||||
"""
|
||||
Get embeddings for multiple texts in batch.
|
||||
"""
|
||||
async with httpx.AsyncClient(timeout=120.0) as client:
|
||||
try:
|
||||
response = await client.post(
|
||||
f"{EMBEDDING_SERVICE_URL}/embed",
|
||||
json={"texts": texts}
|
||||
)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
return data.get("embeddings", [])
|
||||
except httpx.HTTPError as e:
|
||||
logger.error(f"Embedding service batch error: {e}")
|
||||
# Fallback
|
||||
return [_generate_fallback_embedding(t) for t in texts]
|
||||
|
||||
|
||||
async def extract_text_from_url(url: str) -> str:
|
||||
"""
|
||||
Extract text from a document URL (PDF, HTML, etc.).
|
||||
"""
|
||||
async with httpx.AsyncClient(timeout=120.0) as client:
|
||||
try:
|
||||
# First try to use the embedding-service's extract-pdf endpoint
|
||||
response = await client.post(
|
||||
f"{EMBEDDING_SERVICE_URL}/extract-pdf",
|
||||
json={"url": url}
|
||||
)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
return data.get("text", "")
|
||||
except httpx.HTTPError as e:
|
||||
logger.error(f"PDF extraction error for {url}: {e}")
|
||||
# Fallback: try to fetch HTML content directly
|
||||
try:
|
||||
response = await client.get(url, follow_redirects=True)
|
||||
response.raise_for_status()
|
||||
content_type = response.headers.get("content-type", "")
|
||||
if "html" in content_type:
|
||||
# Simple HTML text extraction
|
||||
html = response.text
|
||||
# Remove scripts and styles
|
||||
html = re.sub(r'<script[^>]*>.*?</script>', '', html, flags=re.DOTALL | re.IGNORECASE)
|
||||
html = re.sub(r'<style[^>]*>.*?</style>', '', html, flags=re.DOTALL | re.IGNORECASE)
|
||||
# Remove tags
|
||||
text = re.sub(r'<[^>]+>', ' ', html)
|
||||
# Clean whitespace
|
||||
text = re.sub(r'\s+', ' ', text).strip()
|
||||
return text
|
||||
else:
|
||||
return ""
|
||||
except Exception as fetch_err:
|
||||
logger.error(f"Fallback fetch error for {url}: {fetch_err}")
|
||||
return ""
|
||||
|
||||
|
||||
def _generate_fallback_embedding(text: str) -> List[float]:
|
||||
"""
|
||||
Generate deterministic pseudo-embedding from text hash.
|
||||
Used as fallback when embedding service is unavailable.
|
||||
"""
|
||||
hash_bytes = hashlib.sha256(text.encode()).digest()
|
||||
embedding = []
|
||||
for i in range(0, min(len(hash_bytes), 128), 4):
|
||||
val = struct.unpack('f', hash_bytes[i:i+4])[0]
|
||||
embedding.append(val % 1.0)
|
||||
|
||||
# Pad to 1024 dimensions
|
||||
while len(embedding) < 1024:
|
||||
embedding.extend(embedding[:min(len(embedding), 1024 - len(embedding))])
|
||||
|
||||
return embedding[:1024]
|
||||
Reference in New Issue
Block a user