backend-lehrer (11 files): - llm_gateway/routes/schools.py (867 → 5), recording_api.py (848 → 6) - messenger_api.py (840 → 5), print_generator.py (824 → 5) - unit_analytics_api.py (751 → 5), classroom/routes/context.py (726 → 4) - llm_gateway/routes/edu_search_seeds.py (710 → 4) klausur-service (12 files): - ocr_labeling_api.py (845 → 4), metrics_db.py (833 → 4) - legal_corpus_api.py (790 → 4), page_crop.py (758 → 3) - mail/ai_service.py (747 → 4), github_crawler.py (767 → 3) - trocr_service.py (730 → 4), full_compliance_pipeline.py (723 → 4) - dsfa_rag_api.py (715 → 4), ocr_pipeline_auto.py (705 → 4) website (6 pages): - audit-checklist (867 → 8), content (806 → 6) - screen-flow (790 → 4), scraper (789 → 5) - zeugnisse (776 → 5), modules (745 → 4) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
117 lines
4.0 KiB
Python
117 lines
4.0 KiB
Python
"""
|
|
DSFA RAG Embedding Service Integration.
|
|
|
|
Handles embedding generation, text extraction, and fallback logic.
|
|
"""
|
|
|
|
import os
|
|
import hashlib
|
|
import logging
|
|
import struct
|
|
import re
|
|
from typing import List
|
|
|
|
import httpx
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Embedding service configuration
|
|
EMBEDDING_SERVICE_URL = os.getenv("EMBEDDING_SERVICE_URL", "http://172.18.0.13:8087")
|
|
|
|
|
|
async def get_embedding(text: str) -> List[float]:
|
|
"""
|
|
Get embedding for text using the embedding-service.
|
|
|
|
Uses BGE-M3 model which produces 1024-dimensional vectors.
|
|
"""
|
|
async with httpx.AsyncClient(timeout=60.0) as client:
|
|
try:
|
|
response = await client.post(
|
|
f"{EMBEDDING_SERVICE_URL}/embed-single",
|
|
json={"text": text}
|
|
)
|
|
response.raise_for_status()
|
|
data = response.json()
|
|
return data.get("embedding", [])
|
|
except httpx.HTTPError as e:
|
|
logger.error(f"Embedding service error: {e}")
|
|
# Fallback to hash-based pseudo-embedding for development
|
|
return _generate_fallback_embedding(text)
|
|
|
|
|
|
async def get_embeddings_batch(texts: List[str]) -> List[List[float]]:
|
|
"""
|
|
Get embeddings for multiple texts in batch.
|
|
"""
|
|
async with httpx.AsyncClient(timeout=120.0) as client:
|
|
try:
|
|
response = await client.post(
|
|
f"{EMBEDDING_SERVICE_URL}/embed",
|
|
json={"texts": texts}
|
|
)
|
|
response.raise_for_status()
|
|
data = response.json()
|
|
return data.get("embeddings", [])
|
|
except httpx.HTTPError as e:
|
|
logger.error(f"Embedding service batch error: {e}")
|
|
# Fallback
|
|
return [_generate_fallback_embedding(t) for t in texts]
|
|
|
|
|
|
async def extract_text_from_url(url: str) -> str:
|
|
"""
|
|
Extract text from a document URL (PDF, HTML, etc.).
|
|
"""
|
|
async with httpx.AsyncClient(timeout=120.0) as client:
|
|
try:
|
|
# First try to use the embedding-service's extract-pdf endpoint
|
|
response = await client.post(
|
|
f"{EMBEDDING_SERVICE_URL}/extract-pdf",
|
|
json={"url": url}
|
|
)
|
|
response.raise_for_status()
|
|
data = response.json()
|
|
return data.get("text", "")
|
|
except httpx.HTTPError as e:
|
|
logger.error(f"PDF extraction error for {url}: {e}")
|
|
# Fallback: try to fetch HTML content directly
|
|
try:
|
|
response = await client.get(url, follow_redirects=True)
|
|
response.raise_for_status()
|
|
content_type = response.headers.get("content-type", "")
|
|
if "html" in content_type:
|
|
# Simple HTML text extraction
|
|
html = response.text
|
|
# Remove scripts and styles
|
|
html = re.sub(r'<script[^>]*>.*?</script>', '', html, flags=re.DOTALL | re.IGNORECASE)
|
|
html = re.sub(r'<style[^>]*>.*?</style>', '', html, flags=re.DOTALL | re.IGNORECASE)
|
|
# Remove tags
|
|
text = re.sub(r'<[^>]+>', ' ', html)
|
|
# Clean whitespace
|
|
text = re.sub(r'\s+', ' ', text).strip()
|
|
return text
|
|
else:
|
|
return ""
|
|
except Exception as fetch_err:
|
|
logger.error(f"Fallback fetch error for {url}: {fetch_err}")
|
|
return ""
|
|
|
|
|
|
def _generate_fallback_embedding(text: str) -> List[float]:
|
|
"""
|
|
Generate deterministic pseudo-embedding from text hash.
|
|
Used as fallback when embedding service is unavailable.
|
|
"""
|
|
hash_bytes = hashlib.sha256(text.encode()).digest()
|
|
embedding = []
|
|
for i in range(0, min(len(hash_bytes), 128), 4):
|
|
val = struct.unpack('f', hash_bytes[i:i+4])[0]
|
|
embedding.append(val % 1.0)
|
|
|
|
# Pad to 1024 dimensions
|
|
while len(embedding) < 1024:
|
|
embedding.extend(embedding[:min(len(embedding), 1024 - len(embedding))])
|
|
|
|
return embedding[:1024]
|