[split-required] Split 700-870 LOC files across all services

backend-lehrer (11 files): - llm_gateway/routes/schools.py (867 → 5), recording_api.py (848 → 6) - messenger_api.py (840 → 5), print_generator.py (824 → 5) - unit_analytics_api.py (751 → 5), classroom/routes/context.py (726 → 4) - llm_gateway/routes/edu_search_seeds.py (710 → 4) klausur-service (12 files): - ocr_labeling_api.py (845 → 4), metrics_db.py (833 → 4) - legal_corpus_api.py (790 → 4), page_crop.py (758 → 3) - mail/ai_service.py (747 → 4), github_crawler.py (767 → 3) - trocr_service.py (730 → 4), full_compliance_pipeline.py (723 → 4) - dsfa_rag_api.py (715 → 4), ocr_pipeline_auto.py (705 → 4) website (6 pages): - audit-checklist (867 → 8), content (806 → 6) - screen-flow (790 → 4), scraper (789 → 5) - zeugnisse (776 → 5), modules (745 → 4) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-25 08:01:18 +02:00
parent b6983ab1dc
commit 34da9f4cda
106 changed files with 16500 additions and 16947 deletions
--- a/klausur-service/backend/dsfa_rag_embedding.py
+++ b/klausur-service/backend/dsfa_rag_embedding.py
@@ -0,0 +1,116 @@
+"""
+DSFA RAG Embedding Service Integration.
+
+Handles embedding generation, text extraction, and fallback logic.
+"""
+
+import os
+import hashlib
+import logging
+import struct
+import re
+from typing import List
+
+import httpx
+
+logger = logging.getLogger(__name__)
+
+# Embedding service configuration
+EMBEDDING_SERVICE_URL = os.getenv("EMBEDDING_SERVICE_URL", "http://172.18.0.13:8087")
+
+
+async def get_embedding(text: str) -> List[float]:
+    """
+    Get embedding for text using the embedding-service.
+
+    Uses BGE-M3 model which produces 1024-dimensional vectors.
+    """
+    async with httpx.AsyncClient(timeout=60.0) as client:
+        try:
+            response = await client.post(
+                f"{EMBEDDING_SERVICE_URL}/embed-single",
+                json={"text": text}
+            )
+            response.raise_for_status()
+            data = response.json()
+            return data.get("embedding", [])
+        except httpx.HTTPError as e:
+            logger.error(f"Embedding service error: {e}")
+            # Fallback to hash-based pseudo-embedding for development
+            return _generate_fallback_embedding(text)
+
+
+async def get_embeddings_batch(texts: List[str]) -> List[List[float]]:
+    """
+    Get embeddings for multiple texts in batch.
+    """
+    async with httpx.AsyncClient(timeout=120.0) as client:
+        try:
+            response = await client.post(
+                f"{EMBEDDING_SERVICE_URL}/embed",
+                json={"texts": texts}
+            )
+            response.raise_for_status()
+            data = response.json()
+            return data.get("embeddings", [])
+        except httpx.HTTPError as e:
+            logger.error(f"Embedding service batch error: {e}")
+            # Fallback
+            return [_generate_fallback_embedding(t) for t in texts]
+
+
+async def extract_text_from_url(url: str) -> str:
+    """
+    Extract text from a document URL (PDF, HTML, etc.).
+    """
+    async with httpx.AsyncClient(timeout=120.0) as client:
+        try:
+            # First try to use the embedding-service's extract-pdf endpoint
+            response = await client.post(
+                f"{EMBEDDING_SERVICE_URL}/extract-pdf",
+                json={"url": url}
+            )
+            response.raise_for_status()
+            data = response.json()
+            return data.get("text", "")
+        except httpx.HTTPError as e:
+            logger.error(f"PDF extraction error for {url}: {e}")
+            # Fallback: try to fetch HTML content directly
+            try:
+                response = await client.get(url, follow_redirects=True)
+                response.raise_for_status()
+                content_type = response.headers.get("content-type", "")
+                if "html" in content_type:
+                    # Simple HTML text extraction
+                    html = response.text
+                    # Remove scripts and styles
+                    html = re.sub(r'<script[^>]*>.*?</script>', '', html, flags=re.DOTALL | re.IGNORECASE)
+                    html = re.sub(r'<style[^>]*>.*?</style>', '', html, flags=re.DOTALL | re.IGNORECASE)
+                    # Remove tags
+                    text = re.sub(r'<[^>]+>', ' ', html)
+                    # Clean whitespace
+                    text = re.sub(r'\s+', ' ', text).strip()
+                    return text
+                else:
+                    return ""
+            except Exception as fetch_err:
+                logger.error(f"Fallback fetch error for {url}: {fetch_err}")
+                return ""
+
+
+def _generate_fallback_embedding(text: str) -> List[float]:
+    """
+    Generate deterministic pseudo-embedding from text hash.
+    Used as fallback when embedding service is unavailable.
+    """
+    hash_bytes = hashlib.sha256(text.encode()).digest()
+    embedding = []
+    for i in range(0, min(len(hash_bytes), 128), 4):
+        val = struct.unpack('f', hash_bytes[i:i+4])[0]
+        embedding.append(val % 1.0)
+
+    # Pad to 1024 dimensions
+    while len(embedding) < 1024:
+        embedding.extend(embedding[:min(len(embedding), 1024 - len(embedding))])
+
+    return embedding[:1024]