""" Translation Service — Batch-translates vocabulary words into target languages. Uses Ollama (local LLM) to translate EN/DE word pairs into TR, AR, UK, RU, PL. Translations are cached in vocabulary_words.translations JSONB field. All processing happens locally — no external API calls, GDPR-compliant. """ import json import logging import os from typing import Any, Dict, List import httpx logger = logging.getLogger(__name__) OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL", "http://host.docker.internal:11434") TRANSLATION_MODEL = os.getenv("TRANSLATION_MODEL", "qwen3:30b-a3b") LANGUAGE_NAMES = { "tr": "Turkish", "ar": "Arabic", "uk": "Ukrainian", "ru": "Russian", "pl": "Polish", "fr": "French", "es": "Spanish", } async def translate_words_batch( words: List[Dict[str, str]], target_language: str, batch_size: int = 30, ) -> List[Dict[str, str]]: """ Translate a batch of EN/DE word pairs into a target language. Args: words: List of dicts with 'english' and 'german' keys target_language: ISO 639-1 code (tr, ar, uk, ru, pl) batch_size: Words per LLM request Returns: List of dicts with 'english', 'translation', 'example' keys """ lang_name = LANGUAGE_NAMES.get(target_language, target_language) all_translations = [] for i in range(0, len(words), batch_size): batch = words[i:i + batch_size] word_list = "\n".join( f"{j+1}. {w['english']} = {w.get('german', '')}" for j, w in enumerate(batch) ) prompt = f"""Translate these English/German word pairs into {lang_name}. For each word, provide the translation and a short example sentence in {lang_name}. Words: {word_list} Reply ONLY with a JSON array, no explanation: [ {{"english": "word", "translation": "...", "example": "..."}}, ... ]""" try: async with httpx.AsyncClient(timeout=120.0) as client: resp = await client.post( f"{OLLAMA_BASE_URL}/api/generate", json={ "model": TRANSLATION_MODEL, "prompt": prompt, "stream": False, "options": {"temperature": 0.2, "num_predict": 4096}, }, ) resp.raise_for_status() response_text = resp.json().get("response", "") # Parse JSON from response import re match = re.search(r'\[[\s\S]*\]', response_text) if match: batch_translations = json.loads(match.group()) all_translations.extend(batch_translations) logger.info( f"Translated batch {i//batch_size + 1}: " f"{len(batch_translations)} words → {lang_name}" ) else: logger.warning(f"No JSON array in LLM response for {lang_name}") except Exception as e: logger.error(f"Translation batch failed ({lang_name}): {e}") return all_translations async def translate_and_store( word_ids: List[str], target_language: str, ) -> int: """ Translate vocabulary words and store in the database. Fetches words from DB, translates via LLM, stores in translations JSONB. Skips words that already have a translation for the target language. Returns count of newly translated words. """ from vocabulary_db import get_pool pool = await get_pool() async with pool.acquire() as conn: # Fetch words that need translation rows = await conn.fetch( """ SELECT id, english, german, translations FROM vocabulary_words WHERE id = ANY($1::uuid[]) """, [__import__('uuid').UUID(wid) for wid in word_ids], ) words_to_translate = [] word_map = {} for row in rows: translations = row["translations"] or {} if isinstance(translations, str): translations = json.loads(translations) if target_language not in translations: words_to_translate.append({ "english": row["english"], "german": row["german"], }) word_map[row["english"].lower()] = str(row["id"]) if not words_to_translate: logger.info(f"All {len(rows)} words already translated to {target_language}") return 0 # Translate results = await translate_words_batch(words_to_translate, target_language) # Store results updated = 0 async with pool.acquire() as conn: for result in results: en = result.get("english", "").lower() word_id = word_map.get(en) if not word_id: continue translation = result.get("translation", "") example = result.get("example", "") if not translation: continue await conn.execute( """ UPDATE vocabulary_words SET translations = translations || $1::jsonb WHERE id = $2 """, json.dumps({target_language: { "text": translation, "example": example, }}), __import__('uuid').UUID(word_id), ) updated += 1 logger.info(f"Stored {updated} translations for {target_language}") return updated