Add image service: Wikipedia photos + emoji fallback for vocabulary

image_service.py: Fetches thumbnail from Wikipedia REST API (free, no account). Falls back to emoji for abstract words (40+ mapped). Auto-enrichment: When a learning unit is created, images are automatically fetched for all words that don't have one yet. Manual endpoint: POST /api/vocabulary/enrich-images fills images for existing words without images. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-27 23:36:48 +02:00
parent 91d6918e2c
commit f6caa3091f
2 changed files with 142 additions and 0 deletions
@@ -0,0 +1,115 @@
 """
 Image Service — Fetches vocabulary images from Wikipedia + Emoji fallback.
 On-demand: Images are fetched when a learning unit is created,
 then cached in the vocabulary_words.image_url field.
 Sources (in priority order):
 1. Wikipedia REST API (free, no account needed, CC license)
 2. Emoji fallback for abstract words
 Later: Unsplash API (needs account), Stable Diffusion (local batch)
 """
 import logging
 import os
 from typing import Optional
 import httpx
 logger = logging.getLogger(__name__)
 # Emoji map for common abstract words that don't have good photos
 EMOJI_FALLBACK: dict[str, str] = {
    "strong": "💪", "weak": "😩", "hard-working": "📚", "skinny": "🦴",
    "female": "👩", "male": "👨", "definite": "✅", "definitely": "✅",
    "even": "⚖️", "violent": "⚡", "opinion": "💭", "message": "💬",
    "beginning": "🏁", "mention": "🗣️", "summarize": "📋", "mark": "✏️",
    "throw": "🤾", "take": "🤲", "sum": "➕", "on the one hand": "👐",
    "apple": "🍎", "gym": "🏋️", "medal": "🏅", "sportswoman": "🏃‍♀️",
    "role model": "⭐", "tourist office": "🏨", "the olympics": "🏅",
    "box": "🥊", "football": "⚽", "footballer": "⚽",
 }
 async def fetch_wikipedia_image(word: str) -> Optional[str]:
    """Fetch thumbnail image URL from Wikipedia for a word."""
    # Clean word for Wikipedia lookup
    query = word.split(",")[0].strip()  # "throw, threw, thrown" → "throw"
    query = query.replace("sth.", "").replace("sb.", "").strip()
    if query.startswith("the "):
        query = query[4:]
    try:
        async with httpx.AsyncClient(timeout=10.0) as client:
            resp = await client.get(
                f"https://en.wikipedia.org/api/rest_v1/page/summary/{query}",
                headers={"User-Agent": "BreakPilot/1.0 (education platform)"},
            )
            if resp.status_code == 200:
                data = resp.json()
                thumb = data.get("thumbnail", {})
                url = thumb.get("source")
                if url:
                    logger.info(f"Wikipedia image for '{word}': {url}")
                    return url
    except Exception as e:
        logger.debug(f"Wikipedia image lookup failed for '{word}': {e}")
    return None
 def get_emoji_for_word(word: str) -> str:
    """Get an emoji representation for a word."""
    lower = word.lower()
    for key, emoji in EMOJI_FALLBACK.items():
        if key in lower:
            return emoji
    # Generic fallback by part of speech could be added here
    return "📝"
 async def get_image_for_word(word: str) -> str:
    """Get the best available image for a vocabulary word.
    Returns a URL (Wikipedia) or emoji string.
    Result should be stored in vocabulary_words.image_url.
    """
    # Try Wikipedia first
    url = await fetch_wikipedia_image(word)
    if url:
        return url
    # Fallback to emoji
    return get_emoji_for_word(word)
 async def enrich_words_with_images(word_ids: list[str]) -> int:
    """Fetch and store images for vocabulary words that don't have one yet."""
    from vocabulary.db import get_pool
    import uuid
    pool = await get_pool()
    updated = 0
    async with pool.acquire() as conn:
        rows = await conn.fetch(
            "SELECT id, english, image_url FROM vocabulary_words WHERE id = ANY($1::uuid[])",
            [uuid.UUID(wid) for wid in word_ids],
        )
        for row in rows:
            if row["image_url"]:
                continue  # Already has an image
            image = await get_image_for_word(row["english"])
            if image:
                await conn.execute(
                    "UPDATE vocabulary_words SET image_url = $1 WHERE id = $2",
                    image, row["id"],
                )
                updated += 1
                logger.info(f"Image for '{row['english']}': {image[:60]}...")
    logger.info(f"Enriched {updated} words with images")
    return updated
@@ -270,6 +270,13 @@ async def api_create_unit_from_words(payload: CreateUnitFromWordsPayload):
            },
        }, f, ensure_ascii=False, indent=2)
    # Auto-enrich words with images (Wikipedia + emoji fallback)
    try:
        from services.image_service import enrich_words_with_images
        await enrich_words_with_images(payload.word_ids)
    except Exception as e:
        logger.warning(f"Image enrichment failed (non-critical): {e}")
    logger.info(f"Created vocab unit {lu.id} with {len(words)} words")
    return {
@@ -347,6 +354,26 @@ async def api_bulk_import(payload: BulkImportPayload):
 # ---------------------------------------------------------------------------
@router.post("/enrich-images")
 async def api_enrich_images(word_ids: List[str] = None):
    """Fetch and store images for vocabulary words (Wikipedia + emoji fallback)."""
    from services.image_service import enrich_words_with_images
    from vocabulary.db import get_pool
    import uuid as _uuid
    if not word_ids:
        pool = await get_pool()
        async with pool.acquire() as conn:
            rows = await conn.fetch("SELECT id FROM vocabulary_words WHERE image_url = '' OR image_url IS NULL")
            word_ids = [str(r["id"]) for r in rows]
    if not word_ids:
        return {"enriched": 0, "message": "All words already have images"}
    count = await enrich_words_with_images(word_ids)
    return {"enriched": count, "total": len(word_ids)}
 class TranslateRequest(BaseModel):
    word_ids: List[str]
    target_language: str