Add image service: Wikipedia photos + emoji fallback for vocabulary

image_service.py: Fetches thumbnail from Wikipedia REST API (free,
no account). Falls back to emoji for abstract words (40+ mapped).

Auto-enrichment: When a learning unit is created, images are
automatically fetched for all words that don't have one yet.

Manual endpoint: POST /api/vocabulary/enrich-images fills images
for existing words without images.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-04-27 23:36:48 +02:00
parent 91d6918e2c
commit f6caa3091f
2 changed files with 142 additions and 0 deletions

View File

@@ -0,0 +1,115 @@
"""
Image Service — Fetches vocabulary images from Wikipedia + Emoji fallback.
On-demand: Images are fetched when a learning unit is created,
then cached in the vocabulary_words.image_url field.
Sources (in priority order):
1. Wikipedia REST API (free, no account needed, CC license)
2. Emoji fallback for abstract words
Later: Unsplash API (needs account), Stable Diffusion (local batch)
"""
import logging
import os
from typing import Optional
import httpx
logger = logging.getLogger(__name__)
# Emoji map for common abstract words that don't have good photos
EMOJI_FALLBACK: dict[str, str] = {
"strong": "💪", "weak": "😩", "hard-working": "📚", "skinny": "🦴",
"female": "👩", "male": "👨", "definite": "", "definitely": "",
"even": "⚖️", "violent": "", "opinion": "💭", "message": "💬",
"beginning": "🏁", "mention": "🗣️", "summarize": "📋", "mark": "✏️",
"throw": "🤾", "take": "🤲", "sum": "", "on the one hand": "👐",
"apple": "🍎", "gym": "🏋️", "medal": "🏅", "sportswoman": "🏃‍♀️",
"role model": "", "tourist office": "🏨", "the olympics": "🏅",
"box": "🥊", "football": "", "footballer": "",
}
async def fetch_wikipedia_image(word: str) -> Optional[str]:
"""Fetch thumbnail image URL from Wikipedia for a word."""
# Clean word for Wikipedia lookup
query = word.split(",")[0].strip() # "throw, threw, thrown" → "throw"
query = query.replace("sth.", "").replace("sb.", "").strip()
if query.startswith("the "):
query = query[4:]
try:
async with httpx.AsyncClient(timeout=10.0) as client:
resp = await client.get(
f"https://en.wikipedia.org/api/rest_v1/page/summary/{query}",
headers={"User-Agent": "BreakPilot/1.0 (education platform)"},
)
if resp.status_code == 200:
data = resp.json()
thumb = data.get("thumbnail", {})
url = thumb.get("source")
if url:
logger.info(f"Wikipedia image for '{word}': {url}")
return url
except Exception as e:
logger.debug(f"Wikipedia image lookup failed for '{word}': {e}")
return None
def get_emoji_for_word(word: str) -> str:
"""Get an emoji representation for a word."""
lower = word.lower()
for key, emoji in EMOJI_FALLBACK.items():
if key in lower:
return emoji
# Generic fallback by part of speech could be added here
return "📝"
async def get_image_for_word(word: str) -> str:
"""Get the best available image for a vocabulary word.
Returns a URL (Wikipedia) or emoji string.
Result should be stored in vocabulary_words.image_url.
"""
# Try Wikipedia first
url = await fetch_wikipedia_image(word)
if url:
return url
# Fallback to emoji
return get_emoji_for_word(word)
async def enrich_words_with_images(word_ids: list[str]) -> int:
"""Fetch and store images for vocabulary words that don't have one yet."""
from vocabulary.db import get_pool
import uuid
pool = await get_pool()
updated = 0
async with pool.acquire() as conn:
rows = await conn.fetch(
"SELECT id, english, image_url FROM vocabulary_words WHERE id = ANY($1::uuid[])",
[uuid.UUID(wid) for wid in word_ids],
)
for row in rows:
if row["image_url"]:
continue # Already has an image
image = await get_image_for_word(row["english"])
if image:
await conn.execute(
"UPDATE vocabulary_words SET image_url = $1 WHERE id = $2",
image, row["id"],
)
updated += 1
logger.info(f"Image for '{row['english']}': {image[:60]}...")
logger.info(f"Enriched {updated} words with images")
return updated