Files
breakpilot-lehrer/backend-lehrer/services/image_service.py
T
Benjamin Admin fdde5d43b3 Fix: Wikipedia User-Agent (was 403 Forbidden)
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-27 23:51:46 +02:00

117 lines
4.0 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
Image Service — Fetches vocabulary images from Wikipedia + Emoji fallback.
On-demand: Images are fetched when a learning unit is created,
then cached in the vocabulary_words.image_url field.
Sources (in priority order):
1. Wikipedia REST API (free, no account needed, CC license)
2. Emoji fallback for abstract words
Later: Unsplash API (needs account), Stable Diffusion (local batch)
"""
import logging
import os
from typing import Optional
import httpx
logger = logging.getLogger(__name__)
# Emoji map for common abstract words that don't have good photos
EMOJI_FALLBACK: dict[str, str] = {
"strong": "💪", "weak": "😩", "hard-working": "📚", "skinny": "🦴",
"female": "👩", "male": "👨", "definite": "", "definitely": "",
"even": "⚖️", "violent": "", "opinion": "💭", "message": "💬",
"beginning": "🏁", "mention": "🗣️", "summarize": "📋", "mark": "✏️",
"throw": "🤾", "take": "🤲", "sum": "", "on the one hand": "👐",
"apple": "🍎", "gym": "🏋️", "medal": "🏅", "sportswoman": "🏃‍♀️",
"role model": "", "tourist office": "🏨", "the olympics": "🏅",
"box": "🥊", "football": "", "footballer": "",
}
async def fetch_wikipedia_image(word: str) -> Optional[str]:
"""Fetch thumbnail image URL from Wikipedia for a word."""
# Clean word for Wikipedia lookup
query = word.split(",")[0].strip() # "throw, threw, thrown" → "throw"
query = query.replace("sth.", "").replace("sb.", "").strip()
if query.startswith("the "):
query = query[4:]
try:
async with httpx.AsyncClient(timeout=10.0) as client:
resp = await client.get(
f"https://en.wikipedia.org/api/rest_v1/page/summary/{query}",
headers={"User-Agent": "BreakPilot/1.0 (https://breakpilot.com; education platform; contact@breakpilot.com)"},
follow_redirects=True,
)
if resp.status_code == 200:
data = resp.json()
thumb = data.get("thumbnail", {})
url = thumb.get("source")
if url:
logger.info(f"Wikipedia image for '{word}': {url}")
return url
except Exception as e:
logger.debug(f"Wikipedia image lookup failed for '{word}': {e}")
return None
def get_emoji_for_word(word: str) -> str:
"""Get an emoji representation for a word."""
lower = word.lower()
for key, emoji in EMOJI_FALLBACK.items():
if key in lower:
return emoji
# Generic fallback by part of speech could be added here
return "📝"
async def get_image_for_word(word: str) -> str:
"""Get the best available image for a vocabulary word.
Returns a URL (Wikipedia) or emoji string.
Result should be stored in vocabulary_words.image_url.
"""
# Try Wikipedia first
url = await fetch_wikipedia_image(word)
if url:
return url
# Fallback to emoji
return get_emoji_for_word(word)
async def enrich_words_with_images(word_ids: list[str]) -> int:
"""Fetch and store images for vocabulary words that don't have one yet."""
from vocabulary.db import get_pool
import uuid
pool = await get_pool()
updated = 0
async with pool.acquire() as conn:
rows = await conn.fetch(
"SELECT id, english, image_url FROM vocabulary_words WHERE id = ANY($1::uuid[])",
[uuid.UUID(wid) for wid in word_ids],
)
for row in rows:
if row["image_url"]:
continue # Already has an image
image = await get_image_for_word(row["english"])
if image:
await conn.execute(
"UPDATE vocabulary_words SET image_url = $1 WHERE id = $2",
image, row["id"],
)
updated += 1
logger.info(f"Image for '{row['english']}': {image[:60]}...")
logger.info(f"Enriched {updated} words with images")
return updated