Restructure: Move grid_* + vocab_* into packages (klausur-service)
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 28s
CI / test-go-edu-search (push) Successful in 29s
CI / test-python-klausur (push) Failing after 2m31s
CI / test-python-agent-core (push) Successful in 20s
CI / test-nodejs-website (push) Successful in 23s

grid/ package (16 files):
  grid/build/   — core, zones, cleanup, text_ops, cell_ops, finalize
  grid/editor/  — api, helpers, columns, filters, headers, zones

vocab/ package (10 files):
  vocab/worksheet/ — api, models, extraction, generation, ocr, upload, analysis, compare
  vocab/           — session_store, learn_bridge

26 backward-compat shims. Internal imports relative. RAG untouched.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-04-25 21:30:20 +02:00
parent 098a2ff092
commit 59c400b9aa
58 changed files with 8803 additions and 8659 deletions

View File

@@ -0,0 +1,6 @@
"""
Vocab package — restructured from vocab_* flat modules.
Backward-compatible re-exports: consumers can still use
``from vocab_worksheet_api import ...`` etc. via the shim files in backend/.
"""

View File

@@ -0,0 +1,196 @@
"""
Vocab Learn Bridge — Converts vocabulary session data into Learning Units.
Bridges klausur-service (vocab extraction) with backend-lehrer (learning units + generators).
Creates a Learning Unit in backend-lehrer, then triggers MC/Cloze/QA generation.
DATENSCHUTZ: All communication stays within Docker network (breakpilot-network).
"""
import os
import json
import logging
import httpx
from typing import List, Dict, Any, Optional
logger = logging.getLogger(__name__)
BACKEND_LEHRER_URL = os.getenv("BACKEND_LEHRER_URL", "http://backend-lehrer:8001")
def vocab_to_analysis_data(session_name: str, vocabulary: List[Dict[str, Any]]) -> Dict[str, Any]:
"""
Convert vocabulary entries from a vocab session into the analysis_data format
expected by backend-lehrer generators (MC, Cloze, QA).
The generators consume:
- title: Display name
- subject: Subject area
- grade_level: Target grade
- canonical_text: Full text representation
- printed_blocks: Individual text blocks
- vocabulary: Original vocab data (for vocab-specific modules)
"""
canonical_lines = []
printed_blocks = []
for v in vocabulary:
en = v.get("english", "").strip()
de = v.get("german", "").strip()
example = v.get("example_sentence", "").strip()
if not en and not de:
continue
line = f"{en} = {de}"
if example:
line += f" ({example})"
canonical_lines.append(line)
block_text = f"{en}{de}"
if example:
block_text += f" | {example}"
printed_blocks.append({"text": block_text})
return {
"title": session_name,
"subject": "English Vocabulary",
"grade_level": "5-8",
"canonical_text": "\n".join(canonical_lines),
"printed_blocks": printed_blocks,
"vocabulary": vocabulary,
}
async def create_learning_unit(
session_name: str,
vocabulary: List[Dict[str, Any]],
grade: Optional[str] = None,
) -> Dict[str, Any]:
"""
Create a Learning Unit in backend-lehrer from vocabulary data.
Steps:
1. Create unit via POST /api/learning-units/
2. Return the created unit info
Returns dict with unit_id, status, vocabulary_count.
"""
if not vocabulary:
raise ValueError("No vocabulary entries provided")
analysis_data = vocab_to_analysis_data(session_name, vocabulary)
async with httpx.AsyncClient(timeout=30.0) as client:
# 1. Create Learning Unit
create_payload = {
"title": session_name,
"subject": "Englisch",
"grade": grade or "5-8",
}
try:
resp = await client.post(
f"{BACKEND_LEHRER_URL}/api/learning-units/",
json=create_payload,
)
resp.raise_for_status()
unit = resp.json()
except httpx.HTTPError as e:
logger.error(f"Failed to create learning unit: {e}")
raise RuntimeError(f"Backend-Lehrer nicht erreichbar: {e}")
unit_id = unit.get("id")
if not unit_id:
raise RuntimeError("Learning Unit created but no ID returned")
logger.info(f"Created learning unit {unit_id} with {len(vocabulary)} vocabulary entries")
# 2. Save analysis_data as JSON file for generators
analysis_dir = os.path.expanduser("~/Arbeitsblaetter/Lerneinheiten")
os.makedirs(analysis_dir, exist_ok=True)
analysis_path = os.path.join(analysis_dir, f"{unit_id}_analyse.json")
with open(analysis_path, "w", encoding="utf-8") as f:
json.dump(analysis_data, f, ensure_ascii=False, indent=2)
logger.info(f"Saved analysis data to {analysis_path}")
return {
"unit_id": unit_id,
"unit": unit,
"analysis_path": analysis_path,
"vocabulary_count": len(vocabulary),
"status": "created",
}
async def generate_learning_modules(
unit_id: str,
analysis_path: str,
) -> Dict[str, Any]:
"""
Trigger MC, Cloze, and QA generation from analysis data.
Imports generators directly (they run in-process for klausur-service)
or calls backend-lehrer API if generators aren't available locally.
Returns dict with generation results.
"""
results = {
"unit_id": unit_id,
"mc": {"status": "pending"},
"cloze": {"status": "pending"},
"qa": {"status": "pending"},
}
# Load analysis data
with open(analysis_path, "r", encoding="utf-8") as f:
analysis_data = json.load(f)
# Try to generate via backend-lehrer API
async with httpx.AsyncClient(timeout=120.0) as client:
# Generate QA (includes Leitner fields)
try:
resp = await client.post(
f"{BACKEND_LEHRER_URL}/api/learning-units/{unit_id}/generate-qa",
json={"analysis_data": analysis_data, "num_questions": min(len(analysis_data.get("vocabulary", [])), 20)},
)
if resp.status_code == 200:
results["qa"] = {"status": "generated", "data": resp.json()}
else:
logger.warning(f"QA generation returned {resp.status_code}")
results["qa"] = {"status": "skipped", "reason": f"HTTP {resp.status_code}"}
except Exception as e:
logger.warning(f"QA generation failed: {e}")
results["qa"] = {"status": "error", "reason": str(e)}
# Generate MC
try:
resp = await client.post(
f"{BACKEND_LEHRER_URL}/api/learning-units/{unit_id}/generate-mc",
json={"analysis_data": analysis_data, "num_questions": min(len(analysis_data.get("vocabulary", [])), 10)},
)
if resp.status_code == 200:
results["mc"] = {"status": "generated", "data": resp.json()}
else:
results["mc"] = {"status": "skipped", "reason": f"HTTP {resp.status_code}"}
except Exception as e:
logger.warning(f"MC generation failed: {e}")
results["mc"] = {"status": "error", "reason": str(e)}
# Generate Cloze
try:
resp = await client.post(
f"{BACKEND_LEHRER_URL}/api/learning-units/{unit_id}/generate-cloze",
json={"analysis_data": analysis_data},
)
if resp.status_code == 200:
results["cloze"] = {"status": "generated", "data": resp.json()}
else:
results["cloze"] = {"status": "skipped", "reason": f"HTTP {resp.status_code}"}
except Exception as e:
logger.warning(f"Cloze generation failed: {e}")
results["cloze"] = {"status": "error", "reason": str(e)}
return results

View File

@@ -0,0 +1,427 @@
"""
Vocabulary Session Store - PostgreSQL persistence for vocab extraction sessions.
Replaces in-memory storage with database persistence.
See migrations/001_vocab_sessions.sql for schema.
"""
import os
import uuid
import logging
import json
from typing import Optional, List, Dict, Any
import asyncpg
logger = logging.getLogger(__name__)
# Database configuration
DATABASE_URL = os.getenv(
"DATABASE_URL",
"postgresql://breakpilot:breakpilot@postgres:5432/breakpilot_db"
)
# Connection pool (initialized lazily)
_pool: Optional[asyncpg.Pool] = None
async def get_pool() -> asyncpg.Pool:
"""Get or create the database connection pool."""
global _pool
if _pool is None:
_pool = await asyncpg.create_pool(DATABASE_URL, min_size=2, max_size=10)
return _pool
async def init_vocab_tables():
"""
Initialize vocab tables if they don't exist.
This is called at startup.
"""
pool = await get_pool()
async with pool.acquire() as conn:
# Check if tables exist
tables_exist = await conn.fetchval("""
SELECT EXISTS (
SELECT FROM information_schema.tables
WHERE table_name = 'vocab_sessions'
)
""")
if not tables_exist:
logger.info("Creating vocab tables...")
# Read and execute migration
migration_path = os.path.join(
os.path.dirname(__file__),
"migrations/001_vocab_sessions.sql"
)
if os.path.exists(migration_path):
with open(migration_path, "r") as f:
sql = f.read()
await conn.execute(sql)
logger.info("Vocab tables created successfully")
else:
logger.warning(f"Migration file not found: {migration_path}")
else:
logger.debug("Vocab tables already exist")
# =============================================================================
# SESSION OPERATIONS
# =============================================================================
async def create_session_db(
session_id: str,
name: str,
description: str = "",
source_language: str = "en",
target_language: str = "de"
) -> Dict[str, Any]:
"""Create a new vocabulary session in the database."""
pool = await get_pool()
async with pool.acquire() as conn:
row = await conn.fetchrow("""
INSERT INTO vocab_sessions (
id, name, description, source_language, target_language,
status, vocabulary_count
) VALUES ($1, $2, $3, $4, $5, 'pending', 0)
RETURNING *
""", uuid.UUID(session_id), name, description, source_language, target_language)
return _row_to_dict(row)
async def get_session_db(session_id: str) -> Optional[Dict[str, Any]]:
"""Get a session by ID."""
pool = await get_pool()
async with pool.acquire() as conn:
row = await conn.fetchrow("""
SELECT * FROM vocab_sessions WHERE id = $1
""", uuid.UUID(session_id))
if row:
return _row_to_dict(row)
return None
async def list_sessions_db(
limit: int = 50,
offset: int = 0,
status: Optional[str] = None
) -> List[Dict[str, Any]]:
"""List all sessions with optional filtering."""
pool = await get_pool()
async with pool.acquire() as conn:
if status:
rows = await conn.fetch("""
SELECT * FROM vocab_sessions
WHERE status = $1
ORDER BY created_at DESC
LIMIT $2 OFFSET $3
""", status, limit, offset)
else:
rows = await conn.fetch("""
SELECT * FROM vocab_sessions
ORDER BY created_at DESC
LIMIT $1 OFFSET $2
""", limit, offset)
return [_row_to_dict(row) for row in rows]
async def update_session_db(
session_id: str,
**kwargs
) -> Optional[Dict[str, Any]]:
"""Update a session with given fields."""
pool = await get_pool()
# Build dynamic UPDATE query
fields = []
values = []
param_idx = 1
allowed_fields = [
'name', 'description', 'status', 'vocabulary_count',
'extraction_confidence', 'image_path', 'pdf_path', 'pdf_page_count',
'ocr_prompts', 'processed_pages', 'successful_pages', 'failed_pages'
]
for key, value in kwargs.items():
if key in allowed_fields:
fields.append(f"{key} = ${param_idx}")
# Convert dicts/lists to JSON for JSONB columns
if key in ['ocr_prompts', 'processed_pages', 'successful_pages', 'failed_pages']:
value = json.dumps(value) if value else None
values.append(value)
param_idx += 1
if not fields:
return await get_session_db(session_id)
values.append(uuid.UUID(session_id))
async with pool.acquire() as conn:
row = await conn.fetchrow(f"""
UPDATE vocab_sessions
SET {', '.join(fields)}
WHERE id = ${param_idx}
RETURNING *
""", *values)
if row:
return _row_to_dict(row)
return None
async def delete_session_db(session_id: str) -> bool:
"""Delete a session and all related data (cascades)."""
pool = await get_pool()
async with pool.acquire() as conn:
result = await conn.execute("""
DELETE FROM vocab_sessions WHERE id = $1
""", uuid.UUID(session_id))
return result == "DELETE 1"
# =============================================================================
# VOCABULARY OPERATIONS
# =============================================================================
async def add_vocabulary_db(
session_id: str,
vocab_list: List[Dict[str, Any]]
) -> List[Dict[str, Any]]:
"""Add vocabulary entries to a session."""
if not vocab_list:
return []
pool = await get_pool()
results = []
async with pool.acquire() as conn:
for vocab in vocab_list:
vocab_id = str(uuid.uuid4())
row = await conn.fetchrow("""
INSERT INTO vocab_entries (
id, session_id, english, german, example_sentence,
example_sentence_gap, word_type, source_page
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8)
RETURNING *
""",
uuid.UUID(vocab_id),
uuid.UUID(session_id),
vocab.get('english', ''),
vocab.get('german', ''),
vocab.get('example_sentence'),
vocab.get('example_sentence_gap'),
vocab.get('word_type'),
vocab.get('source_page')
)
results.append(_row_to_dict(row))
# Update vocabulary count
await conn.execute("""
UPDATE vocab_sessions
SET vocabulary_count = (
SELECT COUNT(*) FROM vocab_entries WHERE session_id = $1
)
WHERE id = $1
""", uuid.UUID(session_id))
return results
async def get_vocabulary_db(
session_id: str,
source_page: Optional[int] = None
) -> List[Dict[str, Any]]:
"""Get vocabulary entries for a session."""
pool = await get_pool()
async with pool.acquire() as conn:
if source_page is not None:
rows = await conn.fetch("""
SELECT * FROM vocab_entries
WHERE session_id = $1 AND source_page = $2
ORDER BY created_at
""", uuid.UUID(session_id), source_page)
else:
rows = await conn.fetch("""
SELECT * FROM vocab_entries
WHERE session_id = $1
ORDER BY source_page NULLS LAST, created_at
""", uuid.UUID(session_id))
return [_row_to_dict(row) for row in rows]
async def update_vocabulary_db(
entry_id: str,
**kwargs
) -> Optional[Dict[str, Any]]:
"""Update a single vocabulary entry."""
pool = await get_pool()
fields = []
values = []
param_idx = 1
allowed_fields = [
'english', 'german', 'example_sentence', 'example_sentence_gap',
'word_type', 'source_page'
]
for key, value in kwargs.items():
if key in allowed_fields:
fields.append(f"{key} = ${param_idx}")
values.append(value)
param_idx += 1
if not fields:
return None
values.append(uuid.UUID(entry_id))
async with pool.acquire() as conn:
row = await conn.fetchrow(f"""
UPDATE vocab_entries
SET {', '.join(fields)}
WHERE id = ${param_idx}
RETURNING *
""", *values)
if row:
return _row_to_dict(row)
return None
async def clear_page_vocabulary_db(session_id: str, page: int) -> int:
"""Clear all vocabulary for a specific page."""
pool = await get_pool()
async with pool.acquire() as conn:
result = await conn.execute("""
DELETE FROM vocab_entries
WHERE session_id = $1 AND source_page = $2
""", uuid.UUID(session_id), page)
# Update vocabulary count
await conn.execute("""
UPDATE vocab_sessions
SET vocabulary_count = (
SELECT COUNT(*) FROM vocab_entries WHERE session_id = $1
)
WHERE id = $1
""", uuid.UUID(session_id))
# Return count of deleted rows
count = int(result.split()[-1]) if result else 0
return count
# =============================================================================
# WORKSHEET OPERATIONS
# =============================================================================
async def create_worksheet_db(
session_id: str,
worksheet_types: List[str],
pdf_path: Optional[str] = None,
solution_path: Optional[str] = None
) -> Dict[str, Any]:
"""Create a worksheet record."""
pool = await get_pool()
worksheet_id = str(uuid.uuid4())
async with pool.acquire() as conn:
row = await conn.fetchrow("""
INSERT INTO vocab_worksheets (
id, session_id, worksheet_types, pdf_path, solution_path
) VALUES ($1, $2, $3, $4, $5)
RETURNING *
""",
uuid.UUID(worksheet_id),
uuid.UUID(session_id),
json.dumps(worksheet_types),
pdf_path,
solution_path
)
return _row_to_dict(row)
async def get_worksheet_db(worksheet_id: str) -> Optional[Dict[str, Any]]:
"""Get a worksheet by ID."""
pool = await get_pool()
async with pool.acquire() as conn:
row = await conn.fetchrow("""
SELECT * FROM vocab_worksheets WHERE id = $1
""", uuid.UUID(worksheet_id))
if row:
return _row_to_dict(row)
return None
async def delete_worksheets_for_session_db(session_id: str) -> int:
"""Delete all worksheets for a session."""
pool = await get_pool()
async with pool.acquire() as conn:
result = await conn.execute("""
DELETE FROM vocab_worksheets WHERE session_id = $1
""", uuid.UUID(session_id))
count = int(result.split()[-1]) if result else 0
return count
# =============================================================================
# PDF CACHE OPERATIONS
# =============================================================================
# Simple in-memory cache for PDF data (temporary until served)
_pdf_cache: Dict[str, bytes] = {}
def cache_pdf_data(worksheet_id: str, pdf_data: bytes) -> None:
"""Cache PDF data temporarily for download."""
_pdf_cache[worksheet_id] = pdf_data
def get_cached_pdf_data(worksheet_id: str) -> Optional[bytes]:
"""Get cached PDF data."""
return _pdf_cache.get(worksheet_id)
def clear_cached_pdf_data(worksheet_id: str) -> None:
"""Clear cached PDF data."""
_pdf_cache.pop(worksheet_id, None)
# =============================================================================
# HELPER FUNCTIONS
# =============================================================================
def _row_to_dict(row: asyncpg.Record) -> Dict[str, Any]:
"""Convert asyncpg Record to dict with proper type handling."""
if row is None:
return {}
result = dict(row)
# Convert UUIDs to strings
for key in ['id', 'session_id']:
if key in result and result[key] is not None:
result[key] = str(result[key])
# Convert datetimes to ISO strings
for key in ['created_at', 'updated_at', 'generated_at']:
if key in result and result[key] is not None:
result[key] = result[key].isoformat()
# Parse JSONB fields back to dicts/lists
for key in ['ocr_prompts', 'processed_pages', 'successful_pages', 'failed_pages', 'worksheet_types']:
if key in result and result[key] is not None:
if isinstance(result[key], str):
result[key] = json.loads(result[key])
return result

View File

@@ -0,0 +1,5 @@
"""
Vocab worksheet sub-package.
Main entry point: ``from vocab.worksheet.api import router``
"""

View File

@@ -0,0 +1,472 @@
"""
Vocabulary Worksheet Analysis API - OCR export, ground truth labeling,
extract-with-boxes, deskewed images, and learning unit generation.
The two large handlers (compare_ocr_methods, analyze_grid) live in
vocab_worksheet_compare_api.py and are included via compare_router.
"""
from fastapi import APIRouter, Body, HTTPException
from fastapi.responses import StreamingResponse
from pydantic import BaseModel
from typing import Optional, Dict, Any
from datetime import datetime
import os
import io
import json
import logging
def _get_sessions():
from .api import _sessions
return _sessions
def _get_local_storage_path():
from .api import LOCAL_STORAGE_PATH
return LOCAL_STORAGE_PATH
from .generation import convert_pdf_page_to_image
# Try to import Tesseract extractor
try:
from tesseract_vocab_extractor import (
extract_bounding_boxes, TESSERACT_AVAILABLE,
)
except ImportError:
TESSERACT_AVAILABLE = False
# Try to import Grid Detection Service
try:
from services.grid_detection_service import GridDetectionService
GRID_SERVICE_AVAILABLE = True
except ImportError:
GRID_SERVICE_AVAILABLE = False
logger = logging.getLogger(__name__)
analysis_router = APIRouter()
def _ocr_export_dir():
return os.path.join(_get_local_storage_path(), "ocr-exports")
def _ground_truth_dir():
return os.path.join(_get_local_storage_path(), "ground-truth")
# =============================================================================
# OCR Export Endpoints (for cross-app OCR data sharing)
# =============================================================================
@analysis_router.post("/sessions/{session_id}/ocr-export/{page_number}")
async def save_ocr_export(session_id: str, page_number: int, data: Dict[str, Any] = Body(...)):
"""
Save OCR export data for cross-app sharing (admin-v2 -> studio-v2).
Both apps proxy to klausur-service via /klausur-api/, so this endpoint
serves as shared storage accessible from both ports.
"""
logger.info(f"Saving OCR export for session {session_id}, page {page_number}")
os.makedirs(_ocr_export_dir(), exist_ok=True)
# Save the export data
export_path = os.path.join(_ocr_export_dir(), f"{session_id}_page{page_number}.json")
with open(export_path, 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=2)
# Update latest pointer
latest_path = os.path.join(_ocr_export_dir(), "latest.json")
with open(latest_path, 'w', encoding='utf-8') as f:
json.dump({
"session_id": session_id,
"page_number": page_number,
"saved_at": datetime.utcnow().isoformat(),
}, f, ensure_ascii=False, indent=2)
return {
"success": True,
"session_id": session_id,
"page_number": page_number,
"message": "OCR export saved successfully",
}
@analysis_router.get("/sessions/{session_id}/ocr-export/{page_number}")
async def load_ocr_export(session_id: str, page_number: int):
"""Load a specific OCR export by session and page number."""
export_path = os.path.join(_ocr_export_dir(), f"{session_id}_page{page_number}.json")
if not os.path.exists(export_path):
raise HTTPException(status_code=404, detail="OCR export not found")
with open(export_path, 'r', encoding='utf-8') as f:
data = json.load(f)
return data
@analysis_router.get("/ocr-export/latest")
async def load_latest_ocr_export():
"""Load the most recently saved OCR export data."""
latest_path = os.path.join(_ocr_export_dir(), "latest.json")
if not os.path.exists(latest_path):
raise HTTPException(status_code=404, detail="No OCR exports found")
with open(latest_path, 'r', encoding='utf-8') as f:
pointer = json.load(f)
session_id = pointer.get("session_id")
page_number = pointer.get("page_number")
export_path = os.path.join(_ocr_export_dir(), f"{session_id}_page{page_number}.json")
if not os.path.exists(export_path):
raise HTTPException(status_code=404, detail="Latest OCR export file not found")
with open(export_path, 'r', encoding='utf-8') as f:
data = json.load(f)
return data
# =============================================================================
# Extract with Boxes & Deskewed Image
# =============================================================================
async def extract_entries_with_boxes(image_bytes: bytes, lang: str = "eng+deu") -> dict:
"""Extract vocabulary entries with bounding boxes using Tesseract + GridDetectionService.
Returns dict with 'entries' list and 'image_width'/'image_height'.
Each entry has row_index, english, german, example, confidence, bbox, bbox_en, bbox_de, bbox_ex.
All bbox coordinates are in percent (0-100).
"""
if not TESSERACT_AVAILABLE:
raise HTTPException(status_code=500, detail="Tesseract not available")
if not GRID_SERVICE_AVAILABLE:
raise HTTPException(status_code=500, detail="GridDetectionService not available")
# Step 1: Tesseract word-level bounding boxes
tess_result = await extract_bounding_boxes(image_bytes, lang=lang)
words = tess_result.get("words", [])
img_w = tess_result.get("image_width", 0)
img_h = tess_result.get("image_height", 0)
if not words or img_w == 0 or img_h == 0:
return {"entries": [], "image_width": img_w, "image_height": img_h}
# Step 2: Convert to OCR regions (percentage-based)
service = GridDetectionService()
regions = service.convert_tesseract_regions(words, img_w, img_h)
if not regions:
return {"entries": [], "image_width": img_w, "image_height": img_h}
# Step 3: Detect grid
grid_result = service.detect_grid(regions)
if not grid_result.cells:
return {"entries": [], "image_width": img_w, "image_height": img_h}
# Step 4: Group cells by logical_row and column_type
from services.grid_detection_service import ColumnType
entries = []
for row_idx, row_cells in enumerate(grid_result.cells):
en_text = ""
de_text = ""
ex_text = ""
en_bbox = None
de_bbox = None
ex_bbox = None
row_conf_sum = 0.0
row_conf_count = 0
for cell in row_cells:
cell_bbox = {"x": round(cell.x, 2), "y": round(cell.y, 2),
"w": round(cell.width, 2), "h": round(cell.height, 2)}
if cell.column_type == ColumnType.ENGLISH:
en_text = cell.text.strip()
en_bbox = cell_bbox
elif cell.column_type == ColumnType.GERMAN:
de_text = cell.text.strip()
de_bbox = cell_bbox
elif cell.column_type == ColumnType.EXAMPLE:
ex_text = cell.text.strip()
ex_bbox = cell_bbox
if cell.text.strip():
row_conf_sum += cell.confidence
row_conf_count += 1
# Skip completely empty rows
if not en_text and not de_text and not ex_text:
continue
# Calculate whole-row bounding box
all_bboxes = [b for b in [en_bbox, de_bbox, ex_bbox] if b is not None]
if all_bboxes:
row_x = min(b["x"] for b in all_bboxes)
row_y = min(b["y"] for b in all_bboxes)
row_right = max(b["x"] + b["w"] for b in all_bboxes)
row_bottom = max(b["y"] + b["h"] for b in all_bboxes)
row_bbox = {"x": round(row_x, 2), "y": round(row_y, 2),
"w": round(row_right - row_x, 2), "h": round(row_bottom - row_y, 2)}
else:
row_bbox = {"x": 0, "y": 0, "w": 100, "h": 3}
avg_conf = round((row_conf_sum / row_conf_count * 100) if row_conf_count > 0 else 0, 1)
entries.append({
"row_index": row_idx,
"english": en_text,
"german": de_text,
"example": ex_text,
"confidence": avg_conf,
"bbox": row_bbox,
"bbox_en": en_bbox or {"x": 0, "y": 0, "w": 0, "h": 0},
"bbox_de": de_bbox or {"x": 0, "y": 0, "w": 0, "h": 0},
"bbox_ex": ex_bbox or {"x": 0, "y": 0, "w": 0, "h": 0},
})
return {"entries": entries, "image_width": img_w, "image_height": img_h}
@analysis_router.post("/sessions/{session_id}/extract-with-boxes/{page_number}")
async def extract_with_boxes(session_id: str, page_number: int):
"""Extract vocabulary entries with bounding boxes for ground truth labeling.
Uses Tesseract + GridDetectionService for spatial positioning.
page_number is 0-indexed.
"""
logger.info(f"Extract with boxes for session {session_id}, page {page_number}")
if session_id not in _get_sessions():
raise HTTPException(status_code=404, detail="Session not found")
session = _get_sessions()[session_id]
pdf_data = session.get("pdf_data")
if not pdf_data:
raise HTTPException(status_code=400, detail="No PDF uploaded for this session")
page_count = session.get("pdf_page_count", 1)
if page_number < 0 or page_number >= page_count:
raise HTTPException(status_code=400, detail=f"Invalid page number. PDF has {page_count} pages (0-indexed).")
# Convert page to hires image
image_data = await convert_pdf_page_to_image(pdf_data, page_number, thumbnail=False)
# Deskew image before OCR
deskew_angle = 0.0
try:
from cv_vocab_pipeline import deskew_image_by_word_alignment, CV2_AVAILABLE
if CV2_AVAILABLE:
image_data, deskew_angle = deskew_image_by_word_alignment(image_data)
logger.info(f"Deskew: {deskew_angle:.2f}° for page {page_number}")
except Exception as e:
logger.warning(f"Deskew failed for page {page_number}: {e}")
# Cache deskewed image in session for later serving
if "deskewed_images" not in session:
session["deskewed_images"] = {}
session["deskewed_images"][str(page_number)] = image_data
# Extract entries with boxes (now on deskewed image)
result = await extract_entries_with_boxes(image_data)
# Cache in session
if "gt_entries" not in session:
session["gt_entries"] = {}
session["gt_entries"][str(page_number)] = result["entries"]
return {
"success": True,
"entries": result["entries"],
"entry_count": len(result["entries"]),
"image_width": result["image_width"],
"image_height": result["image_height"],
"deskew_angle": round(deskew_angle, 2),
"deskewed": abs(deskew_angle) > 0.05,
}
@analysis_router.get("/sessions/{session_id}/deskewed-image/{page_number}")
async def get_deskewed_image(session_id: str, page_number: int):
"""Return the deskewed page image as PNG.
Falls back to the original hires image if no deskewed version is cached.
"""
if session_id not in _get_sessions():
raise HTTPException(status_code=404, detail="Session not found")
session = _get_sessions()[session_id]
deskewed = session.get("deskewed_images", {}).get(str(page_number))
if deskewed:
return StreamingResponse(io.BytesIO(deskewed), media_type="image/png")
# Fallback: render original hires image
pdf_data = session.get("pdf_data")
if not pdf_data:
raise HTTPException(status_code=400, detail="No PDF uploaded for this session")
image_data = await convert_pdf_page_to_image(pdf_data, page_number, thumbnail=False)
return StreamingResponse(io.BytesIO(image_data), media_type="image/png")
# =============================================================================
# Ground Truth Labeling
# =============================================================================
@analysis_router.post("/sessions/{session_id}/ground-truth/{page_number}")
async def save_ground_truth(session_id: str, page_number: int, data: dict = Body(...)):
"""Save ground truth labels for a page.
Expects body with 'entries' list - each entry has english, german, example,
status ('confirmed' | 'edited' | 'skipped'), and bbox fields.
"""
logger.info(f"Save ground truth for session {session_id}, page {page_number}")
if session_id not in _get_sessions():
raise HTTPException(status_code=404, detail="Session not found")
entries = data.get("entries", [])
if not entries:
raise HTTPException(status_code=400, detail="No entries provided")
# Save in session
session = _get_sessions()[session_id]
if "ground_truth" not in session:
session["ground_truth"] = {}
session["ground_truth"][str(page_number)] = entries
# Also save to disk
os.makedirs(_ground_truth_dir(), exist_ok=True)
gt_path = os.path.join(_ground_truth_dir(), f"{session_id}_page{page_number}.json")
gt_data = {
"session_id": session_id,
"page_number": page_number,
"saved_at": datetime.now().isoformat(),
"entry_count": len(entries),
"entries": entries,
}
with open(gt_path, 'w', encoding='utf-8') as f:
json.dump(gt_data, f, ensure_ascii=False, indent=2)
logger.info(f"Ground truth saved: {len(entries)} entries to {gt_path}")
confirmed = sum(1 for e in entries if e.get("status") == "confirmed")
edited = sum(1 for e in entries if e.get("status") == "edited")
skipped = sum(1 for e in entries if e.get("status") == "skipped")
return {
"success": True,
"saved_count": len(entries),
"confirmed": confirmed,
"edited": edited,
"skipped": skipped,
"file_path": gt_path,
}
@analysis_router.get("/sessions/{session_id}/ground-truth/{page_number}")
async def load_ground_truth(session_id: str, page_number: int):
"""Load saved ground truth for a page."""
logger.info(f"Load ground truth for session {session_id}, page {page_number}")
if session_id not in _get_sessions():
raise HTTPException(status_code=404, detail="Session not found")
# Try session cache first
session = _get_sessions()[session_id]
cached = session.get("ground_truth", {}).get(str(page_number))
if cached:
return {"success": True, "entries": cached, "source": "cache"}
# Try disk
gt_path = os.path.join(_ground_truth_dir(), f"{session_id}_page{page_number}.json")
if not os.path.exists(gt_path):
raise HTTPException(status_code=404, detail="No ground truth found for this page")
with open(gt_path, 'r', encoding='utf-8') as f:
gt_data = json.load(f)
return {"success": True, "entries": gt_data.get("entries", []), "source": "disk"}
# ─── Learning Module Generation ─────────────────────────────────────────────
class GenerateLearningUnitRequest(BaseModel):
grade: Optional[str] = None
generate_modules: bool = True
@analysis_router.post("/sessions/{session_id}/generate-learning-unit")
async def generate_learning_unit_endpoint(session_id: str, request: GenerateLearningUnitRequest = None):
"""
Create a Learning Unit from the vocabulary in this session.
1. Takes vocabulary from the session
2. Creates a Learning Unit in backend-lehrer
3. Optionally triggers MC/Cloze/QA generation
Returns the created unit info and generation status.
"""
if request is None:
request = GenerateLearningUnitRequest()
if session_id not in _get_sessions():
raise HTTPException(status_code=404, detail="Session not found")
session = _get_sessions()[session_id]
vocabulary = session.get("vocabulary", [])
if not vocabulary:
raise HTTPException(status_code=400, detail="No vocabulary in this session")
try:
from vocab.learn_bridge import create_learning_unit, generate_learning_modules
# Step 1: Create Learning Unit
result = await create_learning_unit(
session_name=session["name"],
vocabulary=vocabulary,
grade=request.grade,
)
# Step 2: Generate modules if requested
if request.generate_modules:
try:
gen_result = await generate_learning_modules(
unit_id=result["unit_id"],
analysis_path=result["analysis_path"],
)
result["generation"] = gen_result
except Exception as e:
logger.warning(f"Module generation failed (unit created): {e}")
result["generation"] = {"status": "error", "reason": str(e)}
return result
except ImportError:
raise HTTPException(status_code=501, detail="vocab_learn_bridge module not available")
except ValueError as e:
raise HTTPException(status_code=400, detail=str(e))
except RuntimeError as e:
raise HTTPException(status_code=502, detail=str(e))
# =============================================================================
# Include compare_ocr_methods & analyze_grid from companion module
# =============================================================================
from .compare_api import compare_router # noqa: E402
analysis_router.include_router(compare_router)

View File

@@ -0,0 +1,498 @@
"""
Vocabulary Worksheet API — core CRUD routes for sessions, uploads,
vocabulary editing, worksheet generation, and PDF downloads.
Sub-routers (included at bottom):
- vocab_worksheet_upload_api: PDF upload, thumbnails, page processing
- vocab_worksheet_analysis_api: OCR compare, grid analysis, ground truth
"""
from fastapi import APIRouter, HTTPException, UploadFile, File, Query
from fastapi.responses import StreamingResponse
from typing import List, Dict, Any
from datetime import datetime
import uuid
import os
import io
import logging
logger = logging.getLogger(__name__)
# --- Imports from extracted sub-modules ---
from .models import (
SessionStatus,
VocabularyEntry,
SessionCreate,
SessionResponse,
VocabularyResponse,
VocabularyUpdate,
WorksheetGenerateRequest,
WorksheetResponse,
)
from .extraction import extract_vocabulary_from_image
from .generation import (
generate_worksheet_html, generate_worksheet_pdf,
convert_pdf_page_to_image,
)
# --- Database integration (used by main.py lifespan) ---
try:
from vocab.session_store import (
DATABASE_URL, get_pool, init_vocab_tables,
list_sessions_db, get_session_db,
)
except ImportError:
DATABASE_URL = os.getenv("DATABASE_URL", "postgresql://breakpilot:breakpilot@postgres:5432/breakpilot_db")
get_pool = None
init_vocab_tables = None
list_sessions_db = None
get_session_db = None
_db_pool = None
def set_db_pool(pool):
"""Set the database connection pool (called from main.py lifespan)."""
global _db_pool
_db_pool = pool
async def _init_vocab_table():
"""Initialize vocab tables in database."""
if init_vocab_tables:
try:
await init_vocab_tables()
logger.info("vocab_session_cache table ready")
except Exception as e:
logger.warning(f"Failed to init vocab tables: {e}")
else:
logger.info("vocab_session_cache table ready")
async def _load_all_sessions():
"""Load all vocab sessions from database into memory cache."""
if not list_sessions_db:
logger.info("Loaded 0 vocab sessions from database")
return
try:
sessions = await list_sessions_db(limit=500)
count = 0
for s in sessions:
sid = s.get("id") or s.get("session_id")
if sid and sid not in _sessions:
_sessions[sid] = {
"id": sid,
"name": s.get("name", ""),
"description": s.get("description", ""),
"status": s.get("status", "created"),
"vocabulary_count": s.get("vocabulary_count", 0),
"source_language": s.get("source_language", "en"),
"target_language": s.get("target_language", "de"),
"created_at": str(s.get("created_at", "")),
}
count += 1
logger.info(f"Loaded {count} vocab sessions from database")
except Exception as e:
logger.warning(f"Failed to load sessions from database: {e}")
# --- Router & module-level state ---
router = APIRouter(prefix="/api/v1/vocab", tags=["Vocabulary Worksheets"])
LOCAL_STORAGE_PATH = os.getenv("VOCAB_STORAGE_PATH", "/app/vocab-worksheets")
_sessions: Dict[str, Dict[str, Any]] = {}
_worksheets: Dict[str, Dict[str, Any]] = {}
@router.post("/sessions", response_model=SessionResponse)
async def create_session(session: SessionCreate):
"""Create a new vocabulary extraction session."""
session_id = str(uuid.uuid4())
session_data = {
"id": session_id,
"name": session.name,
"description": session.description,
"source_language": session.source_language,
"target_language": session.target_language,
"status": SessionStatus.PENDING.value,
"vocabulary": [],
"vocabulary_count": 0,
"image_path": None,
"extraction_confidence": None,
"created_at": datetime.utcnow(),
}
_sessions[session_id] = session_data
# Create storage directory
session_dir = os.path.join(LOCAL_STORAGE_PATH, session_id)
os.makedirs(session_dir, exist_ok=True)
return SessionResponse(
id=session_id,
name=session.name,
description=session.description,
source_language=session.source_language,
target_language=session.target_language,
status=SessionStatus.PENDING.value,
vocabulary_count=0,
image_path=None,
created_at=session_data["created_at"],
)
@router.get("/sessions", response_model=List[SessionResponse])
async def list_sessions(limit: int = Query(50, ge=1, le=100)):
"""List all vocabulary sessions."""
sessions = sorted(
_sessions.values(),
key=lambda x: x["created_at"],
reverse=True
)[:limit]
return [
SessionResponse(
id=s["id"],
name=s["name"],
description=s.get("description"),
source_language=s["source_language"],
target_language=s["target_language"],
status=s["status"],
vocabulary_count=s.get("vocabulary_count", 0),
image_path=s.get("image_path"),
created_at=s["created_at"],
)
for s in sessions
]
@router.get("/sessions/{session_id}", response_model=SessionResponse)
async def get_session(session_id: str):
"""Get a specific session."""
if session_id not in _sessions:
raise HTTPException(status_code=404, detail="Session not found")
s = _sessions[session_id]
return SessionResponse(
id=s["id"],
name=s["name"],
description=s.get("description"),
source_language=s["source_language"],
target_language=s["target_language"],
status=s["status"],
vocabulary_count=s.get("vocabulary_count", 0),
image_path=s.get("image_path"),
created_at=s["created_at"],
)
@router.post("/sessions/{session_id}/upload")
async def upload_image(
session_id: str,
file: UploadFile = File(...),
):
"""
Upload a textbook page image or PDF and extract vocabulary.
Supported formats: PNG, JPG, JPEG, PDF
"""
logger.info(f"Upload request for session {session_id}")
logger.info(f"File: filename={file.filename}, content_type={file.content_type}")
if session_id not in _sessions:
logger.error(f"Session {session_id} not found")
raise HTTPException(status_code=404, detail="Session not found")
session = _sessions[session_id]
# Validate file type - check both extension and content type
extension = file.filename.split('.')[-1].lower() if file.filename else ''
content_type = file.content_type or ''
# Accept images and PDFs
valid_image_extensions = ['png', 'jpg', 'jpeg']
valid_image_content_types = ['image/png', 'image/jpeg', 'image/jpg']
is_pdf = extension == 'pdf' or content_type == 'application/pdf'
is_image = extension in valid_image_extensions or content_type in valid_image_content_types
if not is_pdf and not is_image:
logger.error(f"Invalid file type: extension={extension}, content_type={content_type}")
raise HTTPException(
status_code=400,
detail=f"Only PNG, JPG, JPEG, PDF files are supported. Got: extension={extension}, content_type={content_type}"
)
# Determine final extension for saving
if is_pdf:
save_extension = 'png' # PDFs will be converted to PNG
elif extension in valid_image_extensions:
save_extension = extension
elif content_type == 'image/png':
save_extension = 'png'
else:
save_extension = 'jpg'
# Read file content
content = await file.read()
logger.info(f"Read {len(content)} bytes from uploaded file")
# Convert PDF to image if needed
if is_pdf:
logger.info("Converting PDF to image...")
content = await convert_pdf_page_to_image(content, page_number=0)
logger.info(f"PDF converted, image size: {len(content)} bytes")
# Save image
session_dir = os.path.join(LOCAL_STORAGE_PATH, session_id)
os.makedirs(session_dir, exist_ok=True)
image_path = os.path.join(session_dir, f"source.{save_extension}")
with open(image_path, 'wb') as f:
f.write(content)
# Update session status
session["status"] = SessionStatus.PROCESSING.value
session["image_path"] = image_path
# Extract vocabulary using Vision LLM
vocabulary, confidence, error = await extract_vocabulary_from_image(content, file.filename or "image.png", page_number=0)
# Update session with extracted vocabulary
session["vocabulary"] = [v.dict() for v in vocabulary]
session["vocabulary_count"] = len(vocabulary)
session["extraction_confidence"] = confidence
session["status"] = SessionStatus.EXTRACTED.value
result = {
"session_id": session_id,
"filename": file.filename,
"image_path": image_path,
"vocabulary_count": len(vocabulary),
"extraction_confidence": confidence,
"status": SessionStatus.EXTRACTED.value,
}
if error:
result["error"] = error
return result
@router.get("/sessions/{session_id}/vocabulary", response_model=VocabularyResponse)
async def get_vocabulary(session_id: str):
"""Get extracted vocabulary for a session."""
if session_id not in _sessions:
raise HTTPException(status_code=404, detail="Session not found")
session = _sessions[session_id]
vocabulary = [VocabularyEntry(**v) for v in session.get("vocabulary", [])]
return VocabularyResponse(
session_id=session_id,
vocabulary=vocabulary,
extraction_confidence=session.get("extraction_confidence"),
)
@router.put("/sessions/{session_id}/vocabulary")
async def update_vocabulary(session_id: str, update: VocabularyUpdate):
"""Update vocabulary entries (for manual corrections)."""
if session_id not in _sessions:
raise HTTPException(status_code=404, detail="Session not found")
session = _sessions[session_id]
session["vocabulary"] = [v.dict() for v in update.vocabulary]
session["vocabulary_count"] = len(update.vocabulary)
return {
"session_id": session_id,
"vocabulary_count": len(update.vocabulary),
"message": "Vocabulary updated successfully",
}
@router.post("/sessions/{session_id}/generate", response_model=WorksheetResponse)
async def generate_worksheet(session_id: str, request: WorksheetGenerateRequest):
"""Generate worksheet PDF(s) from extracted vocabulary."""
if session_id not in _sessions:
raise HTTPException(status_code=404, detail="Session not found")
session = _sessions[session_id]
vocabulary = [VocabularyEntry(**v) for v in session.get("vocabulary", [])]
if not vocabulary:
raise HTTPException(status_code=400, detail="No vocabulary to generate worksheet from")
worksheet_id = str(uuid.uuid4())
title = request.title or session["name"]
# Generate HTML for each worksheet type
combined_html = ""
for wtype in request.worksheet_types:
html = generate_worksheet_html(
vocabulary=vocabulary,
worksheet_type=wtype,
title=f"{title} - {wtype.value}",
show_solutions=False,
repetitions=request.repetitions,
line_height=request.line_height,
)
combined_html += html + '<div style="page-break-after: always;"></div>'
# Generate PDF
try:
pdf_bytes = await generate_worksheet_pdf(combined_html)
except Exception as e:
raise HTTPException(status_code=500, detail=f"PDF generation failed: {e}")
# Save PDF
session_dir = os.path.join(LOCAL_STORAGE_PATH, session_id)
pdf_path = os.path.join(session_dir, f"worksheet_{worksheet_id}.pdf")
with open(pdf_path, 'wb') as f:
f.write(pdf_bytes)
# Generate solution PDF if requested
solution_path = None
if request.include_solutions:
solution_html = ""
for wtype in request.worksheet_types:
html = generate_worksheet_html(
vocabulary=vocabulary,
worksheet_type=wtype,
title=f"{title} - {wtype.value} (Loesung)",
show_solutions=True,
repetitions=request.repetitions,
line_height=request.line_height,
)
solution_html += html + '<div style="page-break-after: always;"></div>'
solution_bytes = await generate_worksheet_pdf(solution_html)
solution_path = os.path.join(session_dir, f"solution_{worksheet_id}.pdf")
with open(solution_path, 'wb') as f:
f.write(solution_bytes)
# Store worksheet info
worksheet_data = {
"id": worksheet_id,
"session_id": session_id,
"worksheet_types": [wt.value for wt in request.worksheet_types],
"pdf_path": pdf_path,
"solution_path": solution_path,
"generated_at": datetime.utcnow(),
}
_worksheets[worksheet_id] = worksheet_data
# Update session status
session["status"] = SessionStatus.COMPLETED.value
return WorksheetResponse(
id=worksheet_id,
session_id=session_id,
worksheet_types=worksheet_data["worksheet_types"],
pdf_path=pdf_path,
solution_path=solution_path,
generated_at=worksheet_data["generated_at"],
)
@router.get("/worksheets/{worksheet_id}/pdf")
async def download_worksheet_pdf(worksheet_id: str):
"""Download the generated worksheet PDF."""
if worksheet_id not in _worksheets:
raise HTTPException(status_code=404, detail="Worksheet not found")
worksheet = _worksheets[worksheet_id]
pdf_path = worksheet["pdf_path"]
if not os.path.exists(pdf_path):
raise HTTPException(status_code=404, detail="PDF file not found")
with open(pdf_path, 'rb') as f:
pdf_bytes = f.read()
return StreamingResponse(
io.BytesIO(pdf_bytes),
media_type="application/pdf",
headers={"Content-Disposition": f"attachment; filename=worksheet_{worksheet_id}.pdf"}
)
@router.get("/worksheets/{worksheet_id}/solution")
async def download_solution_pdf(worksheet_id: str):
"""Download the solution PDF."""
if worksheet_id not in _worksheets:
raise HTTPException(status_code=404, detail="Worksheet not found")
worksheet = _worksheets[worksheet_id]
solution_path = worksheet.get("solution_path")
if not solution_path or not os.path.exists(solution_path):
raise HTTPException(status_code=404, detail="Solution PDF not found")
with open(solution_path, 'rb') as f:
pdf_bytes = f.read()
return StreamingResponse(
io.BytesIO(pdf_bytes),
media_type="application/pdf",
headers={"Content-Disposition": f"attachment; filename=solution_{worksheet_id}.pdf"}
)
@router.get("/sessions/{session_id}/image")
async def get_session_image(session_id: str):
"""Get the uploaded source image for a session."""
if session_id not in _sessions:
raise HTTPException(status_code=404, detail="Session not found")
session = _sessions[session_id]
image_path = session.get("image_path")
if not image_path or not os.path.exists(image_path):
raise HTTPException(status_code=404, detail="Image not found")
# Determine content type
extension = image_path.split('.')[-1].lower()
content_type = {
'png': 'image/png',
'jpg': 'image/jpeg',
'jpeg': 'image/jpeg',
}.get(extension, 'application/octet-stream')
with open(image_path, 'rb') as f:
image_bytes = f.read()
return StreamingResponse(
io.BytesIO(image_bytes),
media_type=content_type,
)
@router.delete("/sessions/{session_id}")
async def delete_session(session_id: str):
"""Delete a vocabulary session and all associated files."""
if session_id not in _sessions:
raise HTTPException(status_code=404, detail="Session not found")
# Delete session directory
session_dir = os.path.join(LOCAL_STORAGE_PATH, session_id)
if os.path.exists(session_dir):
import shutil
shutil.rmtree(session_dir)
# Remove from storage
del _sessions[session_id]
# Remove associated worksheets
for wid, ws in list(_worksheets.items()):
if ws["session_id"] == session_id:
del _worksheets[wid]
return {"message": "Session deleted successfully", "session_id": session_id}
# --- Include sub-routers ---
from .upload_api import upload_router
from .analysis_api import analysis_router
router.include_router(upload_router)
router.include_router(analysis_router)

View File

@@ -0,0 +1,542 @@
"""
Vocabulary Worksheet Compare & Grid Analysis API.
Split from vocab_worksheet_analysis_api.py — contains the two largest
route handlers: compare_ocr_methods (~234 LOC) and analyze_grid (~255 LOC).
"""
from fastapi import APIRouter, HTTPException, Query
import base64
import json
import logging
import os
from .extraction import extract_vocabulary_from_image
OLLAMA_URL = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434")
VISION_MODEL = os.getenv("VISION_MODEL", "llama3.2-vision:11b")
def _get_sessions():
from .api import _sessions
return _sessions
from .generation import convert_pdf_page_to_image
# Try to import Tesseract extractor
try:
from tesseract_vocab_extractor import (
run_tesseract_pipeline,
match_positions_to_vocab, TESSERACT_AVAILABLE,
)
except ImportError:
TESSERACT_AVAILABLE = False
# Try to import CV Pipeline
try:
from cv_vocab_pipeline import run_cv_pipeline, CV_PIPELINE_AVAILABLE
except ImportError:
CV_PIPELINE_AVAILABLE = False
# Try to import Grid Detection Service
try:
from services.grid_detection_service import GridDetectionService
GRID_SERVICE_AVAILABLE = True
except ImportError:
GRID_SERVICE_AVAILABLE = False
logger = logging.getLogger(__name__)
compare_router = APIRouter()
# =============================================================================
# OCR Compare & Grid Analysis Endpoints
# =============================================================================
@compare_router.post("/sessions/{session_id}/compare-ocr/{page_number}")
async def compare_ocr_methods(session_id: str, page_number: int):
"""
Run multiple OCR methods on a page and compare results.
This endpoint:
1. Gets the page image from the session's uploaded PDF
2. Runs Vision LLM extraction (primary method)
3. Optionally runs Tesseract extraction
4. Compares found vocabulary across methods
5. Returns structured comparison results
page_number is 0-indexed.
"""
import time
logger.info(f"Compare OCR for session {session_id}, page {page_number}")
if session_id not in _get_sessions():
raise HTTPException(status_code=404, detail="Session not found")
session = _get_sessions()[session_id]
pdf_data = session.get("pdf_data")
if not pdf_data:
raise HTTPException(status_code=400, detail="No PDF uploaded for this session")
page_count = session.get("pdf_page_count", 1)
if page_number < 0 or page_number >= page_count:
raise HTTPException(status_code=400, detail=f"Invalid page number. PDF has {page_count} pages (0-indexed).")
# Convert page to image
image_data = await convert_pdf_page_to_image(pdf_data, page_number, thumbnail=False)
methods_results = {}
all_vocab_sets = {}
# --- Method: Vision LLM ---
try:
start = time.time()
vocab, confidence, error = await extract_vocabulary_from_image(
image_data, f"page_{page_number + 1}.png", page_number=page_number, use_hybrid=False
)
duration = time.time() - start
vocab_list = []
for v in vocab:
entry = v.dict() if hasattr(v, 'dict') else (v.__dict__.copy() if hasattr(v, '__dict__') else dict(v))
vocab_list.append({
"english": entry.get("english", ""),
"german": entry.get("german", ""),
"example": entry.get("example_sentence", ""),
})
methods_results["vision_llm"] = {
"name": "Vision LLM",
"model": VISION_MODEL,
"duration_seconds": round(duration, 1),
"vocabulary_count": len(vocab_list),
"vocabulary": vocab_list,
"confidence": confidence,
"success": len(vocab_list) > 0 and not error,
"error": error if error else None,
}
all_vocab_sets["vision_llm"] = {(v["english"].lower().strip(), v["german"].lower().strip()) for v in vocab_list if v["english"] and v["german"]}
except Exception as e:
logger.error(f"Vision LLM failed: {e}")
methods_results["vision_llm"] = {
"name": "Vision LLM",
"model": VISION_MODEL,
"duration_seconds": 0,
"vocabulary_count": 0,
"vocabulary": [],
"confidence": 0,
"success": False,
"error": str(e),
}
all_vocab_sets["vision_llm"] = set()
# --- Method: Tesseract OCR (bounding boxes + vocab extraction) ---
if TESSERACT_AVAILABLE:
try:
start = time.time()
tess_result = await run_tesseract_pipeline(image_data, lang="eng+deu")
duration = time.time() - start
tess_vocab = tess_result.get("vocabulary", [])
tess_words = tess_result.get("words", [])
# Store Tesseract words in session for later use (grid analysis, position matching)
session["tesseract_words"] = tess_words
session["tesseract_image_width"] = tess_result.get("image_width", 0)
session["tesseract_image_height"] = tess_result.get("image_height", 0)
session[f"tesseract_page_{page_number}"] = tess_result
vocab_list_tess = []
for v in tess_vocab:
vocab_list_tess.append({
"english": v.get("english", ""),
"german": v.get("german", ""),
"example": v.get("example", ""),
})
methods_results["tesseract"] = {
"name": "Tesseract OCR",
"model": "tesseract-ocr (eng+deu)",
"duration_seconds": round(duration, 1),
"vocabulary_count": len(vocab_list_tess),
"vocabulary": vocab_list_tess,
"confidence": 0.7 if tess_vocab else 0,
"success": len(vocab_list_tess) > 0,
"error": tess_result.get("error"),
"word_count": tess_result.get("word_count", 0),
"columns_detected": len(tess_result.get("columns", [])),
}
all_vocab_sets["tesseract"] = {
(v["english"].lower().strip(), v["german"].lower().strip())
for v in vocab_list_tess if v["english"] and v["german"]
}
# Fuzzy-match: attach Tesseract bounding boxes to Vision LLM results
if "vision_llm" in methods_results and methods_results["vision_llm"]["success"]:
llm_vocab_with_bbox = match_positions_to_vocab(
tess_words,
methods_results["vision_llm"]["vocabulary"],
tess_result.get("image_width", 1),
tess_result.get("image_height", 1),
)
methods_results["vision_llm"]["vocabulary"] = llm_vocab_with_bbox
except Exception as e:
logger.error(f"Tesseract failed: {e}")
import traceback
logger.debug(traceback.format_exc())
methods_results["tesseract"] = {
"name": "Tesseract OCR",
"model": "tesseract-ocr",
"duration_seconds": 0,
"vocabulary_count": 0,
"vocabulary": [],
"confidence": 0,
"success": False,
"error": str(e),
}
all_vocab_sets["tesseract"] = set()
# --- Method: CV Pipeline (Document Reconstruction) ---
if CV_PIPELINE_AVAILABLE:
try:
start = time.time()
cv_result = await run_cv_pipeline(pdf_data=pdf_data, page_number=page_number)
duration = time.time() - start
cv_vocab = cv_result.vocabulary if not cv_result.error else []
vocab_list_cv = []
for v in cv_vocab:
vocab_list_cv.append({
"english": v.get("english", ""),
"german": v.get("german", ""),
"example": v.get("example", ""),
})
methods_results["cv_pipeline"] = {
"name": "CV Pipeline (Document Reconstruction)",
"model": "opencv + tesseract (multi-pass)",
"duration_seconds": round(duration, 1),
"vocabulary_count": len(vocab_list_cv),
"vocabulary": vocab_list_cv,
"confidence": 0.8 if cv_vocab else 0,
"success": len(vocab_list_cv) > 0,
"error": cv_result.error,
"word_count": cv_result.word_count,
"columns_detected": cv_result.columns_detected,
"stages": cv_result.stages,
}
all_vocab_sets["cv_pipeline"] = {
(v["english"].lower().strip(), v["german"].lower().strip())
for v in vocab_list_cv if v["english"] and v["german"]
}
except Exception as e:
logger.error(f"CV Pipeline failed: {e}")
import traceback
logger.debug(traceback.format_exc())
methods_results["cv_pipeline"] = {
"name": "CV Pipeline (Document Reconstruction)",
"model": "opencv + tesseract (multi-pass)",
"duration_seconds": 0,
"vocabulary_count": 0,
"vocabulary": [],
"confidence": 0,
"success": False,
"error": str(e),
}
all_vocab_sets["cv_pipeline"] = set()
# --- Build comparison ---
all_unique = set()
for vs in all_vocab_sets.values():
all_unique |= vs
found_by_all = []
found_by_some = []
for english, german in sorted(all_unique):
found_in = [m for m, vs in all_vocab_sets.items() if (english, german) in vs]
entry = {"english": english, "german": german, "methods": found_in}
if len(found_in) == len(all_vocab_sets):
found_by_all.append(entry)
else:
found_by_some.append(entry)
total_methods = max(len(all_vocab_sets), 1)
agreement_rate = len(found_by_all) / max(len(all_unique), 1) if all_unique else 0
# Find best method
best_method = max(all_vocab_sets, key=lambda m: len(all_vocab_sets[m])) if all_vocab_sets else "vision_llm"
return {
"session_id": session_id,
"page_number": page_number,
"methods": methods_results,
"comparison": {
"found_by_all_methods": found_by_all,
"found_by_some_methods": found_by_some,
"total_unique_vocabulary": len(all_unique),
"agreement_rate": agreement_rate,
},
"recommendation": {
"best_method": best_method,
"reason": f"{len(all_vocab_sets.get(best_method, set()))} Vokabeln erkannt mit hoher Konfidenz",
},
}
@compare_router.post("/sessions/{session_id}/analyze-grid/{page_number}")
async def analyze_grid(session_id: str, page_number: int, use_tesseract: bool = Query(True)):
"""
Analyze the grid/table structure of a vocabulary page.
Hybrid approach:
1. If Tesseract bounding boxes are available (from compare-ocr), use them for
real spatial positions via GridDetectionService.
2. Otherwise fall back to Vision LLM for grid structure detection.
page_number is 0-indexed.
Returns GridData structure expected by the frontend GridOverlay component.
"""
import httpx
logger.info(f"Grid analysis for session {session_id}, page {page_number} (use_tesseract={use_tesseract})")
if session_id not in _get_sessions():
raise HTTPException(status_code=404, detail="Session not found")
session = _get_sessions()[session_id]
pdf_data = session.get("pdf_data")
if not pdf_data:
raise HTTPException(status_code=400, detail="No PDF uploaded for this session")
page_count = session.get("pdf_page_count", 1)
if page_number < 0 or page_number >= page_count:
raise HTTPException(status_code=400, detail=f"Invalid page number.")
# Convert page to image
image_data = await convert_pdf_page_to_image(pdf_data, page_number, thumbnail=False)
# --- Strategy 1: Use Tesseract bounding boxes + GridDetectionService ---
tess_page_data = session.get(f"tesseract_page_{page_number}")
if use_tesseract and TESSERACT_AVAILABLE and GRID_SERVICE_AVAILABLE:
try:
# Run Tesseract if not already cached
if not tess_page_data:
logger.info("Running Tesseract for grid analysis (not cached)")
from tesseract_vocab_extractor import run_tesseract_pipeline as _run_tess
tess_page_data = await _run_tess(image_data, lang="eng+deu")
session[f"tesseract_page_{page_number}"] = tess_page_data
session["tesseract_words"] = tess_page_data.get("words", [])
session["tesseract_image_width"] = tess_page_data.get("image_width", 0)
session["tesseract_image_height"] = tess_page_data.get("image_height", 0)
tess_words = tess_page_data.get("words", [])
img_w = tess_page_data.get("image_width", 0)
img_h = tess_page_data.get("image_height", 0)
if tess_words and img_w > 0 and img_h > 0:
service = GridDetectionService()
regions = service.convert_tesseract_regions(tess_words, img_w, img_h)
if regions:
grid_result = service.detect_grid(regions)
grid_dict = grid_result.to_dict()
# Merge LLM text if available (better quality than Tesseract text)
# The LLM vocab was stored during compare-ocr
grid_dict["source"] = "tesseract+grid_service"
grid_dict["word_count"] = len(tess_words)
logger.info(f"Tesseract grid: {grid_result.rows}x{grid_result.columns}, "
f"{grid_result.stats.get('recognized', 0)} recognized")
return {"success": True, "grid": grid_dict}
logger.info("Tesseract data insufficient, falling back to LLM")
except Exception as e:
logger.warning(f"Tesseract grid analysis failed, falling back to LLM: {e}")
import traceback
logger.debug(traceback.format_exc())
# --- Strategy 2: Fall back to Vision LLM ---
image_base64 = base64.b64encode(image_data).decode("utf-8")
grid_prompt = """Analyze this textbook page image. It contains a vocabulary table/grid.
Your task: Identify the TABLE STRUCTURE and extract each cell's content.
Return a JSON object with this EXACT structure:
{
"rows": <number of rows>,
"columns": <number of columns>,
"column_types": ["english", "german", "example"],
"entries": [
{
"row": 0,
"col": 0,
"text": "the word or phrase in this cell",
"column_type": "english",
"confidence": 0.95
}
]
}
Rules:
- row and col are 0-indexed
- column_type is one of: "english", "german", "example", "unknown"
- Detect whether each column contains English words, German translations, or example sentences
- Include ALL non-empty cells
- confidence is 0.0-1.0 based on how clear the text is
- If a cell is empty, don't include it
- Return ONLY the JSON, no other text"""
try:
import asyncio
raw_text = ""
max_retries = 3
for attempt in range(max_retries):
async with httpx.AsyncClient(timeout=300.0) as client:
response = await client.post(
f"{OLLAMA_URL}/api/chat",
json={
"model": VISION_MODEL,
"messages": [{"role": "user", "content": grid_prompt, "images": [image_base64]}],
"stream": False,
"options": {"temperature": 0.1, "num_predict": 8192},
},
timeout=300.0,
)
if response.status_code == 500 and attempt < max_retries - 1:
wait_time = 10 * (attempt + 1)
logger.warning(f"Ollama returned 500, retrying in {wait_time}s (attempt {attempt + 1}/{max_retries})")
await asyncio.sleep(wait_time)
continue
elif response.status_code != 200:
error_detail = response.text[:200] if response.text else "Unknown error"
return {"success": False, "error": f"Ollama Fehler ({response.status_code}): {error_detail}. Bitte erneut versuchen - evtl. laeuft noch ein anderer OCR-Request."}
raw_text = response.json().get("message", {}).get("content", "")
break
# Parse JSON from response
import re
json_match = re.search(r'\{[\s\S]*\}', raw_text)
if not json_match:
return {"success": False, "error": "Could not parse grid structure from LLM response"}
grid_raw = json.loads(json_match.group())
num_rows = grid_raw.get("rows", 0)
num_cols = grid_raw.get("columns", 0)
column_types = grid_raw.get("column_types", [])
entries = grid_raw.get("entries", [])
if num_rows == 0 or num_cols == 0:
return {"success": False, "error": "No grid structure detected"}
# Ensure column_types has the right length
while len(column_types) < num_cols:
column_types.append("unknown")
# Build cell grid with percentage-based coordinates
row_height = 100.0 / num_rows
col_width = 100.0 / num_cols
# Track which cells have content
cell_map = {}
for entry in entries:
r = entry.get("row", 0)
c = entry.get("col", 0)
cell_map[(r, c)] = entry
cells = []
recognized_count = 0
empty_count = 0
problematic_count = 0
for r in range(num_rows):
row_cells = []
for c in range(num_cols):
x = c * col_width
y = r * row_height
if (r, c) in cell_map:
entry = cell_map[(r, c)]
text = entry.get("text", "").strip()
conf = entry.get("confidence", 0.8)
col_type = entry.get("column_type", column_types[c] if c < len(column_types) else "unknown")
if text:
status = "recognized" if conf >= 0.5 else "problematic"
if status == "recognized":
recognized_count += 1
else:
problematic_count += 1
else:
status = "empty"
empty_count += 1
else:
text = ""
conf = 0.0
col_type = column_types[c] if c < len(column_types) else "unknown"
status = "empty"
empty_count += 1
row_cells.append({
"row": r,
"col": c,
"x": round(x, 2),
"y": round(y, 2),
"width": round(col_width, 2),
"height": round(row_height, 2),
"text": text,
"confidence": conf,
"status": status,
"column_type": col_type,
})
cells.append(row_cells)
total = num_rows * num_cols
coverage = (recognized_count + problematic_count) / max(total, 1)
# Column and row boundaries as percentages
col_boundaries = [round(c * col_width, 2) for c in range(num_cols + 1)]
row_boundaries = [round(r * row_height, 2) for r in range(num_rows + 1)]
grid_data = {
"rows": num_rows,
"columns": num_cols,
"cells": cells,
"column_types": column_types,
"column_boundaries": col_boundaries,
"row_boundaries": row_boundaries,
"deskew_angle": 0.0,
"source": "vision_llm",
"stats": {
"recognized": recognized_count,
"problematic": problematic_count,
"empty": empty_count,
"manual": 0,
"total": total,
"coverage": round(coverage, 3),
},
}
return {"success": True, "grid": grid_data}
except httpx.TimeoutException:
logger.error("Grid analysis timed out")
return {"success": False, "error": "Grid-Analyse Timeout (Ollama zu langsam)"}
except Exception as e:
logger.error(f"Grid analysis failed: {e}")
import traceback
logger.debug(traceback.format_exc())
return {"success": False, "error": f"Grid-Analyse fehlgeschlagen: {str(e)}"}

View File

@@ -0,0 +1,325 @@
"""Vocabulary extraction from images using Vision LLM and hybrid OCR+LLM.
Contains:
- VOCAB_EXTRACTION_PROMPT: Prompt template for Vision LLM extraction
- extract_vocabulary_from_image(): Core extraction (hybrid or Vision LLM)
- _get_demo_vocabulary(): Demo data for testing
- parse_vocabulary_json(): Robust JSON parsing with 4-strategy fallback
"""
import base64
import json
import logging
import os
import re
import uuid
from typing import List
import httpx
from .models import VocabularyEntry
logger = logging.getLogger(__name__)
# Ollama Configuration
OLLAMA_URL = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434")
VISION_MODEL = os.getenv("OLLAMA_VISION_MODEL", "qwen2.5vl:32b")
# =============================================================================
# Vision LLM Vocabulary Extraction
# =============================================================================
VOCAB_EXTRACTION_PROMPT = """Analysiere dieses Bild einer Vokabelliste aus einem Schulbuch.
AUFGABE: Extrahiere alle Vokabeleintraege in folgendem JSON-Format:
{
"vocabulary": [
{
"english": "to improve",
"german": "verbessern",
"example": "I want to improve my English."
}
]
}
REGELN:
1. Erkenne das typische 3-Spalten-Layout: Englisch | Deutsch | Beispielsatz
2. Behalte die exakte Schreibweise bei
3. Bei fehlenden Beispielsaetzen: "example": null
4. Ignoriere Seitenzahlen, Ueberschriften, Kapitelnummern
5. Gib NUR valides JSON zurueck, keine Erklaerungen
6. Wenn Wortarten angegeben sind (n, v, adj), extrahiere sie als "word_type"
Beispiel-Output:
{
"vocabulary": [
{"english": "achievement", "german": "Leistung, Errungenschaft", "example": "Her achievements were impressive.", "word_type": "n"},
{"english": "to achieve", "german": "erreichen, erzielen", "example": "She achieved her goals.", "word_type": "v"}
]
}"""
async def extract_vocabulary_from_image(
image_data: bytes,
filename: str,
page_number: int = 0,
use_hybrid: bool = False # DISABLED: PaddleOCR crashes on ARM64 Mac Mini
) -> tuple[List[VocabularyEntry], float, str]:
"""
Extract vocabulary from an image using hybrid OCR+LLM or Vision LLM (default).
Args:
image_data: Image bytes
filename: Original filename for logging
page_number: 0-indexed page number for error messages
use_hybrid: If True, use PaddleOCR + LLM (faster, more accurate for printed text)
If False, use Vision LLM (slower, better for complex layouts)
Returns:
Tuple of (vocabulary_entries, confidence, error_message)
error_message is empty string on success
"""
# ==========================================================================
# HYBRID APPROACH (Default): PaddleOCR + LLM Gateway
# ==========================================================================
if use_hybrid:
try:
from hybrid_vocab_extractor import extract_vocabulary_hybrid
logger.info(f"Using HYBRID extraction for {filename} (PaddleOCR + LLM)")
vocab_dicts, confidence, error = await extract_vocabulary_hybrid(image_data, page_number)
if error:
logger.warning(f"Hybrid extraction had issues: {error}")
# Fall through to Vision LLM fallback
elif vocab_dicts:
# Convert dicts to VocabularyEntry objects
vocabulary = [
VocabularyEntry(
id=str(uuid.uuid4()),
english=v.get("english", ""),
german=v.get("german", ""),
example_sentence=v.get("example"),
source_page=page_number + 1
)
for v in vocab_dicts
if v.get("english") and v.get("german")
]
logger.info(f"Hybrid extraction: {len(vocabulary)} entries from {filename}")
return vocabulary, confidence, ""
except ImportError as e:
logger.warning(f"Hybrid extractor not available: {e}. Falling back to Vision LLM.")
except Exception as e:
logger.warning(f"Hybrid extraction failed: {e}. Falling back to Vision LLM.")
import traceback
logger.debug(traceback.format_exc())
# ==========================================================================
# FALLBACK: Vision LLM (Ollama llama3.2-vision)
# ==========================================================================
logger.info(f"Using VISION LLM extraction for {filename}")
try:
# First check if Ollama is available
async with httpx.AsyncClient(timeout=10.0) as check_client:
try:
health_response = await check_client.get(f"{OLLAMA_URL}/api/tags")
if health_response.status_code != 200:
logger.error(f"Ollama not available at {OLLAMA_URL}")
return [], 0.0, f"Seite {page_number + 1}: Ollama nicht verfuegbar"
except Exception as e:
logger.error(f"Ollama health check failed: {e}")
return [], 0.0, f"Seite {page_number + 1}: Verbindung zu Ollama fehlgeschlagen"
image_base64 = base64.b64encode(image_data).decode("utf-8")
payload = {
"model": VISION_MODEL,
"messages": [
{
"role": "user",
"content": VOCAB_EXTRACTION_PROMPT,
"images": [image_base64]
}
],
"stream": False,
"options": {
"temperature": 0.1,
"num_predict": 4096,
}
}
logger.info(f"Extracting vocabulary from {filename} ({len(image_data)} bytes) using {VISION_MODEL}")
# Increased timeout for Vision models (they can be slow)
async with httpx.AsyncClient(timeout=600.0) as client:
response = await client.post(
f"{OLLAMA_URL}/api/chat",
json=payload,
timeout=300.0 # 5 minutes per page
)
response.raise_for_status()
data = response.json()
extracted_text = data.get("message", {}).get("content", "")
logger.info(f"Ollama response received: {len(extracted_text)} chars")
# Parse JSON from response
vocabulary = parse_vocabulary_json(extracted_text)
# Set source_page for each entry
for v in vocabulary:
v.source_page = page_number + 1
# Estimate confidence
confidence = 0.85 if len(vocabulary) > 0 else 0.1
logger.info(f"Vision LLM extracted {len(vocabulary)} vocabulary entries from {filename}")
return vocabulary, confidence, ""
except httpx.TimeoutException:
logger.error(f"Ollama request timed out for {filename} (model: {VISION_MODEL})")
return [], 0.0, f"Seite {page_number + 1}: Timeout - Verarbeitung dauerte zu lange"
except Exception as e:
logger.error(f"Vocabulary extraction failed for {filename}: {e}")
import traceback
logger.error(traceback.format_exc())
return [], 0.0, f"Seite {page_number + 1}: Fehler - {str(e)[:50]}"
def _get_demo_vocabulary() -> List[VocabularyEntry]:
"""Return demo vocabulary for testing when Vision LLM is not available."""
demo_entries = [
{"english": "to achieve", "german": "erreichen, erzielen", "example": "She achieved her goals."},
{"english": "achievement", "german": "Leistung, Errungenschaft", "example": "That was a great achievement."},
{"english": "improve", "german": "verbessern", "example": "I want to improve my English."},
{"english": "improvement", "german": "Verbesserung", "example": "There has been a lot of improvement."},
{"english": "success", "german": "Erfolg", "example": "The project was a success."},
{"english": "successful", "german": "erfolgreich", "example": "She is a successful businesswoman."},
{"english": "fail", "german": "scheitern, durchfallen", "example": "Don't be afraid to fail."},
{"english": "failure", "german": "Misserfolg, Versagen", "example": "Failure is part of learning."},
]
return [
VocabularyEntry(
id=str(uuid.uuid4()),
english=e["english"],
german=e["german"],
example_sentence=e.get("example"),
)
for e in demo_entries
]
def parse_vocabulary_json(text: str) -> List[VocabularyEntry]:
"""Parse vocabulary JSON from LLM response with robust error handling."""
def clean_json_string(s: str) -> str:
"""Clean a JSON string by removing control characters and fixing common issues."""
# Remove control characters except newlines and tabs
s = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f]', '', s)
# Replace unescaped newlines within strings with space
# This is a simplistic approach - replace actual newlines with escaped ones
s = s.replace('\n', '\\n').replace('\r', '\\r').replace('\t', '\\t')
return s
def try_parse_json(json_str: str) -> dict:
"""Try multiple strategies to parse JSON."""
# Strategy 1: Direct parse
try:
return json.loads(json_str)
except json.JSONDecodeError:
pass
# Strategy 2: Clean and parse
try:
cleaned = clean_json_string(json_str)
return json.loads(cleaned)
except json.JSONDecodeError:
pass
# Strategy 3: Try to fix common issues
try:
# Remove trailing commas before } or ]
fixed = re.sub(r',(\s*[}\]])', r'\1', json_str)
# Fix unquoted keys
fixed = re.sub(r'(\{|\,)\s*([a-zA-Z_][a-zA-Z0-9_]*)\s*:', r'\1"\2":', fixed)
return json.loads(fixed)
except json.JSONDecodeError:
pass
return None
try:
# Find JSON in response (may have extra text)
start = text.find('{')
end = text.rfind('}') + 1
if start == -1 or end == 0:
logger.warning("No JSON found in response")
return []
json_str = text[start:end]
data = try_parse_json(json_str)
if data is None:
# Strategy 4: Extract vocabulary entries using regex as fallback
logger.warning("JSON parsing failed, trying regex extraction")
vocabulary = []
# Match patterns like {"english": "...", "german": "...", ...}
pattern = r'\{\s*"english"\s*:\s*"([^"]*?)"\s*,\s*"german"\s*:\s*"([^"]*?)"(?:\s*,\s*"example"\s*:\s*(?:"([^"]*?)"|null))?'
matches = re.findall(pattern, text, re.IGNORECASE | re.DOTALL)
for match in matches:
english = match[0].strip() if match[0] else ""
german = match[1].strip() if match[1] else ""
example = match[2].strip() if len(match) > 2 and match[2] else None
if english and german:
vocab_entry = VocabularyEntry(
id=str(uuid.uuid4()),
english=english,
german=german,
example_sentence=example,
)
vocabulary.append(vocab_entry)
if vocabulary:
logger.info(f"Regex extraction found {len(vocabulary)} entries")
return vocabulary
# Normal JSON parsing succeeded
vocabulary = []
for i, entry in enumerate(data.get("vocabulary", [])):
english = entry.get("english", "").strip()
german = entry.get("german", "").strip()
# Skip entries that look like hallucinations (very long or containing unusual patterns)
if len(english) > 100 or len(german) > 200:
logger.warning(f"Skipping suspicious entry: {english[:50]}...")
continue
if not english or not german:
continue
vocab_entry = VocabularyEntry(
id=str(uuid.uuid4()),
english=english,
german=german,
example_sentence=entry.get("example"),
word_type=entry.get("word_type"),
)
vocabulary.append(vocab_entry)
return vocabulary
except Exception as e:
logger.error(f"Failed to parse vocabulary JSON: {e}")
import traceback
logger.error(traceback.format_exc())
return []

View File

@@ -0,0 +1,258 @@
"""
Vocabulary Worksheet Generation — HTML/PDF generation and PDF utilities.
Extracted from vocab_worksheet_api.py to keep modules under 500 LOC.
Functions:
- generate_worksheet_html(): Build HTML for various worksheet types
- generate_worksheet_pdf(): Convert HTML to PDF via WeasyPrint
- get_pdf_page_count(): Count pages in a PDF (PyMuPDF)
- convert_pdf_page_to_image(): Render single PDF page to PNG
- convert_pdf_to_images(): Render multiple PDF pages to PNG
"""
import logging
from typing import List
from fastapi import HTTPException
from .models import VocabularyEntry, WorksheetType
logger = logging.getLogger(__name__)
# Optional dependency: WeasyPrint
try:
from weasyprint import HTML as _WeasyHTML
WEASYPRINT_AVAILABLE = True
except (ImportError, OSError):
WEASYPRINT_AVAILABLE = False
logger.warning("WeasyPrint not available")
# Optional dependency: PyMuPDF
try:
import fitz # PyMuPDF
FITZ_AVAILABLE = True
except ImportError:
FITZ_AVAILABLE = False
logger.warning("PyMuPDF (fitz) not available")
# =============================================================================
# Worksheet HTML Generation
# =============================================================================
def generate_worksheet_html(
vocabulary: List[VocabularyEntry],
worksheet_type: WorksheetType,
title: str,
show_solutions: bool = False,
repetitions: int = 3,
line_height: str = "normal"
) -> str:
"""Generate HTML for a worksheet."""
# Line height CSS
line_heights = {
"normal": "2.5em",
"large": "3.5em",
"extra-large": "4.5em"
}
lh = line_heights.get(line_height, "2.5em")
html = f"""<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<style>
@page {{ size: A4; margin: 2cm; }}
body {{ font-family: 'Segoe UI', Arial, sans-serif; font-size: 14px; }}
h1 {{ font-size: 24px; margin-bottom: 10px; }}
.meta {{ color: #666; margin-bottom: 20px; }}
.name-line {{ margin-bottom: 30px; }}
.vocab-table {{ width: 100%; border-collapse: collapse; }}
.vocab-table td {{ padding: 8px; border-bottom: 1px solid #ddd; line-height: {lh}; }}
.vocab-word {{ width: 40%; font-weight: 500; }}
.vocab-blank {{ width: 60%; border-bottom: 2px dotted #999; }}
.vocab-answer {{ width: 60%; color: #2563eb; }}
.gap {{ border-bottom: 2px solid #333; min-width: 100px; display: inline-block; }}
.hint {{ color: #666; font-style: italic; font-size: 12px; }}
.section {{ margin-top: 30px; }}
.section-title {{ font-size: 16px; font-weight: 600; margin-bottom: 15px; color: #374151; }}
</style>
</head>
<body>
<h1>{title}</h1>
<div class="name-line">Name: _________________________ Datum: _____________</div>
"""
if worksheet_type == WorksheetType.EN_TO_DE:
html += '<div class="section"><div class="section-title">Uebersetze ins Deutsche:</div>'
html += '<table class="vocab-table">'
for entry in vocabulary:
if show_solutions:
html += f'<tr><td class="vocab-word">{entry.english}</td><td class="vocab-answer">{entry.german}</td></tr>'
else:
html += f'<tr><td class="vocab-word">{entry.english}</td><td class="vocab-blank"></td></tr>'
html += '</table></div>'
elif worksheet_type == WorksheetType.DE_TO_EN:
html += '<div class="section"><div class="section-title">Uebersetze ins Englische:</div>'
html += '<table class="vocab-table">'
for entry in vocabulary:
if show_solutions:
html += f'<tr><td class="vocab-word">{entry.german}</td><td class="vocab-answer">{entry.english}</td></tr>'
else:
html += f'<tr><td class="vocab-word">{entry.german}</td><td class="vocab-blank"></td></tr>'
html += '</table></div>'
elif worksheet_type == WorksheetType.COPY_PRACTICE:
html += '<div class="section"><div class="section-title">Schreibe jedes Wort mehrmals:</div>'
html += '<table class="vocab-table">'
for entry in vocabulary:
html += f'<tr><td class="vocab-word">{entry.english}</td>'
html += '<td class="vocab-blank">'
if show_solutions:
html += f' {entry.english} ' * repetitions
html += '</td></tr>'
html += '</table></div>'
elif worksheet_type == WorksheetType.GAP_FILL:
entries_with_examples = [e for e in vocabulary if e.example_sentence]
if entries_with_examples:
html += '<div class="section"><div class="section-title">Fuege das passende Wort ein:</div>'
for i, entry in enumerate(entries_with_examples, 1):
# Create gap sentence by removing the English word
gap_sentence = entry.example_sentence
for word in entry.english.split():
if word.lower() in gap_sentence.lower():
gap_sentence = gap_sentence.replace(word, '<span class="gap"></span>')
gap_sentence = gap_sentence.replace(word.capitalize(), '<span class="gap"></span>')
gap_sentence = gap_sentence.replace(word.lower(), '<span class="gap"></span>')
break
html += f'<p>{i}. {gap_sentence}</p>'
if show_solutions:
html += f'<p class="hint">Loesung: {entry.english}</p>'
else:
html += f'<p class="hint">({entry.german})</p>'
html += '</div>'
html += '</body></html>'
return html
# =============================================================================
# Worksheet PDF Generation
# =============================================================================
async def generate_worksheet_pdf(html: str) -> bytes:
"""Generate PDF from HTML using WeasyPrint."""
try:
from weasyprint import HTML
pdf_bytes = HTML(string=html).write_pdf()
return pdf_bytes
except ImportError:
logger.warning("WeasyPrint not available, returning HTML")
return html.encode('utf-8')
except Exception as e:
logger.error(f"PDF generation failed: {e}")
raise
# =============================================================================
# PDF Utilities (PyMuPDF)
# =============================================================================
def get_pdf_page_count(pdf_data: bytes) -> int:
"""Get the number of pages in a PDF."""
try:
import fitz
pdf_document = fitz.open(stream=pdf_data, filetype="pdf")
count = pdf_document.page_count
pdf_document.close()
return count
except Exception as e:
logger.error(f"Failed to get PDF page count: {e}")
return 0
async def convert_pdf_page_to_image(pdf_data: bytes, page_number: int = 0, thumbnail: bool = False) -> bytes:
"""Convert a specific page of PDF to PNG image using PyMuPDF.
Args:
pdf_data: PDF file as bytes
page_number: 0-indexed page number
thumbnail: If True, return a smaller thumbnail image
"""
try:
import fitz # PyMuPDF
pdf_document = fitz.open(stream=pdf_data, filetype="pdf")
if pdf_document.page_count == 0:
raise ValueError("PDF has no pages")
if page_number >= pdf_document.page_count:
raise ValueError(f"Page {page_number} does not exist (PDF has {pdf_document.page_count} pages)")
page = pdf_document[page_number]
# Render page to image
# For thumbnails: lower resolution, for OCR: higher resolution
zoom = 0.5 if thumbnail else 2.0
mat = fitz.Matrix(zoom, zoom)
pix = page.get_pixmap(matrix=mat)
png_data = pix.tobytes("png")
pdf_document.close()
logger.info(f"Converted PDF page {page_number} to PNG: {len(png_data)} bytes (thumbnail={thumbnail})")
return png_data
except ImportError:
logger.error("PyMuPDF (fitz) not installed")
raise HTTPException(status_code=500, detail="PDF conversion not available - PyMuPDF not installed")
except Exception as e:
logger.error(f"PDF conversion failed: {e}")
raise HTTPException(status_code=400, detail=f"PDF conversion failed: {str(e)}")
async def convert_pdf_to_images(pdf_data: bytes, pages: List[int] = None) -> List[bytes]:
"""Convert multiple pages of PDF to PNG images.
Args:
pdf_data: PDF file as bytes
pages: List of 0-indexed page numbers to convert. If None, convert all pages.
"""
try:
import fitz
pdf_document = fitz.open(stream=pdf_data, filetype="pdf")
if pdf_document.page_count == 0:
raise ValueError("PDF has no pages")
# If no pages specified, convert all
if pages is None:
pages = list(range(pdf_document.page_count))
images = []
zoom = 2.0
mat = fitz.Matrix(zoom, zoom)
for page_num in pages:
if page_num < pdf_document.page_count:
page = pdf_document[page_num]
pix = page.get_pixmap(matrix=mat)
images.append(pix.tobytes("png"))
pdf_document.close()
logger.info(f"Converted {len(images)} PDF pages to images")
return images
except ImportError:
logger.error("PyMuPDF (fitz) not installed")
raise HTTPException(status_code=500, detail="PDF conversion not available")
except Exception as e:
logger.error(f"PDF conversion failed: {e}")
raise HTTPException(status_code=400, detail=f"PDF conversion failed: {str(e)}")

View File

@@ -0,0 +1,86 @@
"""Pydantic models and enums for the Vocab Worksheet API."""
from datetime import datetime
from enum import Enum
from typing import List, Optional
from pydantic import BaseModel
# =============================================================================
# Enums
# =============================================================================
class WorksheetType(str, Enum):
EN_TO_DE = "en_to_de" # English -> German translation
DE_TO_EN = "de_to_en" # German -> English translation
COPY_PRACTICE = "copy" # Write word multiple times
GAP_FILL = "gap_fill" # Fill in the blanks
COMBINED = "combined" # All types combined
class SessionStatus(str, Enum):
PENDING = "pending" # Session created, no upload yet
PROCESSING = "processing" # OCR in progress
EXTRACTED = "extracted" # Vocabulary extracted, ready to edit
COMPLETED = "completed" # Worksheet generated
# =============================================================================
# Pydantic Models
# =============================================================================
class VocabularyEntry(BaseModel):
id: str
english: str
german: str
example_sentence: Optional[str] = None
example_sentence_gap: Optional[str] = None # With ___ for gap-fill
word_type: Optional[str] = None # noun, verb, adjective, etc.
source_page: Optional[int] = None # Page number where entry was found (1-indexed)
class SessionCreate(BaseModel):
name: str
description: Optional[str] = None
source_language: str = "en" # Source language (default English)
target_language: str = "de" # Target language (default German)
class SessionResponse(BaseModel):
id: str
name: str
description: Optional[str]
source_language: str
target_language: str
status: str
vocabulary_count: int
image_path: Optional[str]
created_at: datetime
class VocabularyResponse(BaseModel):
session_id: str
vocabulary: List[VocabularyEntry]
extraction_confidence: Optional[float]
class VocabularyUpdate(BaseModel):
vocabulary: List[VocabularyEntry]
class WorksheetGenerateRequest(BaseModel):
worksheet_types: List[WorksheetType]
title: Optional[str] = None
include_solutions: bool = True
repetitions: int = 3 # For copy practice
line_height: str = "normal" # normal, large, extra-large
class WorksheetResponse(BaseModel):
id: str
session_id: str
worksheet_types: List[str]
pdf_path: str
solution_path: Optional[str]
generated_at: datetime

View File

@@ -0,0 +1,481 @@
"""
Vocab Worksheet OCR Pipeline — full Kombi OCR pipeline for a single page.
Extracted from vocab_worksheet_api.py to keep file sizes manageable.
Pipeline steps:
orientation → deskew → dewarp → crop → scan-quality → enhance →
dual-engine OCR (RapidOCR + Tesseract) → merge → grid-build →
vocab extraction → row merging
"""
import logging
import uuid
from typing import Optional
logger = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# Optional heavy dependencies (not available in every environment)
# ---------------------------------------------------------------------------
try:
import cv2
import numpy as np
except ImportError:
cv2 = None # type: ignore[assignment]
np = None # type: ignore[assignment]
logger.warning("cv2 / numpy not available — OCR pipeline disabled")
try:
from PIL import Image
except ImportError:
Image = None # type: ignore[assignment]
try:
import pytesseract
except ImportError:
pytesseract = None # type: ignore[assignment]
# CV pipeline helpers
try:
from cv_vocab_pipeline import (
deskew_two_pass,
dewarp_image,
detect_and_fix_orientation,
_cells_to_vocab_entries,
_fix_phonetic_brackets,
)
except ImportError:
deskew_two_pass = None # type: ignore[assignment]
dewarp_image = None # type: ignore[assignment]
detect_and_fix_orientation = None # type: ignore[assignment]
_cells_to_vocab_entries = None # type: ignore[assignment]
_fix_phonetic_brackets = None # type: ignore[assignment]
try:
from cv_cell_grid import (
_merge_wrapped_rows,
_merge_phonetic_continuation_rows,
_merge_continuation_rows,
)
except ImportError:
_merge_wrapped_rows = None # type: ignore[assignment]
_merge_phonetic_continuation_rows = None # type: ignore[assignment]
_merge_continuation_rows = None # type: ignore[assignment]
try:
from cv_ocr_engines import ocr_region_rapid
except ImportError:
ocr_region_rapid = None # type: ignore[assignment]
try:
from cv_vocab_types import PageRegion
except ImportError:
PageRegion = None # type: ignore[assignment]
try:
from ocr_pipeline_ocr_merge import (
_split_paddle_multi_words,
_merge_paddle_tesseract,
_deduplicate_words,
)
except ImportError:
_split_paddle_multi_words = None # type: ignore[assignment]
_merge_paddle_tesseract = None # type: ignore[assignment]
_deduplicate_words = None # type: ignore[assignment]
try:
from cv_words_first import build_grid_from_words
except ImportError:
build_grid_from_words = None # type: ignore[assignment]
try:
from ocr_pipeline_session_store import (
create_session_db as create_pipeline_session_db,
update_session_db as update_pipeline_session_db,
)
except ImportError:
create_pipeline_session_db = None # type: ignore[assignment]
update_pipeline_session_db = None # type: ignore[assignment]
# ---------------------------------------------------------------------------
# Main pipeline function
# ---------------------------------------------------------------------------
async def _run_ocr_pipeline_for_page(
img_bgr: "np.ndarray",
page_number: int,
vocab_session_id: str,
*,
ipa_mode: str = "none",
syllable_mode: str = "none",
enable_enhance: bool = True,
max_columns: Optional[int] = 3,
override_min_conf: Optional[int] = None,
) -> tuple:
"""Run the full Kombi OCR pipeline on a single page and return vocab entries.
Uses the same pipeline as the admin OCR Kombi pipeline:
orientation → deskew → dewarp → crop → dual-engine OCR → grid-build
(with pipe-autocorrect, word-gap merge, dictionary detection, etc.)
Args:
img_bgr: BGR numpy array.
page_number: 0-indexed page number.
vocab_session_id: Vocab session ID for logging.
ipa_mode: "none" (default for worksheets), "auto", "all", "en", "de".
syllable_mode: "none" (default for worksheets), "auto", "all", "en", "de".
Returns (entries, rotation_deg) where entries is a list of dicts and
rotation_deg is the orientation correction applied (0, 90, 180, 270).
"""
import time as _time
t_total = _time.time()
img_h, img_w = img_bgr.shape[:2]
logger.info(f"Kombi Pipeline page {page_number + 1}: image {img_w}x{img_h}")
# 1. Orientation detection (fix upside-down scans)
t0 = _time.time()
img_bgr, rotation = detect_and_fix_orientation(img_bgr)
if rotation:
img_h, img_w = img_bgr.shape[:2]
logger.info(f" orientation: rotated {rotation}° ({_time.time() - t0:.1f}s)")
else:
logger.info(f" orientation: OK ({_time.time() - t0:.1f}s)")
# 2. Create pipeline session in DB (visible in admin Kombi UI)
pipeline_session_id = str(uuid.uuid4())
try:
_, png_buf = cv2.imencode(".png", img_bgr)
original_png = png_buf.tobytes()
await create_pipeline_session_db(
pipeline_session_id,
name=f"vocab-ws-{vocab_session_id[:8]}-p{page_number + 1}",
filename=f"page_{page_number + 1}.png",
original_png=original_png,
)
except Exception as e:
logger.warning(f"Could not create pipeline session in DB: {e}")
# 3. Three-pass deskew
t0 = _time.time()
deskewed_bgr, angle_applied, deskew_debug = deskew_two_pass(img_bgr.copy())
logger.info(f" deskew: angle={angle_applied:.2f} ({_time.time() - t0:.1f}s)")
# 4. Dewarp
t0 = _time.time()
dewarped_bgr, dewarp_info = dewarp_image(deskewed_bgr)
logger.info(f" dewarp: shear={dewarp_info['shear_degrees']:.3f} ({_time.time() - t0:.1f}s)")
# 5. Content crop (removes scanner borders, gutter shadows)
t0 = _time.time()
try:
from page_crop import detect_and_crop_page
cropped_bgr, crop_result = detect_and_crop_page(dewarped_bgr)
if crop_result.get("crop_applied"):
dewarped_bgr = cropped_bgr
logger.info(f" crop: applied ({_time.time() - t0:.1f}s)")
else:
logger.info(f" crop: skipped ({_time.time() - t0:.1f}s)")
except Exception as e:
logger.warning(f" crop: failed ({e}), continuing with uncropped image")
# 5b. Scan quality assessment
scan_quality_report = None
try:
from scan_quality import score_scan_quality
scan_quality_report = score_scan_quality(dewarped_bgr)
except Exception as e:
logger.warning(f" scan quality: failed ({e})")
if override_min_conf:
min_ocr_conf = override_min_conf
else:
min_ocr_conf = scan_quality_report.recommended_min_conf if scan_quality_report else 40
# 5c. Image enhancement for degraded scans
is_degraded = scan_quality_report.is_degraded if scan_quality_report else False
if is_degraded and enable_enhance:
try:
from ocr_image_enhance import enhance_for_ocr
dewarped_bgr = enhance_for_ocr(dewarped_bgr, is_degraded=True)
logger.info(" enhancement: applied (degraded scan)")
except Exception as e:
logger.warning(f" enhancement: failed ({e})")
# 6. Dual-engine OCR (RapidOCR + Tesseract → merge)
t0 = _time.time()
img_h, img_w = dewarped_bgr.shape[:2]
# RapidOCR (local ONNX)
try:
from cv_ocr_engines import ocr_region_rapid
from cv_vocab_types import PageRegion
full_region = PageRegion(type="full_page", x=0, y=0, width=img_w, height=img_h)
rapid_words = ocr_region_rapid(dewarped_bgr, full_region) or []
except Exception as e:
logger.warning(f" RapidOCR failed: {e}")
rapid_words = []
# Tesseract
from PIL import Image
import pytesseract
pil_img = Image.fromarray(cv2.cvtColor(dewarped_bgr, cv2.COLOR_BGR2RGB))
data = pytesseract.image_to_data(
pil_img, lang="eng+deu", config="--psm 6 --oem 3",
output_type=pytesseract.Output.DICT,
)
tess_words = []
for i in range(len(data["text"])):
text = str(data["text"][i]).strip()
conf_raw = str(data["conf"][i])
conf = int(conf_raw) if conf_raw.lstrip("-").isdigit() else -1
if not text or conf < min_ocr_conf:
continue
tess_words.append({
"text": text,
"left": data["left"][i], "top": data["top"][i],
"width": data["width"][i], "height": data["height"][i],
"conf": conf,
})
# Merge dual-engine results
from ocr_pipeline_ocr_merge import _split_paddle_multi_words, _merge_paddle_tesseract, _deduplicate_words
from cv_words_first import build_grid_from_words
rapid_split = _split_paddle_multi_words(rapid_words) if rapid_words else []
if rapid_split or tess_words:
merged_words = _merge_paddle_tesseract(rapid_split, tess_words)
merged_words = _deduplicate_words(merged_words)
else:
merged_words = tess_words # fallback to Tesseract only
# Build initial grid from merged words
cells, columns_meta = build_grid_from_words(merged_words, img_w, img_h, max_columns=max_columns)
for cell in cells:
cell["ocr_engine"] = "rapid_kombi"
n_rows = len(set(c["row_index"] for c in cells)) if cells else 0
n_cols = len(columns_meta)
logger.info(f" ocr: rapid={len(rapid_words)}, tess={len(tess_words)}, "
f"merged={len(merged_words)}, cells={len(cells)} ({_time.time() - t0:.1f}s)")
# 7. Save word_result to pipeline session (needed by _build_grid_core)
word_result = {
"cells": cells,
"grid_shape": {"rows": n_rows, "cols": n_cols, "total_cells": len(cells)},
"columns_used": columns_meta,
"layout": "vocab" if {c.get("type") for c in columns_meta} & {"column_en", "column_de"} else "generic",
"image_width": img_w,
"image_height": img_h,
"duration_seconds": 0,
"ocr_engine": "rapid_kombi",
"raw_tesseract_words": tess_words,
"summary": {
"total_cells": len(cells),
"non_empty_cells": sum(1 for c in cells if c.get("text")),
},
}
# Save images + word_result to pipeline session for admin visibility
try:
_, dsk_buf = cv2.imencode(".png", deskewed_bgr)
_, dwp_buf = cv2.imencode(".png", dewarped_bgr)
await update_pipeline_session_db(
pipeline_session_id,
deskewed_png=dsk_buf.tobytes(),
dewarped_png=dwp_buf.tobytes(),
cropped_png=cv2.imencode(".png", dewarped_bgr)[1].tobytes(),
word_result=word_result,
deskew_result={"angle_applied": round(angle_applied, 3)},
dewarp_result={"shear_degrees": dewarp_info.get("shear_degrees", 0)},
current_step=8,
)
except Exception as e:
logger.warning(f"Could not update pipeline session: {e}")
# 8. Run full grid-build (with pipe-autocorrect, word-gap merge, etc.)
t0 = _time.time()
try:
from grid_editor_api import _build_grid_core
session_data = {
"word_result": word_result,
}
grid_result = await _build_grid_core(
pipeline_session_id, session_data,
ipa_mode=ipa_mode, syllable_mode=syllable_mode,
)
logger.info(f" grid-build: {grid_result.get('summary', {}).get('total_cells', 0)} cells "
f"({_time.time() - t0:.1f}s)")
# Save grid result to pipeline session
try:
await update_pipeline_session_db(
pipeline_session_id,
grid_editor_result=grid_result,
current_step=11,
)
except Exception:
pass
except Exception as e:
logger.warning(f" grid-build failed: {e}, falling back to basic grid")
grid_result = None
# 9. Extract vocab entries
# Prefer grid-build result (better column detection, more cells) over
# the initial build_grid_from_words() which often under-clusters.
page_vocabulary = []
extraction_source = "none"
# A) Try grid-build zones first (best quality: 4-column detection, autocorrect)
if grid_result and grid_result.get("zones"):
for zone in grid_result["zones"]:
zone_cols = zone.get("columns", [])
zone_cells = zone.get("cells", [])
if not zone_cols or not zone_cells:
continue
# Sort columns by x position to determine roles
sorted_cols = sorted(zone_cols, key=lambda c: c.get("x_min_px", 0))
col_idx_to_pos = {}
for pos, col in enumerate(sorted_cols):
ci = col.get("col_index", col.get("index", -1))
col_idx_to_pos[ci] = pos
# Skip zones with only 1 column (likely headers/boxes)
if len(sorted_cols) < 2:
continue
# Group cells by row
rows_map: dict = {}
for cell in zone_cells:
ri = cell.get("row_index", 0)
if ri not in rows_map:
rows_map[ri] = {}
ci = cell.get("col_index", 0)
rows_map[ri][ci] = (cell.get("text") or "").strip()
n_cols = len(sorted_cols)
for ri in sorted(rows_map.keys()):
row = rows_map[ri]
# Collect texts in column-position order
texts = []
for col in sorted_cols:
ci = col.get("col_index", col.get("index", -1))
texts.append(row.get(ci, ""))
if not any(texts):
continue
# Map by position, skipping narrow first column (page refs/markers)
# Heuristic: if first column is very narrow (<15% of zone width),
# it's likely a marker/ref column — skip it for vocab
first_col_width = sorted_cols[0].get("x_max_px", 0) - sorted_cols[0].get("x_min_px", 0)
zone_width = max(1, (sorted_cols[-1].get("x_max_px", 0) - sorted_cols[0].get("x_min_px", 0)))
skip_first = first_col_width / zone_width < 0.15 and n_cols >= 3
data_texts = texts[1:] if skip_first else texts
entry = {
"id": str(uuid.uuid4()),
"english": data_texts[0] if len(data_texts) > 0 else "",
"german": data_texts[1] if len(data_texts) > 1 else "",
"example_sentence": " ".join(t for t in data_texts[2:] if t) if len(data_texts) > 2 else "",
"source_page": page_number + 1,
}
if entry["english"] or entry["german"]:
page_vocabulary.append(entry)
if page_vocabulary:
extraction_source = f"grid-zones ({len(grid_result['zones'])} zones)"
# B) Fallback: original cells with column classification
if not page_vocabulary:
col_types = {c.get("type") for c in columns_meta}
is_vocab = bool(col_types & {"column_en", "column_de"})
if is_vocab:
entries = _cells_to_vocab_entries(cells, columns_meta)
entries = _fix_phonetic_brackets(entries, pronunciation="british")
for entry in entries:
if not entry.get("english") and not entry.get("german"):
continue
page_vocabulary.append({
"id": str(uuid.uuid4()),
"english": entry.get("english", ""),
"german": entry.get("german", ""),
"example_sentence": entry.get("example", ""),
"source_page": page_number + 1,
})
extraction_source = f"classified ({len(columns_meta)} cols)"
else:
# Last resort: all cells by position
rows_map2: dict = {}
for cell in cells:
ri = cell.get("row_index", 0)
if ri not in rows_map2:
rows_map2[ri] = {}
ci = cell.get("col_index", 0)
rows_map2[ri][ci] = (cell.get("text") or "").strip()
all_ci = sorted({ci for r in rows_map2.values() for ci in r.keys()})
for ri in sorted(rows_map2.keys()):
row = rows_map2[ri]
texts = [row.get(ci, "") for ci in all_ci]
if not any(texts):
continue
page_vocabulary.append({
"id": str(uuid.uuid4()),
"english": texts[0] if len(texts) > 0 else "",
"german": texts[1] if len(texts) > 1 else "",
"example_sentence": " ".join(texts[2:]) if len(texts) > 2 else "",
"source_page": page_number + 1,
})
extraction_source = f"generic ({len(all_ci)} cols)"
# --- Post-processing: merge cell-wrap continuation rows ---
if len(page_vocabulary) >= 2:
try:
# Convert to internal format (example_sentence → example)
internal = []
for v in page_vocabulary:
internal.append({
'row_index': len(internal),
'english': v.get('english', ''),
'german': v.get('german', ''),
'example': v.get('example_sentence', ''),
})
n_before = len(internal)
internal = _merge_wrapped_rows(internal)
internal = _merge_phonetic_continuation_rows(internal)
internal = _merge_continuation_rows(internal)
if len(internal) < n_before:
# Rebuild page_vocabulary from merged entries
merged_vocab = []
for entry in internal:
if not entry.get('english') and not entry.get('german'):
continue
merged_vocab.append({
'id': str(uuid.uuid4()),
'english': entry.get('english', ''),
'german': entry.get('german', ''),
'example_sentence': entry.get('example', ''),
'source_page': page_number + 1,
})
logger.info(f" row merging: {n_before}{len(merged_vocab)} entries")
page_vocabulary = merged_vocab
except Exception as e:
logger.warning(f" row merging failed (non-critical): {e}")
logger.info(f" vocab extraction: {len(page_vocabulary)} entries via {extraction_source}")
total_duration = _time.time() - t_total
logger.info(f"Kombi Pipeline page {page_number + 1}: "
f"{len(page_vocabulary)} vocab entries in {total_duration:.1f}s")
return page_vocabulary, rotation, scan_quality_report

View File

@@ -0,0 +1,490 @@
"""
Vocab Worksheet Upload API — PDF upload, thumbnails, and page processing.
Extracted from vocab_worksheet_api.py to keep modules under 500 LOC.
Routes (no prefix — included into the main /api/v1/vocab router):
POST /sessions/{session_id}/upload-pdf-info
GET /sessions/{session_id}/pdf-thumbnail/{page_number}
GET /sessions/{session_id}/pdf-page-image/{page_number}
POST /sessions/{session_id}/process-single-page/{page_number}
POST /sessions/{session_id}/process-pages
"""
import io
import logging
import os
import uuid
from typing import List
from fastapi import APIRouter, HTTPException, Query, UploadFile, File
from fastapi.responses import StreamingResponse
from .models import SessionStatus
logger = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# Local storage path
# ---------------------------------------------------------------------------
LOCAL_STORAGE_PATH = os.getenv("VOCAB_STORAGE_PATH", "/app/vocab-worksheets")
# ---------------------------------------------------------------------------
# Optional heavy dependencies
# ---------------------------------------------------------------------------
try:
import numpy as np
from cv_preprocessing import render_pdf_high_res, detect_and_fix_orientation
OCR_PIPELINE_AVAILABLE = True
except ImportError:
np = None # type: ignore[assignment]
OCR_PIPELINE_AVAILABLE = False
logger.warning("OCR pipeline imports not available in upload module")
# Sub-module imports (already split out)
from .generation import (
convert_pdf_page_to_image,
convert_pdf_to_images,
get_pdf_page_count,
)
from .extraction import extract_vocabulary_from_image
try:
from .ocr import _run_ocr_pipeline_for_page
except ImportError:
_run_ocr_pipeline_for_page = None # type: ignore[assignment]
logger.warning("vocab_worksheet_ocr not available — process-single-page disabled")
# ---------------------------------------------------------------------------
# In-memory session store (shared with main module)
# ---------------------------------------------------------------------------
def _get_sessions():
from .api import _sessions
return _sessions
# ---------------------------------------------------------------------------
# Router (no prefix — will be included into the main vocab router)
# ---------------------------------------------------------------------------
upload_router = APIRouter()
# =============================================================================
# POST /sessions/{session_id}/upload-pdf-info
# =============================================================================
@upload_router.post("/sessions/{session_id}/upload-pdf-info")
async def upload_pdf_get_info(
session_id: str,
file: UploadFile = File(...),
):
"""
Upload a PDF and get page count and thumbnails for preview.
Use this before processing to let user select pages.
"""
logger.info(f"PDF info request for session {session_id}")
if session_id not in _get_sessions():
raise HTTPException(status_code=404, detail="Session not found")
session = _get_sessions()[session_id]
# Validate file type
extension = file.filename.split('.')[-1].lower() if file.filename else ''
content_type = file.content_type or ''
if extension != 'pdf' and content_type != 'application/pdf':
raise HTTPException(status_code=400, detail="Only PDF files supported for this endpoint")
content = await file.read()
# Save PDF temporarily
session_dir = os.path.join(LOCAL_STORAGE_PATH, session_id)
os.makedirs(session_dir, exist_ok=True)
pdf_path = os.path.join(session_dir, "source.pdf")
with open(pdf_path, 'wb') as f:
f.write(content)
# Get page count
page_count = get_pdf_page_count(content)
# Store PDF data in session for later processing
session["pdf_data"] = content
session["pdf_path"] = pdf_path
session["pdf_page_count"] = page_count
session["status"] = "pdf_uploaded"
# Detect orientation for each page so thumbnails are shown correctly
page_rotations: dict = {}
if OCR_PIPELINE_AVAILABLE:
for pg in range(page_count):
try:
img_bgr = render_pdf_high_res(content, pg, zoom=2.0)
_, rotation = detect_and_fix_orientation(img_bgr)
if rotation:
page_rotations[pg] = rotation
logger.info(f"Page {pg + 1}: orientation {rotation}°")
except Exception as e:
logger.warning(f"Orientation detection failed for page {pg + 1}: {e}")
session["page_rotations"] = page_rotations
return {
"session_id": session_id,
"page_count": page_count,
"filename": file.filename,
"page_rotations": page_rotations,
}
# =============================================================================
# GET /sessions/{session_id}/pdf-thumbnail/{page_number}
# =============================================================================
@upload_router.get("/sessions/{session_id}/pdf-thumbnail/{page_number}")
async def get_pdf_thumbnail(session_id: str, page_number: int, hires: bool = Query(False)):
"""Get a thumbnail image of a specific PDF page.
Uses fitz for rendering so that page_rotations (from OCR orientation
detection) are applied consistently.
Args:
hires: If True, return full-resolution image (zoom=2.0) instead of thumbnail (zoom=0.5).
"""
if session_id not in _get_sessions():
raise HTTPException(status_code=404, detail="Session not found")
session = _get_sessions()[session_id]
pdf_data = session.get("pdf_data")
if not pdf_data:
raise HTTPException(status_code=400, detail="No PDF uploaded for this session")
try:
import fitz
zoom = 2.0 if hires else 0.5
pdf_document = fitz.open(stream=pdf_data, filetype="pdf")
page = pdf_document[page_number]
# Apply orientation correction detected during OCR processing
rot = session.get("page_rotations", {}).get(page_number, 0)
if rot:
page.set_rotation(rot)
mat = fitz.Matrix(zoom, zoom)
pix = page.get_pixmap(matrix=mat)
png_data = pix.tobytes("png")
pdf_document.close()
except Exception as e:
logger.error(f"PDF thumbnail failed: {e}")
raise HTTPException(status_code=500, detail=f"PDF rendering failed: {str(e)}")
return StreamingResponse(
io.BytesIO(png_data),
media_type="image/png",
)
# =============================================================================
# GET /sessions/{session_id}/pdf-page-image/{page_number}
# =============================================================================
@upload_router.get("/sessions/{session_id}/pdf-page-image/{page_number}")
async def get_pdf_page_image(session_id: str, page_number: int, zoom: float = Query(2.0, ge=0.5, le=4.0)):
"""PDF page as PNG at arbitrary resolution (for editor view).
Args:
zoom: Zoom factor (0.5=72DPI, 1.0=144DPI, 2.0=288DPI, 4.0=576DPI).
"""
if session_id not in _get_sessions():
raise HTTPException(status_code=404, detail="Session not found")
session = _get_sessions()[session_id]
pdf_data = session.get("pdf_data")
if not pdf_data:
raise HTTPException(status_code=400, detail="No PDF uploaded for this session")
page_count = session.get("pdf_page_count", 1)
if page_number < 0 or page_number >= page_count:
raise HTTPException(status_code=400, detail=f"Invalid page number. PDF has {page_count} pages (0-indexed).")
try:
import fitz
pdf_document = fitz.open(stream=pdf_data, filetype="pdf")
page = pdf_document[page_number]
# Apply orientation correction detected during OCR processing
rot = session.get("page_rotations", {}).get(page_number, 0)
if rot:
page.set_rotation(rot)
mat = fitz.Matrix(zoom, zoom)
pix = page.get_pixmap(matrix=mat)
png_data = pix.tobytes("png")
pdf_document.close()
logger.info(f"PDF page {page_number} rendered at zoom={zoom} rot={rot}: {len(png_data)} bytes")
except Exception as e:
logger.error(f"PDF page image failed: {e}")
raise HTTPException(status_code=500, detail=f"PDF rendering failed: {str(e)}")
return StreamingResponse(
io.BytesIO(png_data),
media_type="image/png",
)
# =============================================================================
# POST /sessions/{session_id}/process-single-page/{page_number}
# =============================================================================
@upload_router.post("/sessions/{session_id}/process-single-page/{page_number}")
async def process_single_page(
session_id: str,
page_number: int,
ipa_mode: str = Query("none", pattern="^(auto|all|de|en|none)$"),
syllable_mode: str = Query("none", pattern="^(auto|all|de|en|none)$"),
enhance: bool = Query(True, description="Step 3: CLAHE + Denoise for degraded scans"),
max_cols: int = Query(3, description="Step 2: Max column count (0=unlimited)"),
min_conf: int = Query(0, description="Step 1: Min OCR confidence (0=auto from quality score)"),
):
"""
Process a SINGLE page of an uploaded PDF using the Kombi OCR pipeline.
Uses the full Kombi pipeline (orientation -> deskew -> dewarp -> crop ->
dual-engine OCR -> grid-build with autocorrect/merge) for best quality.
Query params:
ipa_mode: "none" (default), "auto", "all", "en", "de"
syllable_mode: "none" (default), "auto", "all", "en", "de"
enhance: true (default) -- apply CLAHE/denoise for degraded scans
max_cols: 3 (default) -- max column count (0=unlimited)
min_conf: 0 (default=auto) -- min OCR confidence (0=from quality score)
The frontend should call this sequentially for each page.
Returns the vocabulary for just this one page.
"""
logger.info(f"Processing SINGLE page {page_number + 1} for session {session_id}")
if session_id not in _get_sessions():
raise HTTPException(
status_code=404,
detail="Session nicht im Speicher. Bitte erstellen Sie eine neue Session und laden Sie das PDF erneut hoch.",
)
session = _get_sessions()[session_id]
pdf_data = session.get("pdf_data")
if not pdf_data:
raise HTTPException(status_code=400, detail="No PDF uploaded for this session")
page_count = session.get("pdf_page_count", 1)
if page_number < 0 or page_number >= page_count:
raise HTTPException(status_code=400, detail=f"Invalid page number. PDF has {page_count} pages (0-indexed).")
# Derive pipeline-level variable names for the quality report
enable_enhance = enhance
max_columns = max_cols if max_cols > 0 else None
override_min_conf = min_conf if min_conf > 0 else None
# --- OCR Pipeline path (use same render_pdf_high_res as admin OCR pipeline) ---
rotation_deg = 0
quality_report = None
min_ocr_conf = 40 # default; overridden by pipeline when quality report is available
if OCR_PIPELINE_AVAILABLE and _run_ocr_pipeline_for_page is not None:
try:
img_bgr = render_pdf_high_res(pdf_data, page_number, zoom=3.0)
page_vocabulary, rotation_deg, quality_report = await _run_ocr_pipeline_for_page(
img_bgr, page_number, session_id,
ipa_mode=ipa_mode, syllable_mode=syllable_mode,
enable_enhance=enable_enhance,
max_columns=max_columns,
override_min_conf=override_min_conf,
)
# Update min_ocr_conf from quality report if available
if quality_report and hasattr(quality_report, 'recommended_min_conf'):
min_ocr_conf = quality_report.recommended_min_conf
except Exception as e:
logger.error(f"OCR pipeline failed for page {page_number + 1}: {e}", exc_info=True)
return {
"session_id": session_id,
"page_number": page_number + 1,
"success": False,
"error": f"OCR pipeline error: {e}",
"vocabulary": [],
"vocabulary_count": 0,
}
else:
# Fallback to LLM vision extraction
logger.warning("OCR pipeline not available, falling back to LLM vision")
image_data = await convert_pdf_page_to_image(pdf_data, page_number, thumbnail=False)
vocabulary, confidence, error = await extract_vocabulary_from_image(
image_data,
f"page_{page_number + 1}.png",
page_number=page_number
)
if error:
logger.warning(f"Page {page_number + 1} failed: {error}")
return {
"session_id": session_id,
"page_number": page_number + 1,
"success": False,
"error": error,
"vocabulary": [],
"vocabulary_count": 0,
}
page_vocabulary = []
for entry in vocabulary:
entry_dict = entry.dict() if hasattr(entry, 'dict') else (entry.__dict__.copy() if hasattr(entry, '__dict__') else dict(entry))
entry_dict['source_page'] = page_number + 1
if 'id' not in entry_dict or not entry_dict['id']:
entry_dict['id'] = str(uuid.uuid4())
page_vocabulary.append(entry_dict)
logger.info(f"Page {page_number + 1}: {len(page_vocabulary)} Vokabeln extrahiert")
# Store rotation for this page (used by image/thumbnail endpoints)
session.setdefault("page_rotations", {})[page_number] = rotation_deg
# Add to session's vocabulary (append, don't replace)
existing_vocab = session.get("vocabulary", [])
# Remove any existing entries from this page (in case of re-processing)
existing_vocab = [v for v in existing_vocab if v.get("source_page") != page_number + 1]
existing_vocab.extend(page_vocabulary)
session["vocabulary"] = existing_vocab
session["vocabulary_count"] = len(existing_vocab)
session["status"] = SessionStatus.EXTRACTED.value
result = {
"session_id": session_id,
"page_number": page_number + 1,
"success": True,
"vocabulary": page_vocabulary,
"vocabulary_count": len(page_vocabulary),
"total_vocabulary_count": len(existing_vocab),
"extraction_confidence": 0.9,
"rotation": rotation_deg,
}
# Add scan quality report + active steps info
if quality_report:
sq = quality_report.to_dict()
sq["active_steps"] = {
"step1_confidence": f"min_conf={min_ocr_conf}" if not override_min_conf else f"min_conf={override_min_conf} (override)",
"step2_max_columns": f"max_cols={max_columns}" if max_columns else "unlimited",
"step3_enhance": "on" if enable_enhance and quality_report.is_degraded else "off",
}
result["scan_quality"] = sq
return result
# =============================================================================
# POST /sessions/{session_id}/process-pages (DEPRECATED)
# =============================================================================
@upload_router.post("/sessions/{session_id}/process-pages")
async def process_pdf_pages(
session_id: str,
pages: List[int] = None,
process_all: bool = False,
):
"""
Process specific pages of an uploaded PDF.
DEPRECATED: Use /process-single-page/{page_number} instead for better results.
Args:
pages: List of 0-indexed page numbers to process
process_all: If True, process all pages
"""
logger.info(f"Process pages request for session {session_id}: pages={pages}, process_all={process_all}")
if session_id not in _get_sessions():
raise HTTPException(status_code=404, detail="Session not found")
session = _get_sessions()[session_id]
pdf_data = session.get("pdf_data")
if not pdf_data:
raise HTTPException(status_code=400, detail="No PDF uploaded for this session")
page_count = session.get("pdf_page_count", 1)
# Determine which pages to process
if process_all:
pages = list(range(page_count))
elif pages is None or len(pages) == 0:
pages = [0] # Default to first page
# Convert selected pages to images
images = await convert_pdf_to_images(pdf_data, pages)
# Extract vocabulary from each page SEQUENTIALLY
all_vocabulary = []
total_confidence = 0.0
successful_pages = []
failed_pages = []
error_messages = []
for i, image_data in enumerate(images):
page_num = pages[i]
logger.info(f"Extracting vocabulary from page {page_num + 1} of {len(images)}...")
vocabulary, confidence, error = await extract_vocabulary_from_image(
image_data,
f"page_{page_num + 1}.png",
page_number=page_num
)
if error:
failed_pages.append(page_num + 1)
error_messages.append(error)
logger.warning(f"Page {page_num + 1} failed: {error}")
else:
successful_pages.append(page_num + 1)
total_confidence += confidence
# Add page info to each entry and convert to dict
for entry in vocabulary:
entry_dict = entry.dict() if hasattr(entry, 'dict') else (entry.__dict__.copy() if hasattr(entry, '__dict__') else dict(entry))
entry_dict['source_page'] = page_num + 1
all_vocabulary.append(entry_dict)
logger.info(f"Page {page_num + 1}: {len(vocabulary)} Vokabeln extrahiert")
avg_confidence = total_confidence / len(successful_pages) if successful_pages else 0
# Update session
session["vocabulary"] = all_vocabulary
session["vocabulary_count"] = len(all_vocabulary)
session["extraction_confidence"] = avg_confidence
session["processed_pages"] = pages
session["successful_pages"] = successful_pages
session["failed_pages"] = failed_pages
session["status"] = SessionStatus.EXTRACTED.value
# Save first page as preview image
if images:
session_dir = os.path.join(LOCAL_STORAGE_PATH, session_id)
image_path = os.path.join(session_dir, "source.png")
with open(image_path, 'wb') as f:
f.write(images[0])
session["image_path"] = image_path
result = {
"session_id": session_id,
"pages_processed": len(pages),
"pages_successful": len(successful_pages),
"pages_failed": len(failed_pages),
"successful_pages": successful_pages,
"failed_pages": failed_pages,
"vocabulary_count": len(all_vocabulary),
"extraction_confidence": avg_confidence,
"status": SessionStatus.EXTRACTED.value,
}
if error_messages:
result["errors"] = error_messages
return result