Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 26s
CI / test-go-edu-search (push) Successful in 25s
CI / test-python-klausur (push) Failing after 1m52s
CI / test-python-agent-core (push) Successful in 18s
CI / test-nodejs-website (push) Successful in 17s
process-single-page now runs the full CV pipeline (deskew → dewarp → columns → rows → cell-first OCR v2 → LLM review) for much better extraction quality. Falls back to LLM vision if pipeline imports are unavailable. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
739 lines
25 KiB
Python
739 lines
25 KiB
Python
"""
|
|
Unit Tests for Vocab-Worksheet API
|
|
|
|
Tests cover:
|
|
- Session CRUD (create, read, list, delete)
|
|
- File upload (images and PDFs)
|
|
- PDF page handling (thumbnails, page selection)
|
|
- Vocabulary extraction (mocked Vision LLM)
|
|
- Vocabulary editing
|
|
- Worksheet generation
|
|
- PDF export
|
|
|
|
DSGVO Note: All tests run locally without external API calls.
|
|
|
|
BACKLOG: Feature not yet integrated into main.py
|
|
See: https://macmini:3002/infrastructure/tests
|
|
"""
|
|
|
|
import pytest
|
|
import sys
|
|
|
|
# Skip entire module if vocab_worksheet_api is not available
|
|
pytest.importorskip("vocab_worksheet_api", reason="vocab_worksheet_api not yet integrated - Backlog item")
|
|
|
|
# Mark all tests in this module as expected failures (backlog item)
|
|
pytestmark = pytest.mark.xfail(
|
|
reason="vocab_worksheet_api not yet integrated into main.py - Backlog item",
|
|
strict=False # Don't fail if test unexpectedly passes
|
|
)
|
|
import json
|
|
import uuid
|
|
import io
|
|
from datetime import datetime
|
|
from unittest.mock import AsyncMock, MagicMock, patch
|
|
from fastapi.testclient import TestClient
|
|
|
|
# Import the main app and vocab-worksheet components
|
|
sys.path.insert(0, '..')
|
|
from main import app
|
|
from vocab_worksheet_api import (
|
|
_sessions,
|
|
_worksheets,
|
|
SessionStatus,
|
|
WorksheetType,
|
|
VocabularyEntry,
|
|
SessionCreate,
|
|
VocabularyUpdate,
|
|
WorksheetGenerateRequest,
|
|
parse_vocabulary_json,
|
|
)
|
|
|
|
|
|
# =============================================
|
|
# FIXTURES
|
|
# =============================================
|
|
|
|
@pytest.fixture
|
|
def client():
|
|
"""Test client for FastAPI app."""
|
|
return TestClient(app)
|
|
|
|
|
|
@pytest.fixture(autouse=True)
|
|
def clear_storage():
|
|
"""Clear in-memory storage before each test."""
|
|
_sessions.clear()
|
|
_worksheets.clear()
|
|
yield
|
|
_sessions.clear()
|
|
_worksheets.clear()
|
|
|
|
|
|
@pytest.fixture
|
|
def sample_session_data():
|
|
"""Sample session creation data."""
|
|
return {
|
|
"name": "Englisch Klasse 7 - Unit 3",
|
|
"description": "Vokabeln aus Green Line 3",
|
|
"source_language": "en",
|
|
"target_language": "de"
|
|
}
|
|
|
|
|
|
@pytest.fixture
|
|
def sample_vocabulary():
|
|
"""Sample vocabulary entries."""
|
|
return [
|
|
{
|
|
"id": str(uuid.uuid4()),
|
|
"english": "to achieve",
|
|
"german": "erreichen, erzielen",
|
|
"example_sentence": "She achieved her goals.",
|
|
"word_type": "v",
|
|
"source_page": 1
|
|
},
|
|
{
|
|
"id": str(uuid.uuid4()),
|
|
"english": "achievement",
|
|
"german": "Leistung, Errungenschaft",
|
|
"example_sentence": "That was a great achievement.",
|
|
"word_type": "n",
|
|
"source_page": 1
|
|
},
|
|
{
|
|
"id": str(uuid.uuid4()),
|
|
"english": "improve",
|
|
"german": "verbessern",
|
|
"example_sentence": "I want to improve my English.",
|
|
"word_type": "v",
|
|
"source_page": 1
|
|
}
|
|
]
|
|
|
|
|
|
@pytest.fixture
|
|
def sample_image_bytes():
|
|
"""Create a minimal valid PNG image (1x1 pixel, white)."""
|
|
# Minimal PNG: 1x1 white pixel
|
|
png_data = bytes([
|
|
0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A, # PNG signature
|
|
0x00, 0x00, 0x00, 0x0D, 0x49, 0x48, 0x44, 0x52, # IHDR chunk
|
|
0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, # 1x1
|
|
0x08, 0x02, 0x00, 0x00, 0x00, 0x90, 0x77, 0x53,
|
|
0xDE, 0x00, 0x00, 0x00, 0x0C, 0x49, 0x44, 0x41, # IDAT chunk
|
|
0x54, 0x08, 0xD7, 0x63, 0xF8, 0xFF, 0xFF, 0xFF,
|
|
0x00, 0x05, 0xFE, 0x02, 0xFE, 0xDC, 0xCC, 0x59,
|
|
0xE7, 0x00, 0x00, 0x00, 0x00, 0x49, 0x45, 0x4E, # IEND chunk
|
|
0x44, 0xAE, 0x42, 0x60, 0x82
|
|
])
|
|
return png_data
|
|
|
|
|
|
# =============================================
|
|
# SESSION TESTS
|
|
# =============================================
|
|
|
|
class TestSessionCRUD:
|
|
"""Test session create, read, update, delete operations."""
|
|
|
|
def test_create_session(self, client, sample_session_data):
|
|
"""Test creating a new vocabulary session."""
|
|
response = client.post("/api/v1/vocab/sessions", json=sample_session_data)
|
|
|
|
assert response.status_code == 200
|
|
data = response.json()
|
|
|
|
assert "id" in data
|
|
assert data["name"] == sample_session_data["name"]
|
|
assert data["description"] == sample_session_data["description"]
|
|
assert data["source_language"] == "en"
|
|
assert data["target_language"] == "de"
|
|
assert data["status"] == "pending"
|
|
assert data["vocabulary_count"] == 0
|
|
|
|
def test_create_session_minimal(self, client):
|
|
"""Test creating session with minimal data."""
|
|
response = client.post("/api/v1/vocab/sessions", json={"name": "Test"})
|
|
|
|
assert response.status_code == 200
|
|
data = response.json()
|
|
assert data["name"] == "Test"
|
|
assert data["source_language"] == "en" # Default
|
|
assert data["target_language"] == "de" # Default
|
|
|
|
def test_list_sessions_empty(self, client):
|
|
"""Test listing sessions when none exist."""
|
|
response = client.get("/api/v1/vocab/sessions")
|
|
|
|
assert response.status_code == 200
|
|
assert response.json() == []
|
|
|
|
def test_list_sessions(self, client, sample_session_data):
|
|
"""Test listing sessions after creating some."""
|
|
# Create 3 sessions
|
|
for i in range(3):
|
|
data = sample_session_data.copy()
|
|
data["name"] = f"Session {i+1}"
|
|
client.post("/api/v1/vocab/sessions", json=data)
|
|
|
|
response = client.get("/api/v1/vocab/sessions")
|
|
|
|
assert response.status_code == 200
|
|
sessions = response.json()
|
|
assert len(sessions) == 3
|
|
|
|
def test_get_session(self, client, sample_session_data):
|
|
"""Test getting a specific session."""
|
|
# Create session
|
|
create_response = client.post("/api/v1/vocab/sessions", json=sample_session_data)
|
|
session_id = create_response.json()["id"]
|
|
|
|
# Get session
|
|
response = client.get(f"/api/v1/vocab/sessions/{session_id}")
|
|
|
|
assert response.status_code == 200
|
|
data = response.json()
|
|
assert data["id"] == session_id
|
|
assert data["name"] == sample_session_data["name"]
|
|
|
|
def test_get_session_not_found(self, client):
|
|
"""Test getting non-existent session."""
|
|
fake_id = str(uuid.uuid4())
|
|
response = client.get(f"/api/v1/vocab/sessions/{fake_id}")
|
|
|
|
assert response.status_code == 404
|
|
assert "not found" in response.json()["detail"].lower()
|
|
|
|
def test_delete_session(self, client, sample_session_data):
|
|
"""Test deleting a session."""
|
|
# Create session
|
|
create_response = client.post("/api/v1/vocab/sessions", json=sample_session_data)
|
|
session_id = create_response.json()["id"]
|
|
|
|
# Delete session
|
|
response = client.delete(f"/api/v1/vocab/sessions/{session_id}")
|
|
|
|
assert response.status_code == 200
|
|
assert "deleted" in response.json()["message"].lower()
|
|
|
|
# Verify it's gone
|
|
get_response = client.get(f"/api/v1/vocab/sessions/{session_id}")
|
|
assert get_response.status_code == 404
|
|
|
|
def test_delete_session_not_found(self, client):
|
|
"""Test deleting non-existent session."""
|
|
fake_id = str(uuid.uuid4())
|
|
response = client.delete(f"/api/v1/vocab/sessions/{fake_id}")
|
|
|
|
assert response.status_code == 404
|
|
|
|
|
|
# =============================================
|
|
# VOCABULARY TESTS
|
|
# =============================================
|
|
|
|
class TestVocabulary:
|
|
"""Test vocabulary operations."""
|
|
|
|
def test_get_vocabulary_empty(self, client, sample_session_data):
|
|
"""Test getting vocabulary from session with no vocabulary."""
|
|
# Create session
|
|
create_response = client.post("/api/v1/vocab/sessions", json=sample_session_data)
|
|
session_id = create_response.json()["id"]
|
|
|
|
# Get vocabulary
|
|
response = client.get(f"/api/v1/vocab/sessions/{session_id}/vocabulary")
|
|
|
|
assert response.status_code == 200
|
|
data = response.json()
|
|
assert data["session_id"] == session_id
|
|
assert data["vocabulary"] == []
|
|
|
|
def test_update_vocabulary(self, client, sample_session_data, sample_vocabulary):
|
|
"""Test updating vocabulary entries."""
|
|
# Create session
|
|
create_response = client.post("/api/v1/vocab/sessions", json=sample_session_data)
|
|
session_id = create_response.json()["id"]
|
|
|
|
# Update vocabulary
|
|
response = client.put(
|
|
f"/api/v1/vocab/sessions/{session_id}/vocabulary",
|
|
json={"vocabulary": sample_vocabulary}
|
|
)
|
|
|
|
assert response.status_code == 200
|
|
data = response.json()
|
|
assert data["vocabulary_count"] == 3
|
|
|
|
# Verify vocabulary was saved
|
|
get_response = client.get(f"/api/v1/vocab/sessions/{session_id}/vocabulary")
|
|
vocab_data = get_response.json()
|
|
assert len(vocab_data["vocabulary"]) == 3
|
|
|
|
def test_update_vocabulary_not_found(self, client, sample_vocabulary):
|
|
"""Test updating vocabulary for non-existent session."""
|
|
fake_id = str(uuid.uuid4())
|
|
response = client.put(
|
|
f"/api/v1/vocab/sessions/{fake_id}/vocabulary",
|
|
json={"vocabulary": sample_vocabulary}
|
|
)
|
|
|
|
assert response.status_code == 404
|
|
|
|
|
|
# =============================================
|
|
# WORKSHEET GENERATION TESTS
|
|
# =============================================
|
|
|
|
class TestWorksheetGeneration:
|
|
"""Test worksheet generation."""
|
|
|
|
def test_generate_worksheet_no_vocabulary(self, client, sample_session_data):
|
|
"""Test generating worksheet without vocabulary fails."""
|
|
# Create session (no vocabulary)
|
|
create_response = client.post("/api/v1/vocab/sessions", json=sample_session_data)
|
|
session_id = create_response.json()["id"]
|
|
|
|
# Try to generate worksheet
|
|
response = client.post(
|
|
f"/api/v1/vocab/sessions/{session_id}/generate",
|
|
json={
|
|
"worksheet_types": ["en_to_de"],
|
|
"include_solutions": True
|
|
}
|
|
)
|
|
|
|
assert response.status_code == 400
|
|
assert "no vocabulary" in response.json()["detail"].lower()
|
|
|
|
@patch('vocab_worksheet_api.generate_worksheet_pdf')
|
|
def test_generate_worksheet_success(
|
|
self, mock_pdf, client, sample_session_data, sample_vocabulary
|
|
):
|
|
"""Test successful worksheet generation."""
|
|
# Mock PDF generation to return fake bytes
|
|
mock_pdf.return_value = b"%PDF-1.4 fake pdf content"
|
|
|
|
# Create session with vocabulary
|
|
create_response = client.post("/api/v1/vocab/sessions", json=sample_session_data)
|
|
session_id = create_response.json()["id"]
|
|
|
|
# Add vocabulary
|
|
client.put(
|
|
f"/api/v1/vocab/sessions/{session_id}/vocabulary",
|
|
json={"vocabulary": sample_vocabulary}
|
|
)
|
|
|
|
# Generate worksheet
|
|
response = client.post(
|
|
f"/api/v1/vocab/sessions/{session_id}/generate",
|
|
json={
|
|
"worksheet_types": ["en_to_de", "de_to_en"],
|
|
"include_solutions": True,
|
|
"line_height": "large"
|
|
}
|
|
)
|
|
|
|
assert response.status_code == 200
|
|
data = response.json()
|
|
assert "id" in data
|
|
assert data["session_id"] == session_id
|
|
assert "en_to_de" in data["worksheet_types"]
|
|
assert "de_to_en" in data["worksheet_types"]
|
|
|
|
def test_generate_worksheet_all_types(self, client, sample_session_data, sample_vocabulary):
|
|
"""Test that all worksheet types are accepted."""
|
|
# Create session with vocabulary
|
|
create_response = client.post("/api/v1/vocab/sessions", json=sample_session_data)
|
|
session_id = create_response.json()["id"]
|
|
|
|
# Add vocabulary
|
|
client.put(
|
|
f"/api/v1/vocab/sessions/{session_id}/vocabulary",
|
|
json={"vocabulary": sample_vocabulary}
|
|
)
|
|
|
|
# Test each worksheet type
|
|
for wtype in ["en_to_de", "de_to_en", "copy", "gap_fill"]:
|
|
with patch('vocab_worksheet_api.generate_worksheet_pdf') as mock_pdf:
|
|
mock_pdf.return_value = b"%PDF-1.4 fake"
|
|
response = client.post(
|
|
f"/api/v1/vocab/sessions/{session_id}/generate",
|
|
json={"worksheet_types": [wtype]}
|
|
)
|
|
assert response.status_code == 200, f"Failed for type: {wtype}"
|
|
|
|
|
|
# =============================================
|
|
# JSON PARSING TESTS
|
|
# =============================================
|
|
|
|
class TestJSONParsing:
|
|
"""Test vocabulary JSON parsing from LLM responses."""
|
|
|
|
def test_parse_valid_json(self):
|
|
"""Test parsing valid JSON response."""
|
|
response = '''
|
|
{
|
|
"vocabulary": [
|
|
{"english": "achieve", "german": "erreichen"},
|
|
{"english": "improve", "german": "verbessern"}
|
|
]
|
|
}
|
|
'''
|
|
result = parse_vocabulary_json(response)
|
|
|
|
assert len(result) == 2
|
|
assert result[0].english == "achieve"
|
|
assert result[0].german == "erreichen"
|
|
|
|
def test_parse_json_with_extra_text(self):
|
|
"""Test parsing JSON with surrounding text."""
|
|
response = '''
|
|
Here is the extracted vocabulary:
|
|
|
|
{
|
|
"vocabulary": [
|
|
{"english": "success", "german": "Erfolg"}
|
|
]
|
|
}
|
|
|
|
I found 1 vocabulary entry.
|
|
'''
|
|
result = parse_vocabulary_json(response)
|
|
|
|
assert len(result) == 1
|
|
assert result[0].english == "success"
|
|
|
|
def test_parse_json_with_examples(self):
|
|
"""Test parsing JSON with example sentences."""
|
|
response = '''
|
|
{
|
|
"vocabulary": [
|
|
{
|
|
"english": "achieve",
|
|
"german": "erreichen",
|
|
"example": "She achieved her goals."
|
|
}
|
|
]
|
|
}
|
|
'''
|
|
result = parse_vocabulary_json(response)
|
|
|
|
assert len(result) == 1
|
|
assert result[0].example_sentence == "She achieved her goals."
|
|
|
|
def test_parse_empty_response(self):
|
|
"""Test parsing empty/invalid response."""
|
|
result = parse_vocabulary_json("")
|
|
assert result == []
|
|
|
|
result = parse_vocabulary_json("no json here")
|
|
assert result == []
|
|
|
|
def test_parse_json_missing_fields(self):
|
|
"""Test that entries without required fields are skipped."""
|
|
response = '''
|
|
{
|
|
"vocabulary": [
|
|
{"english": "valid", "german": "gueltig"},
|
|
{"english": ""},
|
|
{"german": "nur deutsch"},
|
|
{"english": "also valid", "german": "auch gueltig"}
|
|
]
|
|
}
|
|
'''
|
|
result = parse_vocabulary_json(response)
|
|
|
|
# Only entries with both english and german should be included
|
|
assert len(result) == 2
|
|
assert result[0].english == "valid"
|
|
assert result[1].english == "also valid"
|
|
|
|
|
|
# =============================================
|
|
# FILE UPLOAD TESTS
|
|
# =============================================
|
|
|
|
class TestFileUpload:
|
|
"""Test file upload functionality."""
|
|
|
|
@patch('vocab_worksheet_api.extract_vocabulary_from_image')
|
|
def test_upload_image(self, mock_extract, client, sample_session_data, sample_image_bytes):
|
|
"""Test uploading an image file."""
|
|
# Mock extraction to return sample vocabulary
|
|
mock_extract.return_value = (
|
|
[
|
|
VocabularyEntry(
|
|
id=str(uuid.uuid4()),
|
|
english="test",
|
|
german="Test"
|
|
)
|
|
],
|
|
0.85,
|
|
""
|
|
)
|
|
|
|
# Create session
|
|
create_response = client.post("/api/v1/vocab/sessions", json=sample_session_data)
|
|
session_id = create_response.json()["id"]
|
|
|
|
# Upload image
|
|
files = {"file": ("test.png", io.BytesIO(sample_image_bytes), "image/png")}
|
|
response = client.post(
|
|
f"/api/v1/vocab/sessions/{session_id}/upload",
|
|
files=files
|
|
)
|
|
|
|
assert response.status_code == 200
|
|
data = response.json()
|
|
assert data["session_id"] == session_id
|
|
assert data["vocabulary_count"] == 1
|
|
|
|
def test_upload_invalid_file_type(self, client, sample_session_data):
|
|
"""Test uploading invalid file type."""
|
|
# Create session
|
|
create_response = client.post("/api/v1/vocab/sessions", json=sample_session_data)
|
|
session_id = create_response.json()["id"]
|
|
|
|
# Try to upload a text file
|
|
files = {"file": ("test.txt", io.BytesIO(b"hello"), "text/plain")}
|
|
response = client.post(
|
|
f"/api/v1/vocab/sessions/{session_id}/upload",
|
|
files=files
|
|
)
|
|
|
|
assert response.status_code == 400
|
|
assert "supported" in response.json()["detail"].lower()
|
|
|
|
|
|
# =============================================
|
|
# STATUS WORKFLOW TESTS
|
|
# =============================================
|
|
|
|
class TestSessionStatus:
|
|
"""Test session status transitions."""
|
|
|
|
def test_initial_status_pending(self, client, sample_session_data):
|
|
"""Test that new session has PENDING status."""
|
|
response = client.post("/api/v1/vocab/sessions", json=sample_session_data)
|
|
|
|
assert response.json()["status"] == "pending"
|
|
|
|
@patch('vocab_worksheet_api.extract_vocabulary_from_image')
|
|
def test_status_after_extraction(self, mock_extract, client, sample_session_data, sample_image_bytes):
|
|
"""Test that status becomes EXTRACTED after processing."""
|
|
mock_extract.return_value = ([], 0.0, "")
|
|
|
|
# Create and upload
|
|
create_response = client.post("/api/v1/vocab/sessions", json=sample_session_data)
|
|
session_id = create_response.json()["id"]
|
|
|
|
files = {"file": ("test.png", io.BytesIO(sample_image_bytes), "image/png")}
|
|
client.post(f"/api/v1/vocab/sessions/{session_id}/upload", files=files)
|
|
|
|
# Check status
|
|
get_response = client.get(f"/api/v1/vocab/sessions/{session_id}")
|
|
assert get_response.json()["status"] == "extracted"
|
|
|
|
@patch('vocab_worksheet_api.generate_worksheet_pdf')
|
|
def test_status_after_generation(self, mock_pdf, client, sample_session_data, sample_vocabulary):
|
|
"""Test that status becomes COMPLETED after worksheet generation."""
|
|
mock_pdf.return_value = b"%PDF"
|
|
|
|
# Create session with vocabulary
|
|
create_response = client.post("/api/v1/vocab/sessions", json=sample_session_data)
|
|
session_id = create_response.json()["id"]
|
|
|
|
# Add vocabulary
|
|
client.put(
|
|
f"/api/v1/vocab/sessions/{session_id}/vocabulary",
|
|
json={"vocabulary": sample_vocabulary}
|
|
)
|
|
|
|
# Generate worksheet
|
|
client.post(
|
|
f"/api/v1/vocab/sessions/{session_id}/generate",
|
|
json={"worksheet_types": ["en_to_de"]}
|
|
)
|
|
|
|
# Check status
|
|
get_response = client.get(f"/api/v1/vocab/sessions/{session_id}")
|
|
assert get_response.json()["status"] == "completed"
|
|
|
|
|
|
# =============================================
|
|
# EDGE CASES
|
|
# =============================================
|
|
|
|
class TestEdgeCases:
|
|
"""Test edge cases and error handling."""
|
|
|
|
def test_session_with_special_characters(self, client):
|
|
"""Test session with special characters in name."""
|
|
response = client.post(
|
|
"/api/v1/vocab/sessions",
|
|
json={"name": "Englisch Klasse 7 - äöü ß € @"}
|
|
)
|
|
|
|
assert response.status_code == 200
|
|
assert "äöü" in response.json()["name"]
|
|
|
|
def test_vocabulary_with_long_entries(self, client, sample_session_data):
|
|
"""Test vocabulary with very long entries."""
|
|
create_response = client.post("/api/v1/vocab/sessions", json=sample_session_data)
|
|
session_id = create_response.json()["id"]
|
|
|
|
# Create vocabulary with long entries
|
|
long_vocab = [{
|
|
"id": str(uuid.uuid4()),
|
|
"english": "a" * 100,
|
|
"german": "b" * 200,
|
|
"example_sentence": "c" * 500
|
|
}]
|
|
|
|
response = client.put(
|
|
f"/api/v1/vocab/sessions/{session_id}/vocabulary",
|
|
json={"vocabulary": long_vocab}
|
|
)
|
|
|
|
assert response.status_code == 200
|
|
|
|
def test_sessions_limit(self, client, sample_session_data):
|
|
"""Test session listing with limit parameter."""
|
|
# Create 10 sessions
|
|
for i in range(10):
|
|
data = sample_session_data.copy()
|
|
data["name"] = f"Session {i+1}"
|
|
client.post("/api/v1/vocab/sessions", json=data)
|
|
|
|
# Get with limit
|
|
response = client.get("/api/v1/vocab/sessions?limit=5")
|
|
|
|
assert response.status_code == 200
|
|
assert len(response.json()) == 5
|
|
|
|
|
|
# =============================================
|
|
# OCR PIPELINE INTEGRATION TESTS
|
|
# =============================================
|
|
|
|
class TestProcessSinglePageOCRPipeline:
|
|
"""Tests for the OCR pipeline integration in process-single-page."""
|
|
|
|
@patch("vocab_worksheet_api.OCR_PIPELINE_AVAILABLE", True)
|
|
@patch("vocab_worksheet_api._run_ocr_pipeline_for_page")
|
|
def test_process_single_page_uses_ocr_pipeline(self, mock_pipeline, client):
|
|
"""When OCR pipeline is available, process-single-page should use it."""
|
|
# Create a session with PDF data
|
|
session_id = str(uuid.uuid4())
|
|
fake_pdf = b"%PDF-1.4 fake"
|
|
_sessions[session_id] = {
|
|
"id": session_id,
|
|
"name": "Test",
|
|
"status": "uploaded",
|
|
"pdf_data": fake_pdf,
|
|
"pdf_page_count": 2,
|
|
"vocabulary": [],
|
|
}
|
|
|
|
# Mock the pipeline to return vocab entries
|
|
mock_pipeline.return_value = [
|
|
{
|
|
"id": str(uuid.uuid4()),
|
|
"english": "to achieve",
|
|
"german": "erreichen",
|
|
"example_sentence": "She achieved her goal.",
|
|
"source_page": 1,
|
|
},
|
|
{
|
|
"id": str(uuid.uuid4()),
|
|
"english": "goal",
|
|
"german": "Ziel",
|
|
"example_sentence": "",
|
|
"source_page": 1,
|
|
},
|
|
]
|
|
|
|
with patch("vocab_worksheet_api.convert_pdf_page_to_image", new_callable=AsyncMock) as mock_convert:
|
|
mock_convert.return_value = b"fake-png-data"
|
|
response = client.post(f"/api/v1/vocab/sessions/{session_id}/process-single-page/0")
|
|
|
|
assert response.status_code == 200
|
|
data = response.json()
|
|
assert data["success"] is True
|
|
assert data["vocabulary_count"] == 2
|
|
assert data["vocabulary"][0]["english"] == "to achieve"
|
|
assert data["vocabulary"][0]["source_page"] == 1
|
|
|
|
# Verify pipeline was called with correct args
|
|
mock_pipeline.assert_called_once_with(b"fake-png-data", 0, session_id)
|
|
|
|
@patch("vocab_worksheet_api.OCR_PIPELINE_AVAILABLE", True)
|
|
@patch("vocab_worksheet_api._run_ocr_pipeline_for_page")
|
|
def test_process_single_page_ocr_pipeline_error_returns_failure(self, mock_pipeline, client):
|
|
"""When the OCR pipeline raises an exception, return success=False."""
|
|
session_id = str(uuid.uuid4())
|
|
_sessions[session_id] = {
|
|
"id": session_id,
|
|
"name": "Test",
|
|
"status": "uploaded",
|
|
"pdf_data": b"%PDF-1.4 fake",
|
|
"pdf_page_count": 1,
|
|
"vocabulary": [],
|
|
}
|
|
|
|
mock_pipeline.side_effect = ValueError("Column detection failed")
|
|
|
|
with patch("vocab_worksheet_api.convert_pdf_page_to_image", new_callable=AsyncMock) as mock_convert:
|
|
mock_convert.return_value = b"fake-png-data"
|
|
response = client.post(f"/api/v1/vocab/sessions/{session_id}/process-single-page/0")
|
|
|
|
assert response.status_code == 200
|
|
data = response.json()
|
|
assert data["success"] is False
|
|
assert "OCR pipeline error" in data["error"]
|
|
assert data["vocabulary"] == []
|
|
|
|
@patch("vocab_worksheet_api.OCR_PIPELINE_AVAILABLE", False)
|
|
@patch("vocab_worksheet_api.extract_vocabulary_from_image", new_callable=AsyncMock)
|
|
def test_process_single_page_fallback_to_llm(self, mock_llm_extract, client):
|
|
"""When OCR pipeline is not available, fall back to LLM vision."""
|
|
session_id = str(uuid.uuid4())
|
|
_sessions[session_id] = {
|
|
"id": session_id,
|
|
"name": "Test",
|
|
"status": "uploaded",
|
|
"pdf_data": b"%PDF-1.4 fake",
|
|
"pdf_page_count": 1,
|
|
"vocabulary": [],
|
|
}
|
|
|
|
mock_entry = MagicMock()
|
|
mock_entry.dict.return_value = {
|
|
"id": str(uuid.uuid4()),
|
|
"english": "house",
|
|
"german": "Haus",
|
|
"example_sentence": "",
|
|
}
|
|
mock_llm_extract.return_value = ([mock_entry], 0.85, None)
|
|
|
|
with patch("vocab_worksheet_api.convert_pdf_page_to_image", new_callable=AsyncMock) as mock_convert:
|
|
mock_convert.return_value = b"fake-png-data"
|
|
response = client.post(f"/api/v1/vocab/sessions/{session_id}/process-single-page/0")
|
|
|
|
assert response.status_code == 200
|
|
data = response.json()
|
|
assert data["success"] is True
|
|
assert data["vocabulary_count"] == 1
|
|
assert data["vocabulary"][0]["english"] == "house"
|
|
|
|
|
|
# =============================================
|
|
# RUN TESTS
|
|
# =============================================
|
|
|
|
if __name__ == "__main__":
|
|
pytest.main([__file__, "-v"])
|