""" Unit Tests for Vocab-Worksheet API Tests cover: - Session CRUD (create, read, list, delete) - File upload (images and PDFs) - PDF page handling (thumbnails, page selection) - Vocabulary extraction (mocked Vision LLM) - Vocabulary editing - Worksheet generation - PDF export DSGVO Note: All tests run locally without external API calls. BACKLOG: Feature not yet integrated into main.py See: https://macmini:3002/infrastructure/tests """ import pytest import sys # Skip entire module if vocab_worksheet_api is not available pytest.importorskip("vocab_worksheet_api", reason="vocab_worksheet_api not yet integrated - Backlog item") # Mark all tests in this module as expected failures (backlog item) pytestmark = pytest.mark.xfail( reason="vocab_worksheet_api not yet integrated into main.py - Backlog item", strict=False # Don't fail if test unexpectedly passes ) import json import uuid import io from datetime import datetime from unittest.mock import AsyncMock, MagicMock, patch from fastapi.testclient import TestClient # Import the main app and vocab-worksheet components sys.path.insert(0, '..') from main import app from vocab_worksheet_api import ( _sessions, _worksheets, SessionStatus, WorksheetType, VocabularyEntry, SessionCreate, VocabularyUpdate, WorksheetGenerateRequest, parse_vocabulary_json, ) # ============================================= # FIXTURES # ============================================= @pytest.fixture def client(): """Test client for FastAPI app.""" return TestClient(app) @pytest.fixture(autouse=True) def clear_storage(): """Clear in-memory storage before each test.""" _sessions.clear() _worksheets.clear() yield _sessions.clear() _worksheets.clear() @pytest.fixture def sample_session_data(): """Sample session creation data.""" return { "name": "Englisch Klasse 7 - Unit 3", "description": "Vokabeln aus Green Line 3", "source_language": "en", "target_language": "de" } @pytest.fixture def sample_vocabulary(): """Sample vocabulary entries.""" return [ { "id": str(uuid.uuid4()), "english": "to achieve", "german": "erreichen, erzielen", "example_sentence": "She achieved her goals.", "word_type": "v", "source_page": 1 }, { "id": str(uuid.uuid4()), "english": "achievement", "german": "Leistung, Errungenschaft", "example_sentence": "That was a great achievement.", "word_type": "n", "source_page": 1 }, { "id": str(uuid.uuid4()), "english": "improve", "german": "verbessern", "example_sentence": "I want to improve my English.", "word_type": "v", "source_page": 1 } ] @pytest.fixture def sample_image_bytes(): """Create a minimal valid PNG image (1x1 pixel, white).""" # Minimal PNG: 1x1 white pixel png_data = bytes([ 0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A, # PNG signature 0x00, 0x00, 0x00, 0x0D, 0x49, 0x48, 0x44, 0x52, # IHDR chunk 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, # 1x1 0x08, 0x02, 0x00, 0x00, 0x00, 0x90, 0x77, 0x53, 0xDE, 0x00, 0x00, 0x00, 0x0C, 0x49, 0x44, 0x41, # IDAT chunk 0x54, 0x08, 0xD7, 0x63, 0xF8, 0xFF, 0xFF, 0xFF, 0x00, 0x05, 0xFE, 0x02, 0xFE, 0xDC, 0xCC, 0x59, 0xE7, 0x00, 0x00, 0x00, 0x00, 0x49, 0x45, 0x4E, # IEND chunk 0x44, 0xAE, 0x42, 0x60, 0x82 ]) return png_data # ============================================= # SESSION TESTS # ============================================= class TestSessionCRUD: """Test session create, read, update, delete operations.""" def test_create_session(self, client, sample_session_data): """Test creating a new vocabulary session.""" response = client.post("/api/v1/vocab/sessions", json=sample_session_data) assert response.status_code == 200 data = response.json() assert "id" in data assert data["name"] == sample_session_data["name"] assert data["description"] == sample_session_data["description"] assert data["source_language"] == "en" assert data["target_language"] == "de" assert data["status"] == "pending" assert data["vocabulary_count"] == 0 def test_create_session_minimal(self, client): """Test creating session with minimal data.""" response = client.post("/api/v1/vocab/sessions", json={"name": "Test"}) assert response.status_code == 200 data = response.json() assert data["name"] == "Test" assert data["source_language"] == "en" # Default assert data["target_language"] == "de" # Default def test_list_sessions_empty(self, client): """Test listing sessions when none exist.""" response = client.get("/api/v1/vocab/sessions") assert response.status_code == 200 assert response.json() == [] def test_list_sessions(self, client, sample_session_data): """Test listing sessions after creating some.""" # Create 3 sessions for i in range(3): data = sample_session_data.copy() data["name"] = f"Session {i+1}" client.post("/api/v1/vocab/sessions", json=data) response = client.get("/api/v1/vocab/sessions") assert response.status_code == 200 sessions = response.json() assert len(sessions) == 3 def test_get_session(self, client, sample_session_data): """Test getting a specific session.""" # Create session create_response = client.post("/api/v1/vocab/sessions", json=sample_session_data) session_id = create_response.json()["id"] # Get session response = client.get(f"/api/v1/vocab/sessions/{session_id}") assert response.status_code == 200 data = response.json() assert data["id"] == session_id assert data["name"] == sample_session_data["name"] def test_get_session_not_found(self, client): """Test getting non-existent session.""" fake_id = str(uuid.uuid4()) response = client.get(f"/api/v1/vocab/sessions/{fake_id}") assert response.status_code == 404 assert "not found" in response.json()["detail"].lower() def test_delete_session(self, client, sample_session_data): """Test deleting a session.""" # Create session create_response = client.post("/api/v1/vocab/sessions", json=sample_session_data) session_id = create_response.json()["id"] # Delete session response = client.delete(f"/api/v1/vocab/sessions/{session_id}") assert response.status_code == 200 assert "deleted" in response.json()["message"].lower() # Verify it's gone get_response = client.get(f"/api/v1/vocab/sessions/{session_id}") assert get_response.status_code == 404 def test_delete_session_not_found(self, client): """Test deleting non-existent session.""" fake_id = str(uuid.uuid4()) response = client.delete(f"/api/v1/vocab/sessions/{fake_id}") assert response.status_code == 404 # ============================================= # VOCABULARY TESTS # ============================================= class TestVocabulary: """Test vocabulary operations.""" def test_get_vocabulary_empty(self, client, sample_session_data): """Test getting vocabulary from session with no vocabulary.""" # Create session create_response = client.post("/api/v1/vocab/sessions", json=sample_session_data) session_id = create_response.json()["id"] # Get vocabulary response = client.get(f"/api/v1/vocab/sessions/{session_id}/vocabulary") assert response.status_code == 200 data = response.json() assert data["session_id"] == session_id assert data["vocabulary"] == [] def test_update_vocabulary(self, client, sample_session_data, sample_vocabulary): """Test updating vocabulary entries.""" # Create session create_response = client.post("/api/v1/vocab/sessions", json=sample_session_data) session_id = create_response.json()["id"] # Update vocabulary response = client.put( f"/api/v1/vocab/sessions/{session_id}/vocabulary", json={"vocabulary": sample_vocabulary} ) assert response.status_code == 200 data = response.json() assert data["vocabulary_count"] == 3 # Verify vocabulary was saved get_response = client.get(f"/api/v1/vocab/sessions/{session_id}/vocabulary") vocab_data = get_response.json() assert len(vocab_data["vocabulary"]) == 3 def test_update_vocabulary_not_found(self, client, sample_vocabulary): """Test updating vocabulary for non-existent session.""" fake_id = str(uuid.uuid4()) response = client.put( f"/api/v1/vocab/sessions/{fake_id}/vocabulary", json={"vocabulary": sample_vocabulary} ) assert response.status_code == 404 # ============================================= # WORKSHEET GENERATION TESTS # ============================================= class TestWorksheetGeneration: """Test worksheet generation.""" def test_generate_worksheet_no_vocabulary(self, client, sample_session_data): """Test generating worksheet without vocabulary fails.""" # Create session (no vocabulary) create_response = client.post("/api/v1/vocab/sessions", json=sample_session_data) session_id = create_response.json()["id"] # Try to generate worksheet response = client.post( f"/api/v1/vocab/sessions/{session_id}/generate", json={ "worksheet_types": ["en_to_de"], "include_solutions": True } ) assert response.status_code == 400 assert "no vocabulary" in response.json()["detail"].lower() @patch('vocab_worksheet_api.generate_worksheet_pdf') def test_generate_worksheet_success( self, mock_pdf, client, sample_session_data, sample_vocabulary ): """Test successful worksheet generation.""" # Mock PDF generation to return fake bytes mock_pdf.return_value = b"%PDF-1.4 fake pdf content" # Create session with vocabulary create_response = client.post("/api/v1/vocab/sessions", json=sample_session_data) session_id = create_response.json()["id"] # Add vocabulary client.put( f"/api/v1/vocab/sessions/{session_id}/vocabulary", json={"vocabulary": sample_vocabulary} ) # Generate worksheet response = client.post( f"/api/v1/vocab/sessions/{session_id}/generate", json={ "worksheet_types": ["en_to_de", "de_to_en"], "include_solutions": True, "line_height": "large" } ) assert response.status_code == 200 data = response.json() assert "id" in data assert data["session_id"] == session_id assert "en_to_de" in data["worksheet_types"] assert "de_to_en" in data["worksheet_types"] def test_generate_worksheet_all_types(self, client, sample_session_data, sample_vocabulary): """Test that all worksheet types are accepted.""" # Create session with vocabulary create_response = client.post("/api/v1/vocab/sessions", json=sample_session_data) session_id = create_response.json()["id"] # Add vocabulary client.put( f"/api/v1/vocab/sessions/{session_id}/vocabulary", json={"vocabulary": sample_vocabulary} ) # Test each worksheet type for wtype in ["en_to_de", "de_to_en", "copy", "gap_fill"]: with patch('vocab_worksheet_api.generate_worksheet_pdf') as mock_pdf: mock_pdf.return_value = b"%PDF-1.4 fake" response = client.post( f"/api/v1/vocab/sessions/{session_id}/generate", json={"worksheet_types": [wtype]} ) assert response.status_code == 200, f"Failed for type: {wtype}" # ============================================= # JSON PARSING TESTS # ============================================= class TestJSONParsing: """Test vocabulary JSON parsing from LLM responses.""" def test_parse_valid_json(self): """Test parsing valid JSON response.""" response = ''' { "vocabulary": [ {"english": "achieve", "german": "erreichen"}, {"english": "improve", "german": "verbessern"} ] } ''' result = parse_vocabulary_json(response) assert len(result) == 2 assert result[0].english == "achieve" assert result[0].german == "erreichen" def test_parse_json_with_extra_text(self): """Test parsing JSON with surrounding text.""" response = ''' Here is the extracted vocabulary: { "vocabulary": [ {"english": "success", "german": "Erfolg"} ] } I found 1 vocabulary entry. ''' result = parse_vocabulary_json(response) assert len(result) == 1 assert result[0].english == "success" def test_parse_json_with_examples(self): """Test parsing JSON with example sentences.""" response = ''' { "vocabulary": [ { "english": "achieve", "german": "erreichen", "example": "She achieved her goals." } ] } ''' result = parse_vocabulary_json(response) assert len(result) == 1 assert result[0].example_sentence == "She achieved her goals." def test_parse_empty_response(self): """Test parsing empty/invalid response.""" result = parse_vocabulary_json("") assert result == [] result = parse_vocabulary_json("no json here") assert result == [] def test_parse_json_missing_fields(self): """Test that entries without required fields are skipped.""" response = ''' { "vocabulary": [ {"english": "valid", "german": "gueltig"}, {"english": ""}, {"german": "nur deutsch"}, {"english": "also valid", "german": "auch gueltig"} ] } ''' result = parse_vocabulary_json(response) # Only entries with both english and german should be included assert len(result) == 2 assert result[0].english == "valid" assert result[1].english == "also valid" # ============================================= # FILE UPLOAD TESTS # ============================================= class TestFileUpload: """Test file upload functionality.""" @patch('vocab_worksheet_api.extract_vocabulary_from_image') def test_upload_image(self, mock_extract, client, sample_session_data, sample_image_bytes): """Test uploading an image file.""" # Mock extraction to return sample vocabulary mock_extract.return_value = ( [ VocabularyEntry( id=str(uuid.uuid4()), english="test", german="Test" ) ], 0.85, "" ) # Create session create_response = client.post("/api/v1/vocab/sessions", json=sample_session_data) session_id = create_response.json()["id"] # Upload image files = {"file": ("test.png", io.BytesIO(sample_image_bytes), "image/png")} response = client.post( f"/api/v1/vocab/sessions/{session_id}/upload", files=files ) assert response.status_code == 200 data = response.json() assert data["session_id"] == session_id assert data["vocabulary_count"] == 1 def test_upload_invalid_file_type(self, client, sample_session_data): """Test uploading invalid file type.""" # Create session create_response = client.post("/api/v1/vocab/sessions", json=sample_session_data) session_id = create_response.json()["id"] # Try to upload a text file files = {"file": ("test.txt", io.BytesIO(b"hello"), "text/plain")} response = client.post( f"/api/v1/vocab/sessions/{session_id}/upload", files=files ) assert response.status_code == 400 assert "supported" in response.json()["detail"].lower() # ============================================= # STATUS WORKFLOW TESTS # ============================================= class TestSessionStatus: """Test session status transitions.""" def test_initial_status_pending(self, client, sample_session_data): """Test that new session has PENDING status.""" response = client.post("/api/v1/vocab/sessions", json=sample_session_data) assert response.json()["status"] == "pending" @patch('vocab_worksheet_api.extract_vocabulary_from_image') def test_status_after_extraction(self, mock_extract, client, sample_session_data, sample_image_bytes): """Test that status becomes EXTRACTED after processing.""" mock_extract.return_value = ([], 0.0, "") # Create and upload create_response = client.post("/api/v1/vocab/sessions", json=sample_session_data) session_id = create_response.json()["id"] files = {"file": ("test.png", io.BytesIO(sample_image_bytes), "image/png")} client.post(f"/api/v1/vocab/sessions/{session_id}/upload", files=files) # Check status get_response = client.get(f"/api/v1/vocab/sessions/{session_id}") assert get_response.json()["status"] == "extracted" @patch('vocab_worksheet_api.generate_worksheet_pdf') def test_status_after_generation(self, mock_pdf, client, sample_session_data, sample_vocabulary): """Test that status becomes COMPLETED after worksheet generation.""" mock_pdf.return_value = b"%PDF" # Create session with vocabulary create_response = client.post("/api/v1/vocab/sessions", json=sample_session_data) session_id = create_response.json()["id"] # Add vocabulary client.put( f"/api/v1/vocab/sessions/{session_id}/vocabulary", json={"vocabulary": sample_vocabulary} ) # Generate worksheet client.post( f"/api/v1/vocab/sessions/{session_id}/generate", json={"worksheet_types": ["en_to_de"]} ) # Check status get_response = client.get(f"/api/v1/vocab/sessions/{session_id}") assert get_response.json()["status"] == "completed" # ============================================= # EDGE CASES # ============================================= class TestEdgeCases: """Test edge cases and error handling.""" def test_session_with_special_characters(self, client): """Test session with special characters in name.""" response = client.post( "/api/v1/vocab/sessions", json={"name": "Englisch Klasse 7 - äöü ß € @"} ) assert response.status_code == 200 assert "äöü" in response.json()["name"] def test_vocabulary_with_long_entries(self, client, sample_session_data): """Test vocabulary with very long entries.""" create_response = client.post("/api/v1/vocab/sessions", json=sample_session_data) session_id = create_response.json()["id"] # Create vocabulary with long entries long_vocab = [{ "id": str(uuid.uuid4()), "english": "a" * 100, "german": "b" * 200, "example_sentence": "c" * 500 }] response = client.put( f"/api/v1/vocab/sessions/{session_id}/vocabulary", json={"vocabulary": long_vocab} ) assert response.status_code == 200 def test_sessions_limit(self, client, sample_session_data): """Test session listing with limit parameter.""" # Create 10 sessions for i in range(10): data = sample_session_data.copy() data["name"] = f"Session {i+1}" client.post("/api/v1/vocab/sessions", json=data) # Get with limit response = client.get("/api/v1/vocab/sessions?limit=5") assert response.status_code == 200 assert len(response.json()) == 5 # ============================================= # OCR PIPELINE INTEGRATION TESTS # ============================================= class TestProcessSinglePageOCRPipeline: """Tests for the OCR pipeline integration in process-single-page.""" @patch("vocab_worksheet_api.OCR_PIPELINE_AVAILABLE", True) @patch("vocab_worksheet_api._run_ocr_pipeline_for_page") def test_process_single_page_uses_ocr_pipeline(self, mock_pipeline, client): """When OCR pipeline is available, process-single-page should use it.""" # Create a session with PDF data session_id = str(uuid.uuid4()) fake_pdf = b"%PDF-1.4 fake" _sessions[session_id] = { "id": session_id, "name": "Test", "status": "uploaded", "pdf_data": fake_pdf, "pdf_page_count": 2, "vocabulary": [], } # Mock the pipeline to return vocab entries mock_pipeline.return_value = [ { "id": str(uuid.uuid4()), "english": "to achieve", "german": "erreichen", "example_sentence": "She achieved her goal.", "source_page": 1, }, { "id": str(uuid.uuid4()), "english": "goal", "german": "Ziel", "example_sentence": "", "source_page": 1, }, ] with patch("vocab_worksheet_api.convert_pdf_page_to_image", new_callable=AsyncMock) as mock_convert: mock_convert.return_value = b"fake-png-data" response = client.post(f"/api/v1/vocab/sessions/{session_id}/process-single-page/0") assert response.status_code == 200 data = response.json() assert data["success"] is True assert data["vocabulary_count"] == 2 assert data["vocabulary"][0]["english"] == "to achieve" assert data["vocabulary"][0]["source_page"] == 1 # Verify pipeline was called with correct args mock_pipeline.assert_called_once_with(b"fake-png-data", 0, session_id) @patch("vocab_worksheet_api.OCR_PIPELINE_AVAILABLE", True) @patch("vocab_worksheet_api._run_ocr_pipeline_for_page") def test_process_single_page_ocr_pipeline_error_returns_failure(self, mock_pipeline, client): """When the OCR pipeline raises an exception, return success=False.""" session_id = str(uuid.uuid4()) _sessions[session_id] = { "id": session_id, "name": "Test", "status": "uploaded", "pdf_data": b"%PDF-1.4 fake", "pdf_page_count": 1, "vocabulary": [], } mock_pipeline.side_effect = ValueError("Column detection failed") with patch("vocab_worksheet_api.convert_pdf_page_to_image", new_callable=AsyncMock) as mock_convert: mock_convert.return_value = b"fake-png-data" response = client.post(f"/api/v1/vocab/sessions/{session_id}/process-single-page/0") assert response.status_code == 200 data = response.json() assert data["success"] is False assert "OCR pipeline error" in data["error"] assert data["vocabulary"] == [] @patch("vocab_worksheet_api.OCR_PIPELINE_AVAILABLE", False) @patch("vocab_worksheet_api.extract_vocabulary_from_image", new_callable=AsyncMock) def test_process_single_page_fallback_to_llm(self, mock_llm_extract, client): """When OCR pipeline is not available, fall back to LLM vision.""" session_id = str(uuid.uuid4()) _sessions[session_id] = { "id": session_id, "name": "Test", "status": "uploaded", "pdf_data": b"%PDF-1.4 fake", "pdf_page_count": 1, "vocabulary": [], } mock_entry = MagicMock() mock_entry.dict.return_value = { "id": str(uuid.uuid4()), "english": "house", "german": "Haus", "example_sentence": "", } mock_llm_extract.return_value = ([mock_entry], 0.85, None) with patch("vocab_worksheet_api.convert_pdf_page_to_image", new_callable=AsyncMock) as mock_convert: mock_convert.return_value = b"fake-png-data" response = client.post(f"/api/v1/vocab/sessions/{session_id}/process-single-page/0") assert response.status_code == 200 data = response.json() assert data["success"] is True assert data["vocabulary_count"] == 1 assert data["vocabulary"][0]["english"] == "house" # ============================================= # RUN TESTS # ============================================= if __name__ == "__main__": pytest.main([__file__, "-v"])