feat: use OCR pipeline instead of LLM vision for vocab worksheet extraction

process-single-page now runs the full CV pipeline (deskew → dewarp → columns → rows → cell-first OCR v2 → LLM review) for much better extraction quality. Falls back to LLM vision if pipeline imports are unavailable. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-05 15:35:44 +01:00
parent 9ea77ba157
commit b7ae36e92b
2 changed files with 400 additions and 29 deletions
@@ -615,6 +615,121 @@ class TestEdgeCases:
        assert len(response.json()) == 5


+# =============================================
+# OCR PIPELINE INTEGRATION TESTS
+# =============================================
+
+class TestProcessSinglePageOCRPipeline:
+    """Tests for the OCR pipeline integration in process-single-page."""
+
+    @patch("vocab_worksheet_api.OCR_PIPELINE_AVAILABLE", True)
+    @patch("vocab_worksheet_api._run_ocr_pipeline_for_page")
+    def test_process_single_page_uses_ocr_pipeline(self, mock_pipeline, client):
+        """When OCR pipeline is available, process-single-page should use it."""
+        # Create a session with PDF data
+        session_id = str(uuid.uuid4())
+        fake_pdf = b"%PDF-1.4 fake"
+        _sessions[session_id] = {
+            "id": session_id,
+            "name": "Test",
+            "status": "uploaded",
+            "pdf_data": fake_pdf,
+            "pdf_page_count": 2,
+            "vocabulary": [],
+        }
+
+        # Mock the pipeline to return vocab entries
+        mock_pipeline.return_value = [
+            {
+                "id": str(uuid.uuid4()),
+                "english": "to achieve",
+                "german": "erreichen",
+                "example_sentence": "She achieved her goal.",
+                "source_page": 1,
+            },
+            {
+                "id": str(uuid.uuid4()),
+                "english": "goal",
+                "german": "Ziel",
+                "example_sentence": "",
+                "source_page": 1,
+            },
+        ]
+
+        with patch("vocab_worksheet_api.convert_pdf_page_to_image", new_callable=AsyncMock) as mock_convert:
+            mock_convert.return_value = b"fake-png-data"
+            response = client.post(f"/api/v1/vocab/sessions/{session_id}/process-single-page/0")
+
+        assert response.status_code == 200
+        data = response.json()
+        assert data["success"] is True
+        assert data["vocabulary_count"] == 2
+        assert data["vocabulary"][0]["english"] == "to achieve"
+        assert data["vocabulary"][0]["source_page"] == 1
+
+        # Verify pipeline was called with correct args
+        mock_pipeline.assert_called_once_with(b"fake-png-data", 0, session_id)
+
+    @patch("vocab_worksheet_api.OCR_PIPELINE_AVAILABLE", True)
+    @patch("vocab_worksheet_api._run_ocr_pipeline_for_page")
+    def test_process_single_page_ocr_pipeline_error_returns_failure(self, mock_pipeline, client):
+        """When the OCR pipeline raises an exception, return success=False."""
+        session_id = str(uuid.uuid4())
+        _sessions[session_id] = {
+            "id": session_id,
+            "name": "Test",
+            "status": "uploaded",
+            "pdf_data": b"%PDF-1.4 fake",
+            "pdf_page_count": 1,
+            "vocabulary": [],
+        }
+
+        mock_pipeline.side_effect = ValueError("Column detection failed")
+
+        with patch("vocab_worksheet_api.convert_pdf_page_to_image", new_callable=AsyncMock) as mock_convert:
+            mock_convert.return_value = b"fake-png-data"
+            response = client.post(f"/api/v1/vocab/sessions/{session_id}/process-single-page/0")
+
+        assert response.status_code == 200
+        data = response.json()
+        assert data["success"] is False
+        assert "OCR pipeline error" in data["error"]
+        assert data["vocabulary"] == []
+
+    @patch("vocab_worksheet_api.OCR_PIPELINE_AVAILABLE", False)
+    @patch("vocab_worksheet_api.extract_vocabulary_from_image", new_callable=AsyncMock)
+    def test_process_single_page_fallback_to_llm(self, mock_llm_extract, client):
+        """When OCR pipeline is not available, fall back to LLM vision."""
+        session_id = str(uuid.uuid4())
+        _sessions[session_id] = {
+            "id": session_id,
+            "name": "Test",
+            "status": "uploaded",
+            "pdf_data": b"%PDF-1.4 fake",
+            "pdf_page_count": 1,
+            "vocabulary": [],
+        }
+
+        mock_entry = MagicMock()
+        mock_entry.dict.return_value = {
+            "id": str(uuid.uuid4()),
+            "english": "house",
+            "german": "Haus",
+            "example_sentence": "",
+        }
+        mock_llm_extract.return_value = ([mock_entry], 0.85, None)
+
+        with patch("vocab_worksheet_api.convert_pdf_page_to_image", new_callable=AsyncMock) as mock_convert:
+            mock_convert.return_value = b"fake-png-data"
+            response = client.post(f"/api/v1/vocab/sessions/{session_id}/process-single-page/0")
+
+        assert response.status_code == 200
+        data = response.json()
+        assert data["success"] is True
+        assert data["vocabulary_count"] == 1
+        assert data["vocabulary"][0]["english"] == "house"
+
+
 # =============================================
 # RUN TESTS
 # =============================================