feat: use OCR pipeline instead of LLM vision for vocab worksheet extraction
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 26s
CI / test-go-edu-search (push) Successful in 25s
CI / test-python-klausur (push) Failing after 1m52s
CI / test-python-agent-core (push) Successful in 18s
CI / test-nodejs-website (push) Successful in 17s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 26s
CI / test-go-edu-search (push) Successful in 25s
CI / test-python-klausur (push) Failing after 1m52s
CI / test-python-agent-core (push) Successful in 18s
CI / test-nodejs-website (push) Successful in 17s
process-single-page now runs the full CV pipeline (deskew → dewarp → columns → rows → cell-first OCR v2 → LLM review) for much better extraction quality. Falls back to LLM vision if pipeline imports are unavailable. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -615,6 +615,121 @@ class TestEdgeCases:
|
||||
assert len(response.json()) == 5
|
||||
|
||||
|
||||
# =============================================
|
||||
# OCR PIPELINE INTEGRATION TESTS
|
||||
# =============================================
|
||||
|
||||
class TestProcessSinglePageOCRPipeline:
|
||||
"""Tests for the OCR pipeline integration in process-single-page."""
|
||||
|
||||
@patch("vocab_worksheet_api.OCR_PIPELINE_AVAILABLE", True)
|
||||
@patch("vocab_worksheet_api._run_ocr_pipeline_for_page")
|
||||
def test_process_single_page_uses_ocr_pipeline(self, mock_pipeline, client):
|
||||
"""When OCR pipeline is available, process-single-page should use it."""
|
||||
# Create a session with PDF data
|
||||
session_id = str(uuid.uuid4())
|
||||
fake_pdf = b"%PDF-1.4 fake"
|
||||
_sessions[session_id] = {
|
||||
"id": session_id,
|
||||
"name": "Test",
|
||||
"status": "uploaded",
|
||||
"pdf_data": fake_pdf,
|
||||
"pdf_page_count": 2,
|
||||
"vocabulary": [],
|
||||
}
|
||||
|
||||
# Mock the pipeline to return vocab entries
|
||||
mock_pipeline.return_value = [
|
||||
{
|
||||
"id": str(uuid.uuid4()),
|
||||
"english": "to achieve",
|
||||
"german": "erreichen",
|
||||
"example_sentence": "She achieved her goal.",
|
||||
"source_page": 1,
|
||||
},
|
||||
{
|
||||
"id": str(uuid.uuid4()),
|
||||
"english": "goal",
|
||||
"german": "Ziel",
|
||||
"example_sentence": "",
|
||||
"source_page": 1,
|
||||
},
|
||||
]
|
||||
|
||||
with patch("vocab_worksheet_api.convert_pdf_page_to_image", new_callable=AsyncMock) as mock_convert:
|
||||
mock_convert.return_value = b"fake-png-data"
|
||||
response = client.post(f"/api/v1/vocab/sessions/{session_id}/process-single-page/0")
|
||||
|
||||
assert response.status_code == 200
|
||||
data = response.json()
|
||||
assert data["success"] is True
|
||||
assert data["vocabulary_count"] == 2
|
||||
assert data["vocabulary"][0]["english"] == "to achieve"
|
||||
assert data["vocabulary"][0]["source_page"] == 1
|
||||
|
||||
# Verify pipeline was called with correct args
|
||||
mock_pipeline.assert_called_once_with(b"fake-png-data", 0, session_id)
|
||||
|
||||
@patch("vocab_worksheet_api.OCR_PIPELINE_AVAILABLE", True)
|
||||
@patch("vocab_worksheet_api._run_ocr_pipeline_for_page")
|
||||
def test_process_single_page_ocr_pipeline_error_returns_failure(self, mock_pipeline, client):
|
||||
"""When the OCR pipeline raises an exception, return success=False."""
|
||||
session_id = str(uuid.uuid4())
|
||||
_sessions[session_id] = {
|
||||
"id": session_id,
|
||||
"name": "Test",
|
||||
"status": "uploaded",
|
||||
"pdf_data": b"%PDF-1.4 fake",
|
||||
"pdf_page_count": 1,
|
||||
"vocabulary": [],
|
||||
}
|
||||
|
||||
mock_pipeline.side_effect = ValueError("Column detection failed")
|
||||
|
||||
with patch("vocab_worksheet_api.convert_pdf_page_to_image", new_callable=AsyncMock) as mock_convert:
|
||||
mock_convert.return_value = b"fake-png-data"
|
||||
response = client.post(f"/api/v1/vocab/sessions/{session_id}/process-single-page/0")
|
||||
|
||||
assert response.status_code == 200
|
||||
data = response.json()
|
||||
assert data["success"] is False
|
||||
assert "OCR pipeline error" in data["error"]
|
||||
assert data["vocabulary"] == []
|
||||
|
||||
@patch("vocab_worksheet_api.OCR_PIPELINE_AVAILABLE", False)
|
||||
@patch("vocab_worksheet_api.extract_vocabulary_from_image", new_callable=AsyncMock)
|
||||
def test_process_single_page_fallback_to_llm(self, mock_llm_extract, client):
|
||||
"""When OCR pipeline is not available, fall back to LLM vision."""
|
||||
session_id = str(uuid.uuid4())
|
||||
_sessions[session_id] = {
|
||||
"id": session_id,
|
||||
"name": "Test",
|
||||
"status": "uploaded",
|
||||
"pdf_data": b"%PDF-1.4 fake",
|
||||
"pdf_page_count": 1,
|
||||
"vocabulary": [],
|
||||
}
|
||||
|
||||
mock_entry = MagicMock()
|
||||
mock_entry.dict.return_value = {
|
||||
"id": str(uuid.uuid4()),
|
||||
"english": "house",
|
||||
"german": "Haus",
|
||||
"example_sentence": "",
|
||||
}
|
||||
mock_llm_extract.return_value = ([mock_entry], 0.85, None)
|
||||
|
||||
with patch("vocab_worksheet_api.convert_pdf_page_to_image", new_callable=AsyncMock) as mock_convert:
|
||||
mock_convert.return_value = b"fake-png-data"
|
||||
response = client.post(f"/api/v1/vocab/sessions/{session_id}/process-single-page/0")
|
||||
|
||||
assert response.status_code == 200
|
||||
data = response.json()
|
||||
assert data["success"] is True
|
||||
assert data["vocabulary_count"] == 1
|
||||
assert data["vocabulary"][0]["english"] == "house"
|
||||
|
||||
|
||||
# =============================================
|
||||
# RUN TESTS
|
||||
# =============================================
|
||||
|
||||
Reference in New Issue
Block a user