feat: use OCR pipeline instead of LLM vision for vocab worksheet extraction
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 26s
CI / test-go-edu-search (push) Successful in 25s
CI / test-python-klausur (push) Failing after 1m52s
CI / test-python-agent-core (push) Successful in 18s
CI / test-nodejs-website (push) Successful in 17s

process-single-page now runs the full CV pipeline (deskew → dewarp → columns →
rows → cell-first OCR v2 → LLM review) for much better extraction quality.
Falls back to LLM vision if pipeline imports are unavailable.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-05 15:35:44 +01:00
parent 9ea77ba157
commit b7ae36e92b
2 changed files with 400 additions and 29 deletions

View File

@@ -615,6 +615,121 @@ class TestEdgeCases:
assert len(response.json()) == 5
# =============================================
# OCR PIPELINE INTEGRATION TESTS
# =============================================
class TestProcessSinglePageOCRPipeline:
"""Tests for the OCR pipeline integration in process-single-page."""
@patch("vocab_worksheet_api.OCR_PIPELINE_AVAILABLE", True)
@patch("vocab_worksheet_api._run_ocr_pipeline_for_page")
def test_process_single_page_uses_ocr_pipeline(self, mock_pipeline, client):
"""When OCR pipeline is available, process-single-page should use it."""
# Create a session with PDF data
session_id = str(uuid.uuid4())
fake_pdf = b"%PDF-1.4 fake"
_sessions[session_id] = {
"id": session_id,
"name": "Test",
"status": "uploaded",
"pdf_data": fake_pdf,
"pdf_page_count": 2,
"vocabulary": [],
}
# Mock the pipeline to return vocab entries
mock_pipeline.return_value = [
{
"id": str(uuid.uuid4()),
"english": "to achieve",
"german": "erreichen",
"example_sentence": "She achieved her goal.",
"source_page": 1,
},
{
"id": str(uuid.uuid4()),
"english": "goal",
"german": "Ziel",
"example_sentence": "",
"source_page": 1,
},
]
with patch("vocab_worksheet_api.convert_pdf_page_to_image", new_callable=AsyncMock) as mock_convert:
mock_convert.return_value = b"fake-png-data"
response = client.post(f"/api/v1/vocab/sessions/{session_id}/process-single-page/0")
assert response.status_code == 200
data = response.json()
assert data["success"] is True
assert data["vocabulary_count"] == 2
assert data["vocabulary"][0]["english"] == "to achieve"
assert data["vocabulary"][0]["source_page"] == 1
# Verify pipeline was called with correct args
mock_pipeline.assert_called_once_with(b"fake-png-data", 0, session_id)
@patch("vocab_worksheet_api.OCR_PIPELINE_AVAILABLE", True)
@patch("vocab_worksheet_api._run_ocr_pipeline_for_page")
def test_process_single_page_ocr_pipeline_error_returns_failure(self, mock_pipeline, client):
"""When the OCR pipeline raises an exception, return success=False."""
session_id = str(uuid.uuid4())
_sessions[session_id] = {
"id": session_id,
"name": "Test",
"status": "uploaded",
"pdf_data": b"%PDF-1.4 fake",
"pdf_page_count": 1,
"vocabulary": [],
}
mock_pipeline.side_effect = ValueError("Column detection failed")
with patch("vocab_worksheet_api.convert_pdf_page_to_image", new_callable=AsyncMock) as mock_convert:
mock_convert.return_value = b"fake-png-data"
response = client.post(f"/api/v1/vocab/sessions/{session_id}/process-single-page/0")
assert response.status_code == 200
data = response.json()
assert data["success"] is False
assert "OCR pipeline error" in data["error"]
assert data["vocabulary"] == []
@patch("vocab_worksheet_api.OCR_PIPELINE_AVAILABLE", False)
@patch("vocab_worksheet_api.extract_vocabulary_from_image", new_callable=AsyncMock)
def test_process_single_page_fallback_to_llm(self, mock_llm_extract, client):
"""When OCR pipeline is not available, fall back to LLM vision."""
session_id = str(uuid.uuid4())
_sessions[session_id] = {
"id": session_id,
"name": "Test",
"status": "uploaded",
"pdf_data": b"%PDF-1.4 fake",
"pdf_page_count": 1,
"vocabulary": [],
}
mock_entry = MagicMock()
mock_entry.dict.return_value = {
"id": str(uuid.uuid4()),
"english": "house",
"german": "Haus",
"example_sentence": "",
}
mock_llm_extract.return_value = ([mock_entry], 0.85, None)
with patch("vocab_worksheet_api.convert_pdf_page_to_image", new_callable=AsyncMock) as mock_convert:
mock_convert.return_value = b"fake-png-data"
response = client.post(f"/api/v1/vocab/sessions/{session_id}/process-single-page/0")
assert response.status_code == 200
data = response.json()
assert data["success"] is True
assert data["vocabulary_count"] == 1
assert data["vocabulary"][0]["english"] == "house"
# =============================================
# RUN TESTS
# =============================================