feat(ocr): Add Grid Detection v4 tests, docs, and SBOM update
- Add comprehensive tests for grid_detection_service.py (31 tests) - mm coordinate conversion tests - Deskew calculation tests - Column detection tests - Integration tests for vocabulary tables - Add OCR-Compare documentation (OCR-Compare.md) - mm coordinate system documentation - Deskew correction documentation - Worksheet Editor integration guide - API endpoints documentation - Add TypeScript tests for ocr-integration.ts - mm to pixel conversion tests - OCR export format tests - localStorage operations tests - Update SBOM to v1.5.0 - Add OCR Grid Detection System section - Document Fabric.js (MIT) for Worksheet Editor - Document NumPy and OpenCV usage Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
466
studio-v2/lib/worksheet-editor/ocr-integration.test.ts
Normal file
466
studio-v2/lib/worksheet-editor/ocr-integration.test.ts
Normal file
@@ -0,0 +1,466 @@
|
||||
/**
|
||||
* Tests for OCR Integration Utility
|
||||
*
|
||||
* Tests cover:
|
||||
* - mm to pixel conversion
|
||||
* - OCR data export format
|
||||
* - LocalStorage operations
|
||||
* - Canvas integration
|
||||
*/
|
||||
|
||||
import {
|
||||
MM_TO_PX,
|
||||
A4_WIDTH_MM,
|
||||
A4_HEIGHT_MM,
|
||||
A4_WIDTH_PX,
|
||||
A4_HEIGHT_PX,
|
||||
mmToPixel,
|
||||
pixelToMm,
|
||||
getColumnColor,
|
||||
createTextProps,
|
||||
exportOCRData,
|
||||
saveOCRExportToStorage,
|
||||
loadLatestOCRExport,
|
||||
loadOCRExport,
|
||||
clearOCRExports,
|
||||
type OCRWord,
|
||||
type OCRExportData,
|
||||
type ColumnType,
|
||||
} from './ocr-integration'
|
||||
|
||||
// Mock localStorage
|
||||
const localStorageMock = (() => {
|
||||
let store: Record<string, string> = {}
|
||||
return {
|
||||
getItem: jest.fn((key: string) => store[key] || null),
|
||||
setItem: jest.fn((key: string, value: string) => {
|
||||
store[key] = value
|
||||
}),
|
||||
removeItem: jest.fn((key: string) => {
|
||||
delete store[key]
|
||||
}),
|
||||
clear: jest.fn(() => {
|
||||
store = {}
|
||||
}),
|
||||
keys: () => Object.keys(store),
|
||||
}
|
||||
})()
|
||||
|
||||
Object.defineProperty(window, 'localStorage', { value: localStorageMock })
|
||||
|
||||
describe('Constants', () => {
|
||||
test('MM_TO_PX is correct for 96 DPI', () => {
|
||||
// 1 inch = 25.4mm, 96 DPI = 96 pixels per inch
|
||||
// 96 / 25.4 = 3.7795275591
|
||||
expect(MM_TO_PX).toBeCloseTo(3.7795275591, 8)
|
||||
})
|
||||
|
||||
test('A4 dimensions in mm are correct', () => {
|
||||
expect(A4_WIDTH_MM).toBe(210)
|
||||
expect(A4_HEIGHT_MM).toBe(297)
|
||||
})
|
||||
|
||||
test('A4 dimensions in pixels are calculated correctly', () => {
|
||||
expect(A4_WIDTH_PX).toBe(Math.round(210 * MM_TO_PX)) // ~794
|
||||
expect(A4_HEIGHT_PX).toBe(Math.round(297 * MM_TO_PX)) // ~1123
|
||||
})
|
||||
})
|
||||
|
||||
describe('mmToPixel', () => {
|
||||
test('converts 0mm to 0px', () => {
|
||||
expect(mmToPixel(0)).toBe(0)
|
||||
})
|
||||
|
||||
test('converts 1mm correctly', () => {
|
||||
expect(mmToPixel(1)).toBeCloseTo(3.7795275591, 8)
|
||||
})
|
||||
|
||||
test('converts 100mm correctly', () => {
|
||||
expect(mmToPixel(100)).toBeCloseTo(377.95275591, 6)
|
||||
})
|
||||
|
||||
test('converts A4 width correctly', () => {
|
||||
expect(mmToPixel(210)).toBeCloseTo(793.7, 1)
|
||||
})
|
||||
})
|
||||
|
||||
describe('pixelToMm', () => {
|
||||
test('converts 0px to 0mm', () => {
|
||||
expect(pixelToMm(0)).toBe(0)
|
||||
})
|
||||
|
||||
test('converts 100px correctly', () => {
|
||||
expect(pixelToMm(100)).toBeCloseTo(26.458, 2)
|
||||
})
|
||||
|
||||
test('round-trip conversion is accurate', () => {
|
||||
const original = 50
|
||||
const pixels = mmToPixel(original)
|
||||
const backToMm = pixelToMm(pixels)
|
||||
expect(backToMm).toBeCloseTo(original, 8)
|
||||
})
|
||||
})
|
||||
|
||||
describe('getColumnColor', () => {
|
||||
test('returns blue for english column', () => {
|
||||
expect(getColumnColor('english')).toBe('#1e40af')
|
||||
})
|
||||
|
||||
test('returns green for german column', () => {
|
||||
expect(getColumnColor('german')).toBe('#166534')
|
||||
})
|
||||
|
||||
test('returns purple for example column', () => {
|
||||
expect(getColumnColor('example')).toBe('#6b21a8')
|
||||
})
|
||||
|
||||
test('returns gray for unknown column', () => {
|
||||
expect(getColumnColor('unknown')).toBe('#374151')
|
||||
})
|
||||
|
||||
test('uses custom colors from options', () => {
|
||||
const options = { englishColor: '#ff0000' }
|
||||
expect(getColumnColor('english', options)).toBe('#ff0000')
|
||||
})
|
||||
})
|
||||
|
||||
describe('createTextProps', () => {
|
||||
const mockWord: OCRWord = {
|
||||
text: 'house',
|
||||
x_mm: 21.0,
|
||||
y_mm: 44.55,
|
||||
width_mm: 52.5,
|
||||
height_mm: 8.91,
|
||||
column_type: 'english',
|
||||
logical_row: 0,
|
||||
}
|
||||
|
||||
test('creates correct type', () => {
|
||||
const props = createTextProps(mockWord)
|
||||
expect(props.type).toBe('i-text')
|
||||
})
|
||||
|
||||
test('converts mm to pixels for left position', () => {
|
||||
const props = createTextProps(mockWord)
|
||||
expect(props.left).toBeCloseTo(21.0 * MM_TO_PX, 2)
|
||||
})
|
||||
|
||||
test('converts mm to pixels for top position', () => {
|
||||
const props = createTextProps(mockWord)
|
||||
expect(props.top).toBeCloseTo(44.55 * MM_TO_PX, 2)
|
||||
})
|
||||
|
||||
test('applies offset correctly', () => {
|
||||
const props = createTextProps(mockWord, { offsetX: 5, offsetY: 10 })
|
||||
expect(props.left).toBeCloseTo((21.0 + 5) * MM_TO_PX, 2)
|
||||
expect(props.top).toBeCloseTo((44.55 + 10) * MM_TO_PX, 2)
|
||||
})
|
||||
|
||||
test('sets fill color based on column type', () => {
|
||||
const props = createTextProps(mockWord)
|
||||
expect(props.fill).toBe('#1e40af') // English blue
|
||||
})
|
||||
|
||||
test('includes OCR metadata', () => {
|
||||
const props = createTextProps(mockWord)
|
||||
expect(props.ocrMetadata).toBeDefined()
|
||||
expect((props.ocrMetadata as any).x_mm).toBe(21.0)
|
||||
expect((props.ocrMetadata as any).column_type).toBe('english')
|
||||
expect((props.ocrMetadata as any).logical_row).toBe(0)
|
||||
})
|
||||
|
||||
test('uses custom font family', () => {
|
||||
const props = createTextProps(mockWord, { fontFamily: 'Times New Roman' })
|
||||
expect(props.fontFamily).toBe('Times New Roman')
|
||||
})
|
||||
|
||||
test('uses custom font size', () => {
|
||||
const props = createTextProps(mockWord, { fontSize: 16 })
|
||||
expect(props.fontSize).toBe(16)
|
||||
})
|
||||
})
|
||||
|
||||
describe('exportOCRData', () => {
|
||||
const mockGridData = {
|
||||
cells: [
|
||||
[
|
||||
{
|
||||
text: 'house',
|
||||
x_mm: 21.0,
|
||||
y_mm: 44.55,
|
||||
width_mm: 52.5,
|
||||
height_mm: 8.91,
|
||||
column_type: 'english' as ColumnType,
|
||||
logical_row: 0,
|
||||
status: 'recognized',
|
||||
},
|
||||
{
|
||||
text: 'Haus',
|
||||
x_mm: 80.0,
|
||||
y_mm: 44.55,
|
||||
width_mm: 40.0,
|
||||
height_mm: 8.91,
|
||||
column_type: 'german' as ColumnType,
|
||||
logical_row: 0,
|
||||
status: 'recognized',
|
||||
},
|
||||
],
|
||||
],
|
||||
detected_columns: [
|
||||
{ column_type: 'english', x_start_mm: 20.0, x_end_mm: 73.5 },
|
||||
{ column_type: 'german', x_start_mm: 74.0, x_end_mm: 140.0 },
|
||||
],
|
||||
page_dimensions: {
|
||||
width_mm: 210,
|
||||
height_mm: 297,
|
||||
format: 'A4',
|
||||
},
|
||||
}
|
||||
|
||||
test('creates correct version', () => {
|
||||
const result = exportOCRData(mockGridData, 'session-123', 1)
|
||||
expect(result.version).toBe('1.0')
|
||||
})
|
||||
|
||||
test('sets correct source', () => {
|
||||
const result = exportOCRData(mockGridData, 'session-123', 1)
|
||||
expect(result.source).toBe('ocr-compare')
|
||||
})
|
||||
|
||||
test('includes session ID and page number', () => {
|
||||
const result = exportOCRData(mockGridData, 'session-123', 1)
|
||||
expect(result.session_id).toBe('session-123')
|
||||
expect(result.page_number).toBe(1)
|
||||
})
|
||||
|
||||
test('includes page dimensions', () => {
|
||||
const result = exportOCRData(mockGridData, 'session-123', 1)
|
||||
expect(result.page_dimensions.width_mm).toBe(210)
|
||||
expect(result.page_dimensions.height_mm).toBe(297)
|
||||
expect(result.page_dimensions.format).toBe('A4')
|
||||
})
|
||||
|
||||
test('converts cells to words', () => {
|
||||
const result = exportOCRData(mockGridData, 'session-123', 1)
|
||||
expect(result.words).toHaveLength(2)
|
||||
expect(result.words[0].text).toBe('house')
|
||||
expect(result.words[0].column_type).toBe('english')
|
||||
})
|
||||
|
||||
test('filters empty cells', () => {
|
||||
const dataWithEmpty = {
|
||||
...mockGridData,
|
||||
cells: [
|
||||
[
|
||||
...mockGridData.cells[0],
|
||||
{ text: '', status: 'empty' }, // Empty cell
|
||||
],
|
||||
],
|
||||
}
|
||||
const result = exportOCRData(dataWithEmpty, 'session-123', 1)
|
||||
expect(result.words).toHaveLength(2) // Empty cell excluded
|
||||
})
|
||||
|
||||
test('includes detected columns', () => {
|
||||
const result = exportOCRData(mockGridData, 'session-123', 1)
|
||||
expect(result.detected_columns).toHaveLength(2)
|
||||
expect(result.detected_columns[0].column_type).toBe('english')
|
||||
})
|
||||
|
||||
test('sets exported_at timestamp', () => {
|
||||
const before = new Date().toISOString()
|
||||
const result = exportOCRData(mockGridData, 'session-123', 1)
|
||||
const after = new Date().toISOString()
|
||||
|
||||
expect(result.exported_at >= before).toBe(true)
|
||||
expect(result.exported_at <= after).toBe(true)
|
||||
})
|
||||
})
|
||||
|
||||
describe('localStorage operations', () => {
|
||||
beforeEach(() => {
|
||||
localStorageMock.clear()
|
||||
})
|
||||
|
||||
const mockExportData: OCRExportData = {
|
||||
version: '1.0',
|
||||
source: 'ocr-compare',
|
||||
exported_at: '2026-02-08T12:00:00Z',
|
||||
session_id: 'session-123',
|
||||
page_number: 1,
|
||||
page_dimensions: {
|
||||
width_mm: 210,
|
||||
height_mm: 297,
|
||||
format: 'A4',
|
||||
},
|
||||
words: [
|
||||
{
|
||||
text: 'house',
|
||||
x_mm: 21.0,
|
||||
y_mm: 44.55,
|
||||
width_mm: 52.5,
|
||||
height_mm: 8.91,
|
||||
column_type: 'english',
|
||||
logical_row: 0,
|
||||
},
|
||||
],
|
||||
detected_columns: [],
|
||||
}
|
||||
|
||||
describe('saveOCRExportToStorage', () => {
|
||||
test('saves data to localStorage', () => {
|
||||
saveOCRExportToStorage(mockExportData)
|
||||
|
||||
expect(localStorageMock.setItem).toHaveBeenCalledWith(
|
||||
'ocr_export_session-123_1',
|
||||
expect.any(String)
|
||||
)
|
||||
})
|
||||
|
||||
test('sets latest export key', () => {
|
||||
saveOCRExportToStorage(mockExportData)
|
||||
|
||||
expect(localStorageMock.setItem).toHaveBeenCalledWith(
|
||||
'ocr_export_latest',
|
||||
'ocr_export_session-123_1'
|
||||
)
|
||||
})
|
||||
})
|
||||
|
||||
describe('loadLatestOCRExport', () => {
|
||||
test('returns null when no export exists', () => {
|
||||
const result = loadLatestOCRExport()
|
||||
expect(result).toBeNull()
|
||||
})
|
||||
|
||||
test('loads latest export data', () => {
|
||||
// Manually set up the mock
|
||||
localStorageMock.setItem(
|
||||
'ocr_export_session-123_1',
|
||||
JSON.stringify(mockExportData)
|
||||
)
|
||||
localStorageMock.setItem('ocr_export_latest', 'ocr_export_session-123_1')
|
||||
|
||||
// Reset the mock to return correct values
|
||||
localStorageMock.getItem.mockImplementation((key: string) => {
|
||||
if (key === 'ocr_export_latest') return 'ocr_export_session-123_1'
|
||||
if (key === 'ocr_export_session-123_1')
|
||||
return JSON.stringify(mockExportData)
|
||||
return null
|
||||
})
|
||||
|
||||
const result = loadLatestOCRExport()
|
||||
expect(result).not.toBeNull()
|
||||
expect(result?.session_id).toBe('session-123')
|
||||
})
|
||||
})
|
||||
|
||||
describe('loadOCRExport', () => {
|
||||
test('returns null for non-existent session', () => {
|
||||
const result = loadOCRExport('nonexistent', 1)
|
||||
expect(result).toBeNull()
|
||||
})
|
||||
|
||||
test('loads specific export by session and page', () => {
|
||||
localStorageMock.getItem.mockImplementation((key: string) => {
|
||||
if (key === 'ocr_export_session-123_1')
|
||||
return JSON.stringify(mockExportData)
|
||||
return null
|
||||
})
|
||||
|
||||
const result = loadOCRExport('session-123', 1)
|
||||
expect(result).not.toBeNull()
|
||||
expect(result?.page_number).toBe(1)
|
||||
})
|
||||
|
||||
test('handles JSON parse errors gracefully', () => {
|
||||
localStorageMock.getItem.mockImplementation((key: string) => {
|
||||
if (key === 'ocr_export_session-123_1') return 'invalid json'
|
||||
return null
|
||||
})
|
||||
|
||||
const result = loadOCRExport('session-123', 1)
|
||||
expect(result).toBeNull()
|
||||
})
|
||||
})
|
||||
|
||||
describe('clearOCRExports', () => {
|
||||
test('removes all OCR export keys', () => {
|
||||
// Set up mock to return keys
|
||||
Object.defineProperty(localStorageMock, 'keys', {
|
||||
value: () => [
|
||||
'ocr_export_session-1_1',
|
||||
'ocr_export_session-2_1',
|
||||
'ocr_export_latest',
|
||||
'other_key',
|
||||
],
|
||||
})
|
||||
|
||||
// Mock Object.keys(localStorage)
|
||||
const originalKeys = Object.keys
|
||||
Object.keys = jest.fn((obj) => {
|
||||
if (obj === localStorage) {
|
||||
return [
|
||||
'ocr_export_session-1_1',
|
||||
'ocr_export_session-2_1',
|
||||
'ocr_export_latest',
|
||||
'other_key',
|
||||
]
|
||||
}
|
||||
return originalKeys(obj)
|
||||
})
|
||||
|
||||
clearOCRExports()
|
||||
|
||||
expect(localStorageMock.removeItem).toHaveBeenCalledWith(
|
||||
'ocr_export_session-1_1'
|
||||
)
|
||||
expect(localStorageMock.removeItem).toHaveBeenCalledWith(
|
||||
'ocr_export_session-2_1'
|
||||
)
|
||||
expect(localStorageMock.removeItem).toHaveBeenCalledWith(
|
||||
'ocr_export_latest'
|
||||
)
|
||||
|
||||
// Restore Object.keys
|
||||
Object.keys = originalKeys
|
||||
})
|
||||
})
|
||||
})
|
||||
|
||||
describe('Edge Cases', () => {
|
||||
test('handles negative mm values', () => {
|
||||
const pixels = mmToPixel(-10)
|
||||
expect(pixels).toBeCloseTo(-37.795, 2)
|
||||
})
|
||||
|
||||
test('handles very large mm values', () => {
|
||||
const pixels = mmToPixel(10000)
|
||||
expect(pixels).toBeCloseTo(37795.275591, 2)
|
||||
})
|
||||
|
||||
test('handles word with missing optional fields', () => {
|
||||
const word: OCRWord = {
|
||||
text: 'test',
|
||||
x_mm: 0,
|
||||
y_mm: 0,
|
||||
width_mm: 10,
|
||||
height_mm: 5,
|
||||
column_type: 'unknown',
|
||||
logical_row: 0,
|
||||
}
|
||||
const props = createTextProps(word)
|
||||
expect(props).toBeDefined()
|
||||
expect(props.text).toBe('test')
|
||||
})
|
||||
|
||||
test('handles empty words array in export', () => {
|
||||
const gridData = {
|
||||
cells: [],
|
||||
detected_columns: [],
|
||||
page_dimensions: { width_mm: 210, height_mm: 297, format: 'A4' },
|
||||
}
|
||||
const result = exportOCRData(gridData, 'session', 1)
|
||||
expect(result.words).toHaveLength(0)
|
||||
})
|
||||
})
|
||||
Reference in New Issue
Block a user