feat(ocr-pipeline): word-based 5-column detection for vocabulary pages
Replace projection-profile layout analysis with Tesseract word bounding box clustering to detect 5-column vocabulary layouts (page_ref, EN, DE, markers, examples). Falls back to projection profiles when < 3 clusters. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -15,6 +15,8 @@ const TYPE_COLORS: Record<string, string> = {
|
||||
column_en: 'bg-blue-100 text-blue-700 dark:bg-blue-900/30 dark:text-blue-400',
|
||||
column_de: 'bg-green-100 text-green-700 dark:bg-green-900/30 dark:text-green-400',
|
||||
column_example: 'bg-orange-100 text-orange-700 dark:bg-orange-900/30 dark:text-orange-400',
|
||||
page_ref: 'bg-purple-100 text-purple-700 dark:bg-purple-900/30 dark:text-purple-400',
|
||||
column_marker: 'bg-red-100 text-red-700 dark:bg-red-900/30 dark:text-red-400',
|
||||
header: 'bg-gray-100 text-gray-600 dark:bg-gray-700/50 dark:text-gray-400',
|
||||
footer: 'bg-gray-100 text-gray-600 dark:bg-gray-700/50 dark:text-gray-400',
|
||||
}
|
||||
@@ -23,6 +25,8 @@ const TYPE_LABELS: Record<string, string> = {
|
||||
column_en: 'EN',
|
||||
column_de: 'DE',
|
||||
column_example: 'Beispiel',
|
||||
page_ref: 'Seite',
|
||||
column_marker: 'Marker',
|
||||
header: 'Header',
|
||||
footer: 'Footer',
|
||||
}
|
||||
@@ -32,8 +36,8 @@ export function ColumnControls({ columnResult, onRerun, onGroundTruth, onNext, i
|
||||
|
||||
if (!columnResult) return null
|
||||
|
||||
const columns = columnResult.columns.filter((c: PageRegion) => c.type.startsWith('column'))
|
||||
const headerFooter = columnResult.columns.filter((c: PageRegion) => !c.type.startsWith('column'))
|
||||
const columns = columnResult.columns.filter((c: PageRegion) => c.type.startsWith('column') || c.type === 'page_ref')
|
||||
const headerFooter = columnResult.columns.filter((c: PageRegion) => !c.type.startsWith('column') && c.type !== 'page_ref')
|
||||
|
||||
const handleGt = (isCorrect: boolean) => {
|
||||
onGroundTruth({ is_correct: isCorrect })
|
||||
|
||||
Reference in New Issue
Block a user