feat(ocr-pipeline): word-based 5-column detection for vocabulary pages

Replace projection-profile layout analysis with Tesseract word bounding
box clustering to detect 5-column vocabulary layouts (page_ref, EN, DE,
markers, examples). Falls back to projection profiles when < 3 clusters.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-02-26 23:08:14 +01:00
parent aa06ae0f61
commit cf27a95308
4 changed files with 235 additions and 13 deletions

View File

@@ -15,6 +15,8 @@ const TYPE_COLORS: Record<string, string> = {
column_en: 'bg-blue-100 text-blue-700 dark:bg-blue-900/30 dark:text-blue-400',
column_de: 'bg-green-100 text-green-700 dark:bg-green-900/30 dark:text-green-400',
column_example: 'bg-orange-100 text-orange-700 dark:bg-orange-900/30 dark:text-orange-400',
page_ref: 'bg-purple-100 text-purple-700 dark:bg-purple-900/30 dark:text-purple-400',
column_marker: 'bg-red-100 text-red-700 dark:bg-red-900/30 dark:text-red-400',
header: 'bg-gray-100 text-gray-600 dark:bg-gray-700/50 dark:text-gray-400',
footer: 'bg-gray-100 text-gray-600 dark:bg-gray-700/50 dark:text-gray-400',
}
@@ -23,6 +25,8 @@ const TYPE_LABELS: Record<string, string> = {
column_en: 'EN',
column_de: 'DE',
column_example: 'Beispiel',
page_ref: 'Seite',
column_marker: 'Marker',
header: 'Header',
footer: 'Footer',
}
@@ -32,8 +36,8 @@ export function ColumnControls({ columnResult, onRerun, onGroundTruth, onNext, i
if (!columnResult) return null
const columns = columnResult.columns.filter((c: PageRegion) => c.type.startsWith('column'))
const headerFooter = columnResult.columns.filter((c: PageRegion) => !c.type.startsWith('column'))
const columns = columnResult.columns.filter((c: PageRegion) => c.type.startsWith('column') || c.type === 'page_ref')
const headerFooter = columnResult.columns.filter((c: PageRegion) => !c.type.startsWith('column') && c.type !== 'page_ref')
const handleGt = (isCorrect: boolean) => {
onGroundTruth({ is_correct: isCorrect })