feat(ocr-pipeline): word-based 5-column detection for vocabulary pages

Replace projection-profile layout analysis with Tesseract word bounding
box clustering to detect 5-column vocabulary layouts (page_ref, EN, DE,
markers, examples). Falls back to projection profiles when < 3 clusters.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-02-26 23:08:14 +01:00
parent aa06ae0f61
commit cf27a95308
4 changed files with 235 additions and 13 deletions

View File

@@ -64,7 +64,7 @@ export interface DewarpGroundTruth {
}
export interface PageRegion {
type: 'column_en' | 'column_de' | 'column_example' | 'header' | 'footer'
type: 'column_en' | 'column_de' | 'column_example' | 'page_ref' | 'column_marker' | 'header' | 'footer'
x: number
y: number
width: number