feat(ocr-pipeline): word-based 5-column detection for vocabulary pages
Replace projection-profile layout analysis with Tesseract word bounding box clustering to detect 5-column vocabulary layouts (page_ref, EN, DE, markers, examples). Falls back to projection profiles when < 3 clusters. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -64,7 +64,7 @@ export interface DewarpGroundTruth {
|
||||
}
|
||||
|
||||
export interface PageRegion {
|
||||
type: 'column_en' | 'column_de' | 'column_example' | 'header' | 'footer'
|
||||
type: 'column_en' | 'column_de' | 'column_example' | 'page_ref' | 'column_marker' | 'header' | 'footer'
|
||||
x: number
|
||||
y: number
|
||||
width: number
|
||||
|
||||
Reference in New Issue
Block a user