feat(ocr-pipeline): generic sub-column detection via left-edge clustering
Detects hidden sub-columns (e.g. page references like "p.59") within already-recognized columns by clustering word left-edge positions and splitting when a clear minority cluster exists. The sub-column is then classified as page_ref and mapped to VocabRow.source_page. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -34,6 +34,7 @@ from cv_vocab_pipeline import (
|
||||
PageRegion,
|
||||
RowGeometry,
|
||||
_cells_to_vocab_entries,
|
||||
_detect_sub_columns,
|
||||
_fix_character_confusion,
|
||||
_fix_phonetic_brackets,
|
||||
analyze_layout,
|
||||
@@ -698,6 +699,9 @@ async def detect_columns(session_id: str):
|
||||
cached["_inv"] = inv
|
||||
cached["_content_bounds"] = (left_x, right_x, top_y, bottom_y)
|
||||
|
||||
# Split sub-columns (e.g. page references) before classification
|
||||
geometries = _detect_sub_columns(geometries, content_w)
|
||||
|
||||
# Phase B: Content-based classification
|
||||
regions = classify_column_types(geometries, content_w, top_y, w, h, bottom_y,
|
||||
left_x=left_x, right_x=right_x, inv=inv)
|
||||
|
||||
Reference in New Issue
Block a user