feat(ocr-pipeline): generic sub-column detection via left-edge clustering

Detects hidden sub-columns (e.g. page references like "p.59") within
already-recognized columns by clustering word left-edge positions and
splitting when a clear minority cluster exists. The sub-column is then
classified as page_ref and mapped to VocabRow.source_page.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-02 18:18:02 +01:00
parent 0532b2a797
commit 1a246eb059
3 changed files with 343 additions and 2 deletions

View File

@@ -34,6 +34,7 @@ from cv_vocab_pipeline import (
PageRegion,
RowGeometry,
_cells_to_vocab_entries,
_detect_sub_columns,
_fix_character_confusion,
_fix_phonetic_brackets,
analyze_layout,
@@ -698,6 +699,9 @@ async def detect_columns(session_id: str):
cached["_inv"] = inv
cached["_content_bounds"] = (left_x, right_x, top_y, bottom_y)
# Split sub-columns (e.g. page references) before classification
geometries = _detect_sub_columns(geometries, content_w)
# Phase B: Content-based classification
regions = classify_column_types(geometries, content_w, top_y, w, h, bottom_y,
left_x=left_x, right_x=right_x, inv=inv)