feat(ocr-pipeline): generic sub-column detection via left-edge clustering
Detects hidden sub-columns (e.g. page references like "p.59") within already-recognized columns by clustering word left-edge positions and splitting when a clear minority cluster exists. The sub-column is then classified as page_ref and mapped to VocabRow.source_page. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -140,6 +140,7 @@ class VocabRow:
|
||||
english: str = ""
|
||||
german: str = ""
|
||||
example: str = ""
|
||||
source_page: str = ""
|
||||
confidence: float = 0.0
|
||||
y_position: int = 0
|
||||
|
||||
@@ -1033,6 +1034,147 @@ def _detect_columns_by_clustering(
|
||||
)
|
||||
|
||||
|
||||
def _detect_sub_columns(
|
||||
geometries: List[ColumnGeometry],
|
||||
content_w: int,
|
||||
) -> List[ColumnGeometry]:
|
||||
"""Split columns that contain internal sub-columns based on left-edge clustering.
|
||||
|
||||
Detects cases where a minority of words in a column are left-aligned at a
|
||||
different position than the majority (e.g. page references "p.59" next to
|
||||
vocabulary words).
|
||||
|
||||
Returns a new list of ColumnGeometry — potentially longer than the input.
|
||||
"""
|
||||
if content_w <= 0:
|
||||
return geometries
|
||||
|
||||
result: List[ColumnGeometry] = []
|
||||
for geo in geometries:
|
||||
# Only consider wide-enough columns with enough words
|
||||
if geo.width_ratio < 0.15 or geo.word_count < 5:
|
||||
result.append(geo)
|
||||
continue
|
||||
|
||||
# Collect left-edges of confident words
|
||||
left_edges: List[int] = []
|
||||
for w in geo.words:
|
||||
if w.get('conf', 0) >= 30:
|
||||
left_edges.append(w['left'])
|
||||
|
||||
if len(left_edges) < 3:
|
||||
result.append(geo)
|
||||
continue
|
||||
|
||||
# Sort and find the largest gap between consecutive left-edge values
|
||||
sorted_edges = sorted(left_edges)
|
||||
best_gap = 0
|
||||
best_gap_pos = 0 # split point: values <= best_gap_pos go left
|
||||
for i in range(len(sorted_edges) - 1):
|
||||
gap = sorted_edges[i + 1] - sorted_edges[i]
|
||||
if gap > best_gap:
|
||||
best_gap = gap
|
||||
best_gap_pos = (sorted_edges[i] + sorted_edges[i + 1]) // 2
|
||||
|
||||
# Gap must be significant relative to column width
|
||||
min_gap = max(15, int(geo.width * 0.08))
|
||||
if best_gap < min_gap:
|
||||
result.append(geo)
|
||||
continue
|
||||
|
||||
# Split words into left (minority candidate) and right groups
|
||||
left_words = [w for w in geo.words if w.get('conf', 0) >= 30 and w['left'] <= best_gap_pos]
|
||||
right_words = [w for w in geo.words if w.get('conf', 0) >= 30 and w['left'] > best_gap_pos]
|
||||
|
||||
# Also include low-conf words by position
|
||||
for w in geo.words:
|
||||
if w.get('conf', 0) < 30:
|
||||
if w['left'] <= best_gap_pos:
|
||||
left_words.append(w)
|
||||
else:
|
||||
right_words.append(w)
|
||||
|
||||
total = len(left_words) + len(right_words)
|
||||
if total == 0:
|
||||
result.append(geo)
|
||||
continue
|
||||
|
||||
# Determine minority/majority
|
||||
if len(left_words) <= len(right_words):
|
||||
minority, majority = left_words, right_words
|
||||
minority_is_left = True
|
||||
else:
|
||||
minority, majority = right_words, left_words
|
||||
minority_is_left = False
|
||||
|
||||
# Check minority constraints
|
||||
minority_ratio = len(minority) / total
|
||||
if minority_ratio >= 0.35 or len(minority) < 2:
|
||||
result.append(geo)
|
||||
continue
|
||||
|
||||
# Build two sub-column geometries
|
||||
if minority_is_left:
|
||||
# Minority is left sub-column, majority is right
|
||||
sub_x = geo.x
|
||||
sub_width = best_gap_pos - geo.x
|
||||
main_x = best_gap_pos
|
||||
main_width = (geo.x + geo.width) - best_gap_pos
|
||||
else:
|
||||
# Minority is right sub-column, majority is left
|
||||
main_x = geo.x
|
||||
main_width = best_gap_pos - geo.x
|
||||
sub_x = best_gap_pos
|
||||
sub_width = (geo.x + geo.width) - best_gap_pos
|
||||
|
||||
# Sanity check widths
|
||||
if sub_width <= 0 or main_width <= 0:
|
||||
result.append(geo)
|
||||
continue
|
||||
|
||||
sub_geo = ColumnGeometry(
|
||||
index=0, # will be re-indexed below
|
||||
x=sub_x,
|
||||
y=geo.y,
|
||||
width=sub_width,
|
||||
height=geo.height,
|
||||
word_count=len(minority),
|
||||
words=minority,
|
||||
width_ratio=sub_width / content_w if content_w > 0 else 0.0,
|
||||
)
|
||||
main_geo = ColumnGeometry(
|
||||
index=0, # will be re-indexed below
|
||||
x=main_x,
|
||||
y=geo.y,
|
||||
width=main_width,
|
||||
height=geo.height,
|
||||
word_count=len(majority),
|
||||
words=majority,
|
||||
width_ratio=main_width / content_w if content_w > 0 else 0.0,
|
||||
)
|
||||
|
||||
# Insert in left-to-right order
|
||||
if sub_x < main_x:
|
||||
result.append(sub_geo)
|
||||
result.append(main_geo)
|
||||
else:
|
||||
result.append(main_geo)
|
||||
result.append(sub_geo)
|
||||
|
||||
logger.info(
|
||||
f"SubColumnSplit: column idx={geo.index} split at gap={best_gap}px, "
|
||||
f"minority={len(minority)} words (left={minority_is_left}), "
|
||||
f"majority={len(majority)} words"
|
||||
)
|
||||
|
||||
# Re-index by left-to-right order
|
||||
result.sort(key=lambda g: g.x)
|
||||
for i, g in enumerate(result):
|
||||
g.index = i
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def _build_geometries_from_starts(
|
||||
col_starts: List[Tuple[int, int]],
|
||||
word_dicts: List[Dict],
|
||||
@@ -2727,6 +2869,9 @@ def analyze_layout_by_words(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> Li
|
||||
geometries, left_x, right_x, top_y, bottom_y, _word_dicts, _inv = result
|
||||
content_w = right_x - left_x
|
||||
|
||||
# Split sub-columns (e.g. page references) before classification
|
||||
geometries = _detect_sub_columns(geometries, content_w)
|
||||
|
||||
# Phase B: Content-based classification
|
||||
regions = classify_column_types(geometries, content_w, top_y, w, h, bottom_y,
|
||||
left_x=left_x, right_x=right_x, inv=_inv)
|
||||
@@ -3841,7 +3986,7 @@ def build_cell_grid(
|
||||
return [], []
|
||||
|
||||
# Use columns only — skip ignore, header, footer, page_ref
|
||||
_skip_types = {'column_ignore', 'header', 'footer', 'margin_top', 'margin_bottom', 'page_ref', 'margin_left', 'margin_right'}
|
||||
_skip_types = {'column_ignore', 'header', 'footer', 'margin_top', 'margin_bottom', 'margin_left', 'margin_right'}
|
||||
relevant_cols = [c for c in column_regions if c.type not in _skip_types]
|
||||
if not relevant_cols:
|
||||
logger.warning("build_cell_grid: no usable columns found")
|
||||
@@ -4003,7 +4148,7 @@ def build_cell_grid_streaming(
|
||||
if not content_rows:
|
||||
return
|
||||
|
||||
_skip_types = {'column_ignore', 'header', 'footer', 'margin_top', 'margin_bottom', 'page_ref', 'margin_left', 'margin_right'}
|
||||
_skip_types = {'column_ignore', 'header', 'footer', 'margin_top', 'margin_bottom', 'margin_left', 'margin_right'}
|
||||
relevant_cols = [c for c in column_regions if c.type not in _skip_types]
|
||||
if not relevant_cols:
|
||||
return
|
||||
@@ -4055,11 +4200,13 @@ def _cells_to_vocab_entries(
|
||||
'column_en': 'english',
|
||||
'column_de': 'german',
|
||||
'column_example': 'example',
|
||||
'page_ref': 'source_page',
|
||||
}
|
||||
bbox_key_map = {
|
||||
'column_en': 'bbox_en',
|
||||
'column_de': 'bbox_de',
|
||||
'column_example': 'bbox_ex',
|
||||
'page_ref': 'bbox_ref',
|
||||
}
|
||||
|
||||
# Group cells by row_index
|
||||
@@ -4076,11 +4223,13 @@ def _cells_to_vocab_entries(
|
||||
'english': '',
|
||||
'german': '',
|
||||
'example': '',
|
||||
'source_page': '',
|
||||
'confidence': 0.0,
|
||||
'bbox': None,
|
||||
'bbox_en': None,
|
||||
'bbox_de': None,
|
||||
'bbox_ex': None,
|
||||
'bbox_ref': None,
|
||||
'ocr_engine': row_cells[0].get('ocr_engine', '') if row_cells else '',
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user