fix(ocr-pipeline): tune column detection based on GT comparison
Address 5 weaknesses found via ground-truth comparison on session df3548d1: - Add column_ignore for edge columns with < 3 words (margin detection) - Absorb tiny clusters (< 5% width) into neighbors post-merge - Restrict page_ref to left 35% of content area across all 3 levels - Loosen marker thresholds (width < 6%, words <= 15) and add strong marker score for very narrow non-edge columns (< 4%) - Add EN/DE position tiebreaker when language signals are both weak Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1031,6 +1031,33 @@ def detect_column_geometry(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> Opt
|
||||
else:
|
||||
merged.append(s.copy())
|
||||
|
||||
# --- Post-merge: absorb tiny clusters (< 5% content width) into neighbors ---
|
||||
i = 0
|
||||
absorbed_count = 0
|
||||
while i < len(merged) and len(merged) > 3:
|
||||
if i + 1 < len(merged):
|
||||
cluster_w = merged[i + 1]['mean_x'] - merged[i]['mean_x']
|
||||
else:
|
||||
cluster_w = content_w - (merged[i]['mean_x'] - merged[0]['mean_x'])
|
||||
if cluster_w / content_w < 0.05:
|
||||
# Absorb into neighbor (prefer left)
|
||||
if i > 0:
|
||||
target = merged[i - 1]
|
||||
else:
|
||||
target = merged[i + 1]
|
||||
target['count'] += merged[i]['count']
|
||||
target['min_edge'] = min(target['min_edge'], merged[i]['min_edge'])
|
||||
target['max_edge'] = max(target['max_edge'], merged[i]['max_edge'])
|
||||
target['y_min'] = min(target['y_min'], merged[i]['y_min'])
|
||||
target['y_max'] = max(target['y_max'], merged[i]['y_max'])
|
||||
target['y_coverage'] = (target['y_max'] - target['y_min']) / content_h if content_h > 0 else 0.0
|
||||
del merged[i]
|
||||
absorbed_count += 1
|
||||
else:
|
||||
i += 1
|
||||
if absorbed_count:
|
||||
logger.info(f"ColumnGeometry: absorbed {absorbed_count} tiny clusters (< 5% width)")
|
||||
|
||||
_merged_summary = [(m['mean_x']+left_x, m['count'], format(m['y_coverage'], '.0%')) for m in merged]
|
||||
logger.info(f"ColumnGeometry: {len(merged)} clusters after merging (dist={merge_distance}px): {_merged_summary}")
|
||||
|
||||
@@ -1157,11 +1184,14 @@ def _score_role(geom: ColumnGeometry) -> Dict[str, float]:
|
||||
if digit_ratio > 0.4:
|
||||
scores['reference'] = min(1.0, 0.5 + digit_ratio * 0.5)
|
||||
|
||||
# Marker: very narrow + few short entries
|
||||
if geom.width_ratio < 0.08 and geom.word_count <= 10:
|
||||
# Marker: narrow + few short entries
|
||||
if geom.width_ratio < 0.06 and geom.word_count <= 15:
|
||||
scores['marker'] = 0.7
|
||||
if avg_word_len < 4:
|
||||
scores['marker'] = 0.9
|
||||
# Very narrow non-edge column → strong marker regardless of word count
|
||||
if geom.width_ratio < 0.04 and geom.index > 0:
|
||||
scores['marker'] = max(scores['marker'], 0.9)
|
||||
|
||||
# Sentence: longer words + punctuation present
|
||||
if geom.width_ratio > 0.15 and has_punctuation > 2:
|
||||
@@ -1213,6 +1243,39 @@ def classify_column_types(geometries: List[ColumnGeometry],
|
||||
classification_method='content',
|
||||
)]
|
||||
|
||||
# --- Pre-filter: first/last columns with very few words → column_ignore ---
|
||||
ignore_regions = []
|
||||
active_geometries = []
|
||||
for idx, g in enumerate(geometries):
|
||||
if (idx == 0 or idx == len(geometries) - 1) and g.word_count < 3:
|
||||
ignore_regions.append(PageRegion(
|
||||
type='column_ignore', x=g.x, y=g.y,
|
||||
width=g.width, height=content_h,
|
||||
classification_confidence=0.95,
|
||||
classification_method='content',
|
||||
))
|
||||
logger.info(f"ClassifyColumns: column {idx} (x={g.x}, words={g.word_count}) → column_ignore (edge, few words)")
|
||||
else:
|
||||
active_geometries.append(g)
|
||||
|
||||
# Re-index active geometries for classification
|
||||
for new_idx, g in enumerate(active_geometries):
|
||||
g.index = new_idx
|
||||
geometries = active_geometries
|
||||
|
||||
# Handle edge case: all columns ignored or only 1 left
|
||||
if len(geometries) == 0:
|
||||
return ignore_regions
|
||||
if len(geometries) == 1:
|
||||
geom = geometries[0]
|
||||
ignore_regions.append(PageRegion(
|
||||
type='column_text', x=geom.x, y=geom.y,
|
||||
width=geom.width, height=geom.height,
|
||||
classification_confidence=0.9,
|
||||
classification_method='content',
|
||||
))
|
||||
return ignore_regions
|
||||
|
||||
# --- Score all columns ---
|
||||
lang_scores = [_score_language(g.words) for g in geometries]
|
||||
role_scores = [_score_role(g) for g in geometries]
|
||||
@@ -1227,20 +1290,20 @@ def classify_column_types(geometries: List[ColumnGeometry],
|
||||
if regions is not None:
|
||||
logger.info("ClassifyColumns: Level 1 (content-based) succeeded")
|
||||
_add_header_footer(regions, top_y, bottom_y, img_w, img_h)
|
||||
return regions
|
||||
return ignore_regions + regions
|
||||
|
||||
# --- Level 2: Position + language enhanced ---
|
||||
regions = _classify_by_position_enhanced(geometries, lang_scores, content_w, content_h)
|
||||
if regions is not None:
|
||||
logger.info("ClassifyColumns: Level 2 (position+language) succeeded")
|
||||
_add_header_footer(regions, top_y, bottom_y, img_w, img_h)
|
||||
return regions
|
||||
return ignore_regions + regions
|
||||
|
||||
# --- Level 3: Pure position fallback (old code, no regression) ---
|
||||
logger.info("ClassifyColumns: Level 3 (position fallback)")
|
||||
regions = _classify_by_position_fallback(geometries, content_w, content_h)
|
||||
_add_header_footer(regions, top_y, bottom_y, img_w, img_h)
|
||||
return regions
|
||||
return ignore_regions + regions
|
||||
|
||||
|
||||
def _classify_by_content(geometries: List[ColumnGeometry],
|
||||
@@ -1257,8 +1320,12 @@ def _classify_by_content(geometries: List[ColumnGeometry],
|
||||
assigned = set()
|
||||
|
||||
# Step 1: Assign structural roles first (reference, marker)
|
||||
first_x = geometries[0].x if geometries else 0
|
||||
left_35_threshold = first_x + content_w * 0.35
|
||||
|
||||
for i, (geom, rs) in enumerate(zip(geometries, role_scores)):
|
||||
if rs['reference'] >= 0.5 and geom.width_ratio < 0.12:
|
||||
is_left_side = geom.x < left_35_threshold
|
||||
if rs['reference'] >= 0.5 and geom.width_ratio < 0.12 and is_left_side:
|
||||
regions.append(PageRegion(
|
||||
type='page_ref', x=geom.x, y=geom.y,
|
||||
width=geom.width, height=content_h,
|
||||
@@ -1266,7 +1333,7 @@ def _classify_by_content(geometries: List[ColumnGeometry],
|
||||
classification_method='content',
|
||||
))
|
||||
assigned.add(i)
|
||||
elif rs['marker'] >= 0.7 and geom.width_ratio < 0.08:
|
||||
elif rs['marker'] >= 0.7 and geom.width_ratio < 0.06:
|
||||
regions.append(PageRegion(
|
||||
type='column_marker', x=geom.x, y=geom.y,
|
||||
width=geom.width, height=content_h,
|
||||
@@ -1274,6 +1341,15 @@ def _classify_by_content(geometries: List[ColumnGeometry],
|
||||
classification_method='content',
|
||||
))
|
||||
assigned.add(i)
|
||||
elif geom.width_ratio < 0.05 and not is_left_side:
|
||||
# Narrow column on the right side → marker, not page_ref
|
||||
regions.append(PageRegion(
|
||||
type='column_marker', x=geom.x, y=geom.y,
|
||||
width=geom.width, height=content_h,
|
||||
classification_confidence=0.8,
|
||||
classification_method='content',
|
||||
))
|
||||
assigned.add(i)
|
||||
|
||||
# Step 2: Among remaining columns, find EN and DE by language scores
|
||||
remaining = [(i, geometries[i], lang_scores[i], role_scores[i])
|
||||
@@ -1296,6 +1372,47 @@ def _classify_by_content(geometries: List[ColumnGeometry],
|
||||
en_candidates = [(i, g, ls) for i, g, ls, rs in remaining if ls['eng'] > ls['deu'] and ls['eng'] > 0.05]
|
||||
de_candidates = [(i, g, ls) for i, g, ls, rs in remaining if ls['deu'] > ls['eng'] and ls['deu'] > 0.05]
|
||||
|
||||
# Position tiebreaker: when language signals are weak, use left=EN, right=DE
|
||||
if (not en_candidates or not de_candidates) and len(remaining) >= 2:
|
||||
max_eng = max(ls['eng'] for _, _, ls, _ in remaining)
|
||||
max_deu = max(ls['deu'] for _, _, ls, _ in remaining)
|
||||
if max_eng < 0.15 and max_deu < 0.15:
|
||||
# Both signals weak — fall back to positional: left=EN, right=DE
|
||||
sorted_remaining = sorted(remaining, key=lambda x: x[1].x)
|
||||
best_en = (sorted_remaining[0][0], sorted_remaining[0][1], sorted_remaining[0][2])
|
||||
best_de = (sorted_remaining[1][0], sorted_remaining[1][1], sorted_remaining[1][2])
|
||||
logger.info("ClassifyColumns: Level 1 using position tiebreaker (weak signals) - left=EN, right=DE")
|
||||
en_conf = 0.4
|
||||
de_conf = 0.4
|
||||
|
||||
regions.append(PageRegion(
|
||||
type='column_en', x=best_en[1].x, y=best_en[1].y,
|
||||
width=best_en[1].width, height=content_h,
|
||||
classification_confidence=en_conf,
|
||||
classification_method='content',
|
||||
))
|
||||
assigned.add(best_en[0])
|
||||
|
||||
regions.append(PageRegion(
|
||||
type='column_de', x=best_de[1].x, y=best_de[1].y,
|
||||
width=best_de[1].width, height=content_h,
|
||||
classification_confidence=de_conf,
|
||||
classification_method='content',
|
||||
))
|
||||
assigned.add(best_de[0])
|
||||
|
||||
# Assign remaining as example
|
||||
for i, geom, ls, rs in remaining:
|
||||
if i not in assigned:
|
||||
regions.append(PageRegion(
|
||||
type='column_example', x=geom.x, y=geom.y,
|
||||
width=geom.width, height=content_h,
|
||||
classification_confidence=0.4,
|
||||
classification_method='content',
|
||||
))
|
||||
regions.sort(key=lambda r: r.x)
|
||||
return regions
|
||||
|
||||
if not en_candidates or not de_candidates:
|
||||
# Language signals too weak for content-based classification
|
||||
logger.info("ClassifyColumns: Level 1 failed - no clear EN/DE language split")
|
||||
@@ -1363,10 +1480,12 @@ def _classify_by_position_enhanced(geometries: List[ColumnGeometry],
|
||||
"""
|
||||
regions = []
|
||||
untyped = list(range(len(geometries)))
|
||||
first_x = geometries[0].x if geometries else 0
|
||||
left_35_threshold = first_x + content_w * 0.35
|
||||
|
||||
# Rule 1: Leftmost narrow column → page_ref
|
||||
# Rule 1: Leftmost narrow column → page_ref (only if in left 35%)
|
||||
g0 = geometries[0]
|
||||
if g0.width_ratio < 0.12:
|
||||
if g0.width_ratio < 0.12 and g0.x < left_35_threshold:
|
||||
regions.append(PageRegion(
|
||||
type='page_ref', x=g0.x, y=g0.y,
|
||||
width=g0.width, height=content_h,
|
||||
@@ -1378,7 +1497,7 @@ def _classify_by_position_enhanced(geometries: List[ColumnGeometry],
|
||||
# Rule 2: Narrow columns with few words → marker
|
||||
for i in list(untyped):
|
||||
geom = geometries[i]
|
||||
if geom.width_ratio < 0.08 and geom.word_count <= 10:
|
||||
if geom.width_ratio < 0.06 and geom.word_count <= 15:
|
||||
regions.append(PageRegion(
|
||||
type='column_marker', x=geom.x, y=geom.y,
|
||||
width=geom.width, height=content_h,
|
||||
@@ -1463,10 +1582,12 @@ def _classify_by_position_fallback(geometries: List[ColumnGeometry],
|
||||
"""
|
||||
regions = []
|
||||
untyped = list(range(len(geometries)))
|
||||
first_x = geometries[0].x if geometries else 0
|
||||
left_35_threshold = first_x + content_w * 0.35
|
||||
|
||||
# Rule 1: Leftmost narrow column → page_ref
|
||||
# Rule 1: Leftmost narrow column → page_ref (only if in left 35%)
|
||||
g0 = geometries[0]
|
||||
if g0.width_ratio < 0.12:
|
||||
if g0.width_ratio < 0.12 and g0.x < left_35_threshold:
|
||||
regions.append(PageRegion(
|
||||
type='page_ref', x=g0.x, y=g0.y,
|
||||
width=g0.width, height=content_h,
|
||||
@@ -1478,7 +1599,7 @@ def _classify_by_position_fallback(geometries: List[ColumnGeometry],
|
||||
# Rule 2: Narrow + few words → marker
|
||||
for i in list(untyped):
|
||||
geom = geometries[i]
|
||||
if geom.width_ratio < 0.08 and geom.word_count <= 8:
|
||||
if geom.width_ratio < 0.06 and geom.word_count <= 15:
|
||||
regions.append(PageRegion(
|
||||
type='column_marker', x=geom.x, y=geom.y,
|
||||
width=geom.width, height=content_h,
|
||||
|
||||
Reference in New Issue
Block a user