diag: add row-clustering logging for small/box zones
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 29s
CI / test-go-edu-search (push) Successful in 26s
CI / test-python-klausur (push) Failing after 1m55s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 18s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 29s
CI / test-go-edu-search (push) Successful in 26s
CI / test-python-klausur (push) Failing after 1m55s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 18s
Logs word positions, median height, Y tolerance, and resulting rows for zones with <= 30 words to diagnose row merging issues. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -365,6 +365,29 @@ def _build_zone_grid(
|
||||
|
||||
# Cluster rows first (needed for column alignment analysis)
|
||||
rows = _cluster_rows(zone_words)
|
||||
|
||||
# Diagnostic logging for small zones (box zones typically)
|
||||
if len(zone_words) <= 30:
|
||||
import statistics as _st
|
||||
_heights = [w['height'] for w in zone_words if w.get('height', 0) > 0]
|
||||
_med_h = _st.median(_heights) if _heights else 20
|
||||
_y_tol = max(_med_h * 0.5, 5)
|
||||
logger.info(
|
||||
"zone %d row-clustering: %d words, median_h=%.0f, y_tol=%.1f → %d rows",
|
||||
zone_index, len(zone_words), _med_h, _y_tol, len(rows),
|
||||
)
|
||||
for w in sorted(zone_words, key=lambda ww: (ww['top'], ww['left'])):
|
||||
logger.info(
|
||||
" zone %d word: y=%d x=%d h=%d w=%d '%s'",
|
||||
zone_index, w['top'], w['left'], w['height'], w['width'],
|
||||
w.get('text', '')[:40],
|
||||
)
|
||||
for r in rows:
|
||||
logger.info(
|
||||
" zone %d row %d: y_min=%d y_max=%d y_center=%.0f",
|
||||
zone_index, r['index'], r['y_min'], r['y_max'], r['y_center'],
|
||||
)
|
||||
|
||||
# Use global columns if provided, otherwise detect per zone
|
||||
columns = global_columns if global_columns else _cluster_columns_by_alignment(zone_words, zone_w, rows)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user