From 03fa186fec1bfac94ade49b8cdcc432db2009ea2 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Fri, 27 Feb 2026 20:19:09 +0100 Subject: [PATCH] fix(ocr-pipeline): increase merge distance to 6% for better column merging Sub-alignments within a column (indented words, etc.) were 60-90px apart and not getting merged at 3%. On a typical 5-col page (~1500px), 6% = ~90px merges sub-alignments while keeping real column boundaries (~300px) separate. Co-Authored-By: Claude Opus 4.6 --- klausur-service/backend/cv_vocab_pipeline.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/klausur-service/backend/cv_vocab_pipeline.py b/klausur-service/backend/cv_vocab_pipeline.py index 4069a1e..b1820bd 100644 --- a/klausur-service/backend/cv_vocab_pipeline.py +++ b/klausur-service/backend/cv_vocab_pipeline.py @@ -1010,8 +1010,11 @@ def detect_column_geometry(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> Opt logger.info("ColumnGeometry: < 3 clusters after verticality filter, signaling fallback") return None - # --- Merge clusters that are very close (3% of content width) --- - merge_distance = max(20, int(content_w * 0.03)) + # --- Merge clusters that are very close --- + # 6% of content width: on a typical 5-col vocab page (~1500px wide), + # this is ~90px, which merges sub-alignments within a single column + # while keeping real column boundaries (~300px apart) separate. + merge_distance = max(30, int(content_w * 0.06)) merged = [significant[0].copy()] for s in significant[1:]: if s['mean_x'] - merged[-1]['mean_x'] < merge_distance: