From 164b35c06ab988ceae2644634f26e29ff5b1b537 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Fri, 27 Feb 2026 23:33:11 +0100 Subject: [PATCH] fix(ocr-pipeline): tighten page_ref constraints based on live testing - Reduce left-side threshold from 35% to 20% of content width - Strong language signal (eng/deu > 0.3) now prevents page_ref assignment - Increase column_ignore word threshold from 3 to 8 for edge columns - Apply language guard to Level 1 and Level 2 classification Fixes: column with deu=0.921 was misclassified as page_ref because reference score check ran before language analysis. Co-Authored-By: Claude Opus 4.6 --- klausur-service/backend/cv_vocab_pipeline.py | 27 +++++++++++--------- 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/klausur-service/backend/cv_vocab_pipeline.py b/klausur-service/backend/cv_vocab_pipeline.py index 790e960..6be9209 100644 --- a/klausur-service/backend/cv_vocab_pipeline.py +++ b/klausur-service/backend/cv_vocab_pipeline.py @@ -1247,7 +1247,7 @@ def classify_column_types(geometries: List[ColumnGeometry], ignore_regions = [] active_geometries = [] for idx, g in enumerate(geometries): - if (idx == 0 or idx == len(geometries) - 1) and g.word_count < 3: + if (idx == 0 or idx == len(geometries) - 1) and g.word_count < 8: ignore_regions.append(PageRegion( type='column_ignore', x=g.x, y=g.y, width=g.width, height=content_h, @@ -1320,12 +1320,13 @@ def _classify_by_content(geometries: List[ColumnGeometry], assigned = set() # Step 1: Assign structural roles first (reference, marker) - first_x = geometries[0].x if geometries else 0 - left_35_threshold = first_x + content_w * 0.35 + # left_20_threshold: only the leftmost ~20% of content area qualifies for page_ref + left_20_threshold = geometries[0].x + content_w * 0.20 if geometries else 0 - for i, (geom, rs) in enumerate(zip(geometries, role_scores)): - is_left_side = geom.x < left_35_threshold - if rs['reference'] >= 0.5 and geom.width_ratio < 0.12 and is_left_side: + for i, (geom, rs, ls) in enumerate(zip(geometries, role_scores, lang_scores)): + is_left_side = geom.x < left_20_threshold + has_strong_language = ls['eng'] > 0.3 or ls['deu'] > 0.3 + if rs['reference'] >= 0.5 and geom.width_ratio < 0.12 and is_left_side and not has_strong_language: regions.append(PageRegion( type='page_ref', x=geom.x, y=geom.y, width=geom.width, height=content_h, @@ -1481,11 +1482,13 @@ def _classify_by_position_enhanced(geometries: List[ColumnGeometry], regions = [] untyped = list(range(len(geometries))) first_x = geometries[0].x if geometries else 0 - left_35_threshold = first_x + content_w * 0.35 + left_20_threshold = first_x + content_w * 0.20 - # Rule 1: Leftmost narrow column → page_ref (only if in left 35%) + # Rule 1: Leftmost narrow column → page_ref (only if in left 20%, no strong language) g0 = geometries[0] - if g0.width_ratio < 0.12 and g0.x < left_35_threshold: + ls0 = lang_scores[0] + has_strong_lang_0 = ls0['eng'] > 0.3 or ls0['deu'] > 0.3 + if g0.width_ratio < 0.12 and g0.x < left_20_threshold and not has_strong_lang_0: regions.append(PageRegion( type='page_ref', x=g0.x, y=g0.y, width=g0.width, height=content_h, @@ -1583,11 +1586,11 @@ def _classify_by_position_fallback(geometries: List[ColumnGeometry], regions = [] untyped = list(range(len(geometries))) first_x = geometries[0].x if geometries else 0 - left_35_threshold = first_x + content_w * 0.35 + left_20_threshold = first_x + content_w * 0.20 - # Rule 1: Leftmost narrow column → page_ref (only if in left 35%) + # Rule 1: Leftmost narrow column → page_ref (only if in left 20%) g0 = geometries[0] - if g0.width_ratio < 0.12 and g0.x < left_35_threshold: + if g0.width_ratio < 0.12 and g0.x < left_20_threshold: regions.append(PageRegion( type='page_ref', x=g0.x, y=g0.y, width=g0.width, height=content_h,