fix(ocr-pipeline): tighten page_ref constraints based on live testing

- Reduce left-side threshold from 35% to 20% of content width
- Strong language signal (eng/deu > 0.3) now prevents page_ref assignment
- Increase column_ignore word threshold from 3 to 8 for edge columns
- Apply language guard to Level 1 and Level 2 classification

Fixes: column with deu=0.921 was misclassified as page_ref because
reference score check ran before language analysis.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-02-27 23:33:11 +01:00
parent 2297f66edb
commit 164b35c06a

View File

@@ -1247,7 +1247,7 @@ def classify_column_types(geometries: List[ColumnGeometry],
ignore_regions = []
active_geometries = []
for idx, g in enumerate(geometries):
if (idx == 0 or idx == len(geometries) - 1) and g.word_count < 3:
if (idx == 0 or idx == len(geometries) - 1) and g.word_count < 8:
ignore_regions.append(PageRegion(
type='column_ignore', x=g.x, y=g.y,
width=g.width, height=content_h,
@@ -1320,12 +1320,13 @@ def _classify_by_content(geometries: List[ColumnGeometry],
assigned = set()
# Step 1: Assign structural roles first (reference, marker)
first_x = geometries[0].x if geometries else 0
left_35_threshold = first_x + content_w * 0.35
# left_20_threshold: only the leftmost ~20% of content area qualifies for page_ref
left_20_threshold = geometries[0].x + content_w * 0.20 if geometries else 0
for i, (geom, rs) in enumerate(zip(geometries, role_scores)):
is_left_side = geom.x < left_35_threshold
if rs['reference'] >= 0.5 and geom.width_ratio < 0.12 and is_left_side:
for i, (geom, rs, ls) in enumerate(zip(geometries, role_scores, lang_scores)):
is_left_side = geom.x < left_20_threshold
has_strong_language = ls['eng'] > 0.3 or ls['deu'] > 0.3
if rs['reference'] >= 0.5 and geom.width_ratio < 0.12 and is_left_side and not has_strong_language:
regions.append(PageRegion(
type='page_ref', x=geom.x, y=geom.y,
width=geom.width, height=content_h,
@@ -1481,11 +1482,13 @@ def _classify_by_position_enhanced(geometries: List[ColumnGeometry],
regions = []
untyped = list(range(len(geometries)))
first_x = geometries[0].x if geometries else 0
left_35_threshold = first_x + content_w * 0.35
left_20_threshold = first_x + content_w * 0.20
# Rule 1: Leftmost narrow column → page_ref (only if in left 35%)
# Rule 1: Leftmost narrow column → page_ref (only if in left 20%, no strong language)
g0 = geometries[0]
if g0.width_ratio < 0.12 and g0.x < left_35_threshold:
ls0 = lang_scores[0]
has_strong_lang_0 = ls0['eng'] > 0.3 or ls0['deu'] > 0.3
if g0.width_ratio < 0.12 and g0.x < left_20_threshold and not has_strong_lang_0:
regions.append(PageRegion(
type='page_ref', x=g0.x, y=g0.y,
width=g0.width, height=content_h,
@@ -1583,11 +1586,11 @@ def _classify_by_position_fallback(geometries: List[ColumnGeometry],
regions = []
untyped = list(range(len(geometries)))
first_x = geometries[0].x if geometries else 0
left_35_threshold = first_x + content_w * 0.35
left_20_threshold = first_x + content_w * 0.20
# Rule 1: Leftmost narrow column → page_ref (only if in left 35%)
# Rule 1: Leftmost narrow column → page_ref (only if in left 20%)
g0 = geometries[0]
if g0.width_ratio < 0.12 and g0.x < left_35_threshold:
if g0.width_ratio < 0.12 and g0.x < left_20_threshold:
regions.append(PageRegion(
type='page_ref', x=g0.x, y=g0.y,
width=g0.width, height=content_h,