From 4feec7c7b738f51312d9ffdb749246c5545ba1ba Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Tue, 24 Mar 2026 23:17:08 +0100 Subject: [PATCH] Lower syllable pipe-ratio threshold from 5% to 1% Real dictionary pages have only ~3% OCR-detected pipes because the thin syllable divider lines are hard for OCR to read. The primary false-positive guard (article_col_index check) already blocks synonym dictionaries. Co-Authored-By: Claude Opus 4.6 --- klausur-service/backend/cv_syllable_detect.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/klausur-service/backend/cv_syllable_detect.py b/klausur-service/backend/cv_syllable_detect.py index 42fb162..9057afc 100644 --- a/klausur-service/backend/cv_syllable_detect.py +++ b/klausur-service/backend/cv_syllable_detect.py @@ -200,10 +200,9 @@ def insert_syllable_dividers( For dictionary pages: process all content column cells, strip existing pipes, merge pipe-gap spaces, and re-syllabify using pyphen. - Pre-check: at least 5% of content cells must already contain ``|`` from - OCR. This guards against false-positive dictionary detection on pages - like synonym dictionaries or alphabetical word lists that have no actual - syllable divider lines. + Pre-check: at least 1% of content cells must already contain ``|`` from + OCR. This guards against pages with zero pipe characters (the primary + guard — article_col_index — is checked at the call site). Returns the number of cells modified. """ @@ -227,10 +226,10 @@ def insert_syllable_dividers( if total_col_cells > 0: pipe_ratio = cells_with_pipes / total_col_cells - if pipe_ratio < 0.05: + if pipe_ratio < 0.01: logger.info( "build-grid session %s: skipping syllable insertion — " - "only %.1f%% of cells have existing pipes (need >=5%%)", + "only %.1f%% of cells have existing pipes (need >=1%%)", session_id, pipe_ratio * 100, ) return 0