fix: preserve pipe syllable dividers + detect alphabet sidebar columns

1. Pipe divider fix: Changed OCR char-confusion regex so | between
   letters (Ka|me|rad) is NOT converted to I. Only standalone/
   word-boundary pipes are converted (|ch → Ich, | want → I want).

2. Alphabet sidebar detection improvements:
   - _filter_decorative_margin() now considers 2-char words (OCR reads
     "Aa", "Bb" from sidebars), lowered min strip from 8→6
   - _filter_border_strip_words() lowered decorative threshold from 50%→45%
   - New step 4f: grid-level thin-edge-column filter as safety net —
     removes edge columns with <35% fill rate and >60% short text

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-24 13:52:11 +01:00
parent 19a5f69272
commit be86a7d14d
2 changed files with 61 additions and 8 deletions

View File

@@ -481,8 +481,9 @@ _CHAR_CONFUSION_RULES = [
(re.compile(r'\b1([a-z])'), r'I\1'), # 1ch → Ich, 1want → Iwant (re.compile(r'\b1([a-z])'), r'I\1'), # 1ch → Ich, 1want → Iwant
# Standalone "1" → "I" (English pronoun), but NOT "1." or "1," (list number) # Standalone "1" → "I" (English pronoun), but NOT "1." or "1," (list number)
(re.compile(r'(?<!\d)\b1\b(?![\d.,])'), 'I'), # "1 want" → "I want" (re.compile(r'(?<!\d)\b1\b(?![\d.,])'), 'I'), # "1 want" → "I want"
# "|" → "I", but NOT "|." or "|," (those are "1." list prefixes → spell-checker handles them) # "|" → "I", but NOT when embedded between letters (syllable divider: Ka|me|rad)
(re.compile(r'(?<!\|)\|(?!\||[.,])'), 'I'), # |ch → Ich, | want → I want # and NOT "|." or "|," (those are "1." list prefixes → spell-checker handles them)
(re.compile(r'(?<![a-zA-ZäöüÄÖÜß])\|(?!\||[.,])'), 'I'), # |ch → Ich, | want → I want
] ]
# Cross-language indicators: if DE has these, EN "1" is almost certainly "I" # Cross-language indicators: if DE has these, EN "1" is almost certainly "I"

View File

@@ -84,14 +84,14 @@ def _filter_border_strip_words(words: List[Dict]) -> Tuple[List[Dict], int]:
break break
# Validate candidate strip: real border decorations are mostly short # Validate candidate strip: real border decorations are mostly short
# single-character words (alphabet letters, stray marks). Multi-word # words (alphabet letters like "A", "Bb", stray marks). Multi-word
# content like "der Ranzen" or "die Schals" (continuation of German # content like "der Ranzen" or "die Schals" (continuation of German
# translations) must NOT be removed. # translations) must NOT be removed.
def _is_decorative_strip(candidates: List[Dict]) -> bool: def _is_decorative_strip(candidates: List[Dict]) -> bool:
if not candidates: if not candidates:
return False return False
short = sum(1 for w in candidates if len((w.get("text") or "").strip()) <= 2) short = sum(1 for w in candidates if len((w.get("text") or "").strip()) <= 2)
return short / len(candidates) >= 0.5 return short / len(candidates) >= 0.45
strip_ids: set = set() strip_ids: set = set()
if left_count > 0 and left_count / total < 0.20: if left_count > 0 and left_count / total < 0.20:
@@ -1243,20 +1243,22 @@ def _filter_decorative_margin(
return no_strip return no_strip
margin_cutoff = img_w * 0.30 margin_cutoff = img_w * 0.30
# Phase 1: find candidate strips using single-char words # Phase 1: find candidate strips using short words (1-2 chars).
# OCR often reads alphabet sidebar letters as pairs ("Aa", "Bb")
# rather than singles, so accept ≤2-char words as strip candidates.
left_strip = [ left_strip = [
w for w in words w for w in words
if len((w.get("text") or "").strip()) == 1 if len((w.get("text") or "").strip()) <= 2
and w["left"] + w.get("width", 0) / 2 < margin_cutoff and w["left"] + w.get("width", 0) / 2 < margin_cutoff
] ]
right_strip = [ right_strip = [
w for w in words w for w in words
if len((w.get("text") or "").strip()) == 1 if len((w.get("text") or "").strip()) <= 2
and w["left"] + w.get("width", 0) / 2 > img_w - margin_cutoff and w["left"] + w.get("width", 0) / 2 > img_w - margin_cutoff
] ]
for strip, side in [(left_strip, "left"), (right_strip, "right")]: for strip, side in [(left_strip, "left"), (right_strip, "right")]:
if len(strip) < 8: if len(strip) < 6:
continue continue
# Check vertical distribution: should have many distinct Y positions # Check vertical distribution: should have many distinct Y positions
y_centers = sorted(set( y_centers = sorted(set(
@@ -2128,6 +2130,56 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
strip_gap, strip_count, total, strip_gap, strip_count, total,
) )
# 4f. Remove thin decorative edge columns (alphabet sidebar safety net).
# If the leftmost or rightmost column has very few filled cells AND
# most of its text is short (≤2 chars), it's likely an alphabet sidebar
# that slipped through word-level pre-filters.
for z in zones_data:
columns = z.get("columns", [])
cells = z.get("cells", [])
if len(columns) < 3 or not cells:
continue
# Group cells by col_type
col_cells: Dict[str, List[Dict]] = {}
for cell in cells:
ct = cell.get("col_type", "")
col_cells.setdefault(ct, []).append(cell)
# Find edge column types (first and last)
col_types_ordered = sorted(col_cells.keys())
if not col_types_ordered:
continue
# Median cell count across columns (excluding heading rows)
col_counts = [len(v) for v in col_cells.values()]
median_count = sorted(col_counts)[len(col_counts) // 2] if col_counts else 0
if median_count < 3:
continue
for edge_ct in [col_types_ordered[0], col_types_ordered[-1]]:
edge_cells_list = col_cells.get(edge_ct, [])
if not edge_cells_list:
continue
fill_ratio = len(edge_cells_list) / median_count
if fill_ratio > 0.35:
continue # well-filled column → not decorative
short_count = sum(
1 for c in edge_cells_list
if len((c.get("text") or "").strip()) <= 2
)
short_ratio = short_count / len(edge_cells_list) if edge_cells_list else 0
if short_ratio < 0.6:
continue # too much real content → not decorative
# Remove this edge column
removed_count = len(edge_cells_list)
edge_ids = {id(c) for c in edge_cells_list}
z["cells"] = [c for c in cells if id(c) not in edge_ids]
z["columns"] = [col for col in columns if col.get("col_type") != edge_ct]
logger.info(
"Step 4f: removed thin decorative edge column '%s' from zone %d "
"(%d cells, fill=%.0f%%, short=%.0f%%)",
edge_ct, z.get("zone_index", 0), removed_count,
fill_ratio * 100, short_ratio * 100,
)
break # only remove one edge per zone
# 5. Color annotation on final word_boxes in cells # 5. Color annotation on final word_boxes in cells
if img_bgr is not None: if img_bgr is not None:
all_wb: List[Dict] = [] all_wb: List[Dict] = []