fix: preserve pipe syllable dividers + detect alphabet sidebar columns
1. Pipe divider fix: Changed OCR char-confusion regex so | between
letters (Ka|me|rad) is NOT converted to I. Only standalone/
word-boundary pipes are converted (|ch → Ich, | want → I want).
2. Alphabet sidebar detection improvements:
- _filter_decorative_margin() now considers 2-char words (OCR reads
"Aa", "Bb" from sidebars), lowered min strip from 8→6
- _filter_border_strip_words() lowered decorative threshold from 50%→45%
- New step 4f: grid-level thin-edge-column filter as safety net —
removes edge columns with <35% fill rate and >60% short text
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -481,8 +481,9 @@ _CHAR_CONFUSION_RULES = [
|
|||||||
(re.compile(r'\b1([a-z])'), r'I\1'), # 1ch → Ich, 1want → Iwant
|
(re.compile(r'\b1([a-z])'), r'I\1'), # 1ch → Ich, 1want → Iwant
|
||||||
# Standalone "1" → "I" (English pronoun), but NOT "1." or "1," (list number)
|
# Standalone "1" → "I" (English pronoun), but NOT "1." or "1," (list number)
|
||||||
(re.compile(r'(?<!\d)\b1\b(?![\d.,])'), 'I'), # "1 want" → "I want"
|
(re.compile(r'(?<!\d)\b1\b(?![\d.,])'), 'I'), # "1 want" → "I want"
|
||||||
# "|" → "I", but NOT "|." or "|," (those are "1." list prefixes → spell-checker handles them)
|
# "|" → "I", but NOT when embedded between letters (syllable divider: Ka|me|rad)
|
||||||
(re.compile(r'(?<!\|)\|(?!\||[.,])'), 'I'), # |ch → Ich, | want → I want
|
# and NOT "|." or "|," (those are "1." list prefixes → spell-checker handles them)
|
||||||
|
(re.compile(r'(?<![a-zA-ZäöüÄÖÜß])\|(?!\||[.,])'), 'I'), # |ch → Ich, | want → I want
|
||||||
]
|
]
|
||||||
|
|
||||||
# Cross-language indicators: if DE has these, EN "1" is almost certainly "I"
|
# Cross-language indicators: if DE has these, EN "1" is almost certainly "I"
|
||||||
|
|||||||
@@ -84,14 +84,14 @@ def _filter_border_strip_words(words: List[Dict]) -> Tuple[List[Dict], int]:
|
|||||||
break
|
break
|
||||||
|
|
||||||
# Validate candidate strip: real border decorations are mostly short
|
# Validate candidate strip: real border decorations are mostly short
|
||||||
# single-character words (alphabet letters, stray marks). Multi-word
|
# words (alphabet letters like "A", "Bb", stray marks). Multi-word
|
||||||
# content like "der Ranzen" or "die Schals" (continuation of German
|
# content like "der Ranzen" or "die Schals" (continuation of German
|
||||||
# translations) must NOT be removed.
|
# translations) must NOT be removed.
|
||||||
def _is_decorative_strip(candidates: List[Dict]) -> bool:
|
def _is_decorative_strip(candidates: List[Dict]) -> bool:
|
||||||
if not candidates:
|
if not candidates:
|
||||||
return False
|
return False
|
||||||
short = sum(1 for w in candidates if len((w.get("text") or "").strip()) <= 2)
|
short = sum(1 for w in candidates if len((w.get("text") or "").strip()) <= 2)
|
||||||
return short / len(candidates) >= 0.5
|
return short / len(candidates) >= 0.45
|
||||||
|
|
||||||
strip_ids: set = set()
|
strip_ids: set = set()
|
||||||
if left_count > 0 and left_count / total < 0.20:
|
if left_count > 0 and left_count / total < 0.20:
|
||||||
@@ -1243,20 +1243,22 @@ def _filter_decorative_margin(
|
|||||||
return no_strip
|
return no_strip
|
||||||
|
|
||||||
margin_cutoff = img_w * 0.30
|
margin_cutoff = img_w * 0.30
|
||||||
# Phase 1: find candidate strips using single-char words
|
# Phase 1: find candidate strips using short words (1-2 chars).
|
||||||
|
# OCR often reads alphabet sidebar letters as pairs ("Aa", "Bb")
|
||||||
|
# rather than singles, so accept ≤2-char words as strip candidates.
|
||||||
left_strip = [
|
left_strip = [
|
||||||
w for w in words
|
w for w in words
|
||||||
if len((w.get("text") or "").strip()) == 1
|
if len((w.get("text") or "").strip()) <= 2
|
||||||
and w["left"] + w.get("width", 0) / 2 < margin_cutoff
|
and w["left"] + w.get("width", 0) / 2 < margin_cutoff
|
||||||
]
|
]
|
||||||
right_strip = [
|
right_strip = [
|
||||||
w for w in words
|
w for w in words
|
||||||
if len((w.get("text") or "").strip()) == 1
|
if len((w.get("text") or "").strip()) <= 2
|
||||||
and w["left"] + w.get("width", 0) / 2 > img_w - margin_cutoff
|
and w["left"] + w.get("width", 0) / 2 > img_w - margin_cutoff
|
||||||
]
|
]
|
||||||
|
|
||||||
for strip, side in [(left_strip, "left"), (right_strip, "right")]:
|
for strip, side in [(left_strip, "left"), (right_strip, "right")]:
|
||||||
if len(strip) < 8:
|
if len(strip) < 6:
|
||||||
continue
|
continue
|
||||||
# Check vertical distribution: should have many distinct Y positions
|
# Check vertical distribution: should have many distinct Y positions
|
||||||
y_centers = sorted(set(
|
y_centers = sorted(set(
|
||||||
@@ -2128,6 +2130,56 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
|
|||||||
strip_gap, strip_count, total,
|
strip_gap, strip_count, total,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# 4f. Remove thin decorative edge columns (alphabet sidebar safety net).
|
||||||
|
# If the leftmost or rightmost column has very few filled cells AND
|
||||||
|
# most of its text is short (≤2 chars), it's likely an alphabet sidebar
|
||||||
|
# that slipped through word-level pre-filters.
|
||||||
|
for z in zones_data:
|
||||||
|
columns = z.get("columns", [])
|
||||||
|
cells = z.get("cells", [])
|
||||||
|
if len(columns) < 3 or not cells:
|
||||||
|
continue
|
||||||
|
# Group cells by col_type
|
||||||
|
col_cells: Dict[str, List[Dict]] = {}
|
||||||
|
for cell in cells:
|
||||||
|
ct = cell.get("col_type", "")
|
||||||
|
col_cells.setdefault(ct, []).append(cell)
|
||||||
|
# Find edge column types (first and last)
|
||||||
|
col_types_ordered = sorted(col_cells.keys())
|
||||||
|
if not col_types_ordered:
|
||||||
|
continue
|
||||||
|
# Median cell count across columns (excluding heading rows)
|
||||||
|
col_counts = [len(v) for v in col_cells.values()]
|
||||||
|
median_count = sorted(col_counts)[len(col_counts) // 2] if col_counts else 0
|
||||||
|
if median_count < 3:
|
||||||
|
continue
|
||||||
|
for edge_ct in [col_types_ordered[0], col_types_ordered[-1]]:
|
||||||
|
edge_cells_list = col_cells.get(edge_ct, [])
|
||||||
|
if not edge_cells_list:
|
||||||
|
continue
|
||||||
|
fill_ratio = len(edge_cells_list) / median_count
|
||||||
|
if fill_ratio > 0.35:
|
||||||
|
continue # well-filled column → not decorative
|
||||||
|
short_count = sum(
|
||||||
|
1 for c in edge_cells_list
|
||||||
|
if len((c.get("text") or "").strip()) <= 2
|
||||||
|
)
|
||||||
|
short_ratio = short_count / len(edge_cells_list) if edge_cells_list else 0
|
||||||
|
if short_ratio < 0.6:
|
||||||
|
continue # too much real content → not decorative
|
||||||
|
# Remove this edge column
|
||||||
|
removed_count = len(edge_cells_list)
|
||||||
|
edge_ids = {id(c) for c in edge_cells_list}
|
||||||
|
z["cells"] = [c for c in cells if id(c) not in edge_ids]
|
||||||
|
z["columns"] = [col for col in columns if col.get("col_type") != edge_ct]
|
||||||
|
logger.info(
|
||||||
|
"Step 4f: removed thin decorative edge column '%s' from zone %d "
|
||||||
|
"(%d cells, fill=%.0f%%, short=%.0f%%)",
|
||||||
|
edge_ct, z.get("zone_index", 0), removed_count,
|
||||||
|
fill_ratio * 100, short_ratio * 100,
|
||||||
|
)
|
||||||
|
break # only remove one edge per zone
|
||||||
|
|
||||||
# 5. Color annotation on final word_boxes in cells
|
# 5. Color annotation on final word_boxes in cells
|
||||||
if img_bgr is not None:
|
if img_bgr is not None:
|
||||||
all_wb: List[Dict] = []
|
all_wb: List[Dict] = []
|
||||||
|
|||||||
Reference in New Issue
Block a user