fix: preserve pipe syllable dividers + detect alphabet sidebar columns

1. Pipe divider fix: Changed OCR char-confusion regex so | between letters (Ka|me|rad) is NOT converted to I. Only standalone/ word-boundary pipes are converted (|ch → Ich, | want → I want). 2. Alphabet sidebar detection improvements: - _filter_decorative_margin() now considers 2-char words (OCR reads "Aa", "Bb" from sidebars), lowered min strip from 8→6 - _filter_border_strip_words() lowered decorative threshold from 50%→45% - New step 4f: grid-level thin-edge-column filter as safety net — removes edge columns with <35% fill rate and >60% short text Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-24 13:52:11 +01:00
parent 19a5f69272
commit be86a7d14d
2 changed files with 61 additions and 8 deletions
@@ -481,8 +481,9 @@ _CHAR_CONFUSION_RULES = [
    (re.compile(r'\b1([a-z])'), r'I\1'),           # 1ch → Ich, 1want → Iwant
    # Standalone "1" → "I" (English pronoun), but NOT "1." or "1," (list number)
    (re.compile(r'(?<!\d)\b1\b(?![\d.,])'), 'I'),  # "1 want" → "I want"
-    # "|" → "I", but NOT "|." or "|," (those are "1." list prefixes → spell-checker handles them)
+    # "|" → "I", but NOT when embedded between letters (syllable divider: Ka|me|rad)
-    (re.compile(r'(?<!\|)\|(?!\||[.,])'), 'I'),    # |ch → Ich, | want → I want
+    # and NOT "|." or "|," (those are "1." list prefixes → spell-checker handles them)
    (re.compile(r'(?<![a-zA-ZäöüÄÖÜß])\|(?!\||[.,])'), 'I'),  # |ch → Ich, | want → I want
 ]
 # Cross-language indicators: if DE has these, EN "1" is almost certainly "I"
@@ -84,14 +84,14 @@ def _filter_border_strip_words(words: List[Dict]) -> Tuple[List[Dict], int]:
            break
    # Validate candidate strip: real border decorations are mostly short
-    # single-character words (alphabet letters, stray marks).  Multi-word
+    # words (alphabet letters like "A", "Bb", stray marks).  Multi-word
    # content like "der Ranzen" or "die Schals" (continuation of German
    # translations) must NOT be removed.
    def _is_decorative_strip(candidates: List[Dict]) -> bool:
        if not candidates:
            return False
        short = sum(1 for w in candidates if len((w.get("text") or "").strip()) <= 2)
-        return short / len(candidates) >= 0.5
+        return short / len(candidates) >= 0.45
    strip_ids: set = set()
    if left_count > 0 and left_count / total < 0.20:
@@ -1243,20 +1243,22 @@ def _filter_decorative_margin(
        return no_strip
    margin_cutoff = img_w * 0.30
-    # Phase 1: find candidate strips using single-char words
+    # Phase 1: find candidate strips using short words (1-2 chars).
    # OCR often reads alphabet sidebar letters as pairs ("Aa", "Bb")
    # rather than singles, so accept ≤2-char words as strip candidates.
    left_strip = [
        w for w in words
-        if len((w.get("text") or "").strip()) == 1
+        if len((w.get("text") or "").strip()) <= 2
        and w["left"] + w.get("width", 0) / 2 < margin_cutoff
    ]
    right_strip = [
        w for w in words
-        if len((w.get("text") or "").strip()) == 1
+        if len((w.get("text") or "").strip()) <= 2
        and w["left"] + w.get("width", 0) / 2 > img_w - margin_cutoff
    ]
    for strip, side in [(left_strip, "left"), (right_strip, "right")]:
-        if len(strip) < 8:
+        if len(strip) < 6:
            continue
        # Check vertical distribution: should have many distinct Y positions
        y_centers = sorted(set(
@@ -2128,6 +2130,56 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
                strip_gap, strip_count, total,
            )
    # 4f. Remove thin decorative edge columns (alphabet sidebar safety net).
    # If the leftmost or rightmost column has very few filled cells AND
    # most of its text is short (≤2 chars), it's likely an alphabet sidebar
    # that slipped through word-level pre-filters.
    for z in zones_data:
        columns = z.get("columns", [])
        cells = z.get("cells", [])
        if len(columns) < 3 or not cells:
            continue
        # Group cells by col_type
        col_cells: Dict[str, List[Dict]] = {}
        for cell in cells:
            ct = cell.get("col_type", "")
            col_cells.setdefault(ct, []).append(cell)
        # Find edge column types (first and last)
        col_types_ordered = sorted(col_cells.keys())
        if not col_types_ordered:
            continue
        # Median cell count across columns (excluding heading rows)
        col_counts = [len(v) for v in col_cells.values()]
        median_count = sorted(col_counts)[len(col_counts) // 2] if col_counts else 0
        if median_count < 3:
            continue
        for edge_ct in [col_types_ordered[0], col_types_ordered[-1]]:
            edge_cells_list = col_cells.get(edge_ct, [])
            if not edge_cells_list:
                continue
            fill_ratio = len(edge_cells_list) / median_count
            if fill_ratio > 0.35:
                continue  # well-filled column → not decorative
            short_count = sum(
                1 for c in edge_cells_list
                if len((c.get("text") or "").strip()) <= 2
            )
            short_ratio = short_count / len(edge_cells_list) if edge_cells_list else 0
            if short_ratio < 0.6:
                continue  # too much real content → not decorative
            # Remove this edge column
            removed_count = len(edge_cells_list)
            edge_ids = {id(c) for c in edge_cells_list}
            z["cells"] = [c for c in cells if id(c) not in edge_ids]
            z["columns"] = [col for col in columns if col.get("col_type") != edge_ct]
            logger.info(
                "Step 4f: removed thin decorative edge column '%s' from zone %d "
                "(%d cells, fill=%.0f%%, short=%.0f%%)",
                edge_ct, z.get("zone_index", 0), removed_count,
                fill_ratio * 100, short_ratio * 100,
            )
            break  # only remove one edge per zone
    # 5. Color annotation on final word_boxes in cells
    if img_bgr is not None:
        all_wb: List[Dict] = []