Preserve alphabetic marker columns, broaden junk filter, enable IPA in grid

- _merge_inline_marker_columns: skip merge when ≥50% of words are
  alphabetic (preserves "to", "in", "der" columns)
- Rule 2 (oversized stub): widen to ≤3 words / ≤5 chars (catches "SEA &")
- IPA phonetics: map longest-avg-text column to column_en so
  fix_cell_phonetics runs in the grid editor
- ocr_pipeline_overlays: add missing split_page_into_zones import

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-18 11:08:23 +01:00
parent 962bbbe9f6
commit f139d0903e
2 changed files with 67 additions and 16 deletions

View File

@@ -326,6 +326,9 @@ def _filter_border_ghosts(
return filtered, len(words) - len(filtered) return filtered, len(words) - len(filtered)
_MARKER_CHARS = set("•*·-–—|~=+#>→►▸▪◆○●□■✓✗✔✘")
def _merge_inline_marker_columns( def _merge_inline_marker_columns(
columns: List[Dict], columns: List[Dict],
words: List[Dict], words: List[Dict],
@@ -335,6 +338,9 @@ def _merge_inline_marker_columns(
Bullet points (•, *, -) and numbering (1., 2.) create narrow columns Bullet points (•, *, -) and numbering (1., 2.) create narrow columns
at the left edge of a zone. These are inline markers that indent text, at the left edge of a zone. These are inline markers that indent text,
not real separate columns. Merge them with their right neighbour. not real separate columns. Merge them with their right neighbour.
Does NOT merge columns containing alphabetic words like "to", "in",
"der", "die", "das" — those are legitimate content columns.
""" """
if len(columns) < 2: if len(columns) < 2:
return columns return columns
@@ -353,21 +359,38 @@ def _merge_inline_marker_columns(
] ]
col_width = col["x_max"] - col["x_min"] col_width = col["x_max"] - col["x_min"]
# Narrow column with mostly short words → likely inline markers # Narrow column with mostly short words → MIGHT be inline markers
if col_words and col_width < 80: if col_words and col_width < 80:
avg_len = sum(len(w.get("text", "")) for w in col_words) / len(col_words) avg_len = sum(len(w.get("text", "")) for w in col_words) / len(col_words)
if avg_len <= 2 and i + 1 < len(columns): if avg_len <= 2 and i + 1 < len(columns):
# Merge into next column # Check if words are actual markers (symbols/numbers) vs
next_col = columns[i + 1].copy() # real alphabetic words like "to", "in", "der", "die"
next_col["x_min"] = col["x_min"] texts = [(w.get("text") or "").strip() for w in col_words]
merged.append(next_col) alpha_count = sum(
skip.add(i + 1) 1 for t in texts
logger.info( if t and t[0].isalpha() and t not in _MARKER_CHARS
" merged inline marker column %d (w=%d, avg_len=%.1f) "
"into column %d",
i, col_width, avg_len, i + 1,
) )
continue alpha_ratio = alpha_count / len(texts) if texts else 0
# If ≥50% of words are alphabetic, this is a real column
if alpha_ratio >= 0.5:
logger.info(
" kept narrow column %d (w=%d, avg_len=%.1f, "
"alpha=%.0f%%) — contains real words",
i, col_width, avg_len, alpha_ratio * 100,
)
else:
# Merge into next column
next_col = columns[i + 1].copy()
next_col["x_min"] = col["x_min"]
merged.append(next_col)
skip.add(i + 1)
logger.info(
" merged inline marker column %d (w=%d, avg_len=%.1f) "
"into column %d",
i, col_width, avg_len, i + 1,
)
continue
merged.append(col) merged.append(col)
@@ -1096,12 +1119,13 @@ async def build_grid(session_id: str):
junk_row_indices.add(ri) junk_row_indices.add(ri)
continue continue
# Rule 2: oversized stub — ≤2 words, all short text (≤2 chars), # Rule 2: oversized stub — ≤3 words, short total text,
# and word height > 1.8× median (page numbers, stray marks) # and word height > 1.8× median (page numbers, stray marks,
if len(row_wbs) <= 2: # OCR from illustration labels like "SEA &")
if len(row_wbs) <= 3:
total_text = "".join((wb.get("text") or "").strip() for wb in row_wbs) total_text = "".join((wb.get("text") or "").strip() for wb in row_wbs)
max_h = max((wb.get("height", 0) for wb in row_wbs), default=0) max_h = max((wb.get("height", 0) for wb in row_wbs), default=0)
if len(total_text) <= 3 and max_h > median_wb_h * 1.8: if len(total_text) <= 5 and max_h > median_wb_h * 1.8:
junk_row_indices.add(ri) junk_row_indices.add(ri)
continue continue
@@ -1141,8 +1165,35 @@ async def build_grid(session_id: str):
# 5c. IPA phonetic correction — replace garbled OCR phonetics with # 5c. IPA phonetic correction — replace garbled OCR phonetics with
# correct IPA from the dictionary (same as in the OCR pipeline). # correct IPA from the dictionary (same as in the OCR pipeline).
# The grid uses generic col_types (column_1, column_2, ...) but
# fix_cell_phonetics expects column_en / column_text. Identify
# the English headword column (longest average text) and mark it.
all_cells = [cell for z in zones_data for cell in z.get("cells", [])] all_cells = [cell for z in zones_data for cell in z.get("cells", [])]
# Find which col_type has the longest average text → English headwords
col_avg_len: Dict[str, List[int]] = {}
for cell in all_cells:
ct = cell.get("col_type", "")
txt = cell.get("text", "")
col_avg_len.setdefault(ct, []).append(len(txt))
en_col_type = None
best_avg = 0
for ct, lengths in col_avg_len.items():
if not ct.startswith("column_"):
continue
avg = sum(lengths) / len(lengths) if lengths else 0
if avg > best_avg:
best_avg = avg
en_col_type = ct
if en_col_type:
for cell in all_cells:
if cell.get("col_type") == en_col_type:
cell["_orig_col_type"] = en_col_type
cell["col_type"] = "column_en"
fix_cell_phonetics(all_cells, pronunciation="british") fix_cell_phonetics(all_cells, pronunciation="british")
for cell in all_cells:
orig = cell.pop("_orig_col_type", None)
if orig:
cell["col_type"] = orig
duration = time.time() - t0 duration = time.time() - t0

View File

@@ -25,7 +25,7 @@ from ocr_pipeline_common import (
) )
from ocr_pipeline_session_store import get_session_db, get_session_image from ocr_pipeline_session_store import get_session_db, get_session_image
from cv_color_detect import _COLOR_HEX, _COLOR_RANGES from cv_color_detect import _COLOR_HEX, _COLOR_RANGES
from cv_box_detect import detect_boxes from cv_box_detect import detect_boxes, split_page_into_zones
from ocr_pipeline_rows import _draw_box_exclusion_overlay from ocr_pipeline_rows import _draw_box_exclusion_overlay
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)