Preserve alphabetic marker columns, broaden junk filter, enable IPA in grid
- _merge_inline_marker_columns: skip merge when ≥50% of words are alphabetic (preserves "to", "in", "der" columns) - Rule 2 (oversized stub): widen to ≤3 words / ≤5 chars (catches "SEA &") - IPA phonetics: map longest-avg-text column to column_en so fix_cell_phonetics runs in the grid editor - ocr_pipeline_overlays: add missing split_page_into_zones import Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -326,6 +326,9 @@ def _filter_border_ghosts(
|
||||
return filtered, len(words) - len(filtered)
|
||||
|
||||
|
||||
_MARKER_CHARS = set("•*·-–—|~=+#>→►▸▪◆○●□■✓✗✔✘")
|
||||
|
||||
|
||||
def _merge_inline_marker_columns(
|
||||
columns: List[Dict],
|
||||
words: List[Dict],
|
||||
@@ -335,6 +338,9 @@ def _merge_inline_marker_columns(
|
||||
Bullet points (•, *, -) and numbering (1., 2.) create narrow columns
|
||||
at the left edge of a zone. These are inline markers that indent text,
|
||||
not real separate columns. Merge them with their right neighbour.
|
||||
|
||||
Does NOT merge columns containing alphabetic words like "to", "in",
|
||||
"der", "die", "das" — those are legitimate content columns.
|
||||
"""
|
||||
if len(columns) < 2:
|
||||
return columns
|
||||
@@ -353,21 +359,38 @@ def _merge_inline_marker_columns(
|
||||
]
|
||||
col_width = col["x_max"] - col["x_min"]
|
||||
|
||||
# Narrow column with mostly short words → likely inline markers
|
||||
# Narrow column with mostly short words → MIGHT be inline markers
|
||||
if col_words and col_width < 80:
|
||||
avg_len = sum(len(w.get("text", "")) for w in col_words) / len(col_words)
|
||||
if avg_len <= 2 and i + 1 < len(columns):
|
||||
# Merge into next column
|
||||
next_col = columns[i + 1].copy()
|
||||
next_col["x_min"] = col["x_min"]
|
||||
merged.append(next_col)
|
||||
skip.add(i + 1)
|
||||
logger.info(
|
||||
" merged inline marker column %d (w=%d, avg_len=%.1f) "
|
||||
"into column %d",
|
||||
i, col_width, avg_len, i + 1,
|
||||
# Check if words are actual markers (symbols/numbers) vs
|
||||
# real alphabetic words like "to", "in", "der", "die"
|
||||
texts = [(w.get("text") or "").strip() for w in col_words]
|
||||
alpha_count = sum(
|
||||
1 for t in texts
|
||||
if t and t[0].isalpha() and t not in _MARKER_CHARS
|
||||
)
|
||||
continue
|
||||
alpha_ratio = alpha_count / len(texts) if texts else 0
|
||||
|
||||
# If ≥50% of words are alphabetic, this is a real column
|
||||
if alpha_ratio >= 0.5:
|
||||
logger.info(
|
||||
" kept narrow column %d (w=%d, avg_len=%.1f, "
|
||||
"alpha=%.0f%%) — contains real words",
|
||||
i, col_width, avg_len, alpha_ratio * 100,
|
||||
)
|
||||
else:
|
||||
# Merge into next column
|
||||
next_col = columns[i + 1].copy()
|
||||
next_col["x_min"] = col["x_min"]
|
||||
merged.append(next_col)
|
||||
skip.add(i + 1)
|
||||
logger.info(
|
||||
" merged inline marker column %d (w=%d, avg_len=%.1f) "
|
||||
"into column %d",
|
||||
i, col_width, avg_len, i + 1,
|
||||
)
|
||||
continue
|
||||
|
||||
merged.append(col)
|
||||
|
||||
@@ -1096,12 +1119,13 @@ async def build_grid(session_id: str):
|
||||
junk_row_indices.add(ri)
|
||||
continue
|
||||
|
||||
# Rule 2: oversized stub — ≤2 words, all short text (≤2 chars),
|
||||
# and word height > 1.8× median (page numbers, stray marks)
|
||||
if len(row_wbs) <= 2:
|
||||
# Rule 2: oversized stub — ≤3 words, short total text,
|
||||
# and word height > 1.8× median (page numbers, stray marks,
|
||||
# OCR from illustration labels like "SEA &")
|
||||
if len(row_wbs) <= 3:
|
||||
total_text = "".join((wb.get("text") or "").strip() for wb in row_wbs)
|
||||
max_h = max((wb.get("height", 0) for wb in row_wbs), default=0)
|
||||
if len(total_text) <= 3 and max_h > median_wb_h * 1.8:
|
||||
if len(total_text) <= 5 and max_h > median_wb_h * 1.8:
|
||||
junk_row_indices.add(ri)
|
||||
continue
|
||||
|
||||
@@ -1141,8 +1165,35 @@ async def build_grid(session_id: str):
|
||||
|
||||
# 5c. IPA phonetic correction — replace garbled OCR phonetics with
|
||||
# correct IPA from the dictionary (same as in the OCR pipeline).
|
||||
# The grid uses generic col_types (column_1, column_2, ...) but
|
||||
# fix_cell_phonetics expects column_en / column_text. Identify
|
||||
# the English headword column (longest average text) and mark it.
|
||||
all_cells = [cell for z in zones_data for cell in z.get("cells", [])]
|
||||
# Find which col_type has the longest average text → English headwords
|
||||
col_avg_len: Dict[str, List[int]] = {}
|
||||
for cell in all_cells:
|
||||
ct = cell.get("col_type", "")
|
||||
txt = cell.get("text", "")
|
||||
col_avg_len.setdefault(ct, []).append(len(txt))
|
||||
en_col_type = None
|
||||
best_avg = 0
|
||||
for ct, lengths in col_avg_len.items():
|
||||
if not ct.startswith("column_"):
|
||||
continue
|
||||
avg = sum(lengths) / len(lengths) if lengths else 0
|
||||
if avg > best_avg:
|
||||
best_avg = avg
|
||||
en_col_type = ct
|
||||
if en_col_type:
|
||||
for cell in all_cells:
|
||||
if cell.get("col_type") == en_col_type:
|
||||
cell["_orig_col_type"] = en_col_type
|
||||
cell["col_type"] = "column_en"
|
||||
fix_cell_phonetics(all_cells, pronunciation="british")
|
||||
for cell in all_cells:
|
||||
orig = cell.pop("_orig_col_type", None)
|
||||
if orig:
|
||||
cell["col_type"] = orig
|
||||
|
||||
duration = time.time() - t0
|
||||
|
||||
|
||||
Reference in New Issue
Block a user